Python pre_process_data示例

编程语言: Python

命名空间/包名称: preprocess

方法/功能: pre_process_data

hotexamples.com的示例: 4

Python pre_process_data - 已找到4个示例。这些是从开源项目中提取的最受好评的preprocess.pre_process_data现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def get_strings(FILE_NAMES, b_dep):
    count = 0
    FILE_STRINGS = []
    for files in FILE_NAMES:
        string_user = []
        for file in files:
            tweet = ""
            all_lines = ""
            with open(file, 'r', encoding="utf-8") as f:
                for line in f:
                    all_lines += line
                f.close()
            pre_processed_tokens = ppc.pre_process_data(all_lines)
            for ppt in pre_processed_tokens:
                string_user.append(ppt)
            #string_user.append(pre_processed_tokens)
        #string_user = ''.join(string_user)
        if len(string_user) < 1000:
            print("User Omitted: Too Few Records")
            continue
        if len(string_user) < MAX_TENSOR_LENGTH:
            if b_dep == 1:
                LENGTHS_DEPRESSION.append(len(string_user))
            else:
                LENGTHS_CONTROL.append(len(string_user))
            string_user += [''] * (MAX_TENSOR_LENGTH - len(string_user))
        #print(len(string_user))
        count = count + 1
        FILE_STRINGS.append(string_user)
        if count > 999:
            break
    return FILE_STRINGS

示例#2

显示文件

文件： cnn_toxic.py 项目： dimoynwa/NLPTasks

    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dense(conf['targets'].shape[1],
                              activation='sigmoid')(x)

    model = tf.keras.Model(input, x)

    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model


if __name__ == "__main__":
    config = pre_process_data()
    model = build_model(config)
    print('Training the model...')

    history = model.fit(x=config['data'],
                        y=config['targets'],
                        epochs=EPOCHS,
                        validation_split=VALIDATION_SPLIT,
                        batch_size=BATCH_SIZE)
    print('Training done.')

    model.save('./saved_models/cnn_toxic.h5')

    plt.plot(history.history['loss'], label='Loss')
    plt.plot(history.history['val_loss'], label='Validation loss')
    plt.legend()

示例#3

显示文件

#all tweets are stored in a single vector (string at the moment)
depression_lengths = []
control_lengths = []
FILE_STRINGS = []
FILE_STRINGS_CONTROL = []
count = 0
for files in FILE_NAMES_DEPRESSION:
    string_user = []
    for file in files:
        tweet = ""
        all_lines = ""
        with open(file, 'r', encoding="utf-8") as f:
            for line in f:
                all_lines += line
            f.close()
        pre_processed_tokens = ppc.pre_process_data(all_lines)
        #print(pre_processed_tokens) #debugging line
        for ppt in pre_processed_tokens:
            string_user.append(ppt)
        #string_user.append(pre_processed_tokens)
    #string_user = ''.join(string_user)
    if len(string_user) < 1000:
        print("User Omitted: Too Few Records")
        continue
    if len(string_user) < MAX_TENSOR_LENGTH:
        depression_lengths.append(len(string_user))
        string_user += [''] * (MAX_TENSOR_LENGTH - len(string_user))
    #print(len(string_user))
    count = count + 1
    FILE_STRINGS.append(string_user)
    if count > 999:

示例#4

显示文件

文件： single_tweet.py 项目： BeastlyBrosif/dissertation1920

FILE_STRINGS = []
FILE_STRINGS_CONTROL = []
count = 0
for files in FILE_NAMES_DEPRESSION:
    string_user = []
    for file in files:
        #print("Loaded tweets from depression: ", count, end='\r')
        tweet = ""
        all_lines = ""
        all_tweets = []
        with open(file, 'r', encoding="utf-8") as f:
            for line in f:
                if len(line) > 5:
                    if len(line) < 280:
                        #pre process the lines as they come in
                        pps = ppc.pre_process_data(line)
                        if len(pps) > 1:
                            #pad lines
                            pps += [''] * (280 - len(pps))
                            TWEETS_DEPRESSION.append(pps)
                            count = count + 1
            if DEBUG_MODE == 1 and count > 10000:
                f.close()
                break
            f.close()
count = 0
print('\n')
print("Importing Control Users")
for files in FILE_NAMES_CONTROL:
    string_user = []
    for file in files: