예제 #1
0
    print("preprocessing done...")
    return sequences, onehot_labels, tokenizer.word_index


if __name__ == '__main__':
    for num in TOPIC_NUM:
        descriptions, labels = make_dataset(data_type=DATA_TYPE,
                                            topic_num=num,
                                            sample_balance=False)

        if MODEL_GROUP is 'keras':
            sequences, onehot_labels, word_index = data_transform(descriptions, labels,
                                                                  MAX_NUM_WORDS, SEQ_MAXLEN)
            # load glove embedding
            glove = h.load_glove(EMBEDDING_DIM)
            embedding_matrix = h.get_embedding_matrix(word_index, glove,
                                                      MAX_NUM_WORDS, EMBEDDING_DIM)

        # # train test split
        X_train, X_val, y_train, y_val = \
            train_test_split(descriptions, labels, test_size=0.2, random_state=42)
        # save
        validation_file_name = f'validation_split_{num:02d}_topics'
        validation_path = os.path.join(d.DATA_DIR, validation_file_name)
        pickle.dump([X_val, y_val], open(validation_path, 'wb'))

        # if MODEL_GROUP is 'keras':
        #     for (model_type, layer_num) in Keras_models:
        #         print(f'{model_type} training')
        #         model_name = f'{model_type}_{num:02d}_{DATA_TYPE}'
        #         model = k.build_DNN(embedding_matrix, SEQ_MAXLEN,
        #                             num, model_type=model_type)
예제 #2
0

tokenizer = Tokenizer(num_words=Config.MAX_FEATURES)
tokenizer.fit_on_texts(list(train_df["question1"]) + list(train_df["question2"]))

list_tokenized_question1 = tokenizer.texts_to_sequences(X_train["question1"])
list_tokenized_question2 = tokenizer.texts_to_sequences(X_train["question2"])
X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=Config.MAX_TEXT_LENGTH)
X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=Config.MAX_TEXT_LENGTH)

# list_tokenized_question1 = tokenizer.texts_to_sequences(X_validation.question1)
# list_tokenized_question2 = tokenizer.texts_to_sequences(X_validation.question2)
#
# X_val_q1 = pad_sequences(list_tokenized_question1, maxlen=Config.MAX_TEXT_LENGTH)
# X_val_q2 = pad_sequences(list_tokenized_question2, maxlen=Config.MAX_TEXT_LENGTH)
embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, Config.w2vpath, Config.embedding_matrix_path)


# Split to dicts
X_train = {'left': X_train_q1, 'right': X_train_q2}
Y_train = np.array(Y_train)
print (Y_train)
print (X_train['left'][0])
# X_validation = {'left':X_val_q1, 'right': X_val_q2}
# X_test = {'left': build_vocab_data(test_df.question1, word_to_id), 'right': build_vocab_data(test_df.question2, word_to_id)}

# Convert labels to their numpy representations



# Make sure everything is ok