print("preprocessing done...") return sequences, onehot_labels, tokenizer.word_index if __name__ == '__main__': for num in TOPIC_NUM: descriptions, labels = make_dataset(data_type=DATA_TYPE, topic_num=num, sample_balance=False) if MODEL_GROUP is 'keras': sequences, onehot_labels, word_index = data_transform(descriptions, labels, MAX_NUM_WORDS, SEQ_MAXLEN) # load glove embedding glove = h.load_glove(EMBEDDING_DIM) embedding_matrix = h.get_embedding_matrix(word_index, glove, MAX_NUM_WORDS, EMBEDDING_DIM) # # train test split X_train, X_val, y_train, y_val = \ train_test_split(descriptions, labels, test_size=0.2, random_state=42) # save validation_file_name = f'validation_split_{num:02d}_topics' validation_path = os.path.join(d.DATA_DIR, validation_file_name) pickle.dump([X_val, y_val], open(validation_path, 'wb')) # if MODEL_GROUP is 'keras': # for (model_type, layer_num) in Keras_models: # print(f'{model_type} training') # model_name = f'{model_type}_{num:02d}_{DATA_TYPE}' # model = k.build_DNN(embedding_matrix, SEQ_MAXLEN, # num, model_type=model_type)
tokenizer = Tokenizer(num_words=Config.MAX_FEATURES) tokenizer.fit_on_texts(list(train_df["question1"]) + list(train_df["question2"])) list_tokenized_question1 = tokenizer.texts_to_sequences(X_train["question1"]) list_tokenized_question2 = tokenizer.texts_to_sequences(X_train["question2"]) X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=Config.MAX_TEXT_LENGTH) X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=Config.MAX_TEXT_LENGTH) # list_tokenized_question1 = tokenizer.texts_to_sequences(X_validation.question1) # list_tokenized_question2 = tokenizer.texts_to_sequences(X_validation.question2) # # X_val_q1 = pad_sequences(list_tokenized_question1, maxlen=Config.MAX_TEXT_LENGTH) # X_val_q2 = pad_sequences(list_tokenized_question2, maxlen=Config.MAX_TEXT_LENGTH) embedding_matrix1 = get_embedding_matrix(tokenizer.word_index, Config.w2vpath, Config.embedding_matrix_path) # Split to dicts X_train = {'left': X_train_q1, 'right': X_train_q2} Y_train = np.array(Y_train) print (Y_train) print (X_train['left'][0]) # X_validation = {'left':X_val_q1, 'right': X_val_q2} # X_test = {'left': build_vocab_data(test_df.question1, word_to_id), 'right': build_vocab_data(test_df.question2, word_to_id)} # Convert labels to their numpy representations # Make sure everything is ok