train_doc, train_answer = tk.tokenize_set(train_doc_str, train_answer_str, tokenizer) test_doc, test_answer = tk.tokenize_set(test_doc_str, test_answer_str, tokenizer) val_doc, val_answer = tk.tokenize_set(val_doc_str, val_answer_str, tokenizer) logging.info("Dataset loaded. Generating candidate keyphrases...") train_candidates = chunker.extract_candidates_from_set(train_doc_str, tokenizer) test_candidates = chunker.extract_candidates_from_set(test_doc_str, tokenizer) val_candidates = chunker.extract_candidates_from_set(val_doc_str, tokenizer) logging.debug("Candidates recall on training set : %.4f", metrics.recall(train_answer, train_candidates)) logging.debug("Candidates recall on test set : %.4f", metrics.recall(test_answer, test_candidates)) logging.debug("Candidates recall on validation set : %.4f", metrics.recall(val_answer, val_candidates)) train_pos = [] for answers in train_answer.values(): for answer in answers: train_pos.append(nltk.pos_tag(answer)) test_pos = [] for answers in test_answer.values(): for answer in answers: test_pos.append(nltk.pos_tag(answer))
prepare_sequential(train_doc, train_answer, test_doc, test_answer,val_doc,val_answer, max_document_length=MAX_DOCUMENT_LENGTH, max_vocabulary_size=MAX_VOCABULARY_SIZE, embeddings_size=EMBEDDINGS_SIZE) check_randomness("After preprocess") # weigh training examples: everything that's not class 0 (not kp) # gets a heavier score train_y_weights = np.argmax(train_y,axis=2) # this removes the one-hot representation train_y_weights[train_y_weights > 0] = KP_WEIGHT train_y_weights[train_y_weights < 1] = 1 logging.info("Data preprocessing complete.") logging.info("Maximum possible recall: %s", metrics.recall(test_answer, postprocessing.get_words(test_doc,postprocessing.undo_sequential(test_y)))) if not SAVE_MODEL or not os.path.isfile(MODEL_PATH) : logging.debug("Building the network...") model = Sequential() embedding_layer = Embedding(np.shape(embedding_matrix)[0], EMBEDDINGS_SIZE, weights=[embedding_matrix], input_length=MAX_DOCUMENT_LENGTH, trainable=False) model.add(embedding_layer) model.add(Bidirectional(LSTM(150,activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True))) model.add(Dropout(0.25))
train_doc, train_answer = tk.tokenize_set(train_doc_str, train_answer_str, tokenizer) test_doc, test_answer = tk.tokenize_set(test_doc_str, test_answer_str, tokenizer) val_doc, val_answer = tk.tokenize_set(val_doc_str, val_answer_str, tokenizer) logging.info("Dataset loaded. Generating candidate keyphrases...") train_candidates = chunker.extract_candidates_from_set(train_doc_str, tokenizer) test_candidates = chunker.extract_candidates_from_set(test_doc_str, tokenizer) val_candidates = chunker.extract_candidates_from_set(val_doc_str, tokenizer) logging.debug("Candidates recall on training set : %.4f", metrics.recall(train_answer, train_candidates)) logging.debug("Candidates recall on test set : %.4f", metrics.recall(test_answer, test_candidates)) logging.debug("Candidates recall on validation set : %.4f", metrics.recall(val_answer, val_candidates)) logging.info("Candidates generated. Preprocessing data...") train_x, train_y, test_x, test_y, val_x, val_y, val_x_b, val_y_b, embedding_matrix, dictionary = preprocessing. \ prepare_answer_2(train_doc, train_answer, train_candidates, test_doc, test_answer, test_candidates, val_doc, val_answer, val_candidates, max_document_length=MAX_DOCUMENT_LENGTH, max_answer_length=MAX_ANSWER_LENGTH, max_vocabulary_size=MAX_VOCABULARY_SIZE, embeddings_size=EMBEDDINGS_SIZE)
# weigh training examples: everything that's not class 0 (not kp) # gets a heavier score from sklearn.utils import class_weight train_y_weights = np.argmax(train_y, axis=2) train_y_weights = np.reshape( class_weight.compute_sample_weight('balanced', train_y_weights.flatten()), np.shape(train_y_weights)) logging.info("Data preprocessing complete.") logging.info( "Maximum possible recall: %s", metrics.recall( test_answer, postprocessing.get_words(test_doc, postprocessing.undo_sequential(test_y)), STEM_MODE)) if not SAVE_MODEL or not os.path.isfile(MODEL_PATH): logging.debug("Building the network...") model = Sequential() embedding_layer = Embedding(np.shape(embedding_matrix)[0], EMBEDDINGS_SIZE, weights=[embedding_matrix], input_length=MAX_DOCUMENT_LENGTH, trainable=False) model.add(embedding_layer)
train_doc_str, train_answer_str = data.load_train() test_doc_str, test_answer_str = data.load_test() val_doc_str, val_answer_str = data.load_validation() train_doc, train_answer = tk.tokenize_set(train_doc_str,train_answer_str,tokenizer) test_doc, test_answer = tk.tokenize_set(test_doc_str,test_answer_str,tokenizer) val_doc, val_answer = tk.tokenize_set(val_doc_str,val_answer_str,tokenizer) logging.info("Dataset loaded. Generating candidate keyphrases...") train_candidates = chunker.extract_candidates_from_set(train_doc_str,tokenizer) test_candidates = chunker.extract_candidates_from_set(test_doc_str,tokenizer) val_candidates = chunker.extract_candidates_from_set(val_doc_str,tokenizer) logging.debug("Candidates recall on training set : %.4f", metrics.recall(train_answer,train_candidates)) logging.debug("Candidates recall on test set : %.4f", metrics.recall(test_answer,test_candidates)) logging.debug("Candidates recall on validation set : %.4f", metrics.recall(val_answer,val_candidates)) logging.info("Candidates generated. Preprocessing data...") train_x,train_y,test_x,test_y,val_x,val_y, val_x_b, val_y_b,embedding_matrix, dictionary = preprocessing.\ prepare_answer_2(train_doc, train_answer, train_candidates, test_doc, test_answer, test_candidates, val_doc,val_answer, val_candidates, max_document_length=MAX_DOCUMENT_LENGTH, max_answer_length=MAX_ANSWER_LENGTH, max_vocabulary_size=MAX_VOCABULARY_SIZE, embeddings_size=EMBEDDINGS_SIZE) logging.info("Data preprocessing complete.")