def test(weights_file=None): if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) test_data = read_data_from_json(TEST_DATA_PATH) if DEBUG: print_data_stats(test_data, "Test") # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(test_data) test_data, test_labels, _ = preprocess_data(test_data, tokenizer, wf, "test") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "test", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) model.summary() num_tests = test_data["question_input"].shape[0] y = model.predict(test_data) assert (y.shape[0] == num_tests) correct = 0 total = 0 exp_acc = 0.0 lin_acc = 0.0 for i in range(0, num_tests): predicted = np.argmax(y[i]) expected = np.argmax(test_labels[i]) # Expected value (treat y[i] as a random variable). value = np.dot(y[i], [0, 1, 2, 3, 4, 5]) expected_value = np.dot(test_labels[i], [0, 1, 2, 3, 4, 5]) exp_acc += (np.exp(abs(value - expected_value)) - 1.0) lin_acc += abs(value - expected_value) if predicted == expected: correct += 1 total += 1 assert (total == num_tests) print("\nEvaluated on {} terms.".format(total)) print("Accuracy: {0:.3f}%".format(100 * correct / total)) print("Exp accuracy: {0:.3f}".format(exp_acc / total)) print("Linear accuracy: {0:.3f}".format(lin_acc / total))
def predict(entry, sort=False, weights_file=None, show_plot=True): if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) assert ("question" in entry) question = entry["question"] if "terms" not in entry: nlp = spacy.load("en_core_web_sm") doc = nlp(question) entry["terms"] = {} for token in doc: entry["terms"][token.text] = 0 data = [entry] # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(data)) # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(data) data, _, words = preprocess_data(data, tokenizer, wf, "predict") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "predict", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) y = model.predict(data) idx = 0 essentiality = [] for word in entry["terms"]: value = np.dot(y[idx], [0, 1, 2, 3, 4, 5]) essentiality.append(value / 5.0) idx += 1 if sort: zipped = list(zip(words, essentiality)) zipped.sort(key=lambda x: x[1], reverse=True) words = [x[0] for x in zipped] essentiality = [x[1] for x in zipped] import matplotlib.pyplot as plt plt.bar(range(len(words)), essentiality, align='center') plt.title("Predicted values") plt.xticks(range(len(words)), words, rotation=45, horizontalalignment='right') plt.tight_layout() if show_plot: plt.show()
def plot_pmi_values(dataset="val", index=None, sort=False): assert (dataset in ["train", "val", "test"]) data = None if dataset == "train": data = read_data_from_json(TRAIN_DATA_PATH) elif dataset == "val": data = read_data_from_json(VALIDATION_DATA_PATH) elif dataset == "test": data = read_data_from_json(TEST_DATA_PATH) assert (data is not None) if index is None: entry = random.choice(data) else: entry = data[index] entry_copy = deepcopy(entry) predict(entry, sort=sort, show_plot=False) entry = entry_copy wf = WordFeatures() wf.train_PMI([entry]) from pmi_utils import reduce_positive_avg idx = 0 values = [] words = [] for word in entry["terms"]: pmi_values = wf.get_PMI(word, entry, use_question=True, reduce_f=reduce_positive_avg) values.append(pmi_values[1]) words.append(word) idx += 1 if sort: zipped = list(zip(words, values)) zipped.sort(key=lambda x: x[1], reverse=True) words = [x[0] for x in zipped] values = [x[1] for x in zipped] print("\nQuestion: {}\n".format(entry["question"])) import matplotlib.pyplot as plt plt.figure() plt.bar(range(len(words)), values, align='center') plt.title("PMI values") plt.xticks(range(len(words)), words, rotation=45, horizontalalignment='right') plt.tight_layout() plt.show()
def predict_batch(data, weights_file=None): if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(data)) # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(data) data2, _, words = preprocess_data(data, tokenizer, wf, "predict_batch") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "predict_batch", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) y = model.predict(data2, batch_size=128) idx = 0 out = [] for entry in data: out_set = {} for _ in entry["terms"]: value = np.dot(y[idx], [0, 1, 2, 3, 4, 5]) word = words[idx] assert (word not in out_set) out_set[word] = value / 5.0 idx += 1 out.append(out_set) num_entries = 0 for entry in data: num_entries += len(entry['terms']) assert (num_entries == idx) assert (len(data) == len(out)) for i in range(0, len(data)): assert (len(out[i]) == len(data[i]["terms"])) for out_set in out: num_entries -= len(out_set) assert (num_entries == 0) return out
def plot_F1_scores(dataset, weights_file=None): assert (dataset in ["val", "test"]) if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) data = None if dataset == "val": data = read_data_from_json(VALIDATION_DATA_PATH) elif dataset == "test": data = read_data_from_json(TEST_DATA_PATH) assert (data is not None) if DEBUG: print_data_stats(data, "F1 scores data") # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(data) data, labels, words = preprocess_data(data, tokenizer, wf, "F1 scores") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "F1_scores_data", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) model.summary() num_tests = data["question_input"].shape[0] y = model.predict(data) assert (y.shape[0] == num_tests) threshold = 0.0 f1 = [] thresholds = [] best_f1 = None best_threshold = None acc_at_max_f1 = None while threshold <= 1.0: correct = 0 total = 0 true_positive = 0 false_positive = 0 true_negative = 0 false_negative = 0 for i in range(0, num_tests): # Expected value (treat y[i] as a random variable). value = np.dot(y[i], [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) if value >= threshold: predicted = 1 else: predicted = 0 expected_value = np.argmax(labels[i]) if expected_value >= 2.5: expected = 1 else: expected = 0 if predicted == expected: correct += 1 if predicted == 1: if expected == 1: true_positive += 1 else: false_positive += 1 else: if expected == 0: true_negative += 1 else: false_negative += 1 total += 1 assert (total == num_tests) assert (correct == true_positive + true_negative) if true_positive + false_positive == 0: threshold += 0.001 continue if true_positive + false_negative == 0: threshold += 0.001 continue precision = 1.0 * true_positive / (true_positive + false_positive) recall = 1.0 * true_positive / (true_positive + false_negative) f1_score = 2.0 * precision * recall / (precision + recall) if best_f1 is None or f1_score > best_f1: best_f1 = f1_score best_threshold = threshold acc_at_max_f1 = 1.0 * correct / max(total, 1.0) f1.append(f1_score) thresholds.append(threshold) threshold += 0.001 print("Best F1 score: {}, at t = {}".format(round(best_f1, 3), round(best_threshold, 4))) print("Accuracy at max F1: {}".format(round(acc_at_max_f1, 3))) import matplotlib.pyplot as plt plt.title("F1 score") plt.xlabel("Threshold") plt.ylabel("F1") plt.plot(thresholds, f1) plt.show()
def binary_test(weights_file=None): if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) test_data = read_data_from_json(TEST_DATA_PATH) # test_data = undersample_dataset(test_data, prob=0.84) if DEBUG: print_data_stats(test_data, "Test") # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(test_data) test_data, test_labels, words = preprocess_data(test_data, tokenizer, wf, "test") embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "test", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) model.summary() num_tests = test_data["question_input"].shape[0] y = model.predict(test_data) assert (y.shape[0] == num_tests) correct = 0 total = 0 true_positive = 0 false_positive = 0 true_negative = 0 false_negative = 0 correct_confidence = 0.0 wrong_confidence = 0.0 false_positive_words = [] for i in range(0, num_tests): # Expected value (treat y[i] as a random variable). value = np.dot(y[i], [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) if value >= 0.5: predicted = 1 else: predicted = 0 confidence = None if predicted == 1: confidence = np.dot(y[i], [0, 0, 0, 1, 1, 1]) else: confidence = np.dot(y[i], [1, 1, 1, 0, 0, 0]) expected_value = np.argmax(test_labels[i]) if expected_value >= 2.5: expected = 1 else: expected = 0 if predicted == expected: correct += 1 correct_confidence += confidence else: wrong_confidence += confidence if predicted == 1: if expected == 1: true_positive += 1 else: false_positive += 1 false_positive_words.append(words[i]) else: if expected == 0: true_negative += 1 else: false_negative += 1 total += 1 assert (total == num_tests) assert (correct == true_positive + true_negative) precision = 100.0 * true_positive / (true_positive + false_positive) recall = 100.0 * true_positive / (true_positive + false_negative) f1 = 2.0 * precision * recall / (precision + recall) print("") print(" | Correct class |") print(" | 1 | 0 |") print("Predicted 1 |{} |{} |".format( str(true_positive).rjust(6), str(false_positive).rjust(6))) print("Predicted 0 |{} |{} |".format( str(false_negative).rjust(6), str(true_negative).rjust(6))) print("\nEvaluated on {} terms.".format(total)) print("Binary accuracy: {0:.3f}%".format(100 * correct / total)) print("Precision: {0:.3f}%".format(precision)) print("Recall: {0:.3f}%".format(recall)) print("F1: {0:.3f}".format(f1 / 100.0)) if correct >= 1: print("Correct confidence {0:.3f}%".format(100.0 * correct_confidence / correct)) if correct < total: print("Wrong confidence {0:.3f}%".format(100.0 * wrong_confidence / (total - correct))) print("") random.shuffle(false_positive_words) print("Some false positive words: ", str(false_positive_words[:10]))
def train(): train_data = read_data_from_json(TRAIN_DATA_PATH) val_data = read_data_from_json(VALIDATION_DATA_PATH) test_data = read_data_from_json(TEST_DATA_PATH) # train_data = undersample_dataset(train_data, prob=0.68) # val_data = undersample_dataset(val_data, prob=0.68) # test_data = undersample_dataset(test_data, prob=0.68) # train_data = train_data[:2] # val_data = val_data[:2] # test_data = test_data[:1] if DEBUG: print_data_stats(train_data, "Train") print_data_stats(val_data, "Validation") print_data_stats(test_data, "Test") if False: print(dataset_similarity(val_data, train_data)) # 0.5714% print(dataset_similarity(test_data, train_data)) # 2.112% # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts( all_sentences(train_data) + all_sentences(val_data) + all_sentences(test_data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(train_data + val_data + test_data) train_data, train_labels, _ = preprocess_data(train_data, tokenizer, wf, "train") val_data, val_labels, _ = preprocess_data(val_data, tokenizer, wf, "validation") test_data, test_labels, _ = preprocess_data(test_data, tokenizer, wf, "test") # Equalize training data labels to the same frequency. if False: from utils import equalize train_data, train_labels = equalize(train_data, train_labels) if DEBUG: print("Train data has been equalized. New freq: {}.".format( np.asarray(np.sum(train_labels, axis=0), dtype=np.int32))) if False: from utils import oversample_dataset train_data, train_labels = oversample_dataset(train_data, train_labels, [6000, 8000]) if DEBUG: print("Train data has been oversampled. New freq: {}.".format( np.asarray(np.sum(train_labels, axis=0), dtype=np.int32))) embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "train", WORD_EMBEDDINGS_DIM) model.summary() plot_model(model, to_file='model.png', show_shapes=True) filepath = "models/" + "model.{val_acc:.3f}-{epoch:03d}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, mode='max', save_best_only=True, save_weights_only=True) model.fit(train_data, train_labels, batch_size=4000, epochs=450, verbose=2, validation_data=(val_data, val_labels), callbacks=[checkpoint]) score = model.evaluate(test_data, test_labels, verbose=0) if score: print('Test loss:', score[0]) print('Test accuracy:', score[1])