class TestWordEmbeddings(unittest.TestCase):
    wf = WordFeatures()

    def setUp(self):
        pass

    def tearDown(self):
        pass

    def test_is_science_term(self):
        self.assertFalse(DEBUG)
        science_terms = [
            "aardvarks",
            "ab initio",
            "center of curvature",
            "force",
            "gravity",
            "geo-science",
            "origins of the solar system",
            "atom",
            "protons",
            "seahorses",
            "newton's law of universal gravitation",
            "nucleus",
            "zwitterion",
            "zygomorphic",
            "zygomycetes",
            "zygospore",
        ]
        for word in science_terms:
            self.assertTrue(self.wf.is_science_term(word))

        not_science_terms = ["love", "beauty", "nice", "language", "glasses"]
        for word in not_science_terms:
            self.assertFalse(self.wf.is_science_term(word))

    def test_concreteness_ratings(self):
        self.assertFalse(DEBUG)
        to_check = {
            "roadsweeper": 4.85,
            "treeless": 4.24,
            "divisional": 2.04,
            "hopeful": 1.7,
            "essentialness": 1.04,
            "interpretively": 1.21,
            "traindriver": 4.54,
            "chocolaty": 3.45,
            "mathematical": 2.9,
            "baking soda": 5.0,
            "beach ball": 5.0,
            "birth certificate": 5.0,
            "adaptive": 1.97,
            "bucharest": 2.5,
            "soccer": 4.76,
            "sebi": 2.5,
            "fasole": 2.5
        }
        for word in to_check:
            r = to_check[word]
            self.assertAlmostEqual(self.wf.get_concretness_rating(word), r)
Exemplo n.º 2
0
    def __init__(self, data):

        # Word-level features module
        self.feat_word = WordFeatures()

        # Only run GENIA tagger if module is available
        if data and enabled['GENIA']:
            tagger = enabled['GENIA']
            self.feat_genia = GeniaFeatures(tagger, data)

        # Only create UMLS cache if module is available
        if enabled['UMLS']:
            self.feat_umls = UMLSFeatures()

        self.enabled_IOB_nonprose_sentence_features = []
        #self.enabled_IOB_nonprose_sentence_features.append('pos')
        #self.enabled_IOB_nonprose_sentence_features.append('pos_context')
        self.enabled_IOB_nonprose_sentence_features.append('prev')
        self.enabled_IOB_nonprose_sentence_features.append('next')
        self.enabled_IOB_nonprose_sentence_features.append('unigram_context')
        self.enabled_IOB_nonprose_sentence_features.append('UMLS')

        self.enabled_IOB_prose_sentence_features = []
        self.enabled_IOB_prose_sentence_features.append('unigram_context')
        self.enabled_IOB_prose_sentence_features.append('pos')
        self.enabled_IOB_prose_sentence_features.append('pos_context')
        self.enabled_IOB_prose_sentence_features.append('prev')
        self.enabled_IOB_prose_sentence_features.append('prev2')
        self.enabled_IOB_prose_sentence_features.append('next')
        self.enabled_IOB_prose_sentence_features.append('next2')
        self.enabled_IOB_prose_sentence_features.append('GENIA')
        self.enabled_IOB_prose_sentence_features.append('UMLS')
def test(weights_file=None):
    if weights_file is None:
        weights_file = pick_best_model_from_dir()
        if DEBUG:
            print("Best model detected: {}".format(weights_file))

    test_data = read_data_from_json(TEST_DATA_PATH)

    if DEBUG:
        print_data_stats(test_data, "Test")

    # Tokenize data (rudimentary tokenizer).
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_sentences(test_data))

    if DEBUG:
        print("Tokenizer found {} words.".format(len(tokenizer.word_counts)))
        print("")

    # Convert to Keras input arrays (or dict).
    wf = WordFeatures()
    wf.train_PMI(test_data)

    test_data, test_labels, _ = preprocess_data(test_data, tokenizer, wf,
                                                "test")

    embeddings_matrix = build_embeddings_matrix(tokenizer)

    num_words = len(tokenizer.word_counts)
    model = define_model(num_words, embeddings_matrix, "test",
                         WORD_EMBEDDINGS_DIM)
    model.load_weights(weights_file, by_name=True)
    model.summary()

    num_tests = test_data["question_input"].shape[0]
    y = model.predict(test_data)
    assert (y.shape[0] == num_tests)

    correct = 0
    total = 0
    exp_acc = 0.0
    lin_acc = 0.0
    for i in range(0, num_tests):
        predicted = np.argmax(y[i])
        expected = np.argmax(test_labels[i])

        # Expected value (treat y[i] as a random variable).
        value = np.dot(y[i], [0, 1, 2, 3, 4, 5])
        expected_value = np.dot(test_labels[i], [0, 1, 2, 3, 4, 5])
        exp_acc += (np.exp(abs(value - expected_value)) - 1.0)
        lin_acc += abs(value - expected_value)
        if predicted == expected:
            correct += 1
        total += 1
    assert (total == num_tests)
    print("\nEvaluated on {} terms.".format(total))
    print("Accuracy: {0:.3f}%".format(100 * correct / total))
    print("Exp accuracy: {0:.3f}".format(exp_acc / total))
    print("Linear accuracy: {0:.3f}".format(lin_acc / total))
def predict(entry, sort=False, weights_file=None, show_plot=True):
    if weights_file is None:
        weights_file = pick_best_model_from_dir()
        if DEBUG:
            print("Best model detected: {}".format(weights_file))

    assert ("question" in entry)
    question = entry["question"]
    if "terms" not in entry:
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(question)
        entry["terms"] = {}
        for token in doc:
            entry["terms"][token.text] = 0
    data = [entry]

    # Tokenize data (rudimentary tokenizer).
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_sentences(data))

    # Convert to Keras input arrays (or dict).
    wf = WordFeatures()
    wf.train_PMI(data)
    data, _, words = preprocess_data(data, tokenizer, wf, "predict")

    embeddings_matrix = build_embeddings_matrix(tokenizer)

    num_words = len(tokenizer.word_counts)
    model = define_model(num_words, embeddings_matrix, "predict",
                         WORD_EMBEDDINGS_DIM)
    model.load_weights(weights_file, by_name=True)

    y = model.predict(data)

    idx = 0
    essentiality = []
    for word in entry["terms"]:
        value = np.dot(y[idx], [0, 1, 2, 3, 4, 5])
        essentiality.append(value / 5.0)
        idx += 1

    if sort:
        zipped = list(zip(words, essentiality))
        zipped.sort(key=lambda x: x[1], reverse=True)
        words = [x[0] for x in zipped]
        essentiality = [x[1] for x in zipped]

    import matplotlib.pyplot as plt
    plt.bar(range(len(words)), essentiality, align='center')
    plt.title("Predicted values")
    plt.xticks(range(len(words)),
               words,
               rotation=45,
               horizontalalignment='right')
    plt.tight_layout()
    if show_plot:
        plt.show()
def plot_pmi_values(dataset="val", index=None, sort=False):
    assert (dataset in ["train", "val", "test"])
    data = None
    if dataset == "train":
        data = read_data_from_json(TRAIN_DATA_PATH)
    elif dataset == "val":
        data = read_data_from_json(VALIDATION_DATA_PATH)
    elif dataset == "test":
        data = read_data_from_json(TEST_DATA_PATH)
    assert (data is not None)
    if index is None:
        entry = random.choice(data)
    else:
        entry = data[index]

    entry_copy = deepcopy(entry)
    predict(entry, sort=sort, show_plot=False)
    entry = entry_copy

    wf = WordFeatures()
    wf.train_PMI([entry])

    from pmi_utils import reduce_positive_avg
    idx = 0
    values = []
    words = []
    for word in entry["terms"]:
        pmi_values = wf.get_PMI(word,
                                entry,
                                use_question=True,
                                reduce_f=reduce_positive_avg)
        values.append(pmi_values[1])
        words.append(word)
        idx += 1

    if sort:
        zipped = list(zip(words, values))
        zipped.sort(key=lambda x: x[1], reverse=True)
        words = [x[0] for x in zipped]
        values = [x[1] for x in zipped]

    print("\nQuestion: {}\n".format(entry["question"]))

    import matplotlib.pyplot as plt
    plt.figure()
    plt.bar(range(len(words)), values, align='center')
    plt.title("PMI values")
    plt.xticks(range(len(words)),
               words,
               rotation=45,
               horizontalalignment='right')
    plt.tight_layout()
    plt.show()
def predict_batch(data, weights_file=None):
    if weights_file is None:
        weights_file = pick_best_model_from_dir()
        if DEBUG:
            print("Best model detected: {}".format(weights_file))

    # Tokenize data (rudimentary tokenizer).
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_sentences(data))

    # Convert to Keras input arrays (or dict).
    wf = WordFeatures()
    wf.train_PMI(data)
    data2, _, words = preprocess_data(data, tokenizer, wf, "predict_batch")

    embeddings_matrix = build_embeddings_matrix(tokenizer)

    num_words = len(tokenizer.word_counts)
    model = define_model(num_words, embeddings_matrix, "predict_batch",
                         WORD_EMBEDDINGS_DIM)
    model.load_weights(weights_file, by_name=True)

    y = model.predict(data2, batch_size=128)

    idx = 0
    out = []
    for entry in data:
        out_set = {}
        for _ in entry["terms"]:
            value = np.dot(y[idx], [0, 1, 2, 3, 4, 5])
            word = words[idx]
            assert (word not in out_set)
            out_set[word] = value / 5.0
            idx += 1
        out.append(out_set)

    num_entries = 0
    for entry in data:
        num_entries += len(entry['terms'])
    assert (num_entries == idx)

    assert (len(data) == len(out))
    for i in range(0, len(data)):
        assert (len(out[i]) == len(data[i]["terms"]))
    for out_set in out:
        num_entries -= len(out_set)
    assert (num_entries == 0)

    return out
def plot_F1_scores(dataset, weights_file=None):
    assert (dataset in ["val", "test"])
    if weights_file is None:
        weights_file = pick_best_model_from_dir()
        if DEBUG:
            print("Best model detected: {}".format(weights_file))
    data = None
    if dataset == "val":
        data = read_data_from_json(VALIDATION_DATA_PATH)
    elif dataset == "test":
        data = read_data_from_json(TEST_DATA_PATH)
    assert (data is not None)

    if DEBUG:
        print_data_stats(data, "F1 scores data")

    # Tokenize data (rudimentary tokenizer).
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_sentences(data))

    if DEBUG:
        print("Tokenizer found {} words.".format(len(tokenizer.word_counts)))
        print("")

    # Convert to Keras input arrays (or dict).
    wf = WordFeatures()
    wf.train_PMI(data)
    data, labels, words = preprocess_data(data, tokenizer, wf, "F1 scores")

    embeddings_matrix = build_embeddings_matrix(tokenizer)

    num_words = len(tokenizer.word_counts)
    model = define_model(num_words, embeddings_matrix, "F1_scores_data",
                         WORD_EMBEDDINGS_DIM)
    model.load_weights(weights_file, by_name=True)
    model.summary()

    num_tests = data["question_input"].shape[0]
    y = model.predict(data)
    assert (y.shape[0] == num_tests)

    threshold = 0.0
    f1 = []
    thresholds = []
    best_f1 = None
    best_threshold = None
    acc_at_max_f1 = None
    while threshold <= 1.0:
        correct = 0
        total = 0
        true_positive = 0
        false_positive = 0
        true_negative = 0
        false_negative = 0
        for i in range(0, num_tests):
            # Expected value (treat y[i] as a random variable).
            value = np.dot(y[i], [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])

            if value >= threshold:
                predicted = 1
            else:
                predicted = 0

            expected_value = np.argmax(labels[i])
            if expected_value >= 2.5:
                expected = 1
            else:
                expected = 0

            if predicted == expected:
                correct += 1

            if predicted == 1:
                if expected == 1:
                    true_positive += 1
                else:
                    false_positive += 1
            else:
                if expected == 0:
                    true_negative += 1
                else:
                    false_negative += 1
            total += 1

        assert (total == num_tests)
        assert (correct == true_positive + true_negative)
        if true_positive + false_positive == 0:
            threshold += 0.001
            continue
        if true_positive + false_negative == 0:
            threshold += 0.001
            continue
        precision = 1.0 * true_positive / (true_positive + false_positive)
        recall = 1.0 * true_positive / (true_positive + false_negative)
        f1_score = 2.0 * precision * recall / (precision + recall)
        if best_f1 is None or f1_score > best_f1:
            best_f1 = f1_score
            best_threshold = threshold
            acc_at_max_f1 = 1.0 * correct / max(total, 1.0)
        f1.append(f1_score)
        thresholds.append(threshold)
        threshold += 0.001
    print("Best F1 score: {}, at t = {}".format(round(best_f1, 3),
                                                round(best_threshold, 4)))
    print("Accuracy at max F1: {}".format(round(acc_at_max_f1, 3)))

    import matplotlib.pyplot as plt
    plt.title("F1 score")
    plt.xlabel("Threshold")
    plt.ylabel("F1")
    plt.plot(thresholds, f1)
    plt.show()
def binary_test(weights_file=None):
    if weights_file is None:
        weights_file = pick_best_model_from_dir()
        if DEBUG:
            print("Best model detected: {}".format(weights_file))

    test_data = read_data_from_json(TEST_DATA_PATH)
    # test_data = undersample_dataset(test_data, prob=0.84)

    if DEBUG:
        print_data_stats(test_data, "Test")

    # Tokenize data (rudimentary tokenizer).
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_sentences(test_data))

    if DEBUG:
        print("Tokenizer found {} words.".format(len(tokenizer.word_counts)))
        print("")

    # Convert to Keras input arrays (or dict).
    wf = WordFeatures()
    wf.train_PMI(test_data)

    test_data, test_labels, words = preprocess_data(test_data, tokenizer, wf,
                                                    "test")

    embeddings_matrix = build_embeddings_matrix(tokenizer)

    num_words = len(tokenizer.word_counts)
    model = define_model(num_words, embeddings_matrix, "test",
                         WORD_EMBEDDINGS_DIM)
    model.load_weights(weights_file, by_name=True)
    model.summary()

    num_tests = test_data["question_input"].shape[0]
    y = model.predict(test_data)
    assert (y.shape[0] == num_tests)

    correct = 0
    total = 0
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0
    correct_confidence = 0.0
    wrong_confidence = 0.0
    false_positive_words = []
    for i in range(0, num_tests):
        # Expected value (treat y[i] as a random variable).
        value = np.dot(y[i], [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])

        if value >= 0.5:
            predicted = 1
        else:
            predicted = 0
        confidence = None
        if predicted == 1:
            confidence = np.dot(y[i], [0, 0, 0, 1, 1, 1])
        else:
            confidence = np.dot(y[i], [1, 1, 1, 0, 0, 0])

        expected_value = np.argmax(test_labels[i])
        if expected_value >= 2.5:
            expected = 1
        else:
            expected = 0

        if predicted == expected:
            correct += 1
            correct_confidence += confidence
        else:
            wrong_confidence += confidence

        if predicted == 1:
            if expected == 1:
                true_positive += 1
            else:
                false_positive += 1
                false_positive_words.append(words[i])
        else:
            if expected == 0:
                true_negative += 1
            else:
                false_negative += 1
        total += 1
    assert (total == num_tests)
    assert (correct == true_positive + true_negative)
    precision = 100.0 * true_positive / (true_positive + false_positive)
    recall = 100.0 * true_positive / (true_positive + false_negative)
    f1 = 2.0 * precision * recall / (precision + recall)
    print("")
    print("            |  Correct class  |")
    print("            |    1   |    0   |")
    print("Predicted 1 |{}  |{}  |".format(
        str(true_positive).rjust(6),
        str(false_positive).rjust(6)))
    print("Predicted 0 |{}  |{}  |".format(
        str(false_negative).rjust(6),
        str(true_negative).rjust(6)))

    print("\nEvaluated on {} terms.".format(total))
    print("Binary accuracy: {0:.3f}%".format(100 * correct / total))
    print("Precision: {0:.3f}%".format(precision))
    print("Recall: {0:.3f}%".format(recall))
    print("F1: {0:.3f}".format(f1 / 100.0))
    if correct >= 1:
        print("Correct confidence {0:.3f}%".format(100.0 * correct_confidence /
                                                   correct))
    if correct < total:
        print("Wrong confidence {0:.3f}%".format(100.0 * wrong_confidence /
                                                 (total - correct)))
    print("")
    random.shuffle(false_positive_words)
    print("Some false positive words: ", str(false_positive_words[:10]))
def train():
    train_data = read_data_from_json(TRAIN_DATA_PATH)
    val_data = read_data_from_json(VALIDATION_DATA_PATH)
    test_data = read_data_from_json(TEST_DATA_PATH)

    # train_data = undersample_dataset(train_data, prob=0.68)
    # val_data = undersample_dataset(val_data, prob=0.68)
    # test_data = undersample_dataset(test_data, prob=0.68)

    # train_data = train_data[:2]
    # val_data = val_data[:2]
    # test_data = test_data[:1]

    if DEBUG:
        print_data_stats(train_data, "Train")
        print_data_stats(val_data, "Validation")
        print_data_stats(test_data, "Test")
        if False:
            print(dataset_similarity(val_data, train_data))  # 0.5714%
            print(dataset_similarity(test_data, train_data))  # 2.112%

    # Tokenize data (rudimentary tokenizer).
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(
        all_sentences(train_data) + all_sentences(val_data) +
        all_sentences(test_data))

    if DEBUG:
        print("Tokenizer found {} words.".format(len(tokenizer.word_counts)))
        print("")

    # Convert to Keras input arrays (or dict).
    wf = WordFeatures()
    wf.train_PMI(train_data + val_data + test_data)

    train_data, train_labels, _ = preprocess_data(train_data, tokenizer, wf,
                                                  "train")
    val_data, val_labels, _ = preprocess_data(val_data, tokenizer, wf,
                                              "validation")
    test_data, test_labels, _ = preprocess_data(test_data, tokenizer, wf,
                                                "test")

    # Equalize training data labels to the same frequency.
    if False:
        from utils import equalize
        train_data, train_labels = equalize(train_data, train_labels)
        if DEBUG:
            print("Train data has been equalized. New freq: {}.".format(
                np.asarray(np.sum(train_labels, axis=0), dtype=np.int32)))
    if False:
        from utils import oversample_dataset
        train_data, train_labels = oversample_dataset(train_data, train_labels,
                                                      [6000, 8000])
        if DEBUG:
            print("Train data has been oversampled. New freq: {}.".format(
                np.asarray(np.sum(train_labels, axis=0), dtype=np.int32)))

    embeddings_matrix = build_embeddings_matrix(tokenizer)

    num_words = len(tokenizer.word_counts)
    model = define_model(num_words, embeddings_matrix, "train",
                         WORD_EMBEDDINGS_DIM)
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)

    filepath = "models/" + "model.{val_acc:.3f}-{epoch:03d}.hdf5"
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=0,
                                 mode='max',
                                 save_best_only=True,
                                 save_weights_only=True)
    model.fit(train_data,
              train_labels,
              batch_size=4000,
              epochs=450,
              verbose=2,
              validation_data=(val_data, val_labels),
              callbacks=[checkpoint])
    score = model.evaluate(test_data, test_labels, verbose=0)
    if score:
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])