def test_compute_elmo_embeddings_contains_empty_sentence(self):
     embedder = Embedder(['Sentence one.', ''])
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=RuntimeWarning)
         # avoids printing expected warning due to averaging an empty vector.
         embeddings, _ = embedder.compute_elmo_embeddings()
     assert embeddings.shape == (2, 1024)
 def test_add_embeddings_to_corpus_df_from_csv(self, list_of_sentences, expected_df_with_embeddings):
     # expected_df = pd.concat([pd.Series(list_of_sentences), pd.Series(expected_elmo_embeddings.tolist())], axis=1)
     embedder = Embedder(list_of_sentences)
     output_df = embedder.add_embeddings_to_corpus_df(
         os.path.join(os.getenv("FIXTURES_DIR"), 'dummy_sentences.csv'),
         np.array(((1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0))),
         'dummy_embeddings'
     )
     pdt.assert_frame_equal(expected_df_with_embeddings.sort_index(axis=1), output_df.sort_index(axis=1))
 def test_add_embeddings_to_corpus_df_with_emb_from_list_raises_error(self, list_of_sentences):
     # expected_df = pd.concat([pd.Series(list_of_sentences), pd.Series(expected_elmo_embeddings.tolist())], axis=1)
     embedder = Embedder(list_of_sentences)
     with pytest.raises(TypeError):
         embedder.add_embeddings_to_corpus_df(
             os.path.join(os.getenv("FIXTURES_DIR"), 'dummy_sentences.csv'),
             [(np.array((1.0, 2.0, 3.0)), np.array((4.0, 5.0, 6.0)), np.array((7.0, 8.0, 9.0)))],
             'dummy_embeddings'
         )
 def test_add_embeddings_to_corpus_df_with_emb_from_npy(self, list_of_sentences, expected_df_with_embeddings):
     # expected_df = pd.concat([pd.Series(list_of_sentences), pd.Series(expected_elmo_embeddings.tolist())], axis=1)
     embedder = Embedder(list_of_sentences)
     output_df = embedder.add_embeddings_to_corpus_df(
         pd.DataFrame({'dummy_sentences': ['First sentence.', 'Second sentence.', 'Third sentence.']}),
         os.path.join(os.getenv("FIXTURES_DIR"), 'dummy_embeddings.npy'),
         'dummy_embeddings'
     )
     pdt.assert_frame_equal(expected_df_with_embeddings.sort_index(axis=1), output_df.sort_index(axis=1))
 def test_add_embeddings_to_corpus_df_from_df(self, list_of_sentences, expected_df_with_embeddings):
     # expected_df = pd.concat([pd.Series(list_of_sentences), pd.Series(expected_elmo_embeddings.tolist())], axis=1)
     embedder = Embedder(list_of_sentences)
     output_df = embedder.add_embeddings_to_corpus_df(
         pd.DataFrame({'dummy_sentences': ['First sentence.', 'Second sentence.', 'Third sentence.']}),
         np.array(((1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0))),
         'dummy_embeddings'
     )
     pdt.assert_frame_equal(expected_df_with_embeddings.sort_index(axis=1), output_df.sort_index(axis=1))
    def test_compute_word2vec_embeddings_when_tfidf_weights_is_false(
            self, list_of_sentences, expected_w2v_embeddings_tfidf_false
    ):
        assert os.getenv("PYTHONHASHSEED") == "123", \
            'Please set PYTHONHASHSEED environment variable to 123, or else the test will not be deterministically ' \
            'reproducible.'

        embedder = Embedder(list_of_sentences)
        embeddings, _, _ = embedder.compute_word2vec_embeddings(tfidf_weights=False, workers=1, seed=42, hashfxn=hash)
        assert len(list_of_sentences) == len(embeddings)
        assert embeddings.shape == (len(list_of_sentences), 300)
        np.testing.assert_array_equal(expected_w2v_embeddings_tfidf_false, embeddings)
def test_end_to_end_runner():
    scraper = DocumentScraper(
        os.getenv("FIXTURES_DIR"),
        os.path.join(os.getenv("FIXTURES_DIR"), 'words_to_replace.json'))
    df_by_page = scraper.document_corpus_to_pandas_df()
    generator = CorpusGenerator(df_by_page)
    df_by_sentence = generator.df_by_page_to_df_by_sentence()
    list_of_sentences = df_by_sentence['sentence'].values.tolist()
    assert list_of_sentences == [
        'Mr Michael went to the store to buy some eggs.',
        'Joel rolled down the street on his skateboard.',
        'test / this is a first sentence',
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"
    ]

    embedder = Embedder(list_of_sentences)
    models_to_be_run = ['Word2Vec_tfidf_weighted', 'Word2Vec', 'BERT', 'ELMo']
    for model in models_to_be_run:
        if model == 'Word2Vec_tfidf_weighted':
            sentence_embeddings, w2v_tfidf, tfidf_vectorizer = embedder.compute_word2vec_embeddings(
                tfidf_weights=True)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings,
                'Word2Vec_with_TfIdf_weights')
        elif model == 'Word2Vec':
            sentence_embeddings, w2v, _ = embedder.compute_word2vec_embeddings(
                tfidf_weights=False)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'Word2Vec')
        elif model == 'BERT':
            bert_model = 'bert-base-nli-stsb-mean-tokens'  # This line is specific to BERT
            sentence_embeddings, bert = embedder.compute_bert_embeddings(
                bert_model)
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'BERT')
        elif model == 'ELMo':
            sentence_embeddings, elmo = embedder.compute_elmo_embeddings()
            df_by_sentence = embedder.add_embeddings_to_corpus_df(
                df_by_sentence, sentence_embeddings, 'ELMo_layer_3')
        else:
            raise KeyError(f'The model {model} is not recognized as input.')

    w2v_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'Word2Vec',
        'Word2Vec',
        w2v,
        metric_colname='w2v_distance_test1')
    w2v_tfidf_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'Word2Vec_with_TfIdf_weights',
        'Word2Vec_TfIdf_weighted',
        w2v_tfidf,
        metric_colname='w2v_tfidf_weighted_distance_test1',
        tfidf_vectorizer=tfidf_vectorizer)
    elmo_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'ELMo_layer_3',
        'ELMo',
        elmo,
        metric_colname='elmo_distance_test1')
    bert_emb, df_by_sentence = query_embeddings(
        list_of_sentences[0],
        df_by_sentence,
        'BERT',
        'BERT',
        bert,
        metric_colname='bert_distance_test1')

    df_by_sentence.sort_values('w2v_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_equal(w2v_emb, df_by_sentence['Word2Vec'][0])

    df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_equal(
        w2v_tfidf_emb, df_by_sentence['Word2Vec_with_TfIdf_weights'][0])

    df_by_sentence.sort_values('elmo_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    # np.testing.assert_array_almost_equal(elmo_emb, df_by_sentence['ELMo_layer_3'][0])
    # This test does not work, see https://github.com/allenai/allennlp/issues/3995#

    df_by_sentence.sort_values('bert_distance_test1',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][
        0] == "Mr Michael went to the store to buy some eggs."
    np.testing.assert_array_almost_equal(bert_emb, df_by_sentence['BERT'][0])

    w2v_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'Word2Vec',
        'Word2Vec',
        w2v,
        metric_colname='w2v_distance_test2')
    w2v_tfidf_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'Word2Vec_with_TfIdf_weights',
        'Word2Vec_TfIdf_weighted',
        w2v_tfidf,
        metric_colname='w2v_tfidf_weighted_distance_test2',
        tfidf_vectorizer=tfidf_vectorizer)
    elmo_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'ELMo_layer_3',
        'ELMo',
        elmo,
        metric_colname='elmo_distance_test2')
    bert_emb, df_by_sentence = query_embeddings(
        "New York",
        df_by_sentence,
        'BERT',
        'BERT',
        bert,
        metric_colname='bert_distance_test2')

    df_by_sentence.sort_values('w2v_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('w2v_tfidf_weighted_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('elmo_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"

    df_by_sentence.sort_values('bert_distance_test2',
                               ascending=True,
                               inplace=True)
    df_by_sentence.reset_index(inplace=True, drop=True)
    assert df_by_sentence['sentence'][0] == \
        "Take a look, then, at Tuesday's elections in New York City, New Jersey and Virginia:"
示例#8
0
    MODELS_DIR = os.getenv('MODELS_DIR')
    CONFIG_DIR = os.getenv('CONFIG_DIR')
    LOGGING_CONFIG = os.getenv('LOGGING_CONFIG')

    with open(LOGGING_CONFIG, 'r') as f:
        config = yaml.safe_load(f)
    logging.config.dictConfig(config)

    with open(os.path.join(CONFIG_DIR, 'filenames.json'), 'r') as f:
        file_names = json.load(f)

    corpus_filename = "corpus_by_sentence.csv"
    corpus_by_sentence = pd.read_csv(os.path.join(DATA_DIR, "processed", corpus_filename))
    list_of_sentences = corpus_by_sentence['sentence'].values.tolist()
    print("Instantiating Embedder class.")
    embedder = Embedder(list_of_sentences)

    for model in models_to_be_run:
        print(f"Calculating {model} embeddings.")
        if model == 'Word2Vec_tfidf_weighted':
            sentence_embeddings, model_obj, tfidf_vectorizer = embedder.compute_word2vec_embeddings(tfidf_weights=True)
            embedder.save_model(tfidf_vectorizer, MODELS_DIR, file_names[model]['vectorizer_filename'])
            # the line above is specific to Word2Vec with TfIdf vectorizer and cannot be generalized to other models
        elif model == 'Word2Vec':
            sentence_embeddings, model_obj, _ = embedder.compute_word2vec_embeddings(tfidf_weights=False)
        elif model == 'BERT':
            bert_model = 'bert-base-nli-stsb-mean-tokens'  # This line is specific to BERT
            sentence_embeddings, model_obj = embedder.compute_bert_embeddings(bert_model)
        elif model == 'ELMo':
            sentence_embeddings, model_obj = embedder.compute_elmo_embeddings()
        else:
 def test_compute_bert_embeddings_contains_empty_sentence(self):
     embedder = Embedder(['Sentence one.', ''])
     embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens')
     assert embeddings.shape == (2, 768)
示例#10
0
 def test_compute_bert_embeddings_is_empty_sentence(self):
     embedder = Embedder([])
     embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens')
     assert len(embeddings) == 0
     np.testing.assert_array_equal(np.array([], dtype=np.float64), embeddings)
示例#11
0
 def test_compute_bert_embeddings(self, list_of_sentences, expected_bert_embeddings):
     embedder = Embedder(list_of_sentences)
     embeddings, _ = embedder.compute_bert_embeddings(model='bert-base-nli-stsb-mean-tokens')
     assert len(list_of_sentences) == len(embeddings)
     assert embeddings.shape == (len(list_of_sentences), 768)
     np.testing.assert_array_almost_equal(expected_bert_embeddings, embeddings, decimal=5)
示例#12
0
 def test_compute_elmo_embeddings_is_empty_sentence(self):
     embedder = Embedder([])
     embeddings, _ = embedder.compute_elmo_embeddings()
     assert len(embeddings) == 0
     np.testing.assert_array_equal(np.array([], dtype=np.float64), embeddings)
示例#13
0
 def test_compute_elmo_embeddings(self, list_of_sentences, expected_elmo_embeddings):
     embedder = Embedder(list_of_sentences)
     embeddings, _ = embedder.compute_elmo_embeddings()
     assert len(list_of_sentences) == len(embeddings)
     assert embeddings.shape == (len(list_of_sentences), 1024)
     np.testing.assert_array_almost_equal(expected_elmo_embeddings, embeddings)
示例#14
0
 def test_compute_word2vec_embeddings_is_empty_sentence_raises_error(self):
     embedder = Embedder([])
     with pytest.raises(RuntimeError):
         embedder.compute_word2vec_embeddings()
示例#15
0
 def test_class_instantiation(self, list_of_sentences):
     embedder = Embedder(list_of_sentences)
     assert embedder.list_of_sentences == list_of_sentences