Exemplo n.º 1
0
 def __init__(self,
              vocab,
              C=2,
              dim=100,
              lr_opt=None,
              sm_opt=SMOpt.none,
              nr_neg_samples=5):
     Word2Vec.__init__(self, vocab, C, dim, lr_opt, sm_opt, nr_neg_samples)
     self.__theano_build__()
Exemplo n.º 2
0
 def test_create_similarity_dict(self):
     w2v = Word2Vec()
     w2v.load_word2vec_format('testword2vec.txt')
     dict = w2v.create_similarity_dict(
         [('test01', 'test01'), ('test01', 'test02'), ('test14', 'test02'), ('test14', 'test01')])
     self.assertAlmostEqual(1.0, dict.get(('test01', 'test01')))
     self.assertAlmostEqual(0.0, dict.get(('test01', 'test02')))
     self.assertAlmostEqual(0.23570226, dict.get(('test01', 'test02', 'test14')))
Exemplo n.º 3
0
 def test_get_negative_score_for_group(self):
     w2v = Word2Vec()
     w2v.load_word2vec_format('testword2vec.txt')
     self.assertAlmostEqual(1.0, w2v.get_negative_score_for_group(['test01'], ['test01']))
     self.assertAlmostEqual(0.0, w2v.get_negative_score_for_group(['test01'], ['test02']))
     self.assertAlmostEqual(0.50000006, w2v.get_negative_score_for_group(['test01', 'test02'], ['test14']))
     self.assertAlmostEqual(0.77459663,
                            w2v.get_negative_score_for_group(['test04', 'test04', 'test05'], ['test245']))
Exemplo n.º 4
0
 def test_cosine_similarity(self):
     w2v = Word2Vec()
     w2v.load_word2vec_format('testword2vec.txt')
     self.assertTrue(np.isnan(w2v.cosine_similarity(w2v.kv['test00'], w2v.kv['test00'])))
     self.assertEqual(1, w2v.cosine_similarity(w2v.kv['test01'], w2v.kv['test01']))
     self.assertEqual(0, w2v.cosine_similarity(w2v.kv['test01'], w2v.kv['test02']))
     self.assertTrue(np.isclose(0.70710677, w2v.cosine_similarity(w2v.kv['test14'], w2v.kv['test01'])))
     self.assertAlmostEquals(0.70710677, w2v.cosine_similarity(w2v.kv['test14'], w2v.kv['test01']))
     self.assertAlmostEquals(0.4082483, w2v.cosine_similarity(w2v.kv['test245'], w2v.kv['test24']))
Exemplo n.º 5
0
def main(embedding_path="models/embedding.pth", emb_dim=64):
    corpus = dataset_reader.main()
    corpus = corpus[::10]  # because full dataset is 6 hours per epoch
    bigrams = Bigrammer(corpus)
    v_size = len(bigrams.word2idx)
    print("Vocabulary size: {}".format(v_size))

    w2v = Word2Vec(v_size, emb_dim)

    train(w2v, bigrams, embedding_path)
Exemplo n.º 6
0
 def init_codenames(self):
     try:
         lib = os.getenv('LIB')
         # if not lib:
         #    lib = "wiki2vec"
         # wv = w2v.gensim_from_vsmlib("../data/" + lib)
         w2v = Word2Vec()
         #w2v.load_word2vec_format('data_ignore/wiki-news/wiki-news-300d-1M.vec')
         w2v.load_word2vec_format('data/wiki-news/words100en')
         trans = Translate('src/my_config.json')
         return CodeNames(trans, w2v)
     except Exception as e:
         traceback.print_exc()
         print(e)
Exemplo n.º 7
0
 def test_word_similarity(self):
     w2v = Word2Vec()
     w2v.load_word2vec_format('testword2vec.txt')
     self.assertSequenceEqual((('test01', 1.0), ('test02', 0.0)), tuple(w2v.word_similarity('test01', ['test01','test02'])))
     self.assertAlmostEqual(0.70710677,
                              w2v.word_similarity('test01', ['test14']).__next__()[1])
Exemplo n.º 8
0
 def test_get_similar_for_groups(self):
     w2v = Word2Vec()
     w2v.load_word2vec_format('testword2vec.txt')
     self.assertEqual('test14', w2v.get_similar_for_groups(['test01', 'test04'])[0][0])
     self.assertEqual('test245', w2v.get_similar_for_groups(['test02', 'test04', 'test05'])[0][0])
Exemplo n.º 9
0
 def test_load_word2vec_file(self):
     w2v = Word2Vec()
     w2v.load_word2vec_format('testword2vec.txt')
     self.assertTupleEqual(tuple(np.zeros(5)), tuple(w2v.kv.get_vector('test00')))
Exemplo n.º 10
0
    def __init__(self):
        super().__init__()
        self._map_name_to_handler = {
            "sent_BL":
            lambda: self.__dict_Senti_Lexi_0(config.LEXI_BL),
            "sent_GI":
            lambda: self.__dict_Senti_Lexi_0(config.LEXI_GI),
            "sent_IMDB":
            lambda: self.__dict_Senti_Lexi_0(config.LEXI_IMDB),
            "sent_MPQA":
            lambda: self.__dict_Senti_Lexi_0(config.LEXI_MPQA),
            "sent_NRCE":
            lambda: self.__dict_Senti_Lexi_0(config.LEXI_NRCEMOTION),
            "sent_AF":
            lambda: self.__dict_Senti_Lexi_1(config.LEXI_AFINN),
            "sent_NRC140_U":
            lambda: self.__dict_Senti_Lexi_1(config.LEXI_NRC140_U),
            "sent_NRCH_U":
            lambda: self.__dict_Senti_Lexi_1(config.LEXI_NRCHASHTAG_U),
            "sent_NRC140_B":
            lambda: self.__dict_Senti_Lexi_2(config.LEXI_NRC140_B),
            "sent_NRCH_B":
            lambda: self.__dict_Senti_Lexi_2(config.LEXI_NRCHASHTAG_B),
            "embed_Word2Vec":
            lambda: Word2Vec(config.WORD2VEC_GOOGLE),
            "embed_GloVe":
            self.__get_glove_handler,
            "url_crawled_data":
            lambda: self.__get_url_creeper_data(config.URL_CACHE_PATH),
        }
        # GloVe is too large, make cache for it.

        for freq in range(1, 6):
            self._map_name_to_handler[
                "nltk_unigram_t%d" %
                freq] = lambda freq=freq: load_dict_from_file(
                    config.DICT_NLTK_UNIGRAM_TU % freq)
            self._map_name_to_handler[
                "nltk_bigram_t%d" %
                freq] = lambda freq=freq: load_dict_from_file(
                    config.DICT_NLTK_BIGRAM_TU % freq)
            self._map_name_to_handler[
                "nltk_trigram_t%d" %
                freq] = lambda freq=freq: load_dict_from_file(
                    config.DICT_NLTK_TRIGRAM_TU % freq)
            self._map_name_to_handler[
                "hashtag_t%d" % freq] = lambda freq=freq: load_dict_from_file(
                    config.DICT_HASHTAG_TU % freq)
            self._map_name_to_handler["hashtag_unigram_t%d" % freq] = lambda freq=freq: \
                load_dict_from_file(config.DICT_HASHTAG_UNIGRAM_TU % freq)
            self._map_name_to_handler[
                "url_unigram_t%d" %
                freq] = lambda freq=freq: load_dict_from_file(
                    config.DICT_URL_UNIGRAM_TU % freq)


            self._map_name_to_handler["nltk_unigram_for_test_t%d" % freq] = lambda freq=freq: \
                load_dict_from_file(config.DICT_NLTK_UNIGRAM_TU_TEST % freq)

        for k, v in self._map_name_to_handler.items():
            try:
                v.__name__ = "%s_handler" % k
            except Exception:
                pass