def __init__(self, vocab, C=2, dim=100, lr_opt=None, sm_opt=SMOpt.none, nr_neg_samples=5): Word2Vec.__init__(self, vocab, C, dim, lr_opt, sm_opt, nr_neg_samples) self.__theano_build__()
def test_create_similarity_dict(self): w2v = Word2Vec() w2v.load_word2vec_format('testword2vec.txt') dict = w2v.create_similarity_dict( [('test01', 'test01'), ('test01', 'test02'), ('test14', 'test02'), ('test14', 'test01')]) self.assertAlmostEqual(1.0, dict.get(('test01', 'test01'))) self.assertAlmostEqual(0.0, dict.get(('test01', 'test02'))) self.assertAlmostEqual(0.23570226, dict.get(('test01', 'test02', 'test14')))
def test_get_negative_score_for_group(self): w2v = Word2Vec() w2v.load_word2vec_format('testword2vec.txt') self.assertAlmostEqual(1.0, w2v.get_negative_score_for_group(['test01'], ['test01'])) self.assertAlmostEqual(0.0, w2v.get_negative_score_for_group(['test01'], ['test02'])) self.assertAlmostEqual(0.50000006, w2v.get_negative_score_for_group(['test01', 'test02'], ['test14'])) self.assertAlmostEqual(0.77459663, w2v.get_negative_score_for_group(['test04', 'test04', 'test05'], ['test245']))
def test_cosine_similarity(self): w2v = Word2Vec() w2v.load_word2vec_format('testword2vec.txt') self.assertTrue(np.isnan(w2v.cosine_similarity(w2v.kv['test00'], w2v.kv['test00']))) self.assertEqual(1, w2v.cosine_similarity(w2v.kv['test01'], w2v.kv['test01'])) self.assertEqual(0, w2v.cosine_similarity(w2v.kv['test01'], w2v.kv['test02'])) self.assertTrue(np.isclose(0.70710677, w2v.cosine_similarity(w2v.kv['test14'], w2v.kv['test01']))) self.assertAlmostEquals(0.70710677, w2v.cosine_similarity(w2v.kv['test14'], w2v.kv['test01'])) self.assertAlmostEquals(0.4082483, w2v.cosine_similarity(w2v.kv['test245'], w2v.kv['test24']))
def main(embedding_path="models/embedding.pth", emb_dim=64): corpus = dataset_reader.main() corpus = corpus[::10] # because full dataset is 6 hours per epoch bigrams = Bigrammer(corpus) v_size = len(bigrams.word2idx) print("Vocabulary size: {}".format(v_size)) w2v = Word2Vec(v_size, emb_dim) train(w2v, bigrams, embedding_path)
def init_codenames(self): try: lib = os.getenv('LIB') # if not lib: # lib = "wiki2vec" # wv = w2v.gensim_from_vsmlib("../data/" + lib) w2v = Word2Vec() #w2v.load_word2vec_format('data_ignore/wiki-news/wiki-news-300d-1M.vec') w2v.load_word2vec_format('data/wiki-news/words100en') trans = Translate('src/my_config.json') return CodeNames(trans, w2v) except Exception as e: traceback.print_exc() print(e)
def test_word_similarity(self): w2v = Word2Vec() w2v.load_word2vec_format('testword2vec.txt') self.assertSequenceEqual((('test01', 1.0), ('test02', 0.0)), tuple(w2v.word_similarity('test01', ['test01','test02']))) self.assertAlmostEqual(0.70710677, w2v.word_similarity('test01', ['test14']).__next__()[1])
def test_get_similar_for_groups(self): w2v = Word2Vec() w2v.load_word2vec_format('testword2vec.txt') self.assertEqual('test14', w2v.get_similar_for_groups(['test01', 'test04'])[0][0]) self.assertEqual('test245', w2v.get_similar_for_groups(['test02', 'test04', 'test05'])[0][0])
def test_load_word2vec_file(self): w2v = Word2Vec() w2v.load_word2vec_format('testword2vec.txt') self.assertTupleEqual(tuple(np.zeros(5)), tuple(w2v.kv.get_vector('test00')))
def __init__(self): super().__init__() self._map_name_to_handler = { "sent_BL": lambda: self.__dict_Senti_Lexi_0(config.LEXI_BL), "sent_GI": lambda: self.__dict_Senti_Lexi_0(config.LEXI_GI), "sent_IMDB": lambda: self.__dict_Senti_Lexi_0(config.LEXI_IMDB), "sent_MPQA": lambda: self.__dict_Senti_Lexi_0(config.LEXI_MPQA), "sent_NRCE": lambda: self.__dict_Senti_Lexi_0(config.LEXI_NRCEMOTION), "sent_AF": lambda: self.__dict_Senti_Lexi_1(config.LEXI_AFINN), "sent_NRC140_U": lambda: self.__dict_Senti_Lexi_1(config.LEXI_NRC140_U), "sent_NRCH_U": lambda: self.__dict_Senti_Lexi_1(config.LEXI_NRCHASHTAG_U), "sent_NRC140_B": lambda: self.__dict_Senti_Lexi_2(config.LEXI_NRC140_B), "sent_NRCH_B": lambda: self.__dict_Senti_Lexi_2(config.LEXI_NRCHASHTAG_B), "embed_Word2Vec": lambda: Word2Vec(config.WORD2VEC_GOOGLE), "embed_GloVe": self.__get_glove_handler, "url_crawled_data": lambda: self.__get_url_creeper_data(config.URL_CACHE_PATH), } # GloVe is too large, make cache for it. for freq in range(1, 6): self._map_name_to_handler[ "nltk_unigram_t%d" % freq] = lambda freq=freq: load_dict_from_file( config.DICT_NLTK_UNIGRAM_TU % freq) self._map_name_to_handler[ "nltk_bigram_t%d" % freq] = lambda freq=freq: load_dict_from_file( config.DICT_NLTK_BIGRAM_TU % freq) self._map_name_to_handler[ "nltk_trigram_t%d" % freq] = lambda freq=freq: load_dict_from_file( config.DICT_NLTK_TRIGRAM_TU % freq) self._map_name_to_handler[ "hashtag_t%d" % freq] = lambda freq=freq: load_dict_from_file( config.DICT_HASHTAG_TU % freq) self._map_name_to_handler["hashtag_unigram_t%d" % freq] = lambda freq=freq: \ load_dict_from_file(config.DICT_HASHTAG_UNIGRAM_TU % freq) self._map_name_to_handler[ "url_unigram_t%d" % freq] = lambda freq=freq: load_dict_from_file( config.DICT_URL_UNIGRAM_TU % freq) self._map_name_to_handler["nltk_unigram_for_test_t%d" % freq] = lambda freq=freq: \ load_dict_from_file(config.DICT_NLTK_UNIGRAM_TU_TEST % freq) for k, v in self._map_name_to_handler.items(): try: v.__name__ = "%s_handler" % k except Exception: pass