def testTransformSerialized(self): # Same as testTransform, using serialized corpora. passed = False # sometimes, training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(25): # restart at most 5 times # create the transformation model model = self.class_( id2word=dictionary, num_topics=2, passes=100, random_state=0, serialized=True, serialization_path=datapath('testcorpus_serialization.mm') ) model.update(self.corpus, author2doc) jill_topics = model.get_author_topics('jill') # NOTE: this test may easily fail if the author-topic model is altered in any way. The model's # output is sensitive to a lot of things, like the scheduling of the updates, or like the # author2id (because the random initialization changes when author2id changes). If it does # fail, simply be aware of whether we broke something, or if it just naturally changed the # output of the model slightly. vec = matutils.sparse2full(jill_topics, 2) # convert to dense vector, for easier equality tests expected = [0.91, 0.08] # must contain the same values, up to re-ordering passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # Delete the MmCorpus used for serialization inside the author-topic model. remove(datapath('testcorpus_serialization.mm')) if passed: break logging.warning( "Author-topic model failed to converge on attempt %i (got %s, expected %s)", i, sorted(vec), sorted(expected) ) self.assertTrue(passed)
def testLoadOldModel(self): """Test loading fasttext models from previous version""" model_file = 'fasttext_old' model = FT_gensim.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) self.assertTrue(model.vocabulary.cum_table.shape == (12, )) self.assertEqual(len(model.wv.hash2index), 202) self.assertTrue(model.wv.vectors_vocab.shape == (12, 100)) self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100)) # Model stored in multiple files model_file = 'fasttext_old_sep' model = FT_gensim.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) self.assertTrue(model.vocabulary.cum_table.shape == (12, )) self.assertEqual(len(model.wv.hash2index), 202) self.assertTrue(model.wv.vectors_vocab.shape == (12, 100)) self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100))
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def testLoadOldModel(self): """Test loading word2vec models from previous version""" model_file = 'word2vec_old' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) self.assertTrue(model.vocabulary.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) # Model stored in multiple files model_file = 'word2vec_old_sep' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) self.assertTrue(model.vocabulary.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) # load really old model model_file = 'w2v-lee-v0.12.0' model = word2vec.Word2Vec.load(datapath(model_file)) self.onlineSanity(model, trained_model=True) # test for max_final_vocab for model saved in 3.3 model_file = 'word2vec_3.3' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertEqual(model.max_final_vocab, None) self.assertEqual(model.vocabulary.max_final_vocab, None) # Test loading word2vec models from all previous versions old_versions = [ '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4', '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4', '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0', '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' ] saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl') for old_version in old_versions: model = word2vec.Word2Vec.load(saved_models_dir.format(old_version)) self.assertTrue(len(model.wv.vocab) == 3) self.assertTrue(model.wv.vectors.shape == (3, 4)) # check if similarity search and online training works. self.assertTrue(len(model.wv.most_similar('sentence')) == 2) model.build_vocab(list_corpus, update=True) model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) # check if similarity search and online training works after saving and loading back the model. tmpf = get_tmpfile('gensim_word2vec.tst') model.save(tmpf) loaded_model = word2vec.Word2Vec.load(tmpf) loaded_model.build_vocab(list_corpus, update=True) loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
def testSerialized(self): # Test the model using serialized corpora. Basic tests, plus test of update functionality. model = self.class_( self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, serialized=True, serialization_path=datapath('testcorpus_serialization.mm') ) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) self.assertTrue(all(jill_topics > 0)) model.update() jill_topics2 = model.get_author_topics('jill') jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) # Did we learn more about Jill? self.assertFalse(all(np.equal(jill_topics, jill_topics2))) model.update(corpus_new, author2doc_new) # Did we learn something about Sally? sally_topics = model.get_author_topics('sally') sally_topics = matutils.sparse2full(sally_topics, model.num_topics) self.assertTrue(all(sally_topics > 0)) # Delete the MmCorpus used for serialization inside the author-topic model. remove(datapath('testcorpus_serialization.mm'))
def get_corpus(): text_path = datapath('ldavowpalwabbit.txt') dict_path = datapath('ldavowpalwabbit.dict.txt') dictionary = Dictionary.load_from_text(dict_path) with open(text_path) as fhandle: corpus = [dictionary.doc2bow(l.strip().split()) for l in fhandle] return corpus, dictionary
def testLoadOldModel(self): """Test loading doc2vec models from previous version""" model_file = 'doc2vec_old' model = doc2vec.Doc2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (3955, 100)) self.assertTrue(len(model.wv.vocab) == 3955) self.assertTrue(len(model.wv.index2word) == 3955) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (3955, )) self.assertTrue(model.vocabulary.cum_table.shape == (3955, )) self.assertTrue(model.docvecs.vectors_docs.shape == (300, 100)) self.assertTrue(model.trainables.vectors_docs_lockf.shape == (300, )) self.assertTrue(model.docvecs.max_rawint == 299) self.assertTrue(model.docvecs.count == 300) # Model stored in multiple files model_file = 'doc2vec_old_sep' model = doc2vec.Doc2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (3955, 100)) self.assertTrue(len(model.wv.vocab) == 3955) self.assertTrue(len(model.wv.index2word) == 3955) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (3955, )) self.assertTrue(model.vocabulary.cum_table.shape == (3955, )) self.assertTrue(model.docvecs.vectors_docs.shape == (300, 100)) self.assertTrue(model.trainables.vectors_docs_lockf.shape == (300, )) self.assertTrue(model.docvecs.max_rawint == 299) self.assertTrue(model.docvecs.count == 300)
def setUp(self): ft_home = os.environ.get('FT_HOME', None) self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None self.corpus_file = datapath('lee_background.cor') self.test_model_file = datapath('lee_fasttext') self.test_new_model_file = datapath('lee_fasttext_new') # Load pre-trained model to perform tests in case FastText binary isn't available in test environment self.test_model = fasttext.FastText.load_fasttext_format(self.test_model_file)
def setUp(self): self.time_slices = [3, 7] self.corpus = corpora.mmcorpus.MmCorpus(datapath('dtm_test.mm')) self.id2word = corpora.Dictionary.load(datapath('dtm_test.dict')) # first you need to setup the environment variable $DTM_PATH for the dtm executable file self.dtm_path = os.environ.get('DTM_PATH', None) if not self.dtm_path: self.skipTest("$DTM_PATH is not properly set up.")
def setUp(self): filename = datapath("alldata-id-10.txt") train_docs = read_sentiment_docs(filename) self.train_docs = train_docs self.source_doc_vec_file = datapath("small_tag_doc_5_iter50") self.target_doc_vec_file = datapath("large_tag_doc_10_iter50") self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file) self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
def test_type_conversion(self): path = datapath('high_precision.kv.txt') binary_path = datapath('high_precision.kv.bin') model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16) model1.save_word2vec_format(binary_path, binary=True) model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True) self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0])) self.assertEqual(model1["horse.n.01"][0].dtype, np.float16) self.assertEqual(model2["horse.n.01"][0].dtype, np.float64)
def testPathLineSentences(self): """Does PathLineSentences work with a path argument?""" with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\ utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2: sentences = word2vec.PathLineSentences(datapath('PathLineSentences')) orig = orig1.readlines() + orig2.readlines() orig_counter = 0 # to go through orig while matching PathLineSentences for words in sentences: self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split()) orig_counter += 1
def test_encoding_handling(self): """Tests whether utf8 and non-utf8 data loaded correctly.""" non_utf8_file = datapath('poincare_cp852.tsv') relations = [relation for relation in PoincareRelations(non_utf8_file, encoding='cp852')] self.assertEqual(len(relations), 2) self.assertEqual(relations[0], (u'tímto', u'budeš')) utf8_file = datapath('poincare_utf8.tsv') relations = [relation for relation in PoincareRelations(utf8_file)] self.assertEqual(len(relations), 2) self.assertEqual(relations[0], (u'tímto', u'budeš'))
def testEvaluateWordPairs(self): """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets""" corpus = word2vec.LineSentence(datapath('head500.noblanks.cor.bz2')) model = word2vec.Word2Vec(corpus, min_count=3, iter=10) correlation = model.evaluate_word_pairs(datapath('wordsim353.tsv')) pearson = correlation[0][0] spearman = correlation[1][0] oov = correlation[2] self.assertTrue(0.1 < pearson < 1.0) self.assertTrue(0.1 < spearman < 1.0) self.assertTrue(0.0 <= oov < 90.0)
def testModelCompatibilityWithPythonVersions(self): fname_model_2_7 = datapath('ldamodel_python_2_7') model_2_7 = self.class_.load(fname_model_2_7) fname_model_3_5 = datapath('ldamodel_python_3_5') model_3_5 = self.class_.load(fname_model_3_5) self.assertEqual(model_2_7.num_topics, model_3_5.num_topics) self.assertTrue(np.allclose(model_2_7.expElogbeta, model_3_5.expElogbeta)) tstvec = [] self.assertTrue(np.allclose(model_2_7[tstvec], model_3_5[tstvec])) # try projecting an empty vector id2word_2_7 = dict(model_2_7.id2word.iteritems()) id2word_3_5 = dict(model_3_5.id2word.iteritems()) self.assertEqual(set(id2word_2_7.keys()), set(id2word_3_5.keys()))
def test_persistence(self): # Test persistence without using `smartirs` fname = get_tmpfile('gensim_models.tst') model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence with using `smartirs` fname = get_tmpfile('gensim_models_smartirs.tst') model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence between Gensim v3.2.0 and current model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector # Test persistence with using pivoted normalization fname = get_tmpfile('gensim_models_smartirs.tst') model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1) model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
def test_ft_kv_backward_compat_w_360(self): kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz")) ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz")) expected = ['trees', 'survey', 'system', 'graph', 'interface'] actual = [word for (word, similarity) in kv.most_similar("human", topn=5)] self.assertEqual(actual, expected) actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)] self.assertEqual(actual, expected)
def setUp(self): wr_home = os.environ.get('WR_HOME', None) self.wr_path = wr_home if wr_home else None self.corpus_file = datapath('lee.cor') self.out_name = 'testmodel' self.wr_file = datapath('test_glove.txt') if not self.wr_path: return self.test_model = wordrank.Wordrank.train( self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, np=4, cleanup_files=True )
def testCompatibilty(self): phr = Phraser.load(datapath("phraser-3.6.0.model")) model = Phrases.load(datapath("phrases-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] expected_res = ['trees', 'graph_minors'] phr_out = phr[test_sentences] model_out = model[test_sentences] self.assertEqual(phr_out, expected_res) self.assertEqual(model_out, expected_res)
def setUp(self): self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"), ("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"), ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"), ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana") ] self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")] self.source_word_vec = KeyedVectors.load_word2vec_format(self.source_word_vec_file, binary=False) self.target_word_vec = KeyedVectors.load_word2vec_format(self.target_word_vec_file, binary=False)
def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) # Choose doc to be normalized. [3] chosen to demonstrate different results for l1 and l2 norm. # doc is [(1, 1.0), (5, 2.0), (8, 1.0)] self.doc = list(self.corpus)[3] self.model_l1 = normmodel.NormModel(self.corpus, norm='l1') self.model_l2 = normmodel.NormModel(self.corpus, norm='l2')
def test_load_model_with_non_ascii_vocab(self): model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model) try: model[u'který'] except UnicodeDecodeError: self.fail('Unable to access vector for utf8 encoded non-ascii word')
def test_load_model_non_utf8_encoding(self): model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model) try: model[u'který'] except KeyError: self.fail('Unable to access vector for cp-852 word')
def test_sg_neg_training(self): model_gensim = FT_gensim( size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'overnight', u'overnight.', u'month', u'land', u'firm', u'singles', u'death'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
def testPathLineSentencesOneFile(self): """Does PathLineSentences work with a single file argument?""" test_file = os.path.join(datapath('PathLineSentences'), '1.txt') with utils.smart_open(test_file) as orig: sentences = word2vec.PathLineSentences(test_file) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def test_load(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) docs = list(corpus) # the deerwester corpus always has nine documents self.assertEqual(len(docs), 9)
def test_persistence_old_model(self): """Tests whether model from older gensim version is loaded correctly.""" loaded = PoincareModel.load(datapath('poincare_test_3.4.0')) self.assertEqual(loaded.kv.syn0.shape, (239, 2)) self.assertEqual(len(loaded.kv.vocab), 239) self.assertEqual(loaded.size, 2) self.assertEqual(len(loaded.all_relations), 200)
def test_line2doc(self): # case with metadata=False (by default) super(TestMalletCorpus, self).test_line2doc() # case with metadata=True fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) id2word = {1: 'mom', 2: 'window'} corpus = self.corpus_class(fname, id2word=id2word, metadata=True) # should return all words in doc corpus.use_wordids = False doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE) self.assertEqual(docid, '#3') self.assertEqual(doclang, 'lang') self.assertEqual( sorted(doc), [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)]) # should return words in word2id corpus.use_wordids = True doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE) self.assertEqual(docid, '#3') self.assertEqual(doclang, 'lang') self.assertEqual( sorted(doc), [(1, 1), (2, 2)])
def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phrases, before common_terms""" bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) # can make a phraser, cf #1751 phraser = Phraser(bigram_loaded) # does not raise phraser[["human", "interface", "survey"]] # does not raise
def test_closed_file_object(self): file_obj = open(datapath('testcorpus.mm')) f = file_obj.closed mmcorpus.MmCorpus(file_obj) s = file_obj.closed self.assertEqual(f, 0) self.assertEqual(s, 0)
install_java() initialize(java_options="-Xmx4g -Dfile.encoding=utf-8", KKMA="2.0.2", RHINO="2.0.5", EUNJEON="2.0.2", ETRI="2.0.2") def line(): print("\n________________________\n\n") print("Saving cleaned words complete") cleaned_koala_words = koala_bow() topics = [3, 4, 5] dictionary, vocab = split_train(cleaned_koala_words, 1) # create model print("Generating and saving LDA models") for num_topic in topics: line() ldamodel, corpus, id2word = create_lda(num_topic, dictionary) line() temp_file = datapath("../models/koala_model_" + str(num_topic)) ldamodel.save(temp_file) line() model_evaluate(ldamodel, dictionary, id2word, num_topic) line() lda_visualize(ldamodel, dictionary, num_topic, "koala") line()
def test_save_load_no_common_terms(self): """Ensure backwards compatibility with old versions of FrozenPhrases, before connector_words.""" bigram_loaded = FrozenPhrases.load( datapath("phraser-no-common-terms.pkl")) self.assertEqual(bigram_loaded.connector_words, frozenset())
# so we could use it for visualization via pyLDAvis mallet_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( ldamallet) # using sort_topics=False will help us to synchronize the results by this visualization # and the word clouds results vis = pyLDAvis.gensim.prepare(mallet_lda_model, corpus, id2word, sort_topics=False) pyLDAvis.save_html( vis, 'LDA_Mallet_' + str(len(mallet_lda_model.get_topics())) + '_topics.html') # tool to save the model to some file and then use it in other python files temp_file = datapath("mallet_lda_model") mallet_lda_model.save(temp_file) # This LDA model is regular gensim, produced worse results then mallet # Build LDA model # lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # id2word=id2word, # num_topics=14, # random_state=100, # update_every=1, # chunksize=100, # passes=10, # alpha='auto', # minimum_probability=0.0, # per_word_topics=True)
for p in pd: processed_docs = p[u'message'].map(n_all) dictionary = gensim.corpora.Dictionary(processed_docs) dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] tfidf = models.TfidfModel(bow_corpus) corpus_tfidf = tfidf[bow_corpus] lda_model.update(bow_corpus, num_topics=25, id2word=dictionary, passes=2) lda_model_tfidf.update(corpus_tfidf, num_topics=25, id2word=dictionary, passes=2) temp_file = datapath("/data/06333/aroraish/models/ModEModelBOW") lda_model.save(temp_file) temp_file = datapath("/data/06333/aroraish/models/ModEModelTFIDF") lda_model_tfidf.save(temp_file) with open("/data/06333/aroraish/outputs/lda_bag_of_words_overall.txt", 'w') as bw: for idx, topic in lda_model.print_topics(-1): bw.write('Topic: {} \nWords: {}\n\n'.format(idx, topic.encode('utf-8'))) with open("/data/06333/aroraish/outputs/lda_tfidf_overall.txt", 'w') as tf: for idx, topic in lda_model_tfidf.print_topics(-1): tf.write('Topic: {} \nWord: {}\n\n'.format(idx, topic.encode('utf-8')))
def setUp(self): self.vectors = EuclideanKeyedVectors.load_word2vec_format( datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
def setUp(self): texts = [ [ u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior' ], [ u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews' ], [ u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating' ], [ u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs' ], [ u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering' ], [ u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering' ], [ u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment' ], [ u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering' ], [ u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors' ], [ u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians' ], [ u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing' ], [ u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach' ], [ u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts' ], [ u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews' ], [ u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie' ], [ u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures' ], [ u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews' ], [ u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration' ], [ u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance' ], [ u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa' ], ['bank', 'river', 'shore', 'water'], ['river', 'water', 'flow', 'fast', 'tree'], ['bank', 'water', 'fall', 'flow'], ['bank', 'bank', 'water', 'rain', 'river'], ['river', 'water', 'mud', 'tree'], ['money', 'transaction', 'bank', 'finance'], ['bank', 'borrow', 'money'], ['bank', 'finance'], ['finance', 'money', 'sell', 'bank'], ['borrow', 'sell'], ['bank', 'loan', 'sell'] ] # initializing using own LDA sufficient statistics so that we get same results each time. sstats = np.loadtxt(datapath('DTM/sstats_test.txt')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] self.ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats, passes=2, lda_inference_max_iter=10, em_min_iter=1, em_max_iter=4)
def setUp(self): ft_home = os.environ.get('FT_HOME', None) self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None self.test_model_file = datapath('lee_fasttext') self.test_model = FT_gensim.load_fasttext_format(self.test_model_file) self.test_new_model_file = datapath('lee_fasttext_new')
mean_jaccard.append(np.mean(jacc_np)) mean_bleu.append(np.mean(bleu_np)) mean_cos.append(np.mean(cos_np)) mean_fscore.append(np.mean(fscore_np)) return np.max(np.asarray(mean_bleu)), np.max( np.asarray(mean_jaccard)), np.max(np.asarray(mean_cos)), np.max( np.asarray(mean_fscore)) GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations( ) path = "/home/norberteke/PycharmProjects/Thesis/data/" dictionary = Dictionary.load(path + 'GH_full_processed_Dictionary.dict') corpus = MmCorpus(datapath(path + 'corpus_processed_GH_full.mm')) texts = [] with open(path + 'GH_full_processed_corpus.csv', 'r') as f: reader = csv.reader(f) texts = list(reader) terms = [] for (key, value) in dictionary.iteritems(): terms.append(value) def write_results_to_file(path, lda_model, max_bleu, max_jaccard, max_cos, max_fscore): with open(path, 'a') as f: writer = csv.writer(f,
def __init__( self, data_path='data/soccer/', vec_dim=300, # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin'): fasttext_model='/data/dchaudhu/soccerbot_acl/vocab/wiki.en.bin'): self.data_path = data_path self.max_similarity = 85 self.vec_dim = vec_dim self.args = get_args() cap_path = datapath(fasttext_model) self.word_emb = load_facebook_model(cap_path) # print (self.max_er_vec) self.stop = set(stopwords.words('english')) self.punc = string.punctuation self.ent_d, self.ent_list = self.get_kg(self.data_path + 'KG/') self.train_dataset = self.get_data('train') self.val_dataset = self.get_data('val') self.test_dataset = self.get_data('test') self.max_er_vec = [] # max er vector combination size for dat in self.train_dataset: self.max_er_vec.append(sum(len(v) for k, v in dat['kgER'].items())) self.max_out_reln = np.max(self.max_er_vec) self.inp_graph_max_size = np.max( [len(getER_vec(kg['kgER'])) for kg in self.train_dataset]) print('input graph size:' + str(self.inp_graph_max_size)) print(self.max_out_reln) self.objects = ['o' + str(j) for j in range(self.max_out_reln)] # Create vocabulary and word2id self.vocab = defaultdict(float) self.get_vocab(self.train_dataset) self.get_vocab(self.test_dataset) self.get_vocab(self.val_dataset) self.vocab[self.args.unk_tok] += 1.0 self.vocab[self.args.sos_tok] += 1.0 self.vocab[self.args.eou_tok] += 1.0 self.vocab[self.args.mem_tok] += 1.0 self.vocab[self.args.eos_tok] += 1.0 for o in self.objects: self.vocab[o] += 1.0 # self.stoi[self.args.pad_tok] = 0 self.stoi = dict(zip(self.vocab.keys(), range(1, len(self.vocab) + 1))) self.stoi[self.args.pad_tok] = 0 # add additional tokens # self.stoi[self.args.unk_tok] = len(self.stoi) # self.stoi[self.args.sos_tok] = len(self.stoi) # self.stoi[self.args.eos_tok] = len(self.stoi) # print(len(self.stoi)) # self.itos = {v: k for k, v in self.stoi.items()} # for j in range(self.max_out_reln): # self.stoi['o'+str(j)] = len(self.stoi)+1 # del self.stoi self.itos = {v: k for k, v in self.stoi.items()} print(len(self.stoi)) self.n_words = len(self.stoi) self.vectors = np.zeros((len(self.stoi), vec_dim)) for w, w2i in self.stoi.items(): if w2i < self.stoi[self.args.eos_tok]: self.vectors[w2i] = self.word_emb.wv[w]
return sim ''' Author:衣介书生 Link:https://www.jianshu.com/p/0c33c17770a0 ''' def multi_vec(vector_a, x): vector_a = np.mat(vector_a) return vector_a * x for model in models: embeddings = KeyedVectors.load_word2vec_format(datapath( (path / "models/{}.vector".format(model))), binary=False) cilin = open('../files/cilin_hier_perword', 'rb') fileout = open('../results/resem_comp_' + model, 'w', encoding='utf-8') scores = 0.0 cnt = 0 for line in cilin: line = line.decode('utf-8').split() word = line[5] sem = line[:5] length1 = len(sem) semvec1 = np.mat(0) for i in range(len(sem)): tmp = multi_vec(embeddings[sem[i]], 1 / (2**(len(sem) - i))) if (
def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
def run_training_batch(self, batch, batch_idx): """ :param batch: dict; contains three keys: input_ids, attention_mask, decoder_input_ids Example for 'batch': batch: {'input_ids': tensor([[ 0, 36, 230, ..., 8, 41, 2]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1]]), 'decoder_input_ids': tensor([[ 0, 287, 10, 2107, 111, 10468, 226, 47385, 11579, 1012, 2156, 5, 5302, 47385, 281, 47385, 10003, 255, 47385, 347, 111, 2107, 47385, 574, 47385, 1000, 47385, 398, 47385, 245, 16, 10, 205, 1374, 12576, 479, 646, 1000, 1215, 3388, 510, 742, 85, 128, 579, 65, 9, 5, 357, 3092, 23, 63, 1836, 11, 5, 3555, 111, 672, 2156, 26180, 47385, 642, 111, 3547, 4120, 479, 646, 1000, 1215, 3388, 510, 742, 7192, 8806, 10262, 3444, 7951, 2170, 1318, 2]])} :param batch_idx: number of batch :return: """ # load tokenizer tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') # load config for GSM config = yaml_load(f"{self.default_root_dir}/data/config/gsm.yaml") # load dict dictionary = Dictionary.load(datapath('dict-www-cnndm-unigram')) # remove [SEP] sep_list = [ '[SEP_0]', '[SEP_1]', '[SEP_2]', '[SEP_3]', '[SEP_4]', '[SEP_5]', '[SEP_6]', '[SEP_7]', '[SEP_8]', '[SEP_9]', '<S_SEP>' ] # vocab size for topic modeling vocab_size = len(dictionary) # model config['hidden']['features'][0] = vocab_size # trainer batch config['trainer_batch']['test_sample'] = 1 config = extend_config_reference(config) gsm_trainer = config['GSMtrainer'] gsm_trainer[ 'base_dir'] = f"{self.default_root_dir}/log/bart-large-cnn-finetune" gsm_trainer = GSMTrainer.from_config(gsm_trainer) # number of topics K = config['gsmtopic']['k'] # yaml_dump(gsm_trainer, # os.path.join(f"{self.default_root_dir}/log/bart-large-cnn-finetune", "gsm_trainer.yaml")) # ----------------------------------------- # Topic Modeling - GSM # ----------------------------------------- batch_size = batch['input_ids'].size()[0] docs = [] for batch_num in range(batch_size): # extract the batch_sentence batch_sentence = tokenizer.decode( batch['input_ids'][batch_num].tolist(), skip_special_tokens=True) # change to lowercase and split to list batch_sentence_list = batch_sentence.split(" ") # remove [SEP] batch_sentence_list_nosep = [ item for item in batch_sentence_list if item not in sep_list ] text = ' '.join([x for x in batch_sentence_list_nosep]) fine_text = text.replace(' ##', '').lower() batch_sentence = re.sub(r'[^\w\s]', '', fine_text) # batch_sentence: change to the cleaned news for topic modeling # change to training data format in topic modeling gsm_data_bow = dictionary.doc2bow(batch_sentence.split(" ")) docs.append(gsm_data_bow) # gsm_data: data for topic modeling gsm_data = DataLoader(DocDataset(docs, len(dictionary), device='cuda'), batch_size=config['dataset']['batch_size'], drop_last=False, num_workers=0) gsm_trainer.__dict__['train_iterator'] = gsm_data gsm_loss, gsm_p = gsm_trainer.co_train(vocab_size, training=True) del gsm_data # track grad norms grad_norm_dic = {} # track all metrics for callbacks batch_callback_metrics = [] # track metrics to log batch_log_metrics = [] if batch is None: return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) # Batch start events with self.profiler.profile('on_batch_start'): # callbacks self.on_batch_start() # hooks if self.is_function_implemented('on_batch_start'): response = self.get_model().on_batch_start(batch) if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) splits = [batch] if self.truncated_bptt_steps is not None: model_ref = self.get_model() with self.profiler.profile('tbptt_split_batch'): splits = model_ref.tbptt_split_batch(batch, self.truncated_bptt_steps) self.hiddens = None for split_idx, split_batch in enumerate(splits): self.split_idx = split_idx for opt_idx, optimizer in self._get_optimizers_iterable(): # make sure only the gradients of the current optimizer's parameters are calculated # in the training step to prevent dangling gradients in multiple-optimizer setup. if len(self.optimizers) > 1: for param in self.get_model().parameters(): param.requires_grad = False for group in optimizer.param_groups: for param in group['params']: param.requires_grad = True # ------------------- # calculate loss # ------------------- beta = 0.01 opt_closure_result = self.optimizer_closure( split_batch, batch_idx, opt_idx, optimizer, self.hiddens, gsm_p, # topic distribution gsm_loss, # loss for topic modeling K, # number of topics beta, ) # ------------------------------ # POST forward bookkeeping # ------------------------------ batch_callback_metrics.append( opt_closure_result.training_step_output.callback_metrics) batch_log_metrics.append( opt_closure_result.training_step_output.log_metrics) self.add_progress_bar_metrics( opt_closure_result.training_step_output.pbar_on_batch_end) # track hiddens self.hiddens = opt_closure_result.hiddens # check if loss or model weights are nan if self.terminate_on_nan: self.detect_nan_tensors(opt_closure_result.loss) # track total loss for logging (avoid mem leaks) self.batch_loss_value.append(opt_closure_result.loss) # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients if (self.batch_idx + 1) % self.accumulate_grad_batches == 0: # backward grad_norm_dic = self.run_batch_backward_pass( split_batch, batch_idx, opt_idx, optimizer) # calculate running loss for display self.running_loss.append(self.batch_loss_value.mean()) # reset for next set of accumulated grads self.batch_loss_value.reset() # Batch end events with self.profiler.profile('on_batch_end'): # callbacks self.on_batch_end() # model hooks if self.is_function_implemented('on_batch_end'): self.get_model().on_batch_end() # collapse all metrics into one dict batch_log_metrics = { k: v for d in batch_log_metrics for k, v in d.items() } # track all metrics for callbacks self.callback_metrics.update( {k: v for d in batch_callback_metrics for k, v in d.items()}) result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, batch_log_metrics=batch_log_metrics, training_step_output_for_epoch_end=opt_closure_result. training_step_output_for_epoch_end) return result
from gensim.test.utils import datapath from gensim.models.ldamodel import LdaModel model_location = datapath('D:/HazMat/Projects/ML/Models/model_130') model = LdaModel.load(model_location) print(model.print_topics(10))
def ALeA(json_semanticSelection_One, json_semanticSelection_Two, pathModel, pathOutput, scoreAlignPhon="09_Aver_Score_Sem-Phon_Corr", verbose=False, semanticLevel="Level_01", dividers=[","], selectBest="07_Sim_Score_Phon_Corr_Match", selectBestThreshold=0.65, parseVow=True): """ :param json_semanticSelection_One: first semantically tagged lexical list -- format: json string - output of the ASeT algorithm :param json_semanticSelection_Two: second semantically tagged lexical list -- format: json string - output of the ASeT algorithm :param pathModel: path to saved semantic model (string) :param pathOutput: path to save the results (string - no extention; e.g. /my/folder/name_file_with_my_results) :param scoreAlignPhon: select type of score according to which the phonetic alignments are organized (string) -- default: "09_Aver_Score_Sem-Phon_Corr" -- options: "07_Sim_Score_Phon_Corr_Match", "08_Sim_Score_Phon_Glob_Match", "09_Aver_Score_Sem-Phon_Corr", or "10_Aver_Score_Sem-Phon_Glob" -- "07_Sim_Score_Phon_Corr_Match" uses the function "(((SumFeat) / (NrFeat * 7.71)) / (LenAlign * 4.77117)" -- "09_Aver_Score_Sem-Phon_Corr" is the average between the semantic score and the "07_Sim_Score_Phon_Corr_Match" -- "10_Aver_Score_Sem-Phon_Glob" is the average between the semantic score and the "08_Sim_Score_Phon_Glob_Match" -- see FAAL documentation for details ( https://github.com/MKilani/FAAL ) :param verbose: print data during execution (boolean) -- default: True :param semanticLevel: level of the semantic tags according to which the comaprison is performed. The options, for now, are: "Level_01", "Level_02", "Level_03" (see ASeT algorithm for details) :param dividers: dividers used to split meanings (array of strings [string, string] -- default: [","] :param selectBest: parameter according to which the algorithm selects the best matches among those identified by the ALeA on the basis of the other parameters -- default: "07_Sim_Score_Phon_Corr_Match" -- options: "07_Sim_Score_Phon_Corr_Match", "08_Sim_Score_Phon_Glob_Match", "09_Aver_Score_Sem-Phon_Corr", or "10_Aver_Score_Sem-Phon_Glob" -- "07_Sim_Score_Phon_Corr_Match" uses the function "(((SumFeat) / (NrFeat * 7.71)) / (LenAlign * 4.77117)" -- "09_Aver_Score_Sem-Phon_Corr" is the average between the semantic score and the "07_Sim_Score_Phon_Corr_Match" -- "10_Aver_Score_Sem-Phon_Glob" is the average between the semantic score and the "08_Sim_Score_Phon_Glob_Match" -- see FAAL documentation for details ( https://github.com/MKilani/FAAL ) :param selectBestThreshold: threshold for the parameter selectBest -- default: 0.65 :param parseVow: this allows to decide if the phonetic comparison should take into consideration vowels or not. Ignoring vowels can be useful when dealing with unrelated or relatively distant languages, or with languages in which vowels are rather unstable and semantically secondary (e.g. Semitic languages) -- default: True """ gateway = JavaGateway() addition_app = gateway.entry_point semanticSelectionDict_One = json.loads(json_semanticSelection_One) semanticSelectionDict_Two = json.loads(json_semanticSelection_Two) semanticSelectionDict = {} SemanticIndex_ListTwo = {} for key_Two in semanticSelectionDict_Two: entryTwo = semanticSelectionDict_Two[key_Two] ID_Token = entryTwo["00_ID_token"] for match_ID in entryTwo["03_Matches"][semanticLevel]: semantic_item_temp = entryTwo["03_Matches"][semanticLevel][ match_ID]["11_Semantic_Field"] ID_Cluster = entryTwo["03_Matches"][semanticLevel][match_ID][ "05_ID_Cluster"] if semantic_item_temp in SemanticIndex_ListTwo: SemanticIndex_ListTwo[semantic_item_temp].append({ "Key": key_Two, "ID_token": ID_Token, "ID_match": match_ID, "ID_Cluster": ID_Cluster }) else: SemanticIndex_ListTwo[semantic_item_temp] = [{ "Key": key_Two, "ID_token": ID_Token, "ID_match": match_ID, "ID_Cluster": ID_Cluster }] hurry = SemanticIndex_ListTwo["hurry"] #Combine lists counterNewPairs = 0 print("*- Phonetic comparison -*") print("-> Start") # set up progress bar indexBar = -1 print("Progress:") for key_One in semanticSelectionDict_One: indexBar = indexBar + 1 entry = semanticSelectionDict_One[key_One] ID_Token_00 = entry["00_ID_token"] Meaning_token_01 = entry["01_Meaning_token"] Form_token_02 = entry["02_Form_token"] last_match = list(entry["03_Matches"][semanticLevel].keys())[-1] max_cluster_ID = entry["03_Matches"][semanticLevel][last_match][ "05_ID_Cluster"] for new_ID_cluster in range(0, max_cluster_ID + 1): new_entry = {} new_entry["00_ID_token"] = ID_Token_00 new_entry["01_Meaning_token"] = Meaning_token_01 new_entry["02_Form_token"] = Form_token_02 new_match_count = 0 new_matches = {} for match_ID in entry["03_Matches"][semanticLevel]: if entry["03_Matches"][semanticLevel][match_ID][ "05_ID_Cluster"] > new_ID_cluster: continue if entry["03_Matches"][semanticLevel][match_ID][ "05_ID_Cluster"] <= new_ID_cluster: semanticToMatch = entry["03_Matches"][semanticLevel][ match_ID]["11_Semantic_Field"] #new_match_count = 0 if semanticToMatch in SemanticIndex_ListTwo: for matchTwo in SemanticIndex_ListTwo[semanticToMatch]: progbar(indexBar, len(semanticSelectionDict_One) - 1, 20) new_match = {} if matchTwo["ID_Cluster"] <= new_ID_cluster: entry_Two = semanticSelectionDict_Two[ matchTwo["Key"]] new_match["00_ID_Match"] = entry_Two[ "00_ID_token"] new_match["01_Meaning_Match"] = entry_Two[ "01_Meaning_token"] new_match["02_Form_Match"] = entry_Two[ "02_Form_token"] new_match["03_Best_Match_Sem"] = [ semanticToMatch, semanticToMatch ] new_match["05_ID_Cluster"] = new_ID_cluster new_match["06_Sim_Score_Sem_Match"] = 1.0 new_match[ "11_Semantic_Field"] = semanticToMatch new_matches[new_match_count] = new_match.copy() new_match_count = new_match_count + 1 new_entry["03_Matches"] = {} new_entry["03_Matches"][semanticLevel] = new_matches semanticSelectionDict[counterNewPairs] = {} semanticSelectionDict[counterNewPairs][new_ID_cluster] = new_entry counterNewPairs = counterNewPairs + 1 print() print("-> Load Model") # load the google word2vec model temp_file = datapath(pathModel) model = KeyedVectors.load(temp_file) print("-> Model loaded") counter = 0 for key_A in semanticSelectionDict: for sem_Cluster in semanticSelectionDict[key_A]: meaningRaw = semanticSelectionDict[key_A][sem_Cluster][ '01_Meaning_token'] for divider in dividers: meaningRaw = meaningRaw.replace(divider, "£") meaningRaw = meaningRaw.replace(" ", " ") meaningRaw = meaningRaw.replace(" ", " ") meaningRaw = meaningRaw.replace(" ", " ") meaningRaw = meaningRaw.replace("£ ", "£") meaningRaw = meaningRaw.replace(" £", "£") listMeaningsSplit = meaningRaw.split("£") listMeanings = [] for ID in range(0, len(listMeaningsSplit)): listMeanings.append(listMeaningsSplit[ID].split(" ")) numberMatchesOutput = len(listMeanings) print("-> Compile semantic index") print(str(counter + 1) + " of " + str(len(semanticSelectionDict))) counter = counter + 1 index = WmdSimilarity(listMeanings, model, numberMatchesOutput) print("-> Semantic index compiled") for key_B in semanticSelectionDict[key_A][sem_Cluster][ "03_Matches"][semanticLevel]: meaningToCheckRaw = semanticSelectionDict[key_A][sem_Cluster][ "03_Matches"][semanticLevel][key_B]["01_Meaning_Match"] for divider in dividers: meaningToCheckRaw = meaningToCheckRaw.replace(divider, "£") meaningToCheckRaw = meaningToCheckRaw.replace(" ", " ") meaningToCheckRaw = meaningToCheckRaw.replace(" ", " ") meaningToCheckRaw = meaningToCheckRaw.replace(" ", " ") meaningToCheckRaw = meaningToCheckRaw.replace("£ ", "£") meaningToCheckRaw = meaningToCheckRaw.replace(" £", "£") meaningToCheck = meaningToCheckRaw.split("£") bestResult = 0.0 bestMatch = ["", ""] for meaning in meaningToCheck: query = [meaning] resultsQuery = index[query] resultsQueryWithIndexes = list(enumerate(resultsQuery)) if len(resultsQueryWithIndexes) > 0: if resultsQueryWithIndexes[0][1][1] > bestResult: bestResult = resultsQueryWithIndexes[0][1][1] bestMatch = [] bestMatch.append(" ".join(listMeanings[ resultsQueryWithIndexes[0][1][0]])) bestMatch.append(meaning) semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][key_B][ "06_Sim_Score_Sem_Match"] = bestResult semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][key_B]["03_Best_Match_Sem"] = bestMatch #semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][key_B]['09_Aver_Score_Sem-Phon_Corr'] = (semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][key_B]["07_Sim_Score_Phon_Corr_Match"] + semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][key_B]["06_Sim_Score_Sem_Match"]) / 2 print("*- Phonetic comparison -*") print("-> Start") # set up progress bar indexBar = -1 print("Progress:") for key_A in semanticSelectionDict: for sem_Cluster in semanticSelectionDict[key_A]: indexBar = indexBar + 1 progbar(indexBar, len(semanticSelectionDict) - 1, 20) if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel] == {}: continue ID_word_A = semanticSelectionDict[key_A][sem_Cluster][ '00_ID_token'] meaning_word_A = semanticSelectionDict[key_A][sem_Cluster][ '01_Meaning_token'] word_A_list = semanticSelectionDict[key_A][sem_Cluster][ '02_Form_token'] #print (word_A) previous_Key = "" for key_B in semanticSelectionDict[key_A][sem_Cluster][ "03_Matches"][semanticLevel]: if key_B == previous_Key: continue previous_Key = key_B ID_word_B = semanticSelectionDict[key_A][sem_Cluster][ "03_Matches"][semanticLevel][key_B]["00_ID_Match"] meaning_word_B = semanticSelectionDict[key_A][sem_Cluster][ "03_Matches"][semanticLevel][key_B]["01_Meaning_Match"] word_B_list = semanticSelectionDict[key_A][sem_Cluster][ "03_Matches"][semanticLevel][key_B]["02_Form_Match"] resultsComparison = {} IDBestMatch = [] #Compare phonetically FAAL - when more than one varian, select that providing the best alignment according to the selected score "score" index_WordA = -1 for word_A in word_A_list: index_WordA = index_WordA + 1 index_WordB = -1 for word_B in word_B_list: index_WordB = index_WordB + 1 if parseVow == False: noVowWord_A = removeVow(word_A) noVowWord_B = removeVow(word_B) resultsComparisonTemp = interfaceFAAL( noVowWord_A, noVowWord_B, addition_app) else: resultsComparisonTemp = interfaceFAAL( word_A, word_B) #indexBar = indexBar + 1 #progbar(indexBar, (len(semanticSelectionDict)*len(semanticSelectionDict[key_A])* len(semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel])*len(word_A_list)* len(word_B_list)) - 1, 20) #print (resultsComparisonTemp) if resultsComparison == {}: resultsComparison = resultsComparisonTemp IDBestMatch = [] IDBestMatch.append(index_WordA) IDBestMatch.append(word_A) IDBestMatch.append(index_WordB) IDBestMatch.append(word_B) else: if resultsComparisonTemp[ scoreAlignPhon] > resultsComparison[ scoreAlignPhon]: resultsComparison = resultsComparisonTemp IDBestMatch = [] IDBestMatch.append(index_WordA) IDBestMatch.append(word_A) IDBestMatch.append(index_WordB) IDBestMatch.append(word_B) #phoneticSelectionFile = open("/Users/iome/Desktop/dataTLA/lemmata/phonetics.txt", "a+") #phoneticSelectionFile.write(key_A + "||" + key_B + "||" + resultsComparison + "||" + IDBestMatch + "\n") #phoneticSelectionFile.close() semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][key_B]['12_ResultsComp'] = resultsComparison semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][key_B]['04_Best_Match_Phon'] = IDBestMatch semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][key_B][ '07_Sim_Score_Phon_Corr_Match'] = semanticSelectionDict[ key_A][sem_Cluster]["03_Matches"][semanticLevel][ key_B]["12_ResultsComp"]["bestAlignCorrected"] semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][key_B][ '08_Sim_Score_Phon_Glob_Match'] = semanticSelectionDict[ key_A][sem_Cluster]["03_Matches"][semanticLevel][ key_B]["12_ResultsComp"]["bestAlignGlobal"] semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][key_B]['09_Aver_Score_Sem-Phon_Corr'] = ( semanticSelectionDict[key_A][sem_Cluster]["03_Matches"] [semanticLevel][key_B]["07_Sim_Score_Phon_Corr_Match"] + semanticSelectionDict[key_A][sem_Cluster]["03_Matches"] [semanticLevel][key_B]["06_Sim_Score_Sem_Match"]) / 2 semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][key_B]['10_Aver_Score_Sem-Phon_Glob'] = ( semanticSelectionDict[key_A][sem_Cluster]["03_Matches"] [semanticLevel][key_B]["08_Sim_Score_Phon_Glob_Match"] + semanticSelectionDict[key_A][sem_Cluster]["03_Matches"] [semanticLevel][key_B]["06_Sim_Score_Sem_Match"]) / 2 print() # set up progress bar indexBar = -1 print("Progress:") semanticSelectionDict_ordered = {} for key_A in semanticSelectionDict: indexBar = indexBar + 1 progbar(indexBar, len(semanticSelectionDict) - 1, 20) if key_A not in semanticSelectionDict_ordered: semanticSelectionDict_ordered[key_A] = {} temporaryEntries = [] for sem_Cluster in semanticSelectionDict[key_A]: if sem_Cluster not in semanticSelectionDict_ordered[key_A]: semanticSelectionDict_ordered[key_A][sem_Cluster] = {} semanticSelectionDict_ordered[key_A][sem_Cluster][ "00_ID_token"] = semanticSelectionDict[key_A][sem_Cluster][ "00_ID_token"] semanticSelectionDict_ordered[key_A][sem_Cluster][ "01_Meaning_token"] = semanticSelectionDict[key_A][ sem_Cluster]["01_Meaning_token"] semanticSelectionDict_ordered[key_A][sem_Cluster][ "02_Form_token"] = semanticSelectionDict[key_A][sem_Cluster][ "02_Form_token"] if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel] == {}: semanticSelectionDict_ordered[key_A][sem_Cluster][ "03_Matches"] = {} semanticSelectionDict_ordered[key_A][sem_Cluster][ "03_Matches"][semanticLevel] = semanticSelectionDict[ key_A][sem_Cluster]["03_Matches"][semanticLevel] continue for n in range( 0, len(semanticSelectionDict[key_A][sem_Cluster]["03_Matches"] [semanticLevel])): if len(temporaryEntries) == 0: temporaryEntries.append( semanticSelectionDict[key_A][sem_Cluster]["03_Matches"] [semanticLevel][0]) else: if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][n][ scoreAlignPhon] >= temporaryEntries[0][ scoreAlignPhon]: temporaryEntries.insert( 0, semanticSelectionDict[key_A][sem_Cluster] ["03_Matches"][semanticLevel][n]) elif semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n][scoreAlignPhon] < \ temporaryEntries[-1][scoreAlignPhon]: temporaryEntries.append( semanticSelectionDict[key_A][sem_Cluster] ["03_Matches"][semanticLevel][n]) else: for z in range(1, len(temporaryEntries)): if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n]\ [scoreAlignPhon] < temporaryEntries[z-1][scoreAlignPhon] and \ semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n] \ [scoreAlignPhon] >= temporaryEntries[z][scoreAlignPhon]: #if not semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n]\ # [scoreAlignPhon] < temporaryEntries[z-1][scoreAlignPhon] and \ # semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n] \ # ["00_ID_Match"] == temporaryEntries[z]["00_ID_Match"]: temporaryEntries.insert( z, semanticSelectionDict[key_A][sem_Cluster] ["03_Matches"][semanticLevel][n]) break semanticSelectionDict_ordered[key_A][sem_Cluster][ "03_Matches"] = {} semanticSelectionDict_ordered[key_A][sem_Cluster]["03_Matches"][ semanticLevel] = {} temporaryEntriesCleaned = [] #remove doubles from temporary entry doubleEntry = False for temporaryEntry in temporaryEntries: for temporaryEntryCleaned in temporaryEntriesCleaned: if temporaryEntry["00_ID_Match"] == temporaryEntryCleaned[ "00_ID_Match"]: doubleEntry = True if doubleEntry == False: temporaryEntriesCleaned.append( copy.deepcopy(temporaryEntry)) doubleEntry = False for ID in range(0, len(temporaryEntriesCleaned)): semanticSelectionDict_ordered[key_A][sem_Cluster][ "03_Matches"][semanticLevel][ID] = temporaryEntriesCleaned[ ID] json_semanticSelectionDict = json.dumps(semanticSelectionDict_ordered, sort_keys=True, indent=3, ensure_ascii=False) #print(json_semanticSelectionDict) print() print("-> End") print() # set up progress bar indexBar = -1 print("Select top matches - Progress:") semanticSelectionDict = json.loads(json_semanticSelectionDict) semanticSelectionDict_ordered_best = {} resultsSimplified = [] resultsSimplifiedString = "" for key_A in semanticSelectionDict: indexBar = indexBar + 1 progbar(indexBar, len(semanticSelectionDict) - 1, 20) if key_A not in semanticSelectionDict_ordered_best: semanticSelectionDict_ordered_best[key_A] = {} temporaryEntries = [] counter = 0 for sem_Cluster in semanticSelectionDict[key_A]: if sem_Cluster not in semanticSelectionDict_ordered_best[key_A]: semanticSelectionDict_ordered_best[key_A][sem_Cluster] = {} semanticSelectionDict_ordered_best[key_A][sem_Cluster]["00_ID_token"] = \ semanticSelectionDict[key_A][sem_Cluster]["00_ID_token"] semanticSelectionDict_ordered_best[key_A][sem_Cluster]["01_Meaning_token"] = \ semanticSelectionDict[key_A][sem_Cluster]["01_Meaning_token"] semanticSelectionDict_ordered_best[key_A][sem_Cluster]["02_Form_token"] = \ semanticSelectionDict[key_A][sem_Cluster]["02_Form_token"] if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel] == {}: semanticSelectionDict_ordered_best[key_A][sem_Cluster][ "03_Matches"] = {} semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel] = \ semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel] continue for n in range( 0, len(semanticSelectionDict[key_A][sem_Cluster]["03_Matches"] [semanticLevel])): if len(temporaryEntries) == 0: temporaryEntries.append( semanticSelectionDict[key_A][sem_Cluster]["03_Matches"] [semanticLevel][str(0)]) else: if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][ semanticLevel][str( n)][selectBest] > selectBestThreshold: temporaryEntries.append( semanticSelectionDict[key_A][sem_Cluster] ["03_Matches"][semanticLevel][str(n)]) semanticSelectionDict_ordered_best[key_A][sem_Cluster][ "03_Matches"] = {} semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][ semanticLevel] = {} for ID in range(0, len(temporaryEntries)): semanticSelectionDict_ordered_best[key_A][sem_Cluster][ "03_Matches"][semanticLevel][str(ID)] = copy.deepcopy( temporaryEntries[ID]) resultsSimplifiedString = resultsSimplifiedString + "Cluster: " + str(sem_Cluster) + " :: " + str(semanticSelectionDict_ordered_best[key_A][sem_Cluster]["00_ID_token"]) + " - '" + ", ".join(semanticSelectionDict[key_A][sem_Cluster]["02_Form_token"]) + "' - " + \ semanticSelectionDict[key_A][sem_Cluster]["01_Meaning_token"] + " :: " + str(semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel][str(ID)]["00_ID_Match"]) + " - '" + ", ".join(semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel][str(ID)]["02_Form_Match"]) + "' - " + \ semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel][str(ID)]["01_Meaning_Match"] + " :: " + str(semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel][str(ID)][selectBest]) + "\n" resultsSimplifiedString = resultsSimplifiedString + "---------\n" if verbose == True: print() print() print(resultsSimplifiedString) json_semanticSelectionDict_best = json.dumps( semanticSelectionDict_ordered_best, sort_keys=True, indent=3, ensure_ascii=False) Results = open(pathOutput + ".json", "w") # Results.write(json_semanticSelectionDict) Results.close() ResultsBest = open(pathOutput + "_best_" + str(selectBestThreshold) + ".json", "w") # ResultsBest.write(json_semanticSelectionDict_best) ResultsBest.close() ResultsBestSimplified = open(pathOutput + "_bestSimplified_" + str(selectBestThreshold) + ".txt", "w") # ResultsBestSimplified.write(resultsSimplifiedString) ResultsBestSimplified.close() return json_semanticSelectionDict, json_semanticSelectionDict_best, resultsSimplifiedString
esac else echo "'$1' is not a valid file!" fi } """ import pprint from gensim.test.utils import common_texts, get_tmpfile, datapath from gensim.models import Word2Vec, KeyedVectors import gensim.matutils binPath = "/root/GoogleNews-vectors-negative300.bin" binPath = "/Users/liruqi/GoogleNews-vectors-negative300.bin" print(binPath) # Gensim can load word vectors in the “word2vec C format”, as a KeyedVectors instance: wv_from_bin = KeyedVectors.load_word2vec_format(datapath(binPath), binary=True) v1 = wv_from_bin.wv['man'] v2 = wv_from_bin.wv['woman'] pprint.pprint(v1) pprint.pprint(v2) pprint.pprint(wv_from_bin.similarity('man','woman')) pprint.pprint(wv_from_bin.distance('man','woman')) pprint.pprint(1 - wv_from_bin.n_similarity( "National tragedy Trump begins border wall construction in Unesco reserve".split(" "), "Trump administration enters new phase for border wall sets ambitious timetable after securing land".split(" ") ) ) """ # python3 page3.py Traceback (most recent call last):
def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line)
p.add_argument("--dependency", action="append") p.add_argument("--language", action="append") args = p.parse_args() embeddings_folder = args.embeddings_folder output_folder = args.output_folder vaa_pairs_folder = args.vaa_pairs_folder avv_pairs_folder = args.avv_pairs_folder save_path = args.save_path dependency = args.dependency language = args.language dim = 300 batch = 32 epoch = 5 for lang in language: embeddings = KeyedVectors.load_word2vec_format(datapath( (embeddings_folder / "embeddings_{}".format(lang))), binary=False) file0 = open((embeddings_folder / "embeddings_{}".format(lang)), "rb") vocabulary = [] file0.readline() for line in file0: line = line.decode('utf-8').split() vocabulary.append(line[0]) file0.close() model = nn.DotProductModel(embeddings, vocabulary) for dep in dependency: vaa_train_f = open( (vaa_pairs_folder / "{}/v_a1_a2_filtered_pairs_{}_{}.train".format(lang, dep, lang)), "rb") vaa_test_f = open(
text = file_handle.read() doc = nlp(text.lower()) texts, article = [], [] for w in doc: if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I': article.append(w.lemma_) if w.text == '\n': texts.append(article) article = [] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] if len(corpus) == 0: print() else: ldamodel = LdaModel(corpus=corpus, num_topics=50, id2word=dictionary) temp_topic_file = datapath("model") ldamodel.save(temp_topic_file) ldamodel = LdaModel.load(temp_topic_file) # print(ldamodel) # for i in ldamodel.show_topics(): # print(i) file = "sample.csv" print("issue_id,topic") file_handle = open(file, 'r') file_handle.readline() for line in file_handle: a, text = line.split("~", 1) text = text[:-1] text = ' '.join(text.split()) text = text + "\n"
def test_load_model_supervised(self): with self.assertRaises(NotImplementedError): FT_gensim.load_fasttext_format( datapath('pang_lee_polarity_fasttext'))
from gensim.test.utils import common_texts, get_tmpfile from gensim.models import Word2Vec #getting the word2vec model to train the other vectors #as given in the problem that either polyglot or word2vec embeddings can be chosen path = get_tmpfile("word2vec.model") model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) myvector = model.wv from gensim.test.utils import get_tmpfile from gensim.models import KeyedVectors filename = get_tmpfile("vectors.kv") #myvector.save(filename) myvector = KeyedVectors.load(filename, mmap='r') from gensim.test.utils import datapath wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) import gensim.downloader as api myvector = api.load("glove-twitter-50") print("similar word for Dog :") print(myvector.similar_by_word('dog', 1)) print("-------------------------") print("similar word for Whale :") print(myvector.similar_by_word('whale', 1)) print("-------------------------") print("similar word for before :") print(myvector.similar_by_word('before', 1)) print("-------------------------") print("similar word for however :")
passes=10, alpha='auto', per_word_topics=True) num_topics = 20 print("\n\n\n Total topics with word probabilities are: \n") pprint(lda_model.print_topics()) doc_lda = lda_model[corpus] print("\n\n Percentage of a topic in a document \n\n") print(lda_model[corpus[0]]) print("\n\n") from gensim.test.utils import datapath temp_file = datapath("model") lda_model.save(temp_file) # Load a potentially pretrained model from disk. lda_model = gensim.models.ldamodel.LdaModel.load(temp_file) other_texts = [[ 'He do not cook well ', 'why does he annoy me so much', 'this lockdown is getting on my nerve' ], [ 'Good activities', 'i am really feeling very energetic', 'she snores so much' ], ['he is so nice and kind', 'system panic', 'nice person']] print(other_texts) other_corpus = [id2word.doc2bow(text) for text in other_texts]
# 1.3 训练得到word embedding word2vector = model.wv # KeyedVectors vector = word2vector['computer'] # numpy vector of shape (100, ) path = get_tmpfile("wordvectors.kv") word2vector.save(path) word2vector = KeyedVectors.load(path, mmap='r') # 1.4 自动检测并训练词组Phrase bigram_transformer = Phrases(common_texts) model = Word2Vec(bigram_transformer[common_texts], min_count=1) # 2 处理word embedding # 2.1 加载现成的word embedding word2vector1 = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # C text format word2vector2 = KeyedVectors.load_word2vec_format( datapath("euclidean_vectors.bin"), binary=True) # C bin format # 2.2 转化普通glove文件为Gensim支持的word2vec格式,即 C text format # 普通glove文件格式:没有header,从第一行开始就是word及其vector,空格分隔 # word2vec文件格式:第一行是vector个数和vector维度,其他行同普通txt文件,空格分隔 glove_file = './data/normal_glove.txt' word2vec_file = './data/normal_word2vec.txt' glove2word2vec(glove_file, word2vec_file) word2vector = KeyedVectors.load_word2vec_format(word2vec_file, binary=False) # Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings. embedding_layer = word2vector.get_keras_embedding(train_embeddings=False)
def glove2w2v(glove_path, w2v_path): glove_path = datapath(glove_path) w2v_path = get_tmpfile(w2v_path) glove2word2vec(glove_path, w2v_path)
import json import gensim import markovify from gensim.test.utils import datapath from ModelMaker import process_body if __name__ == '__main__': dictionary = None model_choice = 10 # Can be either 5, 10, 15, 20 markov_models = [] with open('Raw Data/dictionary', 'rb') as fp: dictionary = pickle.load(fp) fname = datapath( 'C:/Users/User/Desktop/Love Advice Bot/Raw Data/ldaModel' + str(model_choice)) lda_model = gensim.models.LdaModel.load(fname) num_topics = lda_model.get_topics().shape[0] print('****LOADING MARKOV MODELS****\n') for x in range(num_topics): with open('Raw Data/MarkovModels/markov' + str(model_choice) + '_' + str(x) + '.json') as fp: model_json = json.load(fp) markov_models.append(markovify.Text.from_json(model_json)) print('****PROCESSING I-O/Input.txt ****\n') question = "" with open('I-O/Input.txt', encoding="utf8") as fp:
class Recommendation: __instance = None __name = "Recommendation" __path = datapath(__name) __model = Model.getInstance() __alpha = 5.730e-7 __min_val = 0.1 def getInstance(): if Recommendation.__instance == None: Recommendation() return Recommendation.__instance def __init__(self): try: file = open(Recommendation.__path, 'rb') Recommendation.__instance = pickle.load(file) file.close() except Exception as e: print(e) conn = connect() self.users = {} if conn != None: cursor = conn.cursor() cursor.execute("SELECT id FROM users") rowss = cursor.fetchall() for row in rowss: self.users[row[0]] = User( row[0], Recommendation.__model.dimensions) cursor.close() conn.close() file = open(Recommendation.__path, 'wb') pickle.dump(self, file, pickle.HIGHEST_PROTOCOL) file.close() Recommendation.__instance = self def add_user(self, id): self.users[id] = User(id, Recommendation.__model.dimensions) file = open(Recommendation.__path, 'wb') pickle.dump(self, file, pickle.HIGHEST_PROTOCOL) file.close() def show_users(self): for index, user in self.users.items(): print(index, user.spikes) def recommend_articles(self, id): curr_time = int(round(time.time())) return Recommendation.__model.getReccommendation( self.users[id].get_preference_vector(curr_time, Recommendation.__alpha, Recommendation.__min_val)) def read_articles(self, ids, uid): conn = connect() indices_set = set() if conn != None: cursor = conn.cursor() for id in ids: cursor.execute("SELECT content from articles where id=" + str(id)) rows = cursor.fetchall() l = Recommendation.__model.getVector(rows[0][0]) l.sort(key=lambda x: x[1], reverse=True) for i in range(0, 5): indices_set.add(l[i][0]) self.users[uid].update_spike(indices_set) file = open(Recommendation.__path, 'wb') pickle.dump(self, file, pickle.HIGHEST_PROTOCOL) file.close() return True return False def initialize_vec(self, tags, id): ans = set() sets = {} l = Recommendation.__model.model.show_topics(num_topics=50, num_words=5, log=False, formatted=False) for i in range(0, Recommendation.__model.dimensions): sets[i] = set() for j in range(0, 5): sets[i].add(l[i][1][j][0]) for tag in tags: for k, val in sets.items(): if tag in val: ans.add(k) self.users[id].update_spike(ans) file = open(Recommendation.__path, 'wb') pickle.dump(self, file, pickle.HIGHEST_PROTOCOL) file.close()
def test_open_file_existent_file_object(self): number_of_lines_in_file = 30 file_obj = open(datapath('testcorpus.mm')) with utils.open_file(file_obj) as infile: self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
# H*! from flask import Flask, request from flask_cors import CORS, cross_origin from flask_restful import Resource, Api from json import dumps from flask_jsonpify import jsonify app = Flask(__name__) api = Api(app) import time t0 = time.time() from gensim.test.utils import datapath cap_path = datapath("/home/haoran/cc.en.300.bin") from gensim.models.wrappers import FastText wv = FastText.load_fasttext_format(cap_path) t1 = time.time() CORS(app) print('app started') print('time to load:', t1 - t0) def most_similar(word): return wv.most_similar(word) def similarity(word1, word2):
def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = hdpmodel.HdpModel self.model = self.class_(corpus, id2word=dictionary, random_state=np.random.seed(0))
bleu_np = np.asarray(BLEU_scores) jacc_np = np.asarray(jacc_sim) cos_np = np.asarray(cos_sim) fscore_np = np.asarray(fscore) mean_jaccard.append( np.mean(jacc_np) ) mean_bleu.append( np.mean(bleu_np) ) mean_cos.append( np.mean(cos_np) ) mean_fscore.append( np.mean(fscore_np) ) return np.max( np.asarray(mean_bleu) ), np.max( np.asarray(mean_jaccard) ), np.max( np.asarray(mean_cos) ), np.max( np.asarray(mean_fscore) ) GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations() path = '/home/norberteke/PycharmProjects/Thesis/data/' dictionary = Dictionary.load(path + 'SO_full_processed_Dictionary.dict') corpus = MmCorpus(datapath(path + 'corpus_processed_SO_full.mm')) texts = [] with open(path + 'new_SO_full_processed_corpus.csv', 'r') as f: reader = csv.reader(f) texts = list(reader) terms = [] for (key, value) in dictionary.iteritems(): terms.append(value) def write_results_to_file(path, lda_model, max_bleu, max_jaccard, max_cos, max_fscore): with open(path, 'a') as f: writer = csv.writer(f, delimiter = ',', quotechar='"', quoting = csv.QUOTE_MINIMAL) writer.writerow([str(lda_model.num_topics), str(lda_model.eta), str(max_bleu), str(max_jaccard), str(max_cos), str(max_fscore)])
def test_save_load_no_scoring(self): """Test saving and loading a FrozenPhrases object with no scoring parameter. This should ensure backwards compatibility with old versions of FrozenPhrases""" bigram_loaded = FrozenPhrases.load(datapath("phraser-no-scoring.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer)