def create_evaluation_perplexity(config, Kind): model_fname = config.model_fname % Kind.__name__ corpus_fname = config.corpus_fname % Kind.__name__ try: id2word = Dictionary.load(corpus_fname + '.dict') corpus = MalletCorpus(corpus_fname, id2word=id2word) except: error('Corpora not built yet -- cannot evaluate') held_out = list() training = list() target_len = int(0.1 * len(corpus)) logger.info('Calculating perplexity with held-out %d of %d documents' % (target_len, len(corpus))) ids = set() while len(ids) < target_len: ids.add(random.randint(0, len(corpus))) for doc_id, doc in enumerate(corpus): if doc_id in ids: held_out.append(doc) else: training.append(doc) model = LdaModel(training, id2word=corpus.id2word, alpha=config.alpha, passes=config.passes, num_topics=config.num_topics) pwb = model.log_perplexity(held_out) with open(config.path + 'evaluate-perplexity-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, pwb])
class TestLdaCallback(unittest.TestCase): def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence") self.callback = [self.ch_umass] self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback) self.host = "http://localhost" self.port = 8097 def testCallbackUpdateGraph(self): # Popen have no context-manager in 2.7, for this reason - try/finally. try: # spawn visdom.server proc = subprocess.Popen(['python', '-m', 'visdom.server', '-port', str(self.port)]) # wait for visdom server startup (any better way?) time.sleep(3) viz = Visdom(server=self.host, port=self.port) assert viz.check_connection() # clear screen viz.close() self.model.update(self.corpus) finally: proc.kill()
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lda.gz' if not os.path.exists(model_fname) or force: if corpus: update_every=None # run in batch if we have a pre-supplied corpus else: update_every=1 model = LdaModel(corpus=corpus, id2word=id2word, alpha=project.alpha, eta=project.eta, passes=project.passes, num_topics=project.num_topics, iterations=project.iterations, eval_every=None, # disable perplexity tests for speed update_every=update_every, ) if corpus: model.save(model_fname) else: model = LdaModel.load(model_fname) return model, model_fname
def perform_lda(dictionary, corpus, num_topics, wiki_path=None, passes=1, iterations=50, chunksize=200): """ :param dictionary: :param corpus: :param wiki_path: :param num_topics: :param passes: :param iterations: :param chunksize: :return: """ if wiki_path is not None: logging.info('Generating wiki corpus...') wikis = unpickle(wiki_path) wiki_corpus = [dictionary.doc2bow(wiki) for wiki in wikis] logging.info('Combining original corpus and wiki corpus...') corpus = corpus + wiki_corpus # wiki_corpus is merged after the original corpus lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, iterations=iterations, alpha='auto', chunksize=chunksize) corpus_ids = get_corpus_ids(dictionary.corpus_id2orig_id) # doc_vector_ids = dictionary.corpus_id2orig_id[corpus_ids] doc_vector_ids = [dictionary.corpus_id2orig_id[corpus_id] for corpus_id in corpus_ids] doc_vectors = lda_model.inference(corpus)[0] doc_vectors = doc_vectors[corpus_ids, :] doc_vectors = doc_vectors / doc_vectors.sum(axis=1).reshape(doc_vectors.shape[0], 1) return lda_model, doc_vectors, doc_vector_ids
def lda(docs, k): """Latent Dirichlet allocation topic model. Uses Gensim's LdaModel after tokenizing using scikit-learn's TfidfVectorizer. Parameters ---------- k : integer Number of topics. """ from gensim.matutils import Sparse2Corpus from gensim.models import LdaModel # Use a scikit-learn vectorizer rather than Gensim's equivalent # for speed and consistency with LSA and k-means. vect = _vectorizer() corpus = vect.fit_transform(fetch(d) for d in docs) corpus = Sparse2Corpus(corpus) model = LdaModel(corpus=corpus, num_topics=k) topics = model.show_topics(formatted=False) vocab = vect.get_feature_names() #return [(vocab[int(idx)], w) for topic in topics for w, idx in topic] return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]
class TestLdaDiff(unittest.TestCase): def setUp(self): self.dictionary = common_dictionary self.corpus = common_corpus self.num_topics = 5 self.n_ann_terms = 10 self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10) def testBasic(self): # test for matrix case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms) self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics)) self.assertEqual(len(annotation), self.num_topics) self.assertEqual(len(annotation[0]), self.num_topics) # test for diagonal case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True) self.assertEqual(mdiff.shape, (self.num_topics,)) self.assertEqual(len(annotation), self.num_topics) def testIdentity(self): for dist_name in ["hellinger", "kullback_leibler", "jaccard"]: # test for matrix case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name) for row in annotation: for (int_tokens, diff_tokens) in row: self.assertEqual(diff_tokens, []) self.assertEqual(len(int_tokens), self.n_ann_terms) self.assertTrue(np.allclose(np.diag(mdiff), np.zeros(mdiff.shape[0], dtype=mdiff.dtype))) if dist_name == "jaccard": self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) # test for diagonal case mdiff, annotation = \ self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True) for (int_tokens, diff_tokens) in annotation: self.assertEqual(diff_tokens, []) self.assertEqual(len(int_tokens), self.n_ann_terms) self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) if dist_name == "jaccard": self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) def testInput(self): self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something') self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')
def extract_topics(words): word_id_map=Dictionary([words]) word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2]) word_id_map.compactify() deals_corpus=[word_id_map.doc2bow(words)] lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1) topics=[] for i in range(15): tokens=lda.print_topic(i).split('+') topic_scores=[] for token in tokens: score,token_val=token.split('*') topic_scores.append((token_val,score)) topics.append(topic_scores) return topics
def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''): ''' Constructor ''' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.__destination = destination self.__fileName = fileName self.__modelName = modelName self.__ldaPasses = ldaPasses self.__topicNum = topicNum #======================================================================= # STOP WORDS AND CAHRACTERS #======================================================================= self.__stopwords = stopwords.words('english')# + string.punctuation self.__chars_to_remove = [u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';',u'/',u'^',u'--',u'\\',u'+',u'-',u'.',u'?',u'&',u'#',u'',u''] self.__stopwords.extend(self.__chars_to_remove) self.__stopwords.extend([item for item in string.punctuation]) #======================================================================= # DATABASE #======================================================================= self.__db = connectMySQL(db='xpath', port=3366) self.__queryResults = None self.__cleanedCorpus = [] if modelName != '' and os.path.exists(self.__destination+modelName+'.lda'): self.__ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r') if fileName != '' and os.path.exists(self.__destination+fileName+'.dict'): self.__modelDict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
def write_topics(model_path, csv_name, k): model = LdaModel.load(model_path) topics = [] for topic_id in range(model.num_topics): topics.append(model.return_topic(topicid=topic_id)) dictionary = Dictionary.load('data/dictionary/tweets.dict') word_indices = dictionary.id2token writer = csv.writer(file(csv_name, 'w')) output = [[0 for i in range(model.num_topics)] for j in range(k)] for topic_id, topic in enumerate(topics): for rank, index in enumerate(topic.argsort()[::-1]): output[rank][topic_id] = {} output[rank][topic_id]['word'] = word_indices[index] output[rank][topic_id]['p'] = topic[index] rank += 1 if rank >= k: break for topic_id in range(model.num_topics): row = ['z = ' + str(topic_id)] for rank in range(k): row.append(output[rank][topic_id]['word'] + ':' + str(output[rank][topic_id]['p'])) writer.writerow(row)
def load(self): '''读取 lda 模型和 dic 词典. ''' lda_file = config.get('dmp', 'lda_file') dic_file = config.get('dmp', 'dic_file') self.lda = LdaModel.load(lda_file) self.dic = Dictionary.load(dic_file)
def create_evaluation_distinctiveness(config, Kind): model_fname = config.model_fname % Kind.__name__ try: model = LdaModel.load(model_fname) logger.info('Opened previously created model at file %s' % model_fname) except: error('Cannot evalutate LDA models not built yet!') scores = utils.score(model, utils.kullback_leibler_divergence) total = sum([x[1] for x in scores]) logger.info("%s model KL: %f" % (model_fname, total)) with open(config.path + 'evaluate-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, total]) etas = list() for topic in model.state.get_lambda(): topic_eta = list() for p_w in topic: topic_eta.append(p_w * numpy.log2(p_w)) etas.append(-sum(topic_eta)) entropy = sum(etas) / len(etas) logger.info("%s model entropy mean: %f" % (model_fname, entropy)) with open(config.path + 'evaluate-entropy-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, entropy])
class CorpusLdaModelWrapper: def __init__(self, corpus, dictionary, doc_labels, preprocessing_pipeline, numtopics): self.corpus = corpus self.dictionary = dictionary self.doc_labels = doc_labels self.pipeline = preprocessing_pipeline self.numtopics = numtopics self.trained = False def train(self): # training self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics) self.index = MatrixSimilarity(self.model[self.corpus]) # flag self.trained = True def convertTextToReducedVector(self, text): if not self.trained: raise exceptions.ModelNotTrainedException() tokens = word_tokenize(prep.preprocess_text(text, self.pipeline)) tokens = filter(lambda token: self.dictionary.token2id.has_key(token), tokens) bow = self.dictionary.doc2bow(tokens) return self.model[bow] def queryDoc(self, text): reducedVec = self.convertTextToReducedVector(text) sims = self.index[reducedVec] simtuples = zip(range(len(sims)), sims) if self.doc_labels==None else zip(self.doc_labels, sims) simtuples = sorted(simtuples, key=lambda item: item[1], reverse=True) return simtuples def show_topic(self, id): return self.model.show_topic(id)
def train(self): # training self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics) self.index = MatrixSimilarity(self.model[self.corpus]) # flag self.trained = True
def calculateLDADistance(self, modelName='', topNSimilar='', topicList=''): if modelName=='': modelName=self.__fileName if topNSimilar=='': topNSimilar=5 write2file = self.__destination+modelName+"_results_LDA_similarTopics.csv" resultsCSV = open(write2file, "wb") print 'Reading model data' gensimDict = corpora.Dictionary.load(self.__destination+self.__fileName+'.dict') ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap=None) topics = ldaModel.show_topics(num_topics=ldaModel.num_topics, num_words=len(gensimDict),formatted=False) #======================================================================= # num_topics=ldaModel.num_topics # num_words=len(gensimDict) #======================================================================= #======================================================================= # GET SIMILARITY VECTORS #======================================================================= print 'Extractig vectors' topicsSorted = [sorted(x, key=lambda x: x[1]) for x in topics] vectors = [] for topic in topicsSorted: vector = [item[0] for item in topic] vectors.append(vector) #======================================================================= # CALCULATE SIMILARITIES BETWEEN TOPICS #======================================================================= print 'Calculating distances between LDA topics\n' results = [] for topicListItem in topicList: distances = [] for j in range (0, len(vectors)): dist = euclidean(vectors[topicListItem], vectors[j]) #=============================================================== # print topicListItem, j, dist #=============================================================== distances.append(dist) results.append(distances) #======================================================================= # EXPORT TOP N SIMILAR TOPICS NAD PRINT OUT QUERY TERMS #======================================================================= print 'Writing found similar topics to file\n' for resultItem in range(0,len(results)): similarLDATopics = np.argsort(results[resultItem])[::-1] for similarItem in similarLDATopics[:topNSimilar]: #=============================================================== # print topicList[resultItem],similarItem #=============================================================== resultsCSV.write(str(topicList[resultItem])+'; '+str(similarItem)+'; '+', '.join(x[1].lstrip().rstrip() for x in topics[similarItem][:100])+'\n\n') resultsCSV.write('*******************************************\n\n')
def evaluate_log(context, config): logger.info('Evalutating models for: %s' % config.project.name) model_fname = config.model_fname % ChangesetCorpus.__name__ changeset_fname = config.corpus_fname % ChangesetCorpus.__name__ commit_fname = config.corpus_fname % CommitLogCorpus.__name__ try: commit_id2word = Dictionary.load(commit_fname + '.dict') commit_corpus = MalletCorpus(commit_fname, id2word=commit_id2word) changeset_id2word = Dictionary.load(changeset_fname + '.dict') changeset_corpus = MalletCorpus(changeset_fname, id2word=changeset_id2word) except: error('Corpora not built yet -- cannot evaluate') try: model = LdaModel.load(model_fname) logger.info('Opened previously created model at file %s' % model_fname) except: error('Cannot evalutate LDA models not built yet!') changeset_doc_topic = get_doc_topic(changeset_corpus, model) commit_doc_topic = get_doc_topic(commit_corpus, model) first_shared = dict() for id_ in commit_doc_topic: i = 0 commit_topics = [topic[0] for topic in commit_doc_topic[id_]] try: changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]] except: continue maximum = 101 minimum = maximum for i, topic in enumerate(commit_topics): if topic in changeset_topics: j = changeset_topics.index(topic) minimum = min(minimum, max(i, j)) for i, topic in enumerate(changeset_topics): if topic in commit_topics: j = commit_topics.index(topic) minimum = min(minimum, max(i, j)) first_shared[id_] = minimum if minimum == maximum: logger.info('No common topics found for %s' % str(id_)) del first_shared[id_] mean = sum(first_shared.values()) / len(first_shared) with open('data/evaluate-log-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, mean] + list(first_shared.values()))
def __init__(self): cwd = os.path.dirname(__file__) dictionary_path = os.path.abspath(os.path.join(cwd, 'models/dictionary.dict')) lda_model_path = os.path.abspath(os.path.join(cwd, 'models/lda_model_10_topics.lda')) self.dictionary = corpora.Dictionary.load(dictionary_path) self.lda = LdaModel.load(lda_model_path)
def load_lda_model(lda_model_name=None, mallet=False): if os.path.isfile(lda_model_name): if mallet: lda_model = LdaMallet.load(lda_model_name) else: lda_model = LdaModel.load(lda_model_name) return lda_model return None
def topicsLDA(self, num_topics=10, num_iterations=10000, num_words=10): # LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001) try: lda = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=num_iterations) result = {} tpd = lda[self.corpus] # topic probability distribution for topics in tpd: for elem in topics: if result.get(elem[0], -1) == -1: words = lda.show_topic(elem[0], topn=num_words) result[elem[0]] = {'weight': elem[1], 'words': words} else: result[elem[0]]['weight'] += elem[1] return result except Exception as e: print e return None
def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence") self.callback = [self.ch_umass] self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback) self.host = "http://localhost" self.port = 8097
def analyzeLDA(self, modelName='', numberOfTerms=''): ''' modelName -> name of model to read in to memory without the extension ''' if modelName=='': modelName=self.__fileName if numberOfTerms == '': numberOfTerms=100 write2file = self.__destination+modelName+"_results_%s_SW.csv"%(numberOfTerms) #======================================================================= # allTopicsFile = self.__destination+modelName+"_results_AllTopics.csv" #======================================================================= resultsCSV = open(write2file, "wb") model = LdaModel.load(self.__destination+modelName+'.lda', mmap=None) #and another way, only prints top words for t in range(0, model.num_topics-1): #=================================================================== # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, numberOfTerms)]) #=================================================================== topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords] listSet = set(topicSet) for key in self.__queryWords: difference = set(topicSet).intersection(self.__queryWords[key]) if len(difference) > 0: self.__overlapingTopics[key][t]=topicSet try: for key in self.__overlapingTopics: if self.__overlapingTopics[key]: for topicKey in self.__overlapingTopics[key]: topicTerms = [w.lstrip().rstrip() for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords][:100] #======================================================= # topicTerms = [w.translate(None, ''.join(self.__chars_to_remove)) for w in topicTerms if w !=''] #======================================================= resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(topicTerms)+'\n\n') print key,'\t',topicKey,'\t', topicTerms resultsCSV.write('***************************************\n') print '*************************\n' write2fileJSON = self.__destination+modelName+"_results_%s_SW.json"%(numberOfTerms) with open(write2fileJSON, 'w') as fp: json.dump(self.__overlapingTopics, fp) except KeyError as e: print e pass resultsCSV.close()
def analyzeUniqueLDA(self, modelName='', numberOfTerms=''): ''' modelName -> name of model to read in to memory without the extension ''' if modelName=='': modelName=self.__fileName if numberOfTerms=='': numberOfTerms=100 write2File = self.__destination+modelName+"_results_unique_%sTerms.csv"%(numberOfTerms) resultsCSV = open(write2File, "wb") model = LdaModel.load(self.__destination+modelName+'.lda', mmap=None) #and another way, only prints top words for t in range(0, model.num_topics-1): #=================================================================== # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, 500)]) #=================================================================== # raw_input('prompt') topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords] #=================================================================== # print type(topicSet), topicSet #=================================================================== listSet = set(topicSet) #print listSet #print type(topicSet), topicSet for key in self.__queryWords: #print self.__queryWords[key] difference = set(topicSet).intersection(self.__queryWords[key]) if len(difference) > 0: self.__overlapingTopics[key][t]=topicSet try: for key in self.__overlapingTopics: uniqueQueryTerms = [] if self.__overlapingTopics[key]: for topicKey in self.__overlapingTopics[key]: topicTerms = [w for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords] uniqueQueryTerms.extend(topicTerms) uniqueQueryTerms = [x for x in set(uniqueQueryTerms)] resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(uniqueQueryTerms)+'\n\n') resultsCSV.write('***************************************\n') print key, uniqueQueryTerms print '*************************\n' except KeyError as e: print e pass resultsCSV.close()
class TestLdaDiff(unittest.TestCase): def setUp(self): texts = [ ['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey'], ] self.dictionary = Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.num_topics = 5 self.n_ann_terms = 10 self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10) def testBasic(self): mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms) self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics)) self.assertEquals(len(annotation), self.num_topics) self.assertEquals(len(annotation[0]), self.num_topics) def testIdentity(self): for dist_name in ["hellinger", "kullback_leibler", "jaccard"]: mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name) for row in annotation: for (int_tokens, diff_tokens) in row: self.assertEquals(diff_tokens, []) self.assertEquals(len(int_tokens), self.n_ann_terms) self.assertTrue(np.allclose(np.diag(mdiff), np.zeros(mdiff.shape[0], dtype=mdiff.dtype))) if dist_name == "jaccard": self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) def testInput(self): self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something') self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')
def get_topics_lda(tokens, n_topics=10): """ Using the `gensim` package for LDA. LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia). `gensim` is a package for topic modeling only. So for a particular topic modeling task, it is a lighter option to install and run. Also it can be run distributed and updated over an existing model :param tokens: Preprocessed tokens for faster dictionary building :param n_topics: Number of topics to decompose data to :return: list() of topics """ dict_file = 'resources/deals.dict' if not os.path.isfile(dict_file): print "Dictionary file does not exist. Creating one" dictionary = Dictionary(tokens) freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1] dictionary.filter_tokens(freq1) dictionary.compactify() dictionary.save(dict_file) dictionary = Dictionary.load(dict_file) # print dictionary corpus_file = 'resources/deals.mm' if not os.path.isfile(corpus_file): print "Corpus file does not exist. Creating one" corpus = [dictionary.doc2bow(token) for token in tokens] MmCorpus.serialize(corpus_file, corpus) mm = MmCorpus(corpus_file) # print mm # tfidf = TfidfModel(mm) # corpus_tfidf = tfidf[mm] lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000, passes=1) topics = [] for i in range(0, n_topics): words = lda.print_topic(i).split('+') topic = [] for word in words: score, w = word.split('*') topic.append((w, score)) topics.append(topic) return topics
def create_model(config, Kind): model_fname = config.model_fname % Kind.__name__ corpus_fname = config.corpus_fname % Kind.__name__ if not os.path.exists(model_fname): try: id2word = Dictionary.load(corpus_fname + '.dict') corpus = MalletCorpus(corpus_fname, id2word=id2word) logger.info('Opened previously created corpus: %s' % corpus_fname) except: error('Corpora for building file models not found!') file_model = LdaModel(corpus, id2word=corpus.id2word, alpha=config.alpha, passes=config.passes, num_topics=config.num_topics) file_model.save(model_fname)
def get_keywords(threshold=0.01, model_path='result/model.lda'): lda_model = LdaModel.load(model_path) topic_num = lda_model.num_topics keywords = set() for topic_id in range(topic_num): topic = lda_model.state.get_lambda()[topic_id] topic = topic / topic.sum() # normalize to probability dist signif_word_ids = np.where(topic > threshold)[0] keywords = keywords.union([lda_model.id2word[word_id] for word_id in signif_word_ids]) return keywords
def train(self): '''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象. dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词 lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过 lda.print_topic(id) 来获取主题中词的列表 ''' docs = self.__load_corpus() self.dic = Dictionary(docs) bow = [self.dic.doc2bow(doc) for doc in docs] self.lda = LdaModel(bow, id2word=self.dic, num_topics=self.topic_num)
def __init__(self): # current_working_dir = '/home/etu/eason/nodejs/Semantic_Aware_RecSys' current_working_dir = '.' os.chdir(current_working_dir) lda_model_path = "./LDAmodel/final_ldamodel" self.lda = LdaModel.load(lda_model_path) self.no_of_recommendation = 10 self.omit_topic_below_this_fraction = 0.1 self.mapping = self.__init_mapping() self.linkMapping = self.__init_Link_mapping() self.doc_topic_matrix = loadPickleFile('doc_topic_matrix')
def getAllTopics(self, modelName='', numberOfTerms=100): ''' modelName -> name of model to read in to memory without the extension ''' returningData = {} if modelName=='': modelName=self.__fileName model = LdaModel.load(self.__destination+modelName+'.lda', mmap=None) return model.show_topics(num_topics=model.num_topics,num_words=numberOfTerms, formatted=False)
def get_topics(cv, train_data): """ Uses gensim to perform topic modeling. Parameters --------- cv: A TfidfVectorizer instance. train_data: A scipy csr_matrix. Returns ------- A list of strings (functions of the most important terms in each topic). """ td_gensim = Sparse2Corpus(train_data, documents_columns=False) tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items()) dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct) lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20) topics = lda.top_topics(corpus=td_gensim, num_words=5) return topics
def train(self): data = [] entity2id = {} id2entity = [] for obj in self.data: doc = [] obj_sents = obj["text_data"] entity = obj["prod"] if entity not in entity2id: entity2id[entity] = len(entity2id) id2entity.append(entity) doc_id = entity2id[entity] for obj_sent in obj_sents: for pair in obj_sent: if pair[0] >= 0: doc.append((pair[0], doc_id)) data.append(doc) self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic) f_entity = open("lda/prod.txt", "w") f_model = open("lda/model.txt", "w") f_model.write(str(len(entity2id))) f_model.write(" ") f_model.write(str(self.n_topic)) f_model.write("\n") for entity in id2entity: f_entity.write(entity) f_entity.write("\n") f_model.write(entity) f_model.write(" ") distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0) distr = [pair[1] for pair in distr] for prod in distr: f_model.write(str(prod)) f_model.write(" ") f_model.write("\n") self.ldamodel.save("lda/model_200")
def LDALoad(self): self.ldamodel = LdaModel.load("fixed_time_window_lda.model") self.dictionary = Dictionary.load("lda_dictionary.model") print(self.dictionary)
from gensim import models train = [] stopwords = codecs.open('../../corpus/English_StopWords.txt', 'r', encoding='utf8').readlines() stopwords = [w.strip() for w in stopwords] fp = codecs.open('../../corpus/test.lsnp', 'r', encoding='utf8') for line in fp: line = line.split() train.append([w for w in line if w not in stopwords]) print(train) dictionary = Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] print(corpus[0]) lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) # 打印前20个topic的词分布 print(lda.print_topics(20)) # 打印id为20的topic的词分布 print(lda.print_topic(20)) #模型的保存/ 加载 lda.save('zhwiki_lda.model') lda = models.ldamodel.LdaModel.load('zhwiki_lda.model') # tt = 'loss of energy , motivation and no interest in work anymore - be it time to through it all in' # # test_doc = list(i for i in tt.split()) # # doc_bow = id2word.doc2bow(test_doc) #文档转换成bow # doc_lda = lda[doc_bow] #得到新文档的主题分布 # #输出新文档的主题分布
def gensim_lda_topic_modelling(path, documents, num_of_topics=6, passes=50, verbose=True, plotTopicsResults=True): dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] if verbose: print("Cleaned documents:\n", documents) print("\nDictionary:\n", dictionary) print("\nCorpus in BoW form: \n", corpus) start = time.time() ldamodel = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary) end = time.time() print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0)) ldatopics = ldamodel.show_topics(formatted=False) ldatopics_words = [[[word, prob] for word, prob in topic] for topicid, topic in ldatopics] if verbose: print("\nList of words associated with each topic:\n") for i in range(len(ldatopics_words)): print("\nTopic %d:\n" % i) for w, p in ldatopics_words[i]: print(p, " - ", w) if plotTopicsResults: plot_top_10_words_per_topic(path, ldatopics_words, num_topics=6, num_top_words=10) all_documents_topics = [ (doc_topics, word_topics, word_phis) for doc_topics, word_topics, word_phis in ldamodel.get_document_topics( corpus, per_word_topics=True) ] all_doc_topics = [] for i in range(len(all_documents_topics)): doc_topics, word_topics, phi_values = all_documents_topics[i] all_doc_topics.append( [doc_topics[i][1] for i in range(len(doc_topics))]) if verbose: print('Document topics:', doc_topics) print('Word topics:', word_topics) print('Phi values:', phi_values) print('-------------- \n') if plotTopicsResults: plot_share_of_topics(path, all_doc_topics, no_random_tweets=10) # Plot words coloured differently depending on the topic for doc in documents[0:100]: if len(doc) > 4: color_words(ldamodel, doc)
# we add some words to the stop word list texts, article = [], [] for w in doc: # if it's not a stop word or punctuation mark, add it to our article! if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I': # we add the lematized version of the word article.append(w.lemma_) # if it's a new line, it means we're onto our next document if w.text == '\n': texts.append(article) article = [] # for i in texts: # print(i) bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] # print(texts) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] if len(corpus) == 0: print("fadssf") else: ldamodel = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary) for i in ldamodel.show_topics(): ans = ' '.join(i[1].split(" + ")) # ans=' '.join(ans.split('*')) ans = ''.join(ans.split('"')) print(a + "~" + ans)
class LDATagger: _lda_model = None _dictionary = None _lda_model_path = None _dictionary_path = None DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model") DEFAULT_NUM_TOPICS = 1000 def __init__(self, model_path=DEFAULT_MODEL_PATH, num_topics=DEFAULT_NUM_TOPICS, lock=threading.Lock()): self.save_model_lock = lock if os.path.isfile(model_path): raise Exception("Invalid Model Path; Should Be a Directory") if not os.path.exists(model_path): os.makedirs(model_path) self._lda_model_path = os.path.join(model_path, "lda.model") self._dictionary_path = os.path.join(model_path, "tokens.dict") self.num_topics = num_topics self.model_folder_lock = FileLock(model_path) def topics_for_documents(self, doc_tokens_map): self.check_and_load_model() doc_topics_map = defaultdict(list) for document_id, document_tokens in doc_tokens_map.iteritems(): doc_topics_map[document_id] = self.topics_for_document( document_tokens) return doc_topics_map def topics_for_document(self, tokens): self.check_and_load_model() bow_tokens = self._dictionary.doc2bow(tokens) topics = self._lda_model[bow_tokens] return topics def build_topics(self, tokens_list): self._dictionary = Dictionary(tokens_list) corpus = [ self._dictionary.doc2bow(document_tokens) for document_tokens in tokens_list ] self._lda_model = LdaModel(corpus=corpus, id2word=self._dictionary, num_topics=self.num_topics, passes=100) self.save_model() def save_model(self, sleep_for_test=False, mock_datastruct=None): self.save_model_lock.acquire() self.model_folder_lock.acquire() if mock_datastruct: mock_datastruct.acquire() if sleep_for_test: import time time.sleep(1) print "Acquired Lock " try: self._lda_model.save(self._lda_model_path) self._dictionary.save(self._dictionary_path) finally: print "Released Lock" if mock_datastruct: mock_datastruct.release() self.model_folder_lock.release() self.save_model_lock.release() def check_and_load_model(self): if self._lda_model and self._dictionary: return if os.path.exists(self._lda_model_path): self._lda_model = LdaModel.load(self._lda_model_path) else: raise Exception("LDA Model Not found in the path") if os.path.exists(self._dictionary_path): self._dictionary = Dictionary.load(self._dictionary_path) else: raise Exception("Tokens Dictionary Not found in the path") def update_model(self, tokens_list): self.check_and_load_model() corpus = [ self._dictionary.doc2bow(document_tokens) for document_tokens in tokens_list ] self._lda_model.update(corpus=corpus) self.save_model() def build_or_update_model(self, tokens_list): if not self.does_model_exist(): self.build_topics(tokens_list) else: self.update_model(tokens_list) def does_model_exist(self): if os.path.exists(self._lda_model_path) and os.path.exists( self._dictionary_path): return True return False def get_model(self): self.check_and_load_model() model_hash = { "lda_model": cPickle.dumps(self._lda_model), "dictionary": cPickle.dumps(self._dictionary) } return model_hash def restore_model(self, model_hash): self._lda_model = cPickle.loads( model_hash["lda_model"].encode('utf-8')) self._dictionary = cPickle.loads( model_hash["dictionary"].encode('utf-8')) self.save_model() def topics_to_tokens(self): topics_tokens_map = defaultdict(list) if not self.does_model_exist(): return [] else: model = self._lda_model topics_to_tokens = model.show_topics( topics=self.DEFAULT_NUM_TOPICS, topn=25, log=False, formatted=False) for topic_id, tokens in enumerate(topics_to_tokens): topics_tokens_map[topic_id] = self.list_of_tuples_to_hash( tokens) return topics_tokens_map def list_of_tuples_to_hash(self, tokens): tokens_hash = defaultdict(float) for token_probability, token in tokens: tokens_hash[token] = token_probability return tokens_hash
dtm = vectorizer.fit_transform(docs) sparse.save_npz(dtm_path, dtm) tokens = vectorizer.get_feature_names() vocab_size = len(tokens) pd.Series(tokens).to_csv(token_path, index=False) id2word = pd.Series(tokens).to_dict() corpus = Sparse2Corpus(dtm, documents_columns=False) # dictionary = Dictionary.from_corpus(corpus=train_corpus, id2word=id2word) # for n_topics in [3, 5, 7, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 75, 100]: for n_topics in [5, 10, 15, 20, 30]: print(n_topics, end=' ', flush=True) lda = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word) doc_topics = pd.DataFrame() for i, topics in enumerate(lda.get_document_topics(corpus)): doc_topics = pd.concat([ doc_topics, pd.DataFrame(topics, columns=['topic', 'value']).assign(doc=i) ]) doc_topics.to_csv(model_path / f'doc_topics_{key}_{n_topics}.csv', index=False) model_file = datapath((model_path / f'{key}_{n_topics}').resolve()) lda.save(model_file) train_lda = LdaModel(corpus=train_corpus, num_topics=n_topics, id2word=pd.Series(train_tokens).to_dict())
id2word_nouns = dict_nouns.id2token # Display #pp.pprint(id2word_nouns) # Display results of Corpus # print(corpus_nouns) # print('Number of unique tokens: {}'.format(len(dict_nouns))) # print('Number of documents: {}'.format(len(corpus_nouns))) # TODO: save corpus and dctionary to disk and load them back # save to path_lda_data lda_nouns = LdaModel(corpus=corpus_nouns, id2word=id2word_nouns, num_topics=10, iterations=300, eval_every=1) lda_nouns.print_topics(-1) # Print the Keyword in the 10 topics pp.pprint(lda_nouns.print_topics()) ######################## ######################## #u_mass coherence measure from gensim.models.coherencemodel import CoherenceModel lda_nouns_cm = CoherenceModel(model=lda_nouns, corpus=corpus_nouns,
def run_lda_with_entropy(industry_lda, token_dict, max_k=5): common_dictionary = corpora.Dictionary(industry_lda) common_corpus = [common_dictionary.doc2bow(text) for text in industry_lda] ldamodel = LdaModel(corpus=common_corpus, num_topics=max_k + 1, id2word=common_dictionary) result = ldamodel.print_topics(num_topics=max_k + 1, num_words=10) center_lst = [] for i in range(max_k + 1): result2 = ldamodel.get_topic_terms(topicid=i) sum_word = 0 center = 0 length = len(result2) for v in result2: if common_dictionary[v[0]] in token_dict.keys(): center += token_dict[common_dictionary[v[0]]] center_lst.append(center / length) industry_with_center_distance = [] sum_temp5_lst = [] for i in industry_lda: temp2 = [] for k in i: temp = [] if k in token_dict.keys(): for j in center_lst: temp.append((cal_sim(np.array(token_dict[k]), j))) if len(temp) > 0: temp2.append(temp) if len(temp2) > 0: temp3 = np.array(temp2) temp4 = np.mean(temp3, axis=0) temp5 = np.sum(temp3) else: temp4 = [] for i in range(0, max_k + 1): temp4.append(0.0) temp5 = temp4 industry_with_center_distance.append(temp4) sum_temp5_lst.append(temp5) entro_result_final = {} for number, i in enumerate(industry_lda): entro_result_2 = [] for k in i: entro_result = [] if k in token_dict.keys(): for j in center_lst: temp = cal_sim(np.array(token_dict[k]), j) temp_value = temp / sum_temp5_lst[number] entro_result.append(temp_value * math.log(temp_value)) entro_result_2.append(entro_result) if len(entro_result_2) > 0: temp5 = np.zeros(shape=(1, max_k + 1), dtype=float) for w in entro_result_2: if len(w) > 0: temp4 = np.array(w) temp5 += temp4 list_temp5 = list(temp5[0]) entro_result_final[number] = list_temp5.index(max(list_temp5)) final_result = defaultdict(list) for i in range(0, max_k + 1): for key, value in entro_result_final.items(): if value == i: final_result[i].append(industry_lda[key])
import numpy as np from gensim.models import LdaMulticore as LdaModel import argparse if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train LDA model') parser.add_argument('tweet_file', help=('path to twitter downloader dump where each line is a cleaned tweet')) parser.add_argument('out_dir', help=('path output file to save the model')) parser.add_argument('--include_list', nargs='?', default=None) # see preprocess for default list parser.add_argument('--exclude_list', nargs='?', default=None) # see preprocess for default list parser.add_argument('num_topics', type=int) parser.add_argument('--npasses', type=int, default=50) parser.add_argument('--decay', type=float, default=.5) parser.add_argument('--chunksize', type=int, default=2000) lda_filename = 'lda/middle_east_100.lda' args = parser.parse_args() corpus = corpora.MmCorpus('lda/lda_middle_east.mm') dictionary = corpora.Dictionary.load('lda/lda_middle_east.dict') lda = LdaModel(corpus, num_topics=100, alpha=1./100, eta=.2, chunksize=10000, workers=5, passes=100, decay=0.75, id2word=dictionary) print('Saving model') lda.print_topics() lda.save(lda_filename) print("lda saved in %s " % lda_filename)
def main(): global args taskname = args.taskname no_below = args.no_below no_above = args.no_above num_iters = args.num_iters n_topic = args.n_topic n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2 bkpt_continue = args.bkpt_continue use_tfidf = args.use_tfidf rebuild = args.rebuild auto_adj = args.auto_adj docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild) if auto_adj: no_above = docSet.topk_dfs(topk=20) docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) model_name = 'LDA' msg = 'bow' if not use_tfidf else 'tfidf' run_name = '{}_K{}_{}_{}'.format(model_name, n_topic, taskname, msg) if not os.path.exists('logs'): os.mkdir('logs') if not os.path.exists('ckpt'): os.mkdir('ckpt') loghandler = [ logging.FileHandler(filename=f'logs/{run_name}.log', encoding="utf-8") ] logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', handlers=loghandler) logger = logging.getLogger(__name__) if bkpt_continue: print('loading model ckpt ...') lda_model = gensim.models.ldamodel.LdaModel.load( 'ckpt/{}.model'.format(run_name)) # Training print('Start Training ...') if use_tfidf: tfidf = TfidfModel(docSet.bows) corpus_tfidf = tfidf[docSet.bows] #lda_model = LdaMulticore(list(corpus_tfidf),num_topics=n_topic,id2word=docSet.dictionary,alpha='asymmetric',passes=num_iters,workers=n_cpu,minimum_probability=0.0) lda_model = LdaModel(list(corpus_tfidf), num_topics=n_topic, id2word=docSet.dictionary, alpha='asymmetric', passes=num_iters) else: #lda_model = LdaMulticore(list(docSet.bows),num_topics=n_topic,id2word=docSet.dictionary,alpha='asymmetric',passes=num_iters,workers=n_cpu) lda_model = LdaModel(list(docSet.bows), num_topics=n_topic, id2word=docSet.dictionary, alpha='asymmetric', passes=num_iters) save_name = f'./ckpt/LDA_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt' lda_model.save(save_name) # Evaluation print('Evaluation ...') topic_words = get_topic_words(model=lda_model, n_topic=n_topic, topn=15, vocab=docSet.dictionary) (cv_score, w2v_score, c_uci_score, c_npmi_score), _ = calc_topic_coherence(topic_words, docs=docSet.docs, dictionary=docSet.dictionary) topic_diversity = calc_topic_diversity(topic_words) result_dict = { 'cv': cv_score, 'w2v': w2v_score, 'c_uci': c_uci_score, 'c_npmi': c_npmi_score } logger.info('Topics:') for idx, words in enumerate(topic_words): logger.info(f'##{idx:>3d}:{words}') print(f'##{idx:>3d}:{words}') for measure, score in result_dict.items(): logger.info(f'{measure} score: {score}') print(f'{measure} score: {score}') logger.info(f'topic diversity: {topic_diversity}') print(f'topic diversity: {topic_diversity}')
# Creating the object for LDA model using gensim library Lda = gensim.models.ldamodel.LdaModel # Running and Trainign LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=10, id2word=dictionary, passes=50) print('used: {:.2f}s'.format(time() - start)) print(ldamodel.print_topics(num_topics=2, num_words=4)) for i in ldamodel.print_topics(): for j in i: print(j) ldamodel.save(MODEL_FILE) from gensim.models import LdaModel loading = LdaModel.load(MODEL_FILE) print(loading.print_topics(num_topics=2, num_words=4)) def pre_new(doc): one = cleaning(doc).split() two = dictionary.doc2bow(one) return two pre_new('new article that to be classified by trained model!') belong = loading[( pre_new('new article that to be classified by trained model!'))] print(belong)
import logging from gensim.models import LdaModel from gensim import corpora logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary_path = "topics_labels/models/dictionary.dict" corpus_path = "topics_labels/models/corpus.lda-c" lda_num_topics = 25 lda_model_path = "topics_labels/models/lda_model_50_topics.lda" dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaModel.load(lda_model_path) topics = [5,10,15,20,25] for num_topics in topics: print "number of topics:", num_topics i = 0 for topic in lda.show_topics(num_topics): print '#' + str(i) + ': ' + topic i += 1
""" Classifies all articles according to the LDA model """ import os from multiprocessing.pool import Pool import psycopg2 from gensim import corpora from gensim.models import LdaModel from src.features.text import article_tokenizer from src.visualization.console import StatusVisualization dictionary = corpora.Dictionary.load(os.environ['MODEL_PATH'] + 'articles.dict') model = LdaModel.load(os.environ['MODEL_PATH'] + 'articles.lda') def classify(article): source_url, text = article tokens = article_tokenizer.tokenize(text) """ doc_bow = [dictionary.doc2bow(token) for token in [tokens]] doc_lda = model.get_document_topics(doc_bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False) topics = doc_lda[0] topics_ret = dict() for topic in topics: print(topics)
for i, (min_df, max_df, binary) in enumerate(dtm_params, 1): print(min_df, max_df, binary) vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary)) try: dtm = sparse.load_npz(vocab_path / f'dtm.npz') tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True) except FileNotFoundError: print('missing') continue corpus = Sparse2Corpus(dtm, documents_columns=False) id2word = tokens.to_dict() dictionary = Dictionary.from_corpus(corpus, id2word) for num_topics in topics: print(num_topics, end=' ') model_path = vocab_path / str(num_topics) / str(passes) / 'lda' if model_path.exists(): lda = LdaModel.load(model_path.as_posix()) else: continue start = time() vis = prepare(lda, corpus, dictionary, mds='tsne') terms = vis.topic_info terms = terms[terms.Category != 'Default'] pyLDAvis.save_html(vis, (model_path / 'ldavis.html').as_posix()) terms.to_csv(model_path / 'relevant_terms.csv', index=False) duration = time() - start print(format_time(duration))
type=int, ) args = parser.parse_args() print('Reading dataset') data = pd.read_parquet(args.input_filepath) print('Normalizing text') data.text = data.text.map(nlp.normalize_text) print('Building docterm matrix') docterm, dictionary = nlp.get_docterm_matrix(data.text) doclength = np.array([sum(x[1] for x in doc) for doc in docterm]) print('Training LDA model') lda = LdaModel(docterm, num_topics=args.n_topics) print('Getting document topics') doctopics = corpus2csc([lda.get_document_topics(doc) for doc in docterm]) termtopics = lda.get_topics() print('Computing topic volume time series') topic_volume_over_time = nlp.get_topic_volume_over_time(data, doctopics, 20) print('Computing topic coordinates') topic_coordinates = nlp.get_topic_coordinates(termtopics, method='mds') topic_proportions = nlp.get_topic_proportions(doctopics, doclength) print('Computing term frequencies') term_frequencies = nlp.get_term_frequencies(docterm, termtopics, topic_proportions, doclength)
class TestLdaDiff(unittest.TestCase): def setUp(self): self.dictionary = common_dictionary self.corpus = common_corpus self.num_topics = 5 self.n_ann_terms = 10 self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10) def testBasic(self): # test for matrix case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms) self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics)) self.assertEqual(len(annotation), self.num_topics) self.assertEqual(len(annotation[0]), self.num_topics) # test for diagonal case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True) self.assertEqual(mdiff.shape, (self.num_topics, )) self.assertEqual(len(annotation), self.num_topics) def testIdentity(self): for dist_name in ["hellinger", "kullback_leibler", "jaccard"]: # test for matrix case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name) for row in annotation: for (int_tokens, diff_tokens) in row: self.assertEqual(diff_tokens, []) self.assertEqual(len(int_tokens), self.n_ann_terms) self.assertTrue( np.allclose(np.diag(mdiff), np.zeros(mdiff.shape[0], dtype=mdiff.dtype))) if dist_name == "jaccard": self.assertTrue( np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) # test for diagonal case mdiff, annotation = \ self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True) for (int_tokens, diff_tokens) in annotation: self.assertEqual(diff_tokens, []) self.assertEqual(len(int_tokens), self.n_ann_terms) self.assertTrue( np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) if dist_name == "jaccard": self.assertTrue( np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) def testInput(self): self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something') self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')
class TopicModel: def __init__(self, topicCollection, string): if string.lower() == "nmf": self.model = "NMF" print("Topic Extraction Model: sklearn.NMF") else: self.model = "LDA" print("Topic Extraction Model: gensim.LDAModel") self.stemmer = PorterStemmer() #Train the LDA model on the current discussion def train(self, sentences): if self.model == "NMF": self.sentenceData = [] for sentence in sentences: self.sentenceData.append(preprocess(sentence, self.stemmer)) self.tfidf_vectorizer = TfidfVectorizer( max_features=1500, ngram_range=(1, 2), preprocessor=' '.join, stop_words='english' ) tfidf = self.tfidf_vectorizer.fit_transform(self.sentenceData) self.nmf = NMF(n_components=2, solver="mu") self.W = self.nmf.fit_transform(tfidf) self.H = self.nmf.components_ else: sentenceData = [] for sentence in sentences: sentenceData.append(preprocess(sentence, self.stemmer)) self.dictionary = Dictionary(sentenceData) bow_corpus = [self.dictionary.doc2bow(doc) for doc in sentenceData] self.lda_model = LdaModel(bow_corpus, num_topics=2, id2word=self.dictionary, passes=10) #Classify a given sentence to one of the topics found in training def classify(self, sentence): if self.model == "NMF": index = self.sentenceData.index(preprocess(sentence, self.stemmer)) topic = self.W.argmax(axis=1)[index] return "Topic " + str(topic) else: bow_vector = self.dictionary.doc2bow(preprocess(sentence, self.stemmer)) return "Topic " + str(sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1])[0][0]) #Shows the terms of a given topic def showTerms(self, topic): if self.model == "NMF": terms = "" top_features = [] tfidf_feature_names = self.tfidf_vectorizer.get_feature_names() for topic_idx, topicID in enumerate(self.H): if topic_idx == int(topic.split(' ')[-1]): top_features_ind = topicID.argsort()[:-20 - 1:-1] top_features = [tfidf_feature_names[i] for i in top_features_ind] weights = topicID[top_features_ind] for term in top_features: terms += term + ", " print(topic.split(' ')[-1] + " " + terms) return terms else: terms = "" topic = int(topic.split(" ")[-1]) for term in self.lda_model.show_topic(topic): terms += term[0] + ", " print(str(topic) + " " + terms) return terms #Gets the probability or the coefficient of the given term in the topic def getCoeff(self, topic, term): if self.model == "NMF": weights = [] top_features = [] tfidf_feature_names = self.tfidf_vectorizer.get_feature_names() for topic_idx, topicID in enumerate(self.H): if topic_idx == topic: top_features_ind = topicID.argsort()[:-20 - 1:-1] top_features = [tfidf_feature_names[i] for i in top_features_ind] weights = topicID[top_features_ind] for coeff, terms in zip(weights, top_features): if terms == term: return coeff else: topic = int(topic.split(" ")[-1]) for terms in self.lda_model.show_topic(topic): if terms[0] == term: return terms[1] #Shows all the topics found in training def showTopics(self): if self.model == "NMF": ret = [] for topic_idx, topicID in enumerate(self.H): ret.append("Topic " + str(topic_idx)) return ret else: topics = self.lda_model.print_topics() ret = [] for topic in topics: ret.append("Topic " + str(topic[0])) return ret #Returns a flag to check what model is deployed at the moment def getModel(self): return self.model
def __init__(self): self.dictionary = corpora.Dictionary.load("models/dictionary.dict") self.lda = LdaModel.load("models/lda_model.lda")
#to check print("Wordlist from the dictionary lookup:", dictionary[21], dictionary[22], dictionary[23], dictionary[24], dictionary[25], dictionary[26], dictionary[27]) # In[ ]: #scale it to all text corpus = [dictionary.doc2bow(text) for text in all_text] end_corpus = time.time() print("Time till corpus creation:", end_clean - start_time, "s") # In[ ]: #create the LDA model ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary) end_lda = time.time() print("Time till LDA model creation:", end_lda - start_time, "s") # In[ ]: pyLDAvis.enable_notebook() # In[ ]: pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) # In[ ]: end_viz = time.time() print("Time till viz:", end_viz - start_time, "s")
coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return topic_list, coherence_values # Code starts here topic_list, coherence_value_list = compute_coherence_values( dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, start=1, limit=41, step=5) # Finding the index associated with maximum coherence value max_index = coherence_value_list.index(max(coherence_value_list)) opt_topic = topic_list[max_index] print("optimum no of topics:", opt_topic) # Implementing LDA with the optimum no. of topic lda_model = LdaModel(corpus=doc_term_matrix, num_topics=opt_topic, id2word=dictionary, iterations=10, passes=30, random_state=0) lda_model.print_topics(5)
from TechDashAPI.mysqlUtilities import connectMySQL from TechDashAPI.ContentExtractor import ContentExtractor from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer from TechDashAPI.createDOM import createDom from TechDashAPI.util import utilities from TechDashAPI.topicModeling import techDashTopicModel from gensim.models import LdaModel db = connectMySQL(db='xpath', port=3366) filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/' utilitiesFunctions = utilities() modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/' modelName ='fullModel_100P_20T' model = LdaModel.load(modelDestination+modelName+'.lda', mmap=None) topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T') #=============================================================================== # UPDATE ALL ARTICLES TO NEW TOPICS #=============================================================================== sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """ db.executeQuery(sqlQuery) for item in db._connectMySQL__results: #=========================================================================== # print item #=========================================================================== topicModelCat = topicModel.getDocumentTopics(item[1])
corpus = load_obj('LDABOWcorpus-application') #%% # lda model training (ETA 10 mins) num_topics = 6 # topics declared (based on MATLAB tut) chunk_size = 300 t1 = time.time() # low alpha => each doc only rep by small num topics & vice versa # low eta means each topic only rep by small num words and vice versa lda = LdaModel( corpus=corpus, num_topics=num_topics, id2word=dct, alpha='auto', random_state=100, # eta=None, update_every=1, chunksize=chunk_size, minimum_probability=0.0, # iterations=100, # gamma_threshold=0.001, passes=10, per_word_topics=True) lda.get_document_topics(bow=corpus, per_word_topics=True) tpl = lda.print_topics(num_topics=6, num_words=5) topic, contrib = zip(*tpl) t2 = time.time() print("Time to train LDA model on", len(df), "articles:", (t2 - t1) / 60, "min")
from gensim import corpora,similarities from gensim.models import LdaModel, LsiModel lda_model = LdaModel.load('./data/lda_model') lsi_model = LsiModel.load('./data/lsi_model') id2word = corpora.Dictionary.load('./data/id2word') index = similarities.MatrixSimilarity.load('./data/index')