def LDA(self, num_topics, num_words): dictionary = corpora.Dictionary(self.para_list) doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list] path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623' self.ldamodel = LdaVowpalWabbit(path, doc_term_matrix, num_topics=num_topics, id2word=dictionary) self.ldamodel.save('model/lda_model') print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))
def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. # `topics1` is clearly better as it has a clear distinction between system-human # interaction and graphs. Hence both the coherence measures for `topics1` should be # greater. self.topics1 = [['human', 'computer', 'system', 'interface'], ['graph', 'minors', 'trees', 'eps']] self.topics2 = [['user', 'graph', 'minors', 'system'], ['time', 'graph', 'survey', 'minors']] self.ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=0, iterations=0) mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None if self.mallet_path: self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=corpus, id2word=dictionary, num_topics=2, iterations=0) vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) if not vw_path: msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" logging.info(msg) self.vw_path = None else: self.vw_path = vw_path self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0)
class Contract_Reader(): def __init__(self, config): print('Filepath for texts = ', config.textpath) self.corpus = PCR(config.textpath, '.*\.txt', encoding='utf-16', para_block_reader=read_line_block) if config.clean_paragraphs == 'yes': self.clean(config, mode='para') if config.clean_sentences == 'yes': self.clean(config, mode='sent') #Corpus summaries self.corpus_info() self.LDA(config.num_topics, config.num_words) self.plot(config.num_words) def clean(self, config, mode='sent'): stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WNL() if mode == 'para': #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings. self.para_list = [ list(itertools.chain.from_iterable(para)) for para in self.corpus.paras() ] for index, paragraph in enumerate(self.para_list): paragraph = " ".join(paragraph) stop_free = " ".join( [i for i in paragraph.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.para_list[index] = normalized print(self.para_list[0]) self.para_list = [para.split() for para in self.para_list] print(self.para_list[0]) if mode == 'sent': #Obtain list of strings each one a sentence rather than list of lists. self.sents_list = [" ".join(sent) for sent in self.corpus.sents()] for index, sentence in enumerate(self.sents_list): stop_free = " ".join( [i for i in sentence.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.sents_list[index] = normalized print(self.sents_list[0]) self.sents_list = [ sentence.split() for sentence in self.sents_list ] print(self.sents_list[0]) def LDA(self, num_topics, num_words): dictionary = corpora.Dictionary(self.para_list) doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list] path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623' self.ldamodel = LdaVowpalWabbit(path, doc_term_matrix, num_topics=num_topics, id2word=dictionary) self.ldamodel.save('model/lda_model') print(self.ldamodel.print_topics(num_topics=10, num_words=num_words)) def plot(self, num_words): for t in range(self.ldamodel.num_topics): plt.figure() tuples = [ reversed(x) for x in self.ldamodel.show_topic(t, num_words) ] plt.imshow(WordCloud().fit_words(dict(tuples))) plt.axis("off") plt.title("Topic #" + str(t)) plt.savefig('plots/topic' + str(t)) def corpus_info(self): """ Summary information about the status of a corpus. """ fids = len(self.corpus.fileids()) paras = len(self.corpus.paras()) sents = len(self.corpus.sents()) sperp = sum(len(para) for para in self.corpus.paras()) / float(paras) tokens = FreqDist(self.corpus.words()) count = sum(tokens.values()) vocab = len(tokens) lexdiv = float(count) / float(vocab) print( ("Text corpus contains {} files\n" "Composed of {} paragraphs and {} sentences.\n" "{:0.3f} sentences per paragraph\n" "Word count of {} with a vocabulary of {}\n" "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp, count, vocab, lexdiv))