def build_model(): # vectorizer = vsm.Vectorizer(weighting='tfidf', normalize=True, smooth_idf=True, min_df=3, max_df=0.95, max_n_terms=100000) # doc_term_matrix = vectorizer.fit_transform(build_comp_termlist()) doc_term_matrix, id2term = vsm.doc_term_matrix(build_comp_termlist( )) #, weighting='tfidf', normalize=True, smooth_idf=True, # min_df=3, max_df=0.95) # print(repr(doc_term_matrix)) model = textacy.tm.TopicModel('lda', n_topics=40) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) for topic_idx, top_terms in model.top_topic_terms(id2term): print('topic', topic_idx, ':', ' '.join(top_terms)) for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, topics=[0, 1], top_n=2, weights=True): print(str(topic_idx) + '\t' + str(top_docs)) # for j in top_docs: # print(corpus[j].metadata['title']) model.termite_plot(doc_term_matrix, id2term, topics=-1, n_terms=40, sort_terms_by='seriation', save='./lda-topics.png')
def setUp(self): texts = [ "Mary had a little lamb. Its fleece was white as snow.", "Everywhere that Mary went the lamb was sure to go.", "It followed her to school one day, which was against the rule.", "It made the children laugh and play to see a lamb at school.", "And so the teacher turned it out, but still it lingered near.", "It waited patiently about until Mary did appear.", "Why does the lamb love Mary so? The eager children cry.", "Mary loves the lamb, you know, the teacher did reply." ] corpus = Corpus('en', texts=texts) term_lists = [ doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True) for doc in corpus ] self.doc_term_matrix, self.id_to_term = vsm.doc_term_matrix( term_lists, weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None) self.idx_lamb = [k for k, v in self.id_to_term.items() if v == 'lamb'][0] self.idx_child = [ k for k, v in self.id_to_term.items() if v == 'child' ][0]
def setUp(self): texts = [ "Mary had a little lamb. Its fleece was white as snow.", "Everywhere that Mary went the lamb was sure to go.", "It followed her to school one day, which was against the rule.", "It made the children laugh and play to see a lamb at school.", "And so the teacher turned it out, but still it lingered near.", "It waited patiently about until Mary did appear.", "Why does the lamb love Mary so? The eager children cry.", "Mary loves the lamb, you know, the teacher did reply." ] textcorpus = Corpus('en', texts=texts) term_lists = [ doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True) for doc in textcorpus ] self.doc_term_matrix, self.id2term = doc_term_matrix( term_lists, weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None) self.model = TopicModel('nmf', n_topics=5) self.model.fit(self.doc_term_matrix) self.tempdir = tempfile.mkdtemp(prefix='test_topic_model', dir=os.path.dirname( os.path.abspath(__file__)))
def setUp(self): texts = ["Mary had a little lamb. Its fleece was white as snow.", "Everywhere that Mary went the lamb was sure to go.", "It followed her to school one day, which was against the rule.", "It made the children laugh and play to see a lamb at school.", "And so the teacher turned it out, but still it lingered near.", "It waited patiently about until Mary did appear.", "Why does the lamb love Mary so? The eager children cry.", "Mary loves the lamb, you know, the teacher did reply."] corpus = Corpus('en', texts=texts) term_lists = [doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True) for doc in corpus] self.doc_term_matrix, self.id_to_term = vsm.doc_term_matrix( term_lists, weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None) self.idx_lamb = [k for k, v in self.id_to_term.items() if v == 'lamb'][0] self.idx_child = [k for k, v in self.id_to_term.items() if v == 'child'][0]
def setUp(self): texts = ["Mary had a little lamb. Its fleece was white as snow.", "Everywhere that Mary went the lamb was sure to go.", "It followed her to school one day, which was against the rule.", "It made the children laugh and play to see a lamb at school.", "And so the teacher turned it out, but still it lingered near.", "It waited patiently about until Mary did appear.", "Why does the lamb love Mary so? The eager children cry.", "Mary loves the lamb, you know, the teacher did reply."] textcorpus = Corpus('en', texts=texts) term_lists = [doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True) for doc in textcorpus] self.doc_term_matrix, self.id2term = doc_term_matrix( term_lists, weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None) self.model = TopicModel('nmf', n_topics=5) self.model.fit(self.doc_term_matrix) self.tempdir = tempfile.mkdtemp( prefix='test_topic_model', dir=os.path.dirname(os.path.abspath(__file__)))
def most_discriminating_terms(terms_lists, bool_array_grp1, max_n_terms=1000, top_n_terms=25): """ Given a collection of documents assigned to 1 of 2 exclusive groups, get the `top_n_terms` most discriminating terms for group1-and-not-group2 and group2-and-not-group1. Args: terms_lists (Iterable[Iterable[str]]): a sequence of documents, each as a sequence of (str) terms; used as input to :func:`doc_term_matrix()` bool_array_grp1 (Iterable[bool]): an ordered sequence of True/False values, where True corresponds to documents falling into "group 1" and False corresponds to those in "group 2" max_n_terms (int): only consider terms whose document frequency is within the top `max_n_terms` out of all distinct terms; must be > 0 top_n_terms (int or float): if int (must be > 0), the total number of most discriminating terms to return for each group; if float (must be in the interval (0, 1)), the fraction of `max_n_terms` to return for each group Returns: List[str]: top `top_n_terms` most discriminating terms for grp1-not-grp2 List[str]: top `top_n_terms` most discriminating terms for grp2-not-grp1 References: King, Gary, Patrick Lam, and Margaret Roberts. "Computer-Assisted Keyword and Document Set Discovery from Unstructured Text." (2014). http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.458.1445&rep=rep1&type=pdf """ alpha_grp1 = 1 alpha_grp2 = 1 if isinstance(top_n_terms, float): top_n_terms = top_n_terms * max_n_terms bool_array_grp1 = np.array(bool_array_grp1) bool_array_grp2 = np.invert(bool_array_grp1) dtm, id2term = vsm.doc_term_matrix( terms_lists, weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=3, max_df=0.95, min_ic=0.0, max_n_terms=max_n_terms) # get doc freqs for all terms in grp1 documents dtm_grp1 = dtm[bool_array_grp1, :] n_docs_grp1 = dtm_grp1.shape[0] doc_freqs_grp1 = vsm.get_doc_freqs(dtm_grp1, normalized=False) # get doc freqs for all terms in grp2 documents dtm_grp2 = dtm[bool_array_grp2, :] n_docs_grp2 = dtm_grp2.shape[0] doc_freqs_grp2 = vsm.get_doc_freqs(dtm_grp2, normalized=False) # get terms that occur in a larger fraction of grp1 docs than grp2 docs term_ids_grp1 = np.where(doc_freqs_grp1 / n_docs_grp1 > doc_freqs_grp2 / n_docs_grp2)[0] # get terms that occur in a larger fraction of grp2 docs than grp1 docs term_ids_grp2 = np.where(doc_freqs_grp1 / n_docs_grp1 < doc_freqs_grp2 / n_docs_grp2)[0] # get grp1 terms doc freqs in and not-in grp1 and grp2 docs, plus marginal totals grp1_terms_grp1_df = doc_freqs_grp1[term_ids_grp1] grp1_terms_grp2_df = doc_freqs_grp2[term_ids_grp1] # grp1_terms_grp1_not_df = n_docs_grp1 - grp1_terms_grp1_df # grp1_terms_grp2_not_df = n_docs_grp2 - grp1_terms_grp2_df # grp1_terms_total_df = grp1_terms_grp1_df + grp1_terms_grp2_df # grp1_terms_total_not_df = grp1_terms_grp1_not_df + grp1_terms_grp2_not_df # get grp2 terms doc freqs in and not-in grp2 and grp1 docs, plus marginal totals grp2_terms_grp2_df = doc_freqs_grp2[term_ids_grp2] grp2_terms_grp1_df = doc_freqs_grp1[term_ids_grp2] # grp2_terms_grp2_not_df = n_docs_grp2 - grp2_terms_grp2_df # grp2_terms_grp1_not_df = n_docs_grp1 - grp2_terms_grp1_df # grp2_terms_total_df = grp2_terms_grp2_df + grp2_terms_grp1_df # grp2_terms_total_not_df = grp2_terms_grp2_not_df + grp2_terms_grp1_not_df # get grp1 terms likelihoods, then sort for most discriminating grp1-not-grp2 terms grp1_terms_likelihoods = {} for idx, term_id in enumerate(term_ids_grp1): term1 = Decimal(math.factorial(grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(math.factorial(grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal(math.factorial(grp1_terms_grp1_df[idx] + grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1)) term2 = Decimal(math.factorial(n_docs_grp1 - grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(math.factorial(n_docs_grp2 - grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal((math.factorial(n_docs_grp1 + n_docs_grp2 - grp1_terms_grp1_df[idx] - grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1))) grp1_terms_likelihoods[id2term[term_id]] = term1 * term2 top_grp1_terms = [term for term, likelihood in sorted(grp1_terms_likelihoods.items(), key=itemgetter(1), reverse=True)[:top_n_terms]] # get grp2 terms likelihoods, then sort for most discriminating grp2-not-grp1 terms grp2_terms_likelihoods = {} for idx, term_id in enumerate(term_ids_grp2): term1 = Decimal(math.factorial(grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(math.factorial(grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal(math.factorial(grp2_terms_grp2_df[idx] + grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1)) term2 = Decimal(math.factorial(n_docs_grp2 - grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(math.factorial(n_docs_grp1 - grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal((math.factorial(n_docs_grp2 + n_docs_grp1 - grp2_terms_grp2_df[idx] - grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1))) grp2_terms_likelihoods[id2term[term_id]] = term1 * term2 top_grp2_terms = [term for term, likelihood in sorted(grp2_terms_likelihoods.items(), key=itemgetter(1), reverse=True)[:top_n_terms]] return (top_grp1_terms, top_grp2_terms)