def test_get_doc_freqs(self): doc_freqs = vsm.get_doc_freqs(self.doc_term_matrix, normalized=False) self.assertEqual(len(doc_freqs), self.doc_term_matrix.shape[1]) self.assertEqual(doc_freqs.max(), 5) self.assertEqual(doc_freqs.min(), 1) self.assertEqual(doc_freqs[self.idx_lamb], 5) self.assertEqual(doc_freqs[self.idx_child], 2)
def test_get_doc_freqs_normalized(self): doc_freqs = vsm.get_doc_freqs(self.doc_term_matrix, normalized=True) self.assertEqual(len(doc_freqs), self.doc_term_matrix.shape[1]) self.assertAlmostEqual(doc_freqs.max(), 0.625, places=3) self.assertAlmostEqual(doc_freqs.min(), 0.125, places=3) self.assertAlmostEqual(doc_freqs[self.idx_lamb], 0.625, places=3) self.assertAlmostEqual(doc_freqs[self.idx_child], 0.250, places=3)
def test_get_doc_freqs(vectorizer_and_dtm, lamb_and_child_idxs): _, doc_term_matrix = vectorizer_and_dtm idx_lamb, idx_child = lamb_and_child_idxs doc_freqs = vsm.get_doc_freqs(doc_term_matrix) assert len(doc_freqs) == doc_term_matrix.shape[1] assert doc_freqs.max() == 5 assert doc_freqs.min() == 1 assert doc_freqs[idx_lamb] == 5 assert doc_freqs[idx_child] == 2
def test_get_doc_freqs_normalized(vectorizer_and_dtm, lamb_and_child_idxs): _, doc_term_matrix = vectorizer_and_dtm idx_lamb, idx_child = lamb_and_child_idxs doc_freqs = vsm.get_doc_freqs(doc_term_matrix, normalized=True) assert len(doc_freqs) == doc_term_matrix.shape[1] assert doc_freqs.max() == pytest.approx(0.625, rel=1e-3) assert doc_freqs.min() == pytest.approx(0.125, rel=1e-3) assert doc_freqs[idx_lamb] == pytest.approx(0.625, rel=1e-3) assert doc_freqs[idx_child] == pytest.approx(0.250, rel=1e-3)
def test_get_doc_freqs_exception(): with pytest.raises(ValueError): _ = vsm.get_doc_freqs(coo_matrix((1, 1)).tocsr())
def most_discriminating_terms(terms_lists, bool_array_grp1, max_n_terms=1000, top_n_terms=25): """ Given a collection of documents assigned to 1 of 2 exclusive groups, get the `top_n_terms` most discriminating terms for group1-and-not-group2 and group2-and-not-group1. Args: terms_lists (Iterable[Iterable[str]]): a sequence of documents, each as a sequence of (str) terms; used as input to :func:`doc_term_matrix()` bool_array_grp1 (Iterable[bool]): an ordered sequence of True/False values, where True corresponds to documents falling into "group 1" and False corresponds to those in "group 2" max_n_terms (int): only consider terms whose document frequency is within the top `max_n_terms` out of all distinct terms; must be > 0 top_n_terms (int or float): if int (must be > 0), the total number of most discriminating terms to return for each group; if float (must be in the interval (0, 1)), the fraction of `max_n_terms` to return for each group Returns: List[str]: top `top_n_terms` most discriminating terms for grp1-not-grp2 List[str]: top `top_n_terms` most discriminating terms for grp2-not-grp1 References: King, Gary, Patrick Lam, and Margaret Roberts. "Computer-Assisted Keyword and Document Set Discovery from Unstructured Text." (2014). http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.458.1445&rep=rep1&type=pdf """ alpha_grp1 = 1 alpha_grp2 = 1 if isinstance(top_n_terms, float): top_n_terms = top_n_terms * max_n_terms bool_array_grp1 = np.array(bool_array_grp1) bool_array_grp2 = np.invert(bool_array_grp1) dtm, id2term = vsm.doc_term_matrix( terms_lists, weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=3, max_df=0.95, min_ic=0.0, max_n_terms=max_n_terms) # get doc freqs for all terms in grp1 documents dtm_grp1 = dtm[bool_array_grp1, :] n_docs_grp1 = dtm_grp1.shape[0] doc_freqs_grp1 = vsm.get_doc_freqs(dtm_grp1, normalized=False) # get doc freqs for all terms in grp2 documents dtm_grp2 = dtm[bool_array_grp2, :] n_docs_grp2 = dtm_grp2.shape[0] doc_freqs_grp2 = vsm.get_doc_freqs(dtm_grp2, normalized=False) # get terms that occur in a larger fraction of grp1 docs than grp2 docs term_ids_grp1 = np.where(doc_freqs_grp1 / n_docs_grp1 > doc_freqs_grp2 / n_docs_grp2)[0] # get terms that occur in a larger fraction of grp2 docs than grp1 docs term_ids_grp2 = np.where(doc_freqs_grp1 / n_docs_grp1 < doc_freqs_grp2 / n_docs_grp2)[0] # get grp1 terms doc freqs in and not-in grp1 and grp2 docs, plus marginal totals grp1_terms_grp1_df = doc_freqs_grp1[term_ids_grp1] grp1_terms_grp2_df = doc_freqs_grp2[term_ids_grp1] # grp1_terms_grp1_not_df = n_docs_grp1 - grp1_terms_grp1_df # grp1_terms_grp2_not_df = n_docs_grp2 - grp1_terms_grp2_df # grp1_terms_total_df = grp1_terms_grp1_df + grp1_terms_grp2_df # grp1_terms_total_not_df = grp1_terms_grp1_not_df + grp1_terms_grp2_not_df # get grp2 terms doc freqs in and not-in grp2 and grp1 docs, plus marginal totals grp2_terms_grp2_df = doc_freqs_grp2[term_ids_grp2] grp2_terms_grp1_df = doc_freqs_grp1[term_ids_grp2] # grp2_terms_grp2_not_df = n_docs_grp2 - grp2_terms_grp2_df # grp2_terms_grp1_not_df = n_docs_grp1 - grp2_terms_grp1_df # grp2_terms_total_df = grp2_terms_grp2_df + grp2_terms_grp1_df # grp2_terms_total_not_df = grp2_terms_grp2_not_df + grp2_terms_grp1_not_df # get grp1 terms likelihoods, then sort for most discriminating grp1-not-grp2 terms grp1_terms_likelihoods = {} for idx, term_id in enumerate(term_ids_grp1): term1 = Decimal(math.factorial(grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(math.factorial(grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal(math.factorial(grp1_terms_grp1_df[idx] + grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1)) term2 = Decimal(math.factorial(n_docs_grp1 - grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(math.factorial(n_docs_grp2 - grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal((math.factorial(n_docs_grp1 + n_docs_grp2 - grp1_terms_grp1_df[idx] - grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1))) grp1_terms_likelihoods[id2term[term_id]] = term1 * term2 top_grp1_terms = [term for term, likelihood in sorted(grp1_terms_likelihoods.items(), key=itemgetter(1), reverse=True)[:top_n_terms]] # get grp2 terms likelihoods, then sort for most discriminating grp2-not-grp1 terms grp2_terms_likelihoods = {} for idx, term_id in enumerate(term_ids_grp2): term1 = Decimal(math.factorial(grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(math.factorial(grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal(math.factorial(grp2_terms_grp2_df[idx] + grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1)) term2 = Decimal(math.factorial(n_docs_grp2 - grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(math.factorial(n_docs_grp1 - grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal((math.factorial(n_docs_grp2 + n_docs_grp1 - grp2_terms_grp2_df[idx] - grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1))) grp2_terms_likelihoods[id2term[term_id]] = term1 * term2 top_grp2_terms = [term for term, likelihood in sorted(grp2_terms_likelihoods.items(), key=itemgetter(1), reverse=True)[:top_n_terms]] return (top_grp1_terms, top_grp2_terms)