def test_vectorizer_weighting_combinations(tokenized_docs): init_params = [ dict(tf_type='linear'), dict(tf_type='sqrt'), dict(tf_type='sqrt', apply_dl=True), dict(tf_type='sqrt', apply_dl=True, dl_type='sqrt'), dict(tf_type='linear', apply_idf=True), dict(tf_type='linear', apply_idf=True, idf_type='bm25'), dict(tf_type='linear', apply_idf=True, idf_type='standard', norm='l1'), dict(tf_type='linear', apply_idf=True, idf_type='standard', apply_dl=True), dict(tf_type='linear', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='log'), dict(tf_type='bm25', apply_idf=True, idf_type='bm25'), dict(tf_type='bm25', apply_idf=True, apply_dl=False), dict(tf_type='bm25', apply_idf=True, idf_type='bm25'), dict(tf_type='bm25', apply_idf=True, idf_type='smooth', norm='l2'), ] for ip in init_params: vectorizer = vsm.Vectorizer(**ip) doc_term_matrix = vectorizer.fit(tokenized_docs) vectorizer.weighting
def test_vectorizer_weighting_combinations(tokenized_docs): init_params = [ dict(tf_type="linear"), dict(tf_type="sqrt"), dict(tf_type="sqrt", apply_dl=True), dict(tf_type="sqrt", apply_dl=True, dl_type="sqrt"), dict(tf_type="linear", apply_idf=True), dict(tf_type="linear", apply_idf=True, idf_type="bm25"), dict(tf_type="linear", apply_idf=True, idf_type="standard", norm="l1"), dict(tf_type="linear", apply_idf=True, idf_type="standard", apply_dl=True), dict(tf_type="linear", apply_idf=True, idf_type="smooth", apply_dl=True, dl_type="log"), dict(tf_type="bm25", apply_idf=True, idf_type="bm25"), dict(tf_type="bm25", apply_idf=True, apply_dl=False), dict(tf_type="bm25", apply_idf=True, idf_type="bm25"), dict(tf_type="bm25", apply_idf=True, idf_type="smooth", norm="l2"), ] for ip in init_params: vectorizer = vsm.Vectorizer(**ip) doc_term_matrix = vectorizer.fit(tokenized_docs) vectorizer.weighting
def test_vectorizer_bad_init_params(self): bad_init_params = ( { 'min_df': -1 }, { 'max_df': -1 }, { 'max_n_terms': -1 }, { 'min_ic': -1.0 }, { 'vocabulary': 'foo bar bat baz' }, ) for bad_init_param in bad_init_params: with self.assertRaises(ValueError): vsm.Vectorizer(**bad_init_param)
def test_vectorizer_bad_transform(tokenized_docs): vectorizer = vsm.Vectorizer() with pytest.raises(ValueError): _ = vectorizer.transform(tokenized_docs)
def most_discriminating_terms(terms_lists, bool_array_grp1, max_n_terms=1000, top_n_terms=25): """ Given a collection of documents assigned to 1 of 2 exclusive groups, get the `top_n_terms` most discriminating terms for group1-and-not-group2 and group2-and-not-group1. Args: terms_lists (Iterable[Iterable[str]]): a sequence of documents, each as a sequence of (str) terms; used as input to :func:`doc_term_matrix()` bool_array_grp1 (Iterable[bool]): an ordered sequence of True/False values, where True corresponds to documents falling into "group 1" and False corresponds to those in "group 2" max_n_terms (int): only consider terms whose document frequency is within the top `max_n_terms` out of all distinct terms; must be > 0 top_n_terms (int or float): if int (must be > 0), the total number of most discriminating terms to return for each group; if float (must be in the interval (0, 1)), the fraction of `max_n_terms` to return for each group Returns: List[str]: top `top_n_terms` most discriminating terms for grp1-not-grp2 List[str]: top `top_n_terms` most discriminating terms for grp2-not-grp1 References: King, Gary, Patrick Lam, and Margaret Roberts. "Computer-Assisted Keyword and Document Set Discovery from Unstructured Text." (2014). http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.458.1445&rep=rep1&type=pdf """ alpha_grp1 = 1 alpha_grp2 = 1 if isinstance(top_n_terms, float): top_n_terms = top_n_terms * max_n_terms bool_array_grp1 = np.array(bool_array_grp1) bool_array_grp2 = np.invert(bool_array_grp1) vectorizer = vsm.Vectorizer(weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=3, max_df=0.95, min_ic=0.0, max_n_terms=max_n_terms) dtm = vectorizer.fit_transform(terms_lists) id2term = vectorizer.id_to_term # get doc freqs for all terms in grp1 documents dtm_grp1 = dtm[bool_array_grp1, :] n_docs_grp1 = dtm_grp1.shape[0] doc_freqs_grp1 = vsm.get_doc_freqs(dtm_grp1, normalized=False) # get doc freqs for all terms in grp2 documents dtm_grp2 = dtm[bool_array_grp2, :] n_docs_grp2 = dtm_grp2.shape[0] doc_freqs_grp2 = vsm.get_doc_freqs(dtm_grp2, normalized=False) # get terms that occur in a larger fraction of grp1 docs than grp2 docs term_ids_grp1 = np.where( doc_freqs_grp1 / n_docs_grp1 > doc_freqs_grp2 / n_docs_grp2)[0] # get terms that occur in a larger fraction of grp2 docs than grp1 docs term_ids_grp2 = np.where( doc_freqs_grp1 / n_docs_grp1 < doc_freqs_grp2 / n_docs_grp2)[0] # get grp1 terms doc freqs in and not-in grp1 and grp2 docs, plus marginal totals grp1_terms_grp1_df = doc_freqs_grp1[term_ids_grp1] grp1_terms_grp2_df = doc_freqs_grp2[term_ids_grp1] # grp1_terms_grp1_not_df = n_docs_grp1 - grp1_terms_grp1_df # grp1_terms_grp2_not_df = n_docs_grp2 - grp1_terms_grp2_df # grp1_terms_total_df = grp1_terms_grp1_df + grp1_terms_grp2_df # grp1_terms_total_not_df = grp1_terms_grp1_not_df + grp1_terms_grp2_not_df # get grp2 terms doc freqs in and not-in grp2 and grp1 docs, plus marginal totals grp2_terms_grp2_df = doc_freqs_grp2[term_ids_grp2] grp2_terms_grp1_df = doc_freqs_grp1[term_ids_grp2] # grp2_terms_grp2_not_df = n_docs_grp2 - grp2_terms_grp2_df # grp2_terms_grp1_not_df = n_docs_grp1 - grp2_terms_grp1_df # grp2_terms_total_df = grp2_terms_grp2_df + grp2_terms_grp1_df # grp2_terms_total_not_df = grp2_terms_grp2_not_df + grp2_terms_grp1_not_df # get grp1 terms likelihoods, then sort for most discriminating grp1-not-grp2 terms grp1_terms_likelihoods = {} for idx, term_id in enumerate(term_ids_grp1): term1 = Decimal( math. factorial(grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal( math.factorial(grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal( math.factorial(grp1_terms_grp1_df[idx] + grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1)) term2 = Decimal( math.factorial(n_docs_grp1 - grp1_terms_grp1_df[idx] + alpha_grp1 - 1) ) * Decimal( math.factorial(n_docs_grp2 - grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal( (math.factorial(n_docs_grp1 + n_docs_grp2 - grp1_terms_grp1_df[idx] - grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1))) grp1_terms_likelihoods[id2term[term_id]] = term1 * term2 top_grp1_terms = [ term for term, likelihood in sorted(grp1_terms_likelihoods.items(), key=itemgetter(1), reverse=True)[:top_n_terms] ] # get grp2 terms likelihoods, then sort for most discriminating grp2-not-grp1 terms grp2_terms_likelihoods = {} for idx, term_id in enumerate(term_ids_grp2): term1 = Decimal( math. factorial(grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal( math.factorial(grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal( math.factorial(grp2_terms_grp2_df[idx] + grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1)) term2 = Decimal( math.factorial(n_docs_grp2 - grp2_terms_grp2_df[idx] + alpha_grp2 - 1) ) * Decimal( math.factorial(n_docs_grp1 - grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal( (math.factorial(n_docs_grp2 + n_docs_grp1 - grp2_terms_grp2_df[idx] - grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1))) grp2_terms_likelihoods[id2term[term_id]] = term1 * term2 top_grp2_terms = [ term for term, likelihood in sorted(grp2_terms_likelihoods.items(), key=itemgetter(1), reverse=True)[:top_n_terms] ] return (top_grp1_terms, top_grp2_terms)
def vectorizer_and_dtm(tokenized_docs): vectorizer = vsm.Vectorizer( tf_type='linear', idf_type='smooth', norm=None, min_df=1, max_df=1.0, max_n_terms=None) doc_term_matrix = vectorizer.fit_transform(tokenized_docs) return vectorizer, doc_term_matrix
def vectorizer_and_dtm(tokenized_docs): vectorizer = vsm.Vectorizer( weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None) doc_term_matrix = vectorizer.fit_transform(tokenized_docs) return vectorizer, doc_term_matrix