예제 #1
0
def build_model():
    #    vectorizer = vsm.Vectorizer(weighting='tfidf', normalize=True, smooth_idf=True, min_df=3, max_df=0.95, max_n_terms=100000)
    #    doc_term_matrix = vectorizer.fit_transform(build_comp_termlist())
    doc_term_matrix, id2term = vsm.doc_term_matrix(build_comp_termlist(
    ))  #, weighting='tfidf', normalize=True, smooth_idf=True,
    #                                          min_df=3, max_df=0.95)
    #    print(repr(doc_term_matrix))
    model = textacy.tm.TopicModel('lda', n_topics=40)
    model.fit(doc_term_matrix)
    doc_topic_matrix = model.transform(doc_term_matrix)
    for topic_idx, top_terms in model.top_topic_terms(id2term):
        print('topic', topic_idx, ':', '   '.join(top_terms))
    for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix,
                                                    topics=[0, 1],
                                                    top_n=2,
                                                    weights=True):
        print(str(topic_idx) + '\t' + str(top_docs))


#        for j in top_docs:
#            print(corpus[j].metadata['title'])
    model.termite_plot(doc_term_matrix,
                       id2term,
                       topics=-1,
                       n_terms=40,
                       sort_terms_by='seriation',
                       save='./lda-topics.png')
예제 #2
0
 def setUp(self):
     texts = [
         "Mary had a little lamb. Its fleece was white as snow.",
         "Everywhere that Mary went the lamb was sure to go.",
         "It followed her to school one day, which was against the rule.",
         "It made the children laugh and play to see a lamb at school.",
         "And so the teacher turned it out, but still it lingered near.",
         "It waited patiently about until Mary did appear.",
         "Why does the lamb love Mary so? The eager children cry.",
         "Mary loves the lamb, you know, the teacher did reply."
     ]
     corpus = Corpus('en', texts=texts)
     term_lists = [
         doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)
         for doc in corpus
     ]
     self.doc_term_matrix, self.id_to_term = vsm.doc_term_matrix(
         term_lists,
         weighting='tf',
         normalize=False,
         sublinear_tf=False,
         smooth_idf=True,
         min_df=1,
         max_df=1.0,
         min_ic=0.0,
         max_n_terms=None)
     self.idx_lamb = [k for k, v in self.id_to_term.items()
                      if v == 'lamb'][0]
     self.idx_child = [
         k for k, v in self.id_to_term.items() if v == 'child'
     ][0]
예제 #3
0
 def setUp(self):
     texts = [
         "Mary had a little lamb. Its fleece was white as snow.",
         "Everywhere that Mary went the lamb was sure to go.",
         "It followed her to school one day, which was against the rule.",
         "It made the children laugh and play to see a lamb at school.",
         "And so the teacher turned it out, but still it lingered near.",
         "It waited patiently about until Mary did appear.",
         "Why does the lamb love Mary so? The eager children cry.",
         "Mary loves the lamb, you know, the teacher did reply."
     ]
     textcorpus = Corpus('en', texts=texts)
     term_lists = [
         doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)
         for doc in textcorpus
     ]
     self.doc_term_matrix, self.id2term = doc_term_matrix(
         term_lists,
         weighting='tf',
         normalize=False,
         sublinear_tf=False,
         smooth_idf=True,
         min_df=1,
         max_df=1.0,
         min_ic=0.0,
         max_n_terms=None)
     self.model = TopicModel('nmf', n_topics=5)
     self.model.fit(self.doc_term_matrix)
     self.tempdir = tempfile.mkdtemp(prefix='test_topic_model',
                                     dir=os.path.dirname(
                                         os.path.abspath(__file__)))
예제 #4
0
 def setUp(self):
     texts = ["Mary had a little lamb. Its fleece was white as snow.",
              "Everywhere that Mary went the lamb was sure to go.",
              "It followed her to school one day, which was against the rule.",
              "It made the children laugh and play to see a lamb at school.",
              "And so the teacher turned it out, but still it lingered near.",
              "It waited patiently about until Mary did appear.",
              "Why does the lamb love Mary so? The eager children cry.",
              "Mary loves the lamb, you know, the teacher did reply."]
     corpus = Corpus('en', texts=texts)
     term_lists = [doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)
                   for doc in corpus]
     self.doc_term_matrix, self.id_to_term = vsm.doc_term_matrix(
         term_lists,
         weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
         min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
     self.idx_lamb = [k for k, v in self.id_to_term.items() if v == 'lamb'][0]
     self.idx_child = [k for k, v in self.id_to_term.items() if v == 'child'][0]
예제 #5
0
 def setUp(self):
     texts = ["Mary had a little lamb. Its fleece was white as snow.",
              "Everywhere that Mary went the lamb was sure to go.",
              "It followed her to school one day, which was against the rule.",
              "It made the children laugh and play to see a lamb at school.",
              "And so the teacher turned it out, but still it lingered near.",
              "It waited patiently about until Mary did appear.",
              "Why does the lamb love Mary so? The eager children cry.",
              "Mary loves the lamb, you know, the teacher did reply."]
     textcorpus = Corpus('en', texts=texts)
     term_lists = [doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)
                   for doc in textcorpus]
     self.doc_term_matrix, self.id2term = doc_term_matrix(
         term_lists,
         weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
         min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
     self.model = TopicModel('nmf', n_topics=5)
     self.model.fit(self.doc_term_matrix)
     self.tempdir = tempfile.mkdtemp(
         prefix='test_topic_model', dir=os.path.dirname(os.path.abspath(__file__)))
예제 #6
0
def most_discriminating_terms(terms_lists, bool_array_grp1,
                              max_n_terms=1000, top_n_terms=25):
    """
    Given a collection of documents assigned to 1 of 2 exclusive groups, get the
    `top_n_terms` most discriminating terms for group1-and-not-group2 and
    group2-and-not-group1.

    Args:
        terms_lists (Iterable[Iterable[str]]): a sequence of documents, each as a
            sequence of (str) terms; used as input to :func:`doc_term_matrix()`
        bool_array_grp1 (Iterable[bool]): an ordered sequence of True/False values,
            where True corresponds to documents falling into "group 1" and False
            corresponds to those in "group 2"
        max_n_terms (int): only consider terms whose document frequency is within
            the top `max_n_terms` out of all distinct terms; must be > 0
        top_n_terms (int or float): if int (must be > 0), the total number of most
            discriminating terms to return for each group; if float (must be in
            the interval (0, 1)), the fraction of `max_n_terms` to return for each group

    Returns:
        List[str]: top `top_n_terms` most discriminating terms for grp1-not-grp2
        List[str]: top `top_n_terms` most discriminating terms for grp2-not-grp1

    References:
        King, Gary, Patrick Lam, and Margaret Roberts. "Computer-Assisted Keyword
            and Document Set Discovery from Unstructured Text." (2014).
            http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.458.1445&rep=rep1&type=pdf
    """
    alpha_grp1 = 1
    alpha_grp2 = 1
    if isinstance(top_n_terms, float):
        top_n_terms = top_n_terms * max_n_terms
    bool_array_grp1 = np.array(bool_array_grp1)
    bool_array_grp2 = np.invert(bool_array_grp1)

    dtm, id2term = vsm.doc_term_matrix(
        terms_lists, weighting='tf', normalize=False,
        sublinear_tf=False, smooth_idf=True,
        min_df=3, max_df=0.95, min_ic=0.0, max_n_terms=max_n_terms)

    # get doc freqs for all terms in grp1 documents
    dtm_grp1 = dtm[bool_array_grp1, :]
    n_docs_grp1 = dtm_grp1.shape[0]
    doc_freqs_grp1 = vsm.get_doc_freqs(dtm_grp1, normalized=False)

    # get doc freqs for all terms in grp2 documents
    dtm_grp2 = dtm[bool_array_grp2, :]
    n_docs_grp2 = dtm_grp2.shape[0]
    doc_freqs_grp2 = vsm.get_doc_freqs(dtm_grp2, normalized=False)

    # get terms that occur in a larger fraction of grp1 docs than grp2 docs
    term_ids_grp1 = np.where(doc_freqs_grp1 / n_docs_grp1 > doc_freqs_grp2 / n_docs_grp2)[0]

    # get terms that occur in a larger fraction of grp2 docs than grp1 docs
    term_ids_grp2 = np.where(doc_freqs_grp1 / n_docs_grp1 < doc_freqs_grp2 / n_docs_grp2)[0]

    # get grp1 terms doc freqs in and not-in grp1 and grp2 docs, plus marginal totals
    grp1_terms_grp1_df = doc_freqs_grp1[term_ids_grp1]
    grp1_terms_grp2_df = doc_freqs_grp2[term_ids_grp1]
    # grp1_terms_grp1_not_df = n_docs_grp1 - grp1_terms_grp1_df
    # grp1_terms_grp2_not_df = n_docs_grp2 - grp1_terms_grp2_df
    # grp1_terms_total_df = grp1_terms_grp1_df + grp1_terms_grp2_df
    # grp1_terms_total_not_df = grp1_terms_grp1_not_df + grp1_terms_grp2_not_df

    # get grp2 terms doc freqs in and not-in grp2 and grp1 docs, plus marginal totals
    grp2_terms_grp2_df = doc_freqs_grp2[term_ids_grp2]
    grp2_terms_grp1_df = doc_freqs_grp1[term_ids_grp2]
    # grp2_terms_grp2_not_df = n_docs_grp2 - grp2_terms_grp2_df
    # grp2_terms_grp1_not_df = n_docs_grp1 - grp2_terms_grp1_df
    # grp2_terms_total_df = grp2_terms_grp2_df + grp2_terms_grp1_df
    # grp2_terms_total_not_df = grp2_terms_grp2_not_df + grp2_terms_grp1_not_df

    # get grp1 terms likelihoods, then sort for most discriminating grp1-not-grp2 terms
    grp1_terms_likelihoods = {}
    for idx, term_id in enumerate(term_ids_grp1):
        term1 = Decimal(math.factorial(grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(math.factorial(grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal(math.factorial(grp1_terms_grp1_df[idx] + grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1))
        term2 = Decimal(math.factorial(n_docs_grp1 - grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(math.factorial(n_docs_grp2 - grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal((math.factorial(n_docs_grp1 + n_docs_grp2 - grp1_terms_grp1_df[idx] - grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1)))
        grp1_terms_likelihoods[id2term[term_id]] = term1 * term2
    top_grp1_terms = [term for term, likelihood
                      in sorted(grp1_terms_likelihoods.items(),
                                key=itemgetter(1), reverse=True)[:top_n_terms]]

    # get grp2 terms likelihoods, then sort for most discriminating grp2-not-grp1 terms
    grp2_terms_likelihoods = {}
    for idx, term_id in enumerate(term_ids_grp2):
        term1 = Decimal(math.factorial(grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(math.factorial(grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal(math.factorial(grp2_terms_grp2_df[idx] + grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1))
        term2 = Decimal(math.factorial(n_docs_grp2 - grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(math.factorial(n_docs_grp1 - grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal((math.factorial(n_docs_grp2 + n_docs_grp1 - grp2_terms_grp2_df[idx] - grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1)))
        grp2_terms_likelihoods[id2term[term_id]] = term1 * term2
    top_grp2_terms = [term for term, likelihood
                      in sorted(grp2_terms_likelihoods.items(),
                                key=itemgetter(1), reverse=True)[:top_n_terms]]

    return (top_grp1_terms, top_grp2_terms)
예제 #7
0
def most_discriminating_terms(terms_lists, bool_array_grp1,
                              max_n_terms=1000, top_n_terms=25):
    """
    Given a collection of documents assigned to 1 of 2 exclusive groups, get the
    `top_n_terms` most discriminating terms for group1-and-not-group2 and
    group2-and-not-group1.

    Args:
        terms_lists (Iterable[Iterable[str]]): a sequence of documents, each as a
            sequence of (str) terms; used as input to :func:`doc_term_matrix()`
        bool_array_grp1 (Iterable[bool]): an ordered sequence of True/False values,
            where True corresponds to documents falling into "group 1" and False
            corresponds to those in "group 2"
        max_n_terms (int): only consider terms whose document frequency is within
            the top `max_n_terms` out of all distinct terms; must be > 0
        top_n_terms (int or float): if int (must be > 0), the total number of most
            discriminating terms to return for each group; if float (must be in
            the interval (0, 1)), the fraction of `max_n_terms` to return for each group

    Returns:
        List[str]: top `top_n_terms` most discriminating terms for grp1-not-grp2
        List[str]: top `top_n_terms` most discriminating terms for grp2-not-grp1

    References:
        King, Gary, Patrick Lam, and Margaret Roberts. "Computer-Assisted Keyword
            and Document Set Discovery from Unstructured Text." (2014).
            http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.458.1445&rep=rep1&type=pdf
    """
    alpha_grp1 = 1
    alpha_grp2 = 1
    if isinstance(top_n_terms, float):
        top_n_terms = top_n_terms * max_n_terms
    bool_array_grp1 = np.array(bool_array_grp1)
    bool_array_grp2 = np.invert(bool_array_grp1)

    dtm, id2term = vsm.doc_term_matrix(
        terms_lists, weighting='tf', normalize=False,
        sublinear_tf=False, smooth_idf=True,
        min_df=3, max_df=0.95, min_ic=0.0, max_n_terms=max_n_terms)

    # get doc freqs for all terms in grp1 documents
    dtm_grp1 = dtm[bool_array_grp1, :]
    n_docs_grp1 = dtm_grp1.shape[0]
    doc_freqs_grp1 = vsm.get_doc_freqs(dtm_grp1, normalized=False)

    # get doc freqs for all terms in grp2 documents
    dtm_grp2 = dtm[bool_array_grp2, :]
    n_docs_grp2 = dtm_grp2.shape[0]
    doc_freqs_grp2 = vsm.get_doc_freqs(dtm_grp2, normalized=False)

    # get terms that occur in a larger fraction of grp1 docs than grp2 docs
    term_ids_grp1 = np.where(doc_freqs_grp1 / n_docs_grp1 > doc_freqs_grp2 / n_docs_grp2)[0]

    # get terms that occur in a larger fraction of grp2 docs than grp1 docs
    term_ids_grp2 = np.where(doc_freqs_grp1 / n_docs_grp1 < doc_freqs_grp2 / n_docs_grp2)[0]

    # get grp1 terms doc freqs in and not-in grp1 and grp2 docs, plus marginal totals
    grp1_terms_grp1_df = doc_freqs_grp1[term_ids_grp1]
    grp1_terms_grp2_df = doc_freqs_grp2[term_ids_grp1]
    # grp1_terms_grp1_not_df = n_docs_grp1 - grp1_terms_grp1_df
    # grp1_terms_grp2_not_df = n_docs_grp2 - grp1_terms_grp2_df
    # grp1_terms_total_df = grp1_terms_grp1_df + grp1_terms_grp2_df
    # grp1_terms_total_not_df = grp1_terms_grp1_not_df + grp1_terms_grp2_not_df

    # get grp2 terms doc freqs in and not-in grp2 and grp1 docs, plus marginal totals
    grp2_terms_grp2_df = doc_freqs_grp2[term_ids_grp2]
    grp2_terms_grp1_df = doc_freqs_grp1[term_ids_grp2]
    # grp2_terms_grp2_not_df = n_docs_grp2 - grp2_terms_grp2_df
    # grp2_terms_grp1_not_df = n_docs_grp1 - grp2_terms_grp1_df
    # grp2_terms_total_df = grp2_terms_grp2_df + grp2_terms_grp1_df
    # grp2_terms_total_not_df = grp2_terms_grp2_not_df + grp2_terms_grp1_not_df

    # get grp1 terms likelihoods, then sort for most discriminating grp1-not-grp2 terms
    grp1_terms_likelihoods = {}
    for idx, term_id in enumerate(term_ids_grp1):
        term1 = Decimal(math.factorial(grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(math.factorial(grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal(math.factorial(grp1_terms_grp1_df[idx] + grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1))
        term2 = Decimal(math.factorial(n_docs_grp1 - grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(math.factorial(n_docs_grp2 - grp1_terms_grp2_df[idx] + alpha_grp2 - 1)) / Decimal((math.factorial(n_docs_grp1 + n_docs_grp2 - grp1_terms_grp1_df[idx] - grp1_terms_grp2_df[idx] + alpha_grp1 + alpha_grp2 - 1)))
        grp1_terms_likelihoods[id2term[term_id]] = term1 * term2
    top_grp1_terms = [term for term, likelihood
                      in sorted(grp1_terms_likelihoods.items(),
                                key=itemgetter(1), reverse=True)[:top_n_terms]]

    # get grp2 terms likelihoods, then sort for most discriminating grp2-not-grp1 terms
    grp2_terms_likelihoods = {}
    for idx, term_id in enumerate(term_ids_grp2):
        term1 = Decimal(math.factorial(grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(math.factorial(grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal(math.factorial(grp2_terms_grp2_df[idx] + grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1))
        term2 = Decimal(math.factorial(n_docs_grp2 - grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(math.factorial(n_docs_grp1 - grp2_terms_grp1_df[idx] + alpha_grp1 - 1)) / Decimal((math.factorial(n_docs_grp2 + n_docs_grp1 - grp2_terms_grp2_df[idx] - grp2_terms_grp1_df[idx] + alpha_grp2 + alpha_grp1 - 1)))
        grp2_terms_likelihoods[id2term[term_id]] = term1 * term2
    top_grp2_terms = [term for term, likelihood
                      in sorted(grp2_terms_likelihoods.items(),
                                key=itemgetter(1), reverse=True)[:top_n_terms]]

    return (top_grp1_terms, top_grp2_terms)