Пример #1
0
def test_add_text(model=classify.Bo1Model):
    ix = create_index()
    with ix.reader() as r:
        exp = classify.Expander(r, "content", model=model)
        exp.add_text(text)
        assert (set([t[0] for t in exp.expanded_terms(3)
                     ]) == set(["particles", "velocity", "field"]))
        exp = classify.Expander(r, "extra", model=model)
        exp.add_text(text)
        assert exp.expanded_terms(3) == []
Пример #2
0
    def key_terms(self,
                  fieldname,
                  docs=10,
                  numterms=5,
                  model=classify.Bo1Model,
                  normalize=True):
        """Returns the 'numterms' most important terms from the top 'numdocs' documents
        in these results. "Most important" is generally defined as terms that occur
        frequently in the top hits but relatively infrequently in the collection as
        a whole.
        
        :param fieldname: Look at the terms in this field. This field must store vectors.
        :param docs: Look at this many of the top documents of the results.
        :param terms: Return this number of important terms.
        :param model: The classify.ExpansionModel to use. See the classify module.
        :returns: list of unicode strings.
        """

        docs = min(docs, self.scored_length())
        if docs <= 0: return

        reader = self.searcher.reader()
        fieldnum = self.searcher.fieldname_to_num(fieldname)

        expander = classify.Expander(reader, fieldname, model=model)
        for docnum in self.scored_list[:docs]:
            expander.add(reader.vector_as("weight", docnum, fieldnum))

        return expander.expanded_terms(numterms, normalize=normalize)
Пример #3
0
    def key_terms(self,
                  docnums,
                  fieldname,
                  numterms=5,
                  model=classify.Bo1Model,
                  normalize=True):
        """Returns the 'numterms' most important terms from the documents listed
        (by number) in 'docnums'. You can get document numbers for the documents
        your interested in with the document_number() and document_numbers() methods.
        
        >>> docnum = searcher.document_number(path=u"/a/b")
        >>> keywords = list(searcher.key_terms([docnum], "content"))
        
        "Most important" is generally defined as terms that occur
        frequently in the top hits but relatively infrequently in the collection as
        a whole.
        
        :param fieldname: Look at the terms in this field. This field must store vectors.
        :param docnums: A sequence of document numbers specifying which documents to
            extract key terms from.
        :param numterms: Return this number of important terms.
        :param model: The classify.ExpansionModel to use. See the classify module.
        """

        ixreader = self.ixreader
        fieldnum = self.fieldname_to_num(fieldname)

        expander = classify.Expander(self, fieldname, model=model)
        for docnum in docnums:
            expander.add(ixreader.vector_as(docnum, fieldnum, "weight"))
        return expander.expanded_terms(numterms, normalize=normalize)
Пример #4
0
def test_add_text():
    ix = create_index()
    with ix.reader() as r:
        exp = classify.Expander(r, "content")
        exp.add_text(text)
        assert ([t[0] for t in exp.expanded_terms(3)
                 ] == ["particles", "velocity", "field"])
Пример #5
0
 def key_terms_from_text(self, fieldname, text, numterms=5,
                         model=classify.Bo1Model, normalize=True):
     """Return the 'numterms' most important terms from the given text.
     
     :param numterms: Return this number of important terms.
     :param model: The classify.ExpansionModel to use. See the classify
         module.
     """
     
     expander = classify.Expander(self.ixreader, fieldname, model=model)
     expander.add_text(text)
     return expander.expanded_terms(numterms, normalize=normalize)
Пример #6
0
def test_empty_more_like(model=classify.Bo1Model):
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "emptymore") as ix:
        with ix.searcher() as s:
            assert s.doc_count() == 0
            q = query.Term("a", u("b"))
            r = s.search(q)
            assert r.scored_length() == 0
            assert r.key_terms("text", model=model) == []

            ex = classify.Expander(s.reader(), "text", model=model)
            assert ex.expanded_terms(1) == []
Пример #7
0
def test_fake_more_like(model=classify.Bo1Model):
    schema = fields.Schema(text=fields.TEXT)
    reader = reading.EmptyReader(schema)
    ex = classify.Expander(reader, "text", model=model)
    assert ex.expanded_terms(1) == []