예제 #1
0
    def prepare_indices(self, build_index, path):
        if build_index:
            print("Indexing corpus...")
            schema = None
            if self.lang == "ja":
                schema = Schema(path=ID(stored=True),
                                content=NGRAM(stored=True))
            else:
                ana = analysis.StandardAnalyzer(stoplist=None, minsize=0)
                schema = Schema(path=ID(stored=True),
                                content=TEXT(analyzer=ana))
            index_directory = os.path.dirname(path) + "/tmp/indices/indexdir"
            if not os.path.exists(index_directory):
                os.makedirs(index_directory)
            self.ix = create_in(index_directory, schema)
            with self.ix.writer(limitmb=2048, multisegment=True) as writer:
                i = 0
                for utterance in log_progress(self.utterances):
                    writer.add_document(path=str(i), content=utterance.text)
                    i += 1

            print("Indexing corpus by lemma...")
            if self.lang == "ja":
                schema = Schema(path=ID(stored=True),
                                content=NGRAM(stored=True))
            else:
                ana = analysis.StandardAnalyzer(stoplist=None, minsize=0)
                schema = Schema(path=ID(stored=True),
                                content=TEXT(analyzer=ana))
            lemma_index_directory = os.path.dirname(path) + \
                "/tmp/indices/lemmaindexdir"
            if not os.path.exists(lemma_index_directory):
                os.makedirs(lemma_index_directory)
            self.ix_lemma = create_in(lemma_index_directory, schema)
            with self.ix_lemma.writer(limitmb=2048,
                                      multisegment=True) as writer:
                i = 0
                for utterance in log_progress(self.utterances):
                    lemmas = [token.lemma_ for token in utterance.spacy]
                    writer.add_document(path=str(i), content=" ".join(lemmas))
                    i += 1
        else:
            print("Loading indices...")
            index_directory = os.path.dirname(path) + "/tmp/indices/indexdir"
            if not os.path.exists(index_directory):
                raise IOError('No existing indices! You should build ' +
                              'indices before trying to load them.')
            self.ix = open_dir(index_directory)

            print("Loading lemma indices...")
            index_directory = os.path.dirname(path) + \
                "/tmp/indices/lemmaindexdir"
            if not os.path.exists(index_directory):
                raise IOError('No existing indices! You should build ' +
                              'indices before trying to load them.')
            self.ix_lemma = open_dir(index_directory)
예제 #2
0
    def test_vector_unicode(self):
        a = analysis.StandardAnalyzer()
        schema = fields.Schema(content=fields.TEXT(vector=formats.Frequency(
            analyzer=a)))
        ix = self.make_index("testindex", schema, "vector_unicode")
        try:
            writer = ix.writer()
            writer.add_document(
                content=u"\u1234\u2345\u3456 \u4567\u5678\u6789")
            writer.add_document(
                content=u"\u0123\u1234\u4567 \u4567\u5678\u6789")
            writer.commit()

            writer = ix.writer()
            writer.add_document(
                content=u"\u2345\u3456\u4567 \u789a\u789b\u789c")
            writer.add_document(
                content=u"\u0123\u1234\u4567 \u2345\u3456\u4567")
            writer.commit()

            reader = ix.reader()
            vec = list(reader.vector_as("frequency", 0, 0))
            self.assertEqual(vec, [(u'\u3456\u4567', 1),
                                   (u'\u789a\u789b\u789c', 1)])
        finally:
            pass
예제 #3
0
    def test_vector_merge(self):
        a = analysis.StandardAnalyzer()
        schema = fields.Schema(
            title=fields.TEXT,
            content=fields.TEXT(vector=formats.Frequency(analyzer=a)))
        ix = self.make_index("testindex", schema, "vector_merge")
        try:
            writer = ix.writer()
            writer.add_document(
                title=u"one",
                content=u"This is the story of the black hole story")
            writer.commit()

            writer = ix.writer()
            writer.add_document(title=u"two",
                                content=u"You can read along in your book")
            writer.commit()

            reader = ix.reader()
            searcher = Searcher(reader)

            docnum = searcher.document_number(title=u"one")
            vec = list(reader.vector(docnum, "content").items_as("frequency"))
            self.assertEqual(vec, [(u'black', 1), (u'hole', 1), (u'story', 2)])

            docnum = searcher.document_number(title=u"two")
            vec = list(searcher.vector_as("frequency", docnum, "content"))
            self.assertEqual(vec, [(u'along', 1), (u'book', 1), (u'read', 1),
                                   (u'your', 1)])
        finally:
            pass
예제 #4
0
 def test_simple_fragment(self):
     terms = frozenset(("bravo", "india"))
     sa = analysis.StandardAnalyzer()
     sf = highlight.SimpleFragmenter(size=20)
     uc = highlight.UppercaseFormatter()
     htext = highlight.highlight(self._doc, terms, sa, sf, uc)
     self.assertEqual(htext, "alfa BRAVO charlie...hotel INDIA juliet kilo")
예제 #5
0
def test_maxclasses():
    terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2)
    htext = highlight.highlight(_doc, terms, sa, cf, hf)
    assert htext == '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot'
예제 #6
0
def test_context_fragment():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, uc)
    assert htext == "alfa BRAVO charlie...hotel INDIA juliet"
예제 #7
0
 def whoosh_schema(self):
     #ana = analysis.StemmingAnalyzer()
     ana = analysis.StandardAnalyzer()
     schema = fields.Schema(id=fields.ID(stored=True),
                            headline=fields.STORED,
                            text=fields.TEXT(analyzer=ana, stored=True))
     return schema
예제 #8
0
def test_sequence_complex():
    ana = analysis.StandardAnalyzer(stoplist=None)
    schema = fields.Schema(title=fields.TEXT(stored=True),
                           path=fields.ID(stored=True),
                           content=fields.TEXT(stored=True,
                                               phrase=True,
                                               analyzer=ana))
    ix = RamStorage().create_index(schema)

    with ix.writer() as w:
        w.add_document(title=u"First document",
                       path=u"/a",
                       content=u"This is the first document we've added!")
        w.add_document(title=u"Second document",
                       path=u"/b",
                       content=(u"In truth, he said, I would like to combine "
                                u"logical operators with proximity-based "
                                u"search in Whoosh!"))

    with ix.searcher() as s:
        qp = qparser.QueryParser("content", ix.schema)
        qp.remove_plugin_class(plugins.PhrasePlugin)
        qp.add_plugin(plugins.SequencePlugin())
        qp.add_plugin(plugins.FuzzyTermPlugin())

        q = qp.parse(u'"(he OR she OR we~) would*"~3')
        r = s.search(q)
        assert r.scored_length()
예제 #9
0
def test_more_like_this(model=classify.Bo2Model):
    docs = [
        u("alfa bravo charlie delta echo foxtrot golf"),
        u("delta echo foxtrot golf hotel india juliet"),
        u("echo foxtrot golf hotel india juliet kilo"),
        u("foxtrot golf hotel india juliet kilo lima"),
        u("golf hotel india juliet kilo lima mike"),
        u("foxtrot golf hotel india alfa bravo charlie")
    ]

    def _check(schema, **kwargs):
        ix = RamStorage().create_index(schema)
        with ix.writer() as w:
            for i, text in enumerate(docs):
                w.add_document(id=text_type(i + 1), text=text)

        with ix.searcher() as s:
            docnum = s.document_number(id=u("1"))
            r = s.more_like(docnum, "text", model=model, **kwargs)
            assert [hit["id"] for hit in r] == ["6", "2", "3"]

    schema = fields.Schema(id=fields.ID(stored=True),
                           text=fields.TEXT(stored=True))
    _check(schema)

    ana = analysis.StandardAnalyzer()
    schema = fields.Schema(id=fields.ID(stored=True),
                           text=fields.TEXT(analyzer=ana,
                                            vector=formats.Frequency()))
    _check(schema)

    schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    _check(schema, text=docs[0])
예제 #10
0
def test_null_fragment():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    nf = highlight.WholeFragmenter()
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, nf, uc)
    assert htext == "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima"
예제 #11
0
def test_context_at_start():
    terms = frozenset(["alfa"])
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=15)
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, uc)
    assert htext == "ALFA bravo charlie delta echo foxtrot"
예제 #12
0
파일: whooshindex.py 프로젝트: visar/montag
def build(db_dir):
    if not os.path.exists(db_dir):
        os.makedirs(db_dir)

    index_dir = os.path.join(db_dir, "whoosh")

    if os.path.exists(index_dir):
        index = open_dir(index_dir)
    else:
        all_words_ana = analysis.StandardAnalyzer(stoplist=None, minsize=0)

        schema = Schema(any_field=TEXT(analyzer=all_words_ana),
                        title=TEXT(analyzer=all_words_ana),
                        subtitle=TEXT(analyzer=all_words_ana),
                        author=TEXT(analyzer=all_words_ana),
                        edition=TEXT(analyzer=all_words_ana),
                        principal_language=TEXT(analyzer=all_words_ana),
                        publication_year=NUMERIC(),
                        tag=KEYWORD(commas=True, scorable=True,
                                    lowercase=True),
                        guid=ID(stored=True, unique=True),
                        merge_db_id=NUMERIC(stored=True),
                        type=NUMERIC())

        os.mkdir(index_dir)
        index = create_in(index_dir, schema)

    return WhooshIndex(index)
예제 #13
0
def test_html_format():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    hf = highlight.HtmlFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, hf)
    assert htext == 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet'
예제 #14
0
def make_index():
    ana = analysis.StandardAnalyzer(stoplist=None)
    sc = fields.Schema(id=fields.ID(stored=True),
                       text=fields.TEXT(analyzer=ana,
                                        vector=formats.Frequency()),
                       subs=fields.NUMERIC(int, stored=True))
    ix = RamIndex(sc)
    ix.add_document(
        id=u("fieldtype"),
        text=u("The FieldType object supports the following attributes"),
        subs=56)
    ix.add_document(id=u("format"),
                    text=u("the storage format for the field contents"),
                    subs=100)
    ix.add_document(
        id=u("vector"),
        text=u("the storage format for the field vectors (forward index)"),
        subs=23)
    ix.add_document(
        id=u("scorable"),
        text=u("whether searches against this field may be scored."),
        subs=34)
    ix.add_document(
        id=u("stored"),
        text=u(
            "whether the content of this field is stored for each document."),
        subs=575)
    ix.add_document(
        id=u("unique"),
        text=u("whether this field value is unique to each document."),
        subs=2)
    ix.add_document(id=u("const"),
                    text=u("The constructor for the base field type simply"),
                    subs=58204)
    return ix
예제 #15
0
def test_charset_pickeability():
    from whoosh.support import charset
    charmap = charset.charset_table_to_dict(charset.default_charset)
    ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap)
    _ = dumps(ana, -1)

    ana = analysis.CharsetTokenizer(charmap)
    _ = dumps(ana, -1)
예제 #16
0
def test_html_escape():
    terms = frozenset(["bravo"])
    sa = analysis.StandardAnalyzer()
    wf = highlight.WholeFragmenter()
    hf = highlight.HtmlFormatter()
    htext = highlight.highlight(u('alfa <bravo "charlie"> delta'), terms, sa,
                                wf, hf)
    assert htext == 'alfa &lt;<strong class="match term0">bravo</strong> "charlie"&gt; delta'
예제 #17
0
def test_sentence_fragment():
    text = u("This is the first sentence. This one doesn't have the word. " +
             "This sentence is the second. Third sentence here.")
    terms = ("sentence", )
    sa = analysis.StandardAnalyzer(stoplist=None)
    sf = highlight.SentenceFragmenter()
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(text, terms, sa, sf, uc)
    assert htext == "This is the first SENTENCE...This SENTENCE is the second...Third SENTENCE here"
예제 #18
0
def test_free_dates():
    a = analysis.StandardAnalyzer(stoplist=None)
    schema = fields.Schema(text=fields.TEXT(analyzer=a), date=fields.DATETIME)
    qp = qparser.QueryParser("text", schema)
    basedate = datetime(2010, 9, 20, 15, 16, 6, 454000)
    qp.add_plugin(dateparse.DateParserPlugin(basedate, free=True))

    q = qp.parse(u("hello date:last tuesday"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 2)
    assert_equal(q[0].__class__, query.Term)
    assert_equal(q[0].text, "hello")
    assert_equal(q[1].__class__, query.DateRange)
    assert_equal(q[1].startdate, adatetime(2010, 9, 14).floor())
    assert_equal(q[1].enddate, adatetime(2010, 9, 14).ceil())

    q = qp.parse(u("date:mar 29 1972 hello"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 2)
    assert_equal(q[0].__class__, query.DateRange)
    assert_equal(q[0].startdate, adatetime(1972, 3, 29).floor())
    assert_equal(q[0].enddate, adatetime(1972, 3, 29).ceil())
    assert_equal(q[1].__class__, query.Term)
    assert_equal(q[1].text, "hello")

    q = qp.parse(u("date:2005 march 2"))
    assert_equal(q.__class__, query.DateRange)
    assert_equal(q.startdate, adatetime(2005, 3, 2).floor())
    assert_equal(q.enddate, adatetime(2005, 3, 2).ceil())

    q = qp.parse(u("date:'2005' march 2"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 3)
    assert_equal(q[0].__class__, query.DateRange)
    assert_equal(q[0].startdate, adatetime(2005).floor())
    assert_equal(q[0].enddate, adatetime(2005).ceil())
    assert_equal(q[1].__class__, query.Term)
    assert_equal(q[1].fieldname, "text")
    assert_equal(q[1].text, "march")

    q = qp.parse(u("date:march 24 to dec 12"))
    assert_equal(q.__class__, query.DateRange)
    assert_equal(q.startdate, adatetime(2010, 3, 24).floor())
    assert_equal(q.enddate, adatetime(2010, 12, 12).ceil())

    q = qp.parse(u("date:5:10pm"))
    assert_equal(q.__class__, query.DateRange)
    assert_equal(q.startdate, adatetime(2010, 9, 20, 17, 10).floor())
    assert_equal(q.enddate, adatetime(2010, 9, 20, 17, 10).ceil())

    q = qp.parse(u("(date:30 june OR date:10 july) quick"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 2)
    assert_equal(q[0].__class__, query.Or)
    assert_equal(q[0][0].__class__, query.DateRange)
    assert_equal(q[0][1].__class__, query.DateRange)
예제 #19
0
 def test_null_fragment(self):
     terms = frozenset(("bravo", "india"))
     sa = analysis.StandardAnalyzer()
     nf = highlight.NullFragmenter
     uc = highlight.UppercaseFormatter()
     htext = highlight.highlight(self._doc, terms, sa, nf, uc)
     self.assertEqual(
         htext,
         "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima"
     )
예제 #20
0
def test_url():
    sample = u("Visit http://bitbucket.org/mchaput/whoosh or " +
               "urn:isbn:5930502 or http://www.apple.com/.")

    anas = [analysis.SimpleAnalyzer(analysis.url_pattern),
            analysis.StandardAnalyzer(analysis.url_pattern, stoplist=None)]
    for ana in anas:
        ts = [t.text for t in ana(sample)]
        assert ts == [u('visit'), u('http://bitbucket.org/mchaput/whoosh'),
                      u('or'), u('urn:isbn:5930502'), u('or'),
                      u('http://www.apple.com/')]
예제 #21
0
def test_dash():
    ana = analysis.StandardAnalyzer("[ \t\r\n()*?]+")
    schema = fields.Schema(title=fields.TEXT(analyzer=ana),
                           text=fields.TEXT(analyzer=ana), time=fields.ID)
    qtext = u("*Ben-Hayden*")

    qp = default.QueryParser("text", schema)
    q = qp.parse(qtext)
    assert_equal(repr(q), "Wildcard('text', u'*ben-hayden*')")

    qp = default.MultifieldParser(["title", "text", "time"], schema)
    q = qp.parse(qtext)
    assert_equal(repr(q), "Or([Wildcard('title', u'*ben-hayden*'), Wildcard('text', u'*ben-hayden*'), Wildcard('time', u'*Ben-Hayden*')])")
예제 #22
0
class SearchEngine(object):
    ana = analysis.StandardAnalyzer()
    schema = whoosh.fields.Schema(
        url=whoosh.fields.ID(unique=True, stored=True),
        title=whoosh.fields.TEXT(stored=True, phrase=False),
        content=whoosh.fields.TEXT(spelling=True, stored=True, phrase=False), 
        modtime=whoosh.fields.ID())

    def __init__(self, index_path):
        self.path = index_path
        if not os.path.exists(index_path):
            os.makedirs(index_path)

    def create_index(self):
        whoosh.index.create_in(self.path, self.schema)
	self.open_index()

    def open_index(self):
        self._index = whoosh.index.open_dir(self.path)
        self._writer = self._index.writer()
	return self._index

    def add_document(self, url, title, content,modtime):
        self._writer.add_document(
            url=unicode(url),
            title=unicode(title),
            content=unicode(content,errors='ignore'),
	    modtime =unicode(modtime),)

    def update_document(self, url, title, content,modtime):
        self._writer.update_document(
            url=unicode(url),
            title=unicode(title),
            content=unicode(content),
	    modtime = unicode(modtime),)

    def delete_document(self, url):
        self._index.delete_by_term('url', unicode(url))

    _queryparser = whoosh.qparser.QueryParser('content', schema=schema)
    def find(self):
        searcher = self._index.searcher()
	return searcher,self._queryparser

    def commit(self):
        self._writer.commit()

    def cancel(self):
        self._writer.cancel()
예제 #23
0
def test_fuzzy_plugin():
    ana = analysis.StandardAnalyzer("\\S+")
    schema = fields.Schema(f=fields.TEXT(analyzer=ana))
    qp = default.QueryParser("f", schema)
    qp.add_plugin(plugins.FuzzyTermPlugin())

    q = qp.parse("bob~")
    assert q.__class__ == query.FuzzyTerm
    assert q.field() == "f"
    assert q.text == "bob"
    assert q.maxdist == 1

    q = qp.parse("Alfa Bravo~ Charlie")
    assert q.__class__ == query.And
    assert q[0].__class__ == query.Term
    assert q[0].text == "alfa"
    assert q[1].__class__ == query.FuzzyTerm
    assert q[1].field() == "f"
    assert q[1].text == "bravo"
    assert q[1].maxdist == 1
    assert q[2].__class__ == query.Term
    assert q[2].text == "charlie"

    q = qp.parse("Alfa Bravo~2 Charlie")
    assert q.__class__ == query.And
    assert q[0].__class__ == query.Term
    assert q[0].text == "alfa"
    assert q[1].__class__ == query.FuzzyTerm
    assert q[1].field() == "f"
    assert q[1].text == "bravo"
    assert q[1].maxdist == 2
    assert q[2].__class__ == query.Term
    assert q[2].text == "charlie"

    q = qp.parse("alfa ~2 bravo")
    assert q.__class__ == query.And
    assert q[0].__class__ == query.Term
    assert q[0].text == "alfa"
    assert q[1].__class__ == query.Term
    assert q[1].text == "~2"
    assert q[2].__class__ == query.Term
    assert q[2].text == "bravo"

    qp = default.QueryParser("f", None)
    q = qp.parse("'bob~'")
    assert q.__class__ == query.Term
    assert q.field() == "f"
    assert q.text == "bob~"
예제 #24
0
def create_index():
    analyzer = analysis.StandardAnalyzer()
    vector_format = formats.Frequency()
    schema = fields.Schema(path=fields.ID(stored=True),
                           content=fields.TEXT(analyzer=analyzer,
                                               vector=vector_format))

    ix = RamStorage().create_index(schema)

    w = ix.writer()
    from string import ascii_lowercase
    for letter, content in zip(ascii_lowercase, domain):
        w.add_document(path=u("/%s") % letter, content=content)
    w.commit()

    return ix
예제 #25
0
def test_dash():
    ana = analysis.StandardAnalyzer("[^ \t\r\n()*?]+")
    schema = fields.Schema(title=fields.TEXT(analyzer=ana),
                           text=fields.TEXT(analyzer=ana),
                           time=fields.ID)
    qtext = u("*Ben-Hayden*")

    qp = default.QueryParser("text", schema)
    q = qp.parse(qtext)
    assert q.__class__ == query.Wildcard
    assert q.fieldname == "text"
    assert q.text == "*ben-hayden*"

    qp = default.MultifieldParser(["title", "text", "time"], schema)
    q = qp.parse(qtext)
    assert q.__unicode__() == "(title:*ben-hayden* OR text:*ben-hayden* OR time:*Ben-Hayden*)"
예제 #26
0
def test_vector_postings():
    s = fields.Schema(id=fields.ID(stored=True, unique=True),
                      content=fields.TEXT(vector=formats.Positions(
                          analyzer=analysis.StandardAnalyzer())))
    st = RamStorage()
    ix = st.create_index(s)

    writer = ix.writer()
    writer.add_document(
        id=u('1'), content=u('the quick brown fox jumped over the lazy dogs'))
    writer.commit()
    r = ix.reader()

    terms = list(r.vector_as("weight", 0, "content"))
    assert_equal(terms, [(u('brown'), 1.0), (u('dogs'), 1.0), (u('fox'), 1.0),
                         (u('jumped'), 1.0), (u('lazy'), 1.0),
                         (u('over'), 1.0), (u('quick'), 1.0)])
예제 #27
0
파일: index.py 프로젝트: molson77/Kive
def get_schema():
    '''
    Specifies what fields are stored in the index and returns to be passed to newly created index.
    '''

    # analyzer = analysis.SpaceSeparatedTokenizer() | analysis.LowercaseFilter() | analysis.CharsetFilter(accent_map)
    analyzer = analysis.StandardAnalyzer(
        stoplist=None, minsize=1) | analysis.CharsetFilter(accent_map)
    return Schema(name=TEXT(analyzer=analyzer, stored=True),
                  path=TEXT(stored=True),
                  content=TEXT(analyzer=analyzer, stored=True),
                  legacy_ingest=TEXT,
                  ingest=TEXT,
                  last_accessed=TEXT,
                  media_files=TEXT(analyzer=analyzer, stored=True),
                  indexed_time=DATETIME(stored=True),
                  id=ID(stored=True, unique=True))
예제 #28
0
 def _get_index(self):
     if self.index is None:
         path = os.path.join(self.db_path, 'whoosh-index')
         if not os.path.exists(path):
             # StandardAnalyzer lowercases all words and configure it to
             # block stopwords and words with lengths not between
             # self.minlength and self.maxlength from indexer_common
             stopfilter = analysis.StandardAnalyzer(  #stoplist=self.stopwords,
                 minsize=self.minlength,
                 maxsize=self.maxlength)
             os.mkdir(path)
             schema = fields.Schema(
                 identifier=fields.ID(stored=True, unique=True),
                 content=fields.TEXT(analyzer=stopfilter))
             index.create_in(path, schema)
         self.index = index.open_dir(path)
     return self.index
예제 #29
0
    def test_vector_phrase(self):
        ana = analysis.StandardAnalyzer()
        ftype = fields.FieldType(formats.Frequency(ana),
                                 formats.Positions(ana),
                                 scorable=True)
        schema = fields.Schema(name=fields.ID(stored=True), value=ftype)
        storage = RamStorage()
        ix = storage.create_index(schema)
        writer = ix.writer()
        writer.add_document(name=u"A",
                            value=u"Little Miss Muffet sat on a tuffet")
        writer.add_document(name=u"B", value=u"Miss Little Muffet tuffet")
        writer.add_document(name=u"C", value=u"Miss Little Muffet tuffet sat")
        writer.add_document(
            name=u"D",
            value=u"Gibberish blonk falunk miss muffet sat tuffet garbonzo")
        writer.add_document(name=u"E", value=u"Blah blah blah pancakes")
        writer.commit()

        searcher = ix.searcher()

        def names(results):
            return sorted([fields['name'] for fields in results])

        q = query.Phrase("value",
                         [u"little", u"miss", u"muffet", u"sat", u"tuffet"])
        sc = q.scorer(searcher)
        self.assertEqual(sc.__class__.__name__, "VectorPhraseScorer")

        self.assertEqual(names(searcher.search(q)), ["A"])

        q = query.Phrase("value", [u"miss", u"muffet", u"sat", u"tuffet"])
        self.assertEqual(names(searcher.search(q)), ["A", "D"])

        q = query.Phrase("value", [u"falunk", u"gibberish"])
        self.assertEqual(names(searcher.search(q)), [])

        q = query.Phrase("value", [u"gibberish", u"falunk"], slop=2)
        self.assertEqual(names(searcher.search(q)), ["D"])

        #q = query.Phrase("value", [u"blah"] * 4)
        #self.assertEqual(names(searcher.search(q)), []) # blah blah blah blah

        q = query.Phrase("value", [u"blah"] * 3)
        self.assertEqual(names(searcher.search(q)), ["E"])
예제 #30
0
    def test_vector_reading(self):
        a = analysis.StandardAnalyzer()
        schema = fields.Schema(
            title=fields.TEXT,
            content=fields.TEXT(vector=formats.Frequency(analyzer=a)))
        ix = self.make_index("testindex", schema, "vector_reading")
        try:
            writer = ix.writer()
            writer.add_document(
                title=u"one",
                content=u"This is the story of the black hole story")
            writer.commit()

            reader = ix.reader()
            self.assertEqual(list(reader.vector_as("frequency", 0, "content")),
                             [(u'black', 1), (u'hole', 1), (u'story', 2)])
        finally:
            pass