示例#1
0
文件: ir.py 项目: BinbinBian/qb
    def __init__(self, location, mean, var, num_results, time_limit):
        self._name = location
        self._index = index.open_dir(location)
        self._query_hash = 0
        self._query_terms = None

        self._document_cache = {}

        self._limit = num_results
        self._time = time_limit

        # TODO(jbg): This is a parameter that can be optimized
        og = qparser.OrGroup.factory(0.9)
        self._text_parser = qparser.QueryParser("content",
                                                self._index.schema, group=og)
        self._id_parser = qparser.QueryParser("id",
                                              self._index.schema,
                                              group=og)

        if not isnan(mean):
            self._mean = mean
        else:
            self._mean = 0.0
        if not isnan(var):
            self._var = var
        else:
            self._var = 1.0

        self._var = var
        self._misses = 0
        self._hits = 0
示例#2
0
    def mostrar_lista(event):
        ix = open_dir(dirindex)
        with ix.searcher() as searcher:

            contenido = str(en.get()).split(" ")[0]
            operador = str(en.get()).split(" ")[1]
            titulo = str(en.get()).split(" ")[2]
            print(contenido)
            print(operador)
            print(titulo)

            if "Y" in operador:
                query = qparser.QueryParser(
                    'contenido', ix.schema, group=qparser.OrGroup
                ).parse(contenido) & qparser.QueryParser(
                    'titulo', ix.schema, group=qparser.OrGroup).parse(titulo)
            elif "OR" in operador:
                query = QueryParser('contenido',
                                    ix.schema).parse(contenido) | QueryParser(
                                        'titulo', ix.schema).parse(titulo)
            else:
                query = QueryParser('contenido',
                                    ix.schema).parse(contenido) - QueryParser(
                                        'titulo', ix.schema).parse(titulo)

            results = searcher.search(query)

            imprimir_b_a(results)
示例#3
0
    def test_escaping(self):
        qp = qparser.QueryParser("text")

        q = qp.parse(r'big\small')
        self.assertEqual(q.__class__, query.Term, q)
        self.assertEqual(q.text, "bigsmall")

        q = qp.parse(r'big\\small')
        self.assertEqual(q.__class__, query.Term)
        self.assertEqual(q.text, r'big\small')

        q = qp.parse(r'http\:example')
        self.assertEqual(q.__class__, query.Term)
        self.assertEqual(q.fieldname, "text")
        self.assertEqual(q.text, "http:example")

        q = qp.parse(r'hello\ there')
        self.assertEqual(q.__class__, query.Term)
        self.assertEqual(q.text, "hello there")

        q = qp.parse(r'\[start\ TO\ end\]')
        self.assertEqual(q.__class__, query.Term)
        self.assertEqual(q.text, "[start TO end]")

        schema = fields.Schema(text=fields.TEXT)
        qp = qparser.QueryParser("text")
        q = qp.parse(r"http\:\/\/www\.example\.com")
        self.assertEqual(q.__class__.__name__, "Term")
        self.assertEqual(q.text, "http://www.example.com")

        q = qp.parse(u"\\\\")
        self.assertEqual(q.__class__.__name__, "Term")
        self.assertEqual(q.text, "\\")
示例#4
0
def test_pseudofield():
    schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT)

    def regex_maker(node):
        if node.has_text:
            node = qparser.RegexPlugin.RegexNode(node.text)
            node.set_fieldname("content")
            return node

    qp = qparser.QueryParser("a", schema)
    qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker}))
    q = qp.parse(u("alfa regex:br.vo"))
    assert_equal(q.__unicode__(), '(a:alfa AND content:r"br.vo")')

    def rev_text(node):
        if node.has_text:
            # Create a word node for the reversed text
            revtext = node.text[::-1]  # Reverse the text
            rnode = qparser.WordNode(revtext)
            # Duplicate the original node's start and end char
            rnode.set_range(node.startchar, node.endchar)

            # Put the original node and the reversed node in an OrGroup
            group = qparser.OrGroup([node, rnode])

            # Need to set the fieldname here because the PseudoFieldPlugin
            # removes the field name syntax
            group.set_fieldname("reverse")

            return group

    qp = qparser.QueryParser("content", schema)
    qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text}))
    q = qp.parse(u("alfa reverse:bravo"))
    assert_equal(q.__unicode__(), '(content:alfa AND (reverse:bravo OR reverse:ovarb))')
示例#5
0
def get_video_ids(query):
    levenshtein_distance = 1
    index = open_dir(corpus_index_dir)

    query_terms = query.split(" ")
    fuzzy_query_terms = [
        "{0}~{1}".format(qt, levenshtein_distance) for qt in query_terms
    ]
    fuzzy_query_terms = " ".join(fuzzy_query_terms)

    fuzzy_or_query_parser = qparser.QueryParser("content",
                                                index.schema,
                                                group=qparser.OrGroup)
    fuzzy_or_query_parser.add_plugin(qparser.FuzzyTermPlugin())
    fuzzy_parsed_or_query = fuzzy_or_query_parser.parse(fuzzy_query_terms)

    fuzzy_and_query_parser = qparser.QueryParser("content",
                                                 index.schema,
                                                 group=qparser.AndGroup)
    fuzzy_and_query_parser.add_plugin(qparser.FuzzyTermPlugin())
    fuzzy_parsed_and_query = fuzzy_and_query_parser.parse(fuzzy_query_terms)

    fuzzy_query_parser = Or([fuzzy_parsed_or_query, fuzzy_parsed_and_query])

    with index.searcher(weighting=scoring.TF_IDF()) as searcher:
        results = searcher.search(fuzzy_query_parser, limit=None)
        video_ids = [result.fields()["title"] for result in results]

    return video_ids
示例#6
0
def test_multi_language():
    # Analyzer for English
    ana_eng = analysis.StemmingAnalyzer()

    # analyzer for Pig Latin
    def stem_piglatin(w):
        if w.endswith("ay"):
            w = w[:-2]
        return w

    ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"],
                                        stemfn=stem_piglatin)

    # Dictionary mapping languages to analyzers
    analyzers = {"eng": ana_eng, "pig": ana_pig}

    # Fake documents
    corpus = [(u("eng"), u("Such stuff as dreams are made on")),
              (u("pig"), u("Otay ebay, roay otnay otay ebay"))]

    schema = fields.Schema(content=fields.TEXT(stored=True),
                           lang=fields.ID(stored=True))
    ix = RamStorage().create_index(schema)

    with ix.writer() as w:
        for doclang, content in corpus:
            ana = analyzers[doclang]
            # "Pre-analyze" the field into token strings
            words = [token.text for token in ana(content)]
            # Note we store the original value but index the pre-analyzed words
            w.add_document(lang=doclang,
                           content=words,
                           _stored_content=content)

    with ix.searcher() as s:
        schema = s.schema

        # Modify the schema to fake the correct analyzer for the language
        # we're searching in
        schema["content"].analyzer = analyzers["eng"]

        qp = qparser.QueryParser("content", schema)
        q = qp.parse("dreaming")
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["content"] == "Such stuff as dreams are made on"

        schema["content"].analyzer = analyzers["pig"]
        qp = qparser.QueryParser("content", schema)
        q = qp.parse("otnay")
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["content"] == "Otay ebay, roay otnay otay ebay"
示例#7
0
def test_workflow_easy():
    schema = fields.Schema(id=fields.ID(stored=True),
                           title=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("1"), title=u("The man who wasn't there"))
    w.add_document(id=u("2"), title=u("The dog who barked at midnight"))
    w.add_document(id=u("3"), title=u("The invisible man"))
    w.add_document(id=u("4"), title=u("The girl with the dragon tattoo"))
    w.add_document(id=u("5"), title=u("The woman who disappeared"))
    w.commit()

    with ix.searcher() as s:
        # Parse the user query
        parser = qparser.QueryParser("title", schema=ix.schema)
        q = parser.parse(u("man"))
        r = s.search(q, terms=True)
        assert_equal(len(r), 2)

        r.fragmenter = highlight.WholeFragmenter()
        r.formatter = highlight.UppercaseFormatter()
        outputs = [hit.highlights("title") for hit in r]
        assert_equal(outputs, ["The invisible MAN",
                               "The MAN who wasn't there"])
示例#8
0
def queryparse():
    c = True
    while c:
        print("Enter string")
        qstring = input()
        qp = qparser.QueryParser("content", ix.schema)
        q = qp.parse(qstring)

        with ix.searcher() as s:
            corrected = s.correct_query(
                q,
                qstring,
            )
            if corrected.query != q:
                print("Did you mean:", corrected.string)
                print(corrected.string)
                print("Enter yes or no [Y/N]")
                x = input()
                if (x == "Y"):
                    c = False
                    return corrected.string
            else:
                print(qstring)
                c = False
                return qstring
示例#9
0
def base_search(searcher, field, schema, query):
    '''
    Wrapper for fulltext search.
    '''
    parser = qparser.QueryParser(field, schema)
    parsed = parser.parse(query)
    return [result['checksum'] for result in searcher.search(parsed)]
示例#10
0
def test_boolean():
    schema = fields.Schema(id=fields.ID(stored=True),
                           done=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), done=True)
    w.add_document(id=u("b"), done=False)
    w.add_document(id=u("c"), done=True)
    w.add_document(id=u("d"), done=False)
    w.add_document(id=u("e"), done=True)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("done:true"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        r = s.search(qp.parse("done:yes"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        q = qp.parse("done:false")
        assert q.__class__ == query.Term
        assert q.text is False
        assert schema["done"].to_bytes(False) == b("f")
        r = s.search(q)
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)

        r = s.search(qp.parse("done:no"))
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)
示例#11
0
def test_decimal_ranges():
    from decimal import Decimal

    schema = fields.Schema(id=fields.STORED,
                           num=fields.NUMERIC(int, decimal_places=2))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    count = Decimal("0.0")
    inc = Decimal("0.2")
    for _ in xrange(500):
        w.add_document(id=str(count), num=count)
        count += inc
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("num", schema)

        def check(qs, start, end):
            q = qp.parse(qs)
            result = [s.stored_fields(d)["id"] for d in q.docs(s)]

            target = []
            count = Decimal(start)
            limit = Decimal(end)
            while count <= limit:
                target.append(str(count))
                count += inc

            assert result == target

        check("[10.2 to 80.8]", "10.2", "80.8")
        check("{10.2 to 80.8]", "10.4", "80.8")
        check("[10.2 to 80.8}", "10.2", "80.6")
        check("{10.2 to 80.8}", "10.4", "80.6")
示例#12
0
def test_decimal_numeric():
    from decimal import Decimal

    f = fields.NUMERIC(int, decimal_places=4)
    schema = fields.Schema(id=fields.ID(stored=True), deci=f)
    ix = RamStorage().create_index(schema)

    # assert f.from_text(f.to_text(Decimal("123.56"))), Decimal("123.56"))

    w = ix.writer()
    w.add_document(id=u("a"), deci=Decimal("123.56"))
    w.add_document(id=u("b"), deci=Decimal("0.536255"))
    w.add_document(id=u("c"), deci=Decimal("2.5255"))
    w.add_document(id=u("d"), deci=Decimal("58"))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("deci", schema)
        q = qp.parse(u("123.56"))
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["id"] == "a"

        r = s.search(qp.parse(u("0.536255")))
        assert len(r) == 1
        assert r[0]["id"] == "b"
示例#13
0
def test_andmaybe_quality():
    schema = fields.Schema(id=fields.STORED, title=fields.TEXT(stored=True),
                           year=fields.NUMERIC)
    ix = RamStorage().create_index(schema)

    domain = [(u('Alpha Bravo Charlie Delta'), 2000),
              (u('Echo Bravo Foxtrot'), 2000), (u('Bravo Golf Hotel'), 2002),
              (u('Bravo India'), 2002), (u('Juliet Kilo Bravo'), 2004),
              (u('Lima Bravo Mike'), 2004)]
    w = ix.writer()
    for title, year in domain:
        w.add_document(title=title, year=year)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("title", ix.schema)
        q = qp.parse(u("title:bravo ANDMAYBE year:2004"))

        titles = [hit["title"] for hit in s.search(q, limit=None)[:2]]
        print("titles1=", titles)
        assert "Juliet Kilo Bravo" in titles

        titles = [hit["title"] for hit in s.search(q, limit=2)]
        print("titles2=", titles)
        assert "Juliet Kilo Bravo" in titles
示例#14
0
def test_not2():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("a"), value=u("alfa bravo charlie delta echo"))
    writer.add_document(name=u("b"), value=u("bravo charlie delta echo foxtrot"))
    writer.add_document(name=u("c"), value=u("charlie delta echo foxtrot golf"))
    writer.add_document(name=u("d"), value=u("delta echo golf hotel india"))
    writer.add_document(name=u("e"), value=u("echo golf hotel india juliet"))
    writer.commit()

    with ix.searcher() as s:
        p = qparser.QueryParser("value", None)
        results = s.search(p.parse("echo NOT golf"))
        assert_equal(sorted([d["name"] for d in results]), ["a", "b"])

        results = s.search(p.parse("echo NOT bravo"))
        assert_equal(sorted([d["name"] for d in results]), ["c", "d", "e"])

    ix.delete_by_term("value", u("bravo"))

    with ix.searcher() as s:
        results = s.search(p.parse("echo NOT charlie"))
        assert_equal(sorted([d["name"] for d in results]), ["d", "e"])
示例#15
0
def test_gtlt():
    schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC,
                           c=fields.KEYWORD,
                           d=fields.NUMERIC(float), e=fields.DATETIME)
    qp = qparser.QueryParser("a", schema)
    qp.add_plugin(plugins.GtLtPlugin())
    qp.add_plugin(dateparse.DateParserPlugin())

    q = qp.parse(u("a:hello b:>100 c:<=z there"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 4)
    assert_equal(q[0], query.Term("a", "hello"))
    assert_equal(q[1], query.NumericRange("b", 100, None, startexcl=True))
    assert_equal(q[2], query.TermRange("c", None, 'z'))
    assert_equal(q[3], query.Term("a", "there"))

    q = qp.parse(u("hello e:>'29 mar 2001' there"))
    assert_equal(q.__class__, query.And)
    assert_equal(len(q), 3)
    assert_equal(q[0], query.Term("a", "hello"))
    # As of this writing, date ranges don't support startexcl/endexcl
    assert_equal(q[1], query.DateRange("e", datetime(2001, 3, 29, 0, 0), None))
    assert_equal(q[2], query.Term("a", "there"))

    q = qp.parse(u("a:> alfa c:<= bravo"))
    assert_equal(text_type(q), "(a:a: AND a:alfa AND a:c: AND a:bravo)")

    qp.remove_plugin_class(plugins.FieldsPlugin)
    qp.remove_plugin_class(plugins.RangePlugin)
    q = qp.parse(u("hello a:>500 there"))
    assert_equal(text_type(q), "(a:hello AND a:a: AND a:500 AND a:there)")
示例#16
0
def test_custom_tokens():
    qp = qparser.QueryParser("text", None)
    qp.remove_plugin_class(plugins.OperatorsPlugin)

    cp = plugins.OperatorsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~",
                                 Not="-")
    qp.add_plugin(cp)

    q = qp.parse("this | that")
    assert_equal(q.__class__, query.Or)
    assert_equal(q[0].__class__, query.Term)
    assert_equal(q[0].text, "this")
    assert_equal(q[1].__class__, query.Term)
    assert_equal(q[1].text, "that")

    q = qp.parse("this&!that")
    assert_equal(q.__class__, query.AndNot)
    assert_equal(q.a.__class__, query.Term)
    assert_equal(q.a.text, "this")
    assert_equal(q.b.__class__, query.Term)
    assert_equal(q.b.text, "that")

    q = qp.parse("alfa -bravo NOT charlie")
    assert_equal(len(q), 4)
    assert_equal(q[1].__class__, query.Not)
    assert_equal(q[1].query.text, "bravo")
    assert_equal(q[2].text, "NOT")
示例#17
0
def test_combos():
    qs = 'w:a "hi there"^4.2 AND x:b^2.3 OR c AND (y:d OR e) (apple ANDNOT bear)^2.3'

    init_args = {plugins.MultifieldPlugin: (["content", "title"], {"content": 1.0, "title": 1.2}),
                 plugins.FieldAliasPlugin: ({"content": ("text", "body")},),
                 plugins.MultifieldPlugin: (["title", "content"],),
                 plugins.CopyFieldPlugin: ({"name": "phone"},),
                 plugins.PseudoFieldPlugin: ({"name": lambda x: x}),
                 }

    pis = _plugin_classes(())
    for i, plugin in enumerate(pis):
        try:
            pis[i] = plugin(*init_args.get(plugin, ()))
        except TypeError:
            raise TypeError("Error instantiating %s" % plugin)

    count = 0
    for i, first in enumerate(pis):
        for j in xrange(len(pis)):
            if i == j: continue
            plist = [p for p in pis[:j] if p is not first] + [first]
            qp = qparser.QueryParser("text", None, plugins=plist)
            try:
                qp.parse(qs)
            except Exception:
                e = sys.exc_info()[1]
                raise Exception(str(e) + " combo: %s %r" % (count, plist))
            count += 1
示例#18
0
def test_globfield_length_merge():
    # Issue 343

    schema = fields.Schema(title=fields.TEXT(stored=True),
                           path=fields.ID(stored=True))
    schema.add("*_text", fields.TEXT, glob=True)

    with TempIndex(schema, "globlenmerge") as ix:
        with ix.writer() as w:
            w.add_document(
                title=u("First document"),
                path=u("/a"),
                content_text=u("This is the first document we've added!"))

        with ix.writer() as w:
            w.add_document(
                title=u("Second document"),
                path=u("/b"),
                content_text=u(
                    "The second document is even more interesting!"))

        with ix.searcher() as s:
            docnum = s.document_number(path="/a")
            assert s.doc_field_length(docnum, "content_text") is not None

            qp = qparser.QueryParser("content", schema)
            q = qp.parse("content_text:document")
            r = s.search(q)
            paths = sorted(hit["path"] for hit in r)
            assert paths == ["/a", "/b"]
示例#19
0
def test_numeric():
    schema = fields.Schema(id=fields.ID(stored=True),
                           integer=fields.NUMERIC(int),
                           floating=fields.NUMERIC(float))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), integer=5820, floating=1.2)
    w.add_document(id=u("b"), integer=22, floating=2.3)
    w.add_document(id=u("c"), integer=78, floating=3.4)
    w.add_document(id=u("d"), integer=13, floating=4.5)
    w.add_document(id=u("e"), integer=9, floating=5.6)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("integer", schema)

        q = qp.parse(u("5820"))
        r = s.search(q)
        assert len(r) == 1
        assert r[0]["id"] == "a"

    with ix.searcher() as s:
        r = s.search(qp.parse("floating:4.5"))
        assert len(r) == 1
        assert r[0]["id"] == "d"

    q = qp.parse("integer:*")
    assert q.__class__ == query.Every
    assert q.field() == "integer"

    q = qp.parse("integer:5?6")
    assert q == query.NullQuery
示例#20
0
def test_finalweighting():
    from whoosh.scoring import Frequency

    schema = fields.Schema(id=fields.ID(stored=True),
                           summary=fields.TEXT,
                           n_comments=fields.STORED)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    w.add_document(id=u("1"), summary=u("alfa bravo"), n_comments=5)
    w.add_document(id=u("2"), summary=u("alfa"), n_comments=12)
    w.add_document(id=u("3"), summary=u("bravo"), n_comments=2)
    w.add_document(id=u("4"), summary=u("bravo bravo"), n_comments=7)
    w.commit()

    class CommentWeighting(Frequency):
        use_final = True

        def final(self, searcher, docnum, score):
            ncomments = searcher.stored_fields(docnum).get("n_comments", 0)
            return ncomments

    with ix.searcher(weighting=CommentWeighting()) as s:
        r = s.search(qparser.QueryParser("summary", None).parse("alfa OR bravo"))
        ids = [fs["id"] for fs in r]
        assert_equal(["2", "4", "1", "3"], ids)
示例#21
0
def test_numeric_ranges():
    schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC)
    ix = RamStorage().create_index(schema)
    w = ix.writer()

    for i in xrange(400):
        w.add_document(id=i, num=i)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("num", schema)

        def check(qs, target):
            q = qp.parse(qs)
            result = [s.stored_fields(d)["id"] for d in q.docs(s)]
            assert result == target

        # Note that range() is always inclusive-exclusive
        check("[10 to 390]", list(range(10, 390 + 1)))
        check("[100 to]", list(range(100, 400)))
        check("[to 350]", list(range(0, 350 + 1)))
        check("[16 to 255]", list(range(16, 255 + 1)))
        check("{10 to 390]", list(range(11, 390 + 1)))
        check("[10 to 390}", list(range(10, 390)))
        check("{10 to 390}", list(range(11, 390)))
        check("{16 to 255}", list(range(17, 255)))
示例#22
0
 def searcher(self):
     path = os.path.join(self.options.dir,
                         "%s_whoosh" % self.options.indexname)
     ix = index.open_dir(path)
     self.srch = ix.searcher()
     self.parser = qparser.QueryParser(self.bench.spec.main_field,
                                       schema=ix.schema)
示例#23
0
def test_datetime():
    dtf = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=dtf)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for month in xrange(1, 12):
        for day in xrange(1, 28):
            w.add_document(id=u("%s-%s") % (month, day),
                           date=datetime(2010, month, day, 14, 0, 0))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("date:20100523"))
        assert len(r) == 1
        assert r[0]["id"] == "5-23"
        assert r[0]["date"].__class__ is datetime
        assert r[0]["date"].month == 5
        assert r[0]["date"].day == 23

        r = s.search(qp.parse("date:'2010 02'"))
        assert len(r) == 27

        q = qp.parse(u("date:[2010-05 to 2010-08]"))
        startdt = datetime(2010, 5, 1, 0, 0, 0, 0)
        enddt = datetime(2010, 8, 31, 23, 59, 59, 999999)
        assert q.__class__ is query.NumericRange
        assert q.start == times.datetime_to_long(startdt)
        assert q.end == times.datetime_to_long(enddt)
示例#24
0
def score_to_file():

    # Open index
    ix = index.open_dir(index_dir)

    # Use the reader to get statistics
    reader = ix.reader()

    queries = load_queries()

    outfile = open(output_file, "w")
    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        # with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        qp = qparser.QueryParser(field, schema=ix.schema)
        # qp = qparser.MultifieldParser(fields, schema=ix.schema)
        for query in queries:
            print("Processing query number", query['id'])

            # Retrieve documents using the vector space model
            q = qp.parse(query['text'])  # we contatenate query terms
            res = searcher.search(q)
            # res = get_score(searcher, qp, query['text'])
            for r in res:
                outfile.write(query['id'] + " Q0 " + r['id'] + " " +
                              str(r.score) + "\n")
            # Output max 50 results
            # for docnum in sorted(res, key=res.get, reverse=True)[:50]:
            #     # Look up our docID
            #     stored = reader.stored_fields(docnum)
            #     # Write `docID Q0 queryID score` into output file
            #     outfile.write(query['id']+ " Q0 " + stored['id'] + " " + str(res[docnum]) + "\n")
        outfile.close()
    ix.close()
示例#25
0
def test_boolean_strings():
    schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(i=0, b="true")
        w.add_document(i=1, b="True")
        w.add_document(i=2, b="false")
        w.add_document(i=3, b="False")
        w.add_document(i=4, b=u("true"))
        w.add_document(i=5, b=u("True"))
        w.add_document(i=6, b=u("false"))
        w.add_document(i=7, b=u("False"))

    with ix.searcher() as s:
        qp = qparser.QueryParser("b", ix.schema)

        def check(qs, nums):
            q = qp.parse(qs)
            r = s.search(q, limit=None)
            assert [hit["i"] for hit in r] == nums

        trues = [0, 1, 4, 5]
        falses = [2, 3, 6, 7]
        check("true", trues)
        check("True", trues)
        check("false", falses)
        check("False", falses)
        check("t", trues)
        check("f", falses)
示例#26
0
 def _make_query_parser(self):
     schema = self._make_schema()
     qp = qparser.QueryParser('path', schema=schema)
     qp.add_plugin(qparser.GtLtPlugin())
     from whoosh.qparser.dateparse import DateParserPlugin
     qp.add_plugin(DateParserPlugin())
     return qp
示例#27
0
 def test_boost(self):
     qp = qparser.QueryParser("content")
     q = qp.parse("this^3 fn:that^0.5 5.67")
     self.assertEqual(q.subqueries[0].boost, 3.0)
     self.assertEqual(q.subqueries[1].boost, 0.5)
     self.assertEqual(q.subqueries[1].fieldname, "fn")
     self.assertEqual(q.subqueries[2].text, "5.67")
 def n_gram_query(self, query_string):
     og = qparser.OrGroup.factory(0.8)
     parser = qparser.QueryParser(_N_GRAM_FIELD, self._schema, group=og)
     parser.remove_plugin_class(qparser.FieldsPlugin)
     parser.remove_plugin_class(qparser.WildcardPlugin)
     parser.add_plugin(qparser.FuzzyTermPlugin())
     return parser.parse(query_string)
示例#29
0
文件: markov.py 项目: norosa/mimicbot
    def search(self, query, number):
        self.get_index()

        import pprint
        pp = pprint.PrettyPrinter(indent=4, depth=9)

        with self.index.searcher() as searcher:
            # improve relevance! form query from keywords
            keywords = searcher.key_terms_from_text("text", query, numterms=number)
            keyword_query = " ".join(
                [keyword for keyword, score in keywords])

            # if we don't find any keywords. for example, we're not actually
            # looking up context tweets
            # TODO find better way of doing this
            if not keyword_query:
                keyword_query = "*"

            print("keyword query: %s" % keyword_query)

            parser = qparser.QueryParser(
                "text", self.index.schema, group=qparser.OrGroup)
            query = parser.parse(keyword_query)

            restrict_retweets = whoosh.query.Term("retweet", True)

            results = searcher.search(query, mask=restrict_retweets, limit=MAX_TWEETS_NUMBER)
            for result in results:
                yield result["text"]
示例#30
0
def getQparser(index):
    parser = qparser.QueryParser("body",
                                 schema=index.schema,
                                 group=qparser.OrGroup)
    parser.remove_plugin_class(qparser.FieldsPlugin)
    parser.remove_plugin_class(qparser.WildcardPlugin)
    return parser