示例#1
0
    def visit_query(self, query, n=False):
        query_dict = query.query

        qp = xapian.QueryParser()
        qp.set_database(self.database)

        field_prefix = {}
        field_type = {}
        field_col = {}

        for field_dict in self.schema['idx_fields']:
            fname = field_dict['field_name']
            field_col[fname] = field_dict['column']
            field_type[fname] = field_dict['type']
            field_prefix[fname] = DOCUMENT_CUSTOM_TERM_PREFIX + fname.upper()
        field_prefix[self.schema['obj_id']] = DOCUMENT_ID_TERM_PREFIX
        pre_query = None
        new_query = None
        for field in query_dict:
            if field in field_prefix:
                prefix = field_prefix[field]
                col = field_col.get(field)
                value = query_dict[field]

                if isinstance(value, dict):
                    ftype = field_type[field]
                    if ftype == 'int' or ftype == 'long':
                        begin = value.get('$gt', 0)
                        end = value.get('$lt', sys.maxint)
                        qp.add_valuerangeprocessor(xapian.NumberValueRangeProcessor(col, prefix))
                        new_query = qp.parse_query('%s%s..%s' % (prefix, begin, end))
                elif not isinstance(value, basestring) and hasattr(value, '__getitem__') or hasattr(value, '__iter__'):
                    value = ['%s%s' % (prefix, v) for v in value]
                    #De Morgan's laws, if we want the intersection of negation sets,
                    #Firstly, we obtain the disjunction of this sets, then get negation of them
                    # (AND_NOT [U, (OR, [a, b, c])])
                    # NOT (a OR B OR C)
                    # NOT a AND not b AND not C
                    if not n:
                        new_query = xapian.Query(xapian.Query.OP_AND, value)
                    else:
                        new_query = xapian.Query(xapian.Query.OP_OR, value)
                else:
                    new_query = xapian.Query('%s%s' % (prefix, value))
                if pre_query:
                    if not n:
                        new_query = xapian.Query(xapian.Query.OP_AND, [pre_query, new_query])
                    else:
                        # and_not , [U, a or b])
                        # not a and not b
                        new_query = xapian.Query(xapian.Query.OP_OR, [pre_query, new_query])
                pre_query = new_query

        return new_query
示例#2
0
def search(dbpath, querystring, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(dbpath)

    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)
    queryparser.add_prefix("title", "S")
    queryparser.add_prefix("description", "XD")
    # and add in value range processors
    queryparser.add_valuerangeprocessor(
        xapian.NumberValueRangeProcessor(0, 'mm', False))
    queryparser.add_valuerangeprocessor(xapian.NumberValueRangeProcessor(
        1, ''))

    # And parse the query
    query = queryparser.parse_query(querystring)

    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)

    # And print out something about each match
    matches = []
    for match in enquire.get_mset(offset, pagesize):
        fields = json.loads(match.document.get_data())
        print u"%(rank)i: #%(docid)3.3i (%(date)s) %(measurements)s\n        %(title)s" % {
            'rank': match.rank + 1,
            'docid': match.docid,
            'measurements': fields.get('MEASUREMENTS', u''),
            'date': fields.get('DATE_MADE', u''),
            'title': fields.get('TITLE', u''),
        }
        matches.append(match.docid)

    # Finally, make sure we log the query and displayed results
    support.log_matches(querystring, offset, pagesize, matches)
示例#3
0
def main(args):
    keyword = args.get('keyword')
    title = args.get('title')
    rated_list = args.get('rated')
    year_range = args.get('year_range')
    show_facets = args.get('show_facets')

    with closing(_x.Database('./xdb/movies.db')) as x_db:
        # get a query parser
        qp = _query_parser(x_db)

        if keyword:
            x_query = qp.parse_query(keyword)
        else:
            x_query = _x.Query.MatchAll

        if title:
            title_query = qp.parse_query(title, 0, 'S')
            x_query = _joinq(_x.Query.OP_FILTER, x_query, title_query)

        if rated_list:
            rated_queries = [
                _x.Query('XRATED:{}'.format(rated)) for rated in rated_list
            ]
            rated_query = _x.Query(_x.Query.OP_OR, rated_queries)
            x_query = _joinq(_x.Query.OP_FILTER, x_query, rated_query)

        if year_range:
            qp.add_valuerangeprocessor(_x.NumberValueRangeProcessor(SLOT_YEAR))
            year_range_query = qp.parse_query(year_range)
            x_query = _joinq(_x.Query.OP_FILTER, x_query, year_range_query)

        # setup the enquire object to perform the query
        enq = _x.Enquire(x_db)
        print str(x_query)
        enq.set_query(x_query)

        # Set up a spy to inspect value slots on matched documents
        spy = _x.ValueCountMatchSpy(SLOT_RATED)
        enq.add_matchspy(spy)

        # iterate through the matched set and display the stored json dup
        for res in enq.get_mset(0, x_db.get_doccount(), None, None):
            print json.dumps(json.loads(res.document.get_data()),
                             indent=4,
                             sort_keys=True)

        # Fetch and display the spy values
        if show_facets:
            facets = {item.term: int(item.termfreq) for item in spy.values()}
            print "Facets:{}, Total:{} ".format(facets, sum(facets.values()))
示例#4
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    # A regexp check would be better, but seems to create a bogus "leak" of -1
    # objects in Python 3.
    expect(len(xapian.__version__.split('.')), 3,
           'xapian.__version__ not X.Y.Z')
    expect((xapian.__version__.split('.'))[0], '1',
           'xapian.__version__ not "1.Y.Z"')

    def access_cvar():
        res = xapian.cvar
        print("Unhandled constants: ", res)
        return res

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    #
    # Python 3.5 generates a different exception message here to earlier
    # versions, so we need a check which matches both.
    expect_exception(AttributeError,
                     lambda msg: msg.find("has no attribute 'cvar'") != -1,
                     access_cvar)

    stem = xapian.Stem(b"english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data(b"a\0b")
    if doc.get_data() == b"a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), b"a\0b",
           "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data(b"is there anybody out there?")
    doc.add_term(b"XYzzy")
    doc.add_posting(stem(b"is"), 1)
    doc.add_posting(stem(b"there"), 2)
    doc.add_posting(stem(b"anybody"), 3)
    doc.add_posting(stem(b"out"), 4)
    doc.add_posting(stem(b"there"), 5)

    db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY)
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]),
        "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE,
                          (b"smoke", b"test", b"tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR,
                          (xapian.Query(b"smoke"), query1, b"string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(
        query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]),
        "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'),
                 "VALUE_RANGE 0 1 4")

    # Check database factory functions are wrapped as expected (or not wrapped
    # in the first cases):

    expect_exception(
        AttributeError,
        lambda msg: msg.find("has no attribute 'open_stub'") != -1,
        lambda: xapian.open_stub(b"nosuchdir/nosuchdb"))
    expect_exception(
        AttributeError,
        lambda msg: msg.find("has no attribute 'open_stub'") != -1,
        lambda: xapian.open_stub(b"nosuchdir/nosuchdb", xapian.DB_OPEN))

    expect_exception(
        xapian.DatabaseOpeningError, None,
        lambda: xapian.Database(b"nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB))
    expect_exception(
        xapian.DatabaseOpeningError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_OPEN | xapian.DB_BACKEND_STUB))

    expect_exception(
        xapian.DatabaseOpeningError, None, lambda: xapian.Database(
            b"nosuchdir/nosuchdb", xapian.DB_BACKEND_GLASS))
    expect_exception(
        xapian.DatabaseCreateError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_GLASS))

    expect_exception(
        xapian.FeatureUnavailableError, None, lambda: xapian.Database(
            b"nosuchdir/nosuchdb", xapian.DB_BACKEND_CHERT))
    expect_exception(
        xapian.FeatureUnavailableError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_CHERT))

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"/bin/false", b"")
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"/bin/false", b"")

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"127.0.0.1", 0, 1)
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"127.0.0.1", 0, 1)

    # Check wrapping of MatchAll and MatchNothing:

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)
    enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = b" ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, b"is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist(b"there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(b""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, b"there"):
        count += 1
    expect(count, 2,
           "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to(b'n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < b'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" %
                           x.term.decode('utf-8'))

    # Feature test for Document.values
    count = 0
    for term in list(doc.values()):
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found",
                     db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data(b"Two")
    doc.add_posting(stem(b"out"), 1)
    doc.add_posting(stem(b"outside"), 1)
    doc.add_posting(stem(b"source"), 2)
    doc.add_value(0, b"yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == b"yes"

    query = xapian.Query(stem(b"out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1,
           "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith(b'a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            testexpanddecider())
    eset_terms = [item.term for item in eset]
    expect(len(eset_terms), eset.size(),
           "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith(b'a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect([i.weight for i in eset][-1] < 1.9, True,
           "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            None, 1.9)
    expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError,
                     "Syntax: <expression> AND <expression>", qp.parse_query,
                     b"test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(
        qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT test@1)")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem(b'en'))
    expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((SYNONYM WILDCARD OR o) OR Zo@2))")

    expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((SYNONYM WILDCARD OR outside) OR Zoutsid@2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')),
                 '(foo OR bar\\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')),
                 '(foo OR bar\u00a3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'),
                 '(foo OR bar)')

    expect_query(
        qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT Zt\u00e9st@1)")

    doc = xapian.Document()
    doc.set_data(b"Unicode with an acc\xe9nt")
    doc.add_posting(stem(b"out\xe9r"), 1)
    expect(doc.get_data(), b"Unicode with an acc\xe9nt")
    term = next(doc.termlist()).term
    expect(term, b"out\xe9r")

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    stop.add(b'a')
    expect(stop(b'a'), True)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == b"b"

        def get_description(self):
            return "my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), "my_b_stopper")
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    expect(stop(b'b'), True)
    expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text(b'foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer])
            for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]),
                                          (b'foo', 2, [1, 4])])

    # Check DateValueRangeProcessor works
    context("checking that DateValueRangeProcessor works")
    qp = xapian.QueryParser()
    vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960)
    qp.add_valuerangeprocessor(vrpdate)
    query = qp.parse_query(b'12/03/99..12/04/01')
    expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)')

    # Regression test for bug#193, fixed in 1.0.3.
    context("running regression test for bug#193")
    vrp = xapian.NumberValueRangeProcessor(0, b'$', True)
    a = '$10'
    b = '20'
    slot, a, b = vrp(a, b.encode('utf-8'))
    expect(slot, 0)
    expect(xapian.sortable_unserialise(a), 10)
    expect(xapian.sortable_unserialise(b), 20)

    # Feature test for xapian.FieldProcessor
    context("running feature test for xapian.FieldProcessor")

    class testfieldprocessor(xapian.FieldProcessor):
        def __call__(self, s):
            if s == 'spam':
                raise Exception('already spam')
            return xapian.Query("spam")

    qp.add_prefix('spam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam', testfieldprocessor())
    query = qp.parse_query('spam:ignored')
    expect(str(query), 'Query(spam)')

    # FIXME: This doesn't currently work:
    # expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam')

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query(b"I like tea")

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata(b'Foo'), b'')
    db.set_metadata(b'Foo', b'Foo')
    expect(db.get_metadata(b'Foo'), b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.set_metadata, b'',
                     b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(
        xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5),
        "5 * foo")
示例#5
0
    def _enquire(self, request, query, order_by, group_by):
        enquire = xapian.Enquire(self._db)
        queries = []
        and_not_queries = []
        boolean_queries = []

        if query:
            query = self._extract_exact_search_terms(query, request)

        if query:
            parser = xapian.QueryParser()
            parser.set_database(self._db)
            for name, prop in self._props.items():
                if not prop.prefix:
                    continue
                if prop.boolean:
                    parser.add_boolean_prefix(name, prop.prefix)
                else:
                    parser.add_prefix(name, prop.prefix)
                parser.add_prefix('', prop.prefix)
                if prop.slot is not None and \
                        prop.typecast in [int, float, bool]:
                    value_range = xapian.NumberValueRangeProcessor(
                        prop.slot, name + ':')
                    parser.add_valuerangeprocessor(value_range)
            parser.add_prefix('', '')
            query = parser.parse_query(
                query, xapian.QueryParser.FLAG_PHRASE
                | xapian.QueryParser.FLAG_BOOLEAN
                | xapian.QueryParser.FLAG_LOVEHATE
                | xapian.QueryParser.FLAG_PARTIAL
                | xapian.QueryParser.FLAG_WILDCARD
                | xapian.QueryParser.FLAG_PURE_NOT, '')
            queries.append(query)

        for name, value in request.items():
            prop = self._props.get(name)
            if prop is None or not prop.prefix:
                continue

            sub_queries = []
            not_queries = []
            for needle in value if type(value) in (tuple, list) else [value]:
                if needle is None:
                    continue
                needle = prop.to_string(needle)[0]
                if needle.startswith('!'):
                    term = _term(prop.prefix, needle[1:])
                    not_queries.append(xapian.Query(term))
                elif needle.startswith('-'):
                    term = _term(prop.prefix, needle[1:])
                    and_not_queries.append(xapian.Query(term))
                else:
                    term = _term(prop.prefix, needle)
                    sub_queries.append(xapian.Query(term))

            if not_queries:
                not_query = xapian.Query(xapian.Query.OP_AND_NOT, [
                    xapian.Query(''),
                    xapian.Query(xapian.Query.OP_OR, not_queries)
                ])
                sub_queries.append(not_query)

            if sub_queries:
                if len(sub_queries) == 1:
                    query = sub_queries[0]
                else:
                    query = xapian.Query(xapian.Query.OP_OR, sub_queries)
                if prop.boolean:
                    boolean_queries.append(query)
                else:
                    queries.append(query)

        final = None
        if queries:
            final = xapian.Query(xapian.Query.OP_AND, queries)
        if boolean_queries:
            query = xapian.Query(xapian.Query.OP_AND, boolean_queries)
            if final is None:
                final = query
            else:
                final = xapian.Query(xapian.Query.OP_FILTER, [final, query])
        if final is None:
            final = xapian.Query('')
        for i in and_not_queries:
            final = xapian.Query(xapian.Query.OP_AND_NOT, [final, i])
        enquire.set_query(final)

        if hasattr(xapian, 'MultiValueKeyMaker'):
            sorter = xapian.MultiValueKeyMaker()
            if order_by:
                if order_by.startswith('+'):
                    reverse = False
                    order_by = order_by[1:]
                elif order_by.startswith('-'):
                    reverse = True
                    order_by = order_by[1:]
                else:
                    reverse = False
                prop = self._props.get(order_by)
                enforce(prop is not None and prop.slot is not None,
                        'Cannot sort using %r property of %r', order_by,
                        self.metadata.name)
                sorter.add_value(prop.slot, reverse)
            # Sort by ascending GUID to make order predictable all time
            sorter.add_value(0, False)
            enquire.set_sort_by_key(sorter, reverse=False)
        else:
            _logger.warning('In order to support sorting, '
                            'Xapian should be at least 1.2.0')

        if group_by:
            prop = self._props.get(group_by)
            enforce(prop is not None and prop.slot is not None,
                    'Cannot group by %r property of %r', group_by,
                    self.metadata.name)
            enquire.set_collapse_key(prop.slot)

        return enquire
示例#6
0
    def query(self,
              querystring=None,
              qtype=None,
              begin=None,
              end=None,
              keywords=[],
              hashtags=[],
              synonymslist=[],
              emotiononly=False):
        if qtype == 'hy':
            self.qp.add_valuerangeprocessor(
                xapian.NumberValueRangeProcessor(self.timestampvi, ''))
            querystring = begin + '..' + end

            if emotiononly:
                self.qp.add_valuerangeprocessor(
                    xapian.NumberValueRangeProcessor(self.emotiononlyvi, 'f',
                                                     False))
                querystring += ' 1.0..1.0f'

            query = self.qp.parse_query(querystring)
            print "Parsed query is: %s" % [str(query)]

            self.enquire.set_query(query)
            #matches = self.enquire.get_mset(0, self.maxitems)
            matches = self.enquire.get_mset(0, 10000)
            # Display the results.
            print "%i results found." % matches.size()

            if not self.lowkeywords_proc(matches):
                return
            emotions_list, keywords_list = self.keywords_and_emotions_list_proc(
                matches)

            return emotions_list, keywords_list

        if qtype == 'yq':
            self.qp.add_valuerangeprocessor(
                xapian.NumberValueRangeProcessor(self.timestampvi, ''))
            querystring = begin + '..' + end
            query = self.qp.parse_query(querystring)
            print "Parsed query is: %s" % [str(query)]

            self.enquire.set_query(query)
            #matches = self.enquire.get_mset(0,10)
            matches = self.enquire.get_mset(0, self.maxitems)

            # Display the results.
            print "%i results found." % matches.size()

            keywords_arr = []
            for m in matches:
                #hashtag
                hashtags = json.loads(m.document.get_value(self.hashtagsvi))

                #keywords
                keywords_hash = json.loads(
                    m.document.get_value(self.keywordsvi))
                keywords_arr.append(keywords_hash)
                #keywords_counter += Counter(json.loads(m.document.get_value(self.keywordsvi)))

            print 'mapreduce begin: ', str(
                time.strftime("%H:%M:%S", time.gmtime()))
            mapper = SimpleMapReduce(hasharr_to_list, count_words)
            word_counts = mapper(keywords_arr)
            keywords_hash = {}
            for word, count in word_counts:
                keywords_hash[word] = count
            for synonyms in synonymslist:
                if len(synonyms) >= 2 and synonyms[0] in keywords_hash:
                    for word in synonyms[1:]:
                        if word in keywords_hash:
                            keywords_hash[synonyms[0]] += keywords_hash[word]
                            del keywords_hash[word]
            print 'mapreduce end: ', str(
                time.strftime("%H:%M:%S", time.gmtime()))

            #print keywords_counter
            return hashtags, keywords_hash

        if qtype == 'lh':
            self.qp.add_valuerangeprocessor(
                xapian.NumberValueRangeProcessor(self.timestampvi, ''))
            timequerystr = begin + '..' + end
            timequery = self.qp.parse_query(timequerystr)

            hashtags = ['H' + hashtag.lower() for hashtag in hashtags]
            keywords = [keyword.lower() for keyword in keywords]
            keywords.extend(hashtags)
            if len(keywords) > 0:
                wordsquery = xapian.Query(xapian.Query.OP_OR, keywords)
            else:
                return None

            query = xapian.Query(xapian.Query.OP_AND, [timequery, wordsquery])
            print "Parsed query is: %s" % [str(query)]

            self.enquire.set_query(query)
            self.enquire.set_sort_by_value(self.timestampvi, False)
            #matches = self.enquire.get_mset(0,10)
            matches = self.enquire.get_mset(0, self.maxitems)

            # Display the results.
            print "%i results found." % matches.size()

            results = []
            for m in matches:
                result = {}
                result['location'] = m.document.get_value(self.loctvi)
                result['repost_location'] = m.document.get_value(
                    self.reploctvi)
                result['timestamp'] = xapian.sortable_unserialise(
                    m.document.get_value(self.timestampvi))
                results.append(result)

            return results
示例#7
0
    def __init__(self, root, writable=False, create=False, force=False):
        # xapers root
        self.root = os.path.abspath(os.path.expanduser(root))

        # xapers db directory
        xapers_path = os.path.join(self.root, '.xapers')

        # xapes directory initialization
        if not os.path.exists(xapers_path):
            if create:
                if os.path.exists(self.root):
                    if os.listdir(self.root) and not force:
                        raise DatabaseInitializationError(
                            'Uninitialized Xapers root directory exists but is not empty.'
                        )
                os.makedirs(xapers_path)
            else:
                if os.path.exists(self.root):
                    raise DatabaseInitializationError(
                        "Xapers directory '%s' does not contain a database." %
                        (self.root))
                else:
                    raise DatabaseUninitializedError(
                        "Xapers directory '%s' not found." % (self.root))

        # the Xapian db
        xapian_path = os.path.join(xapers_path, 'xapian')
        if writable:
            try:
                self.xapian = xapian.WritableDatabase(xapian_path,
                                                      xapian.DB_CREATE_OR_OPEN)
            except xapian.DatabaseLockError:
                raise DatabaseLockError("Xapers database locked.")
        else:
            self.xapian = xapian.Database(xapian_path)

        stemmer = xapian.Stem("english")

        # The Xapian TermGenerator
        # http://trac.xapian.org/wiki/FAQ/TermGenerator
        self.term_gen = xapian.TermGenerator()
        self.term_gen.set_stemmer(stemmer)

        # The Xapian QueryParser
        self.query_parser = xapian.QueryParser()
        self.query_parser.set_database(self.xapian)
        self.query_parser.set_stemmer(stemmer)
        self.query_parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
        self.query_parser.set_default_op(xapian.Query.OP_AND)

        # add boolean internal prefixes
        for name, prefix in self.BOOLEAN_PREFIX.items():
            self.query_parser.add_boolean_prefix(name, prefix)
        # for prefixes that can be applied multiply to the same
        # document (like tags) set the filter grouping to use AND:
        # https://xapian.org/docs/apidoc/html/classXapian_1_1QueryParser.html#a67d25f9297bb98c2101a03ff3d60cf30
        for name, prefix in self.BOOLEAN_PREFIX_MULTI.items():
            self.query_parser.add_boolean_prefix(name, prefix, False)

        # add probabalistic prefixes
        for name, prefix in self.PROBABILISTIC_PREFIX.items():
            self.query_parser.add_prefix(name, prefix)

        # add value facets
        for name, facet in self.NUMBER_VALUE_FACET.items():
            self.query_parser.add_valuerangeprocessor(
                xapian.NumberValueRangeProcessor(facet, name + ':'))

        # register known source prefixes
        # FIXME: can we do this by just finding all XSOURCE terms in
        #        db?  Would elliminate dependence on source modules at
        #        search time.
        for source in Sources():
            name = source.name
            self.query_parser.add_boolean_prefix(
                name, self._make_source_prefix(name))
示例#8
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    def access_cvar():
        return xapian.cvar

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    expect_exception(AttributeError, "'module' object has no attribute 'cvar'",
                     access_cvar)

    stem = xapian.Stem("english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data("a\0b")
    if doc.get_data() == "a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), "a\0b",
           "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data("is there anybody out there?")
    doc.add_term("XYzzy")
    doc.add_posting(stem("is"), 1)
    doc.add_posting(stem("there"), 2)
    doc.add_posting(stem("anybody"), 3)
    doc.add_posting(stem("out"), 4)
    doc.add_posting(stem("there"), 5)

    db = xapian.inmemory_open()
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(xapian.Query(xapian.Query.OP_OR, terms),
                 "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR,
                          (xapian.Query("smoke"), query1, "string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(
        query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'),
                 "VALUE_RANGE 0 1 4")

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)
    enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = " ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, "is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist("there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, "there"):
        count += 1
    expect(count, 2,
           "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to('n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < 'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term)

    # Feature test for Document.values
    count = 0
    for term in doc.values():
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found",
                     db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data("Two")
    doc.add_posting(stem("out"), 1)
    doc.add_posting(stem("outside"), 1)
    doc.add_posting(stem("source"), 2)
    doc.add_value(0, "yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == "yes"

    query = xapian.Query(stem("out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1,
           "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith('a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            testexpanddecider())
    eset_terms = [term[xapian.ESET_TNAME] for term in eset.items]
    expect(len(eset_terms), eset.size(),
           "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith('a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect(eset.items[-1][xapian.ESET_WT] < 1.9, True,
           "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            None, 1.9)
    expect(eset.items[-1][xapian.ESET_WT] >= 1.9, True,
           "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError,
                     "Syntax: <expression> AND <expression>", qp.parse_query,
                     "test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(
        qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT test:(pos=1))")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem('en'))
    expect_query(
        qp.parse_query("foo o", qp.FLAG_PARTIAL),
        "(Zfoo:(pos=1) AND ((out:(pos=2) SYNONYM outsid:(pos=2)) OR Zo:(pos=2)))"
    )

    expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo:(pos=1) AND Zoutsid:(pos=2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'),
                 '(foo OR bar)')

    expect_query(
        qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT Zt\xc3\xa9st:(pos=1))")

    doc = xapian.Document()
    doc.set_data(u"Unicode with an acc\xe9nt")
    doc.add_posting(stem(u"out\xe9r"), 1)
    expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8'))
    term = doc.termlist().next().term
    expect(term, u"out\xe9r".encode('utf-8'))

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo:(pos=1) AND Zbar:(pos=2) AND Za:(pos=3))")

    stop.add('a')
    expect(stop('a'), True)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo:(pos=1) AND Zbar:(pos=2))")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == "b"

        def get_description(self):
            return u"my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), u"my_b_stopper")
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo:(pos=1) AND Zbar:(pos=2) AND Za:(pos=3))")

    expect(stop('b'), True)
    expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo:(pos=1) AND Zbar:(pos=2))")

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text('foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer])
            for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]),
                                          ('foo', 2, [1, 4])])

    # Check DateValueRangeProcessor works
    context("checking that DateValueRangeProcessor works")
    qp = xapian.QueryParser()
    vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960)
    qp.add_valuerangeprocessor(vrpdate)
    query = qp.parse_query('12/03/99..12/04/01')
    expect(str(query), 'Xapian::Query(VALUE_RANGE 1 19991203 20011204)')

    # Regression test for bug#193, fixed in 1.0.3.
    context("running regression test for bug#193")
    vrp = xapian.NumberValueRangeProcessor(0, '$', True)
    a = '$10'
    b = '20'
    slot, a, b = vrp(a, b)
    expect(slot, 0)
    expect(xapian.sortable_unserialise(a), 10)
    expect(xapian.sortable_unserialise(b), 20)

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query("I like tea")

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata('Foo'), '')
    db.set_metadata('Foo', 'Foo')
    expect(db.get_metadata('Foo'), 'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, '')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.set_metadata, '',
                     'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, '')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(
        xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5),
        "5 * foo")
示例#9
0
def search(dbpath, querystring, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(dbpath)

    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stem_strategy(queryparser.STEM_SOME)
    queryparser.add_prefix("title", "S")
    queryparser.add_prefix("description", "XD")

    # and add in value range processors
    # Start of custom VRP code
    class PopulationValueRangeProcessor(xapian.ValueRangeProcessor):
        def __init__(self, value, low, high):
            super(PopulationValueRangeProcessor, self).__init__()
            self.nvrp = xapian.NumberValueRangeProcessor(value)
            self.low = low
            self.high = high

        def __call__(self, begin, end):
            if begin != u"":
                try:
                    _begin = int(begin)
                    if _begin < self.low or _begin > self.high:
                        raise ValueError()
                except:
                    return (xapian.BAD_VALUENO, begin, end)
            if end != u"":
                try:
                    _end = int(end)
                    if _end < self.low or _end > self.high:
                        raise ValueError()
                except:
                    return (xapian.BAD_VALUENO, begin, end)
            return self.nvrp(begin, end)

    queryparser.add_valuerangeprocessor(
        PopulationValueRangeProcessor(3, 500000, 50000000))
    # End of custom VRP code
    # Start of date example code
    queryparser.add_valuerangeprocessor(
        xapian.DateValueRangeProcessor(2, True, 1860))
    queryparser.add_valuerangeprocessor(xapian.NumberValueRangeProcessor(
        1, ''))
    # End of date example code
    # And parse the query
    query = queryparser.parse_query(querystring)

    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)

    # And print out something about each match
    matches = []
    for match in enquire.get_mset(offset, pagesize):
        fields = json.loads(match.document.get_data())
        print u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n        Population %(pop)s" % {
            'rank': match.rank + 1,
            'docid': match.docid,
            'name': fields.get('name', u''),
            'date': fields.get('admitted', u''),
            'pop': fields.get('population', u''),
            'lat': fields.get('latitude', u''),
            'lon': fields.get('longitude', u''),
        }
        matches.append(match.docid)

    # Finally, make sure we log the query and displayed results
    support.log_matches(querystring, offset, pagesize, matches)
示例#10
0
 def __init__(self, value, low, high):
     super(PopulationValueRangeProcessor, self).__init__()
     self.nvrp = xapian.NumberValueRangeProcessor(value)
     self.low = low
     self.high = high