def visit_query(self, query, n=False): query_dict = query.query qp = xapian.QueryParser() qp.set_database(self.database) field_prefix = {} field_type = {} field_col = {} for field_dict in self.schema['idx_fields']: fname = field_dict['field_name'] field_col[fname] = field_dict['column'] field_type[fname] = field_dict['type'] field_prefix[fname] = DOCUMENT_CUSTOM_TERM_PREFIX + fname.upper() field_prefix[self.schema['obj_id']] = DOCUMENT_ID_TERM_PREFIX pre_query = None new_query = None for field in query_dict: if field in field_prefix: prefix = field_prefix[field] col = field_col.get(field) value = query_dict[field] if isinstance(value, dict): ftype = field_type[field] if ftype == 'int' or ftype == 'long': begin = value.get('$gt', 0) end = value.get('$lt', sys.maxint) qp.add_valuerangeprocessor(xapian.NumberValueRangeProcessor(col, prefix)) new_query = qp.parse_query('%s%s..%s' % (prefix, begin, end)) elif not isinstance(value, basestring) and hasattr(value, '__getitem__') or hasattr(value, '__iter__'): value = ['%s%s' % (prefix, v) for v in value] #De Morgan's laws, if we want the intersection of negation sets, #Firstly, we obtain the disjunction of this sets, then get negation of them # (AND_NOT [U, (OR, [a, b, c])]) # NOT (a OR B OR C) # NOT a AND not b AND not C if not n: new_query = xapian.Query(xapian.Query.OP_AND, value) else: new_query = xapian.Query(xapian.Query.OP_OR, value) else: new_query = xapian.Query('%s%s' % (prefix, value)) if pre_query: if not n: new_query = xapian.Query(xapian.Query.OP_AND, [pre_query, new_query]) else: # and_not , [U, a or b]) # not a and not b new_query = xapian.Query(xapian.Query.OP_OR, [pre_query, new_query]) pre_query = new_query return new_query
def search(dbpath, querystring, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(dbpath) # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") # and add in value range processors queryparser.add_valuerangeprocessor( xapian.NumberValueRangeProcessor(0, 'mm', False)) queryparser.add_valuerangeprocessor(xapian.NumberValueRangeProcessor( 1, '')) # And parse the query query = queryparser.parse_query(querystring) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) # And print out something about each match matches = [] for match in enquire.get_mset(offset, pagesize): fields = json.loads(match.document.get_data()) print u"%(rank)i: #%(docid)3.3i (%(date)s) %(measurements)s\n %(title)s" % { 'rank': match.rank + 1, 'docid': match.docid, 'measurements': fields.get('MEASUREMENTS', u''), 'date': fields.get('DATE_MADE', u''), 'title': fields.get('TITLE', u''), } matches.append(match.docid) # Finally, make sure we log the query and displayed results support.log_matches(querystring, offset, pagesize, matches)
def main(args): keyword = args.get('keyword') title = args.get('title') rated_list = args.get('rated') year_range = args.get('year_range') show_facets = args.get('show_facets') with closing(_x.Database('./xdb/movies.db')) as x_db: # get a query parser qp = _query_parser(x_db) if keyword: x_query = qp.parse_query(keyword) else: x_query = _x.Query.MatchAll if title: title_query = qp.parse_query(title, 0, 'S') x_query = _joinq(_x.Query.OP_FILTER, x_query, title_query) if rated_list: rated_queries = [ _x.Query('XRATED:{}'.format(rated)) for rated in rated_list ] rated_query = _x.Query(_x.Query.OP_OR, rated_queries) x_query = _joinq(_x.Query.OP_FILTER, x_query, rated_query) if year_range: qp.add_valuerangeprocessor(_x.NumberValueRangeProcessor(SLOT_YEAR)) year_range_query = qp.parse_query(year_range) x_query = _joinq(_x.Query.OP_FILTER, x_query, year_range_query) # setup the enquire object to perform the query enq = _x.Enquire(x_db) print str(x_query) enq.set_query(x_query) # Set up a spy to inspect value slots on matched documents spy = _x.ValueCountMatchSpy(SLOT_RATED) enq.add_matchspy(spy) # iterate through the matched set and display the stored json dup for res in enq.get_mset(0, x_db.get_doccount(), None, None): print json.dumps(json.loads(res.document.get_data()), indent=4, sort_keys=True) # Fetch and display the spy values if show_facets: facets = {item.term: int(item.termfreq) for item in spy.values()} print "Facets:{}, Total:{} ".format(facets, sum(facets.values()))
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") # A regexp check would be better, but seems to create a bogus "leak" of -1 # objects in Python 3. expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z') expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"') def access_cvar(): res = xapian.cvar print("Unhandled constants: ", res) return res # Check that SWIG isn't generating cvar (regression test for ticket#297). # # Python 3.5 generates a different exception message here to earlier # versions, so we need a check which matches both. expect_exception(AttributeError, lambda msg: msg.find("has no attribute 'cvar'") != -1, access_cvar) stem = xapian.Stem(b"english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data(b"a\0b") if doc.get_data() == b"a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), b"a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data(b"is there anybody out there?") doc.add_term(b"XYzzy") doc.add_posting(stem(b"is"), 1) doc.add_posting(stem(b"there"), 2) doc.add_posting(stem(b"anybody"), 3) doc.add_posting(stem(b"out"), 4) doc.add_posting(stem(b"there"), 5) db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY) db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query( xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, (b"smoke", b"test", b"tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query(b"smoke"), query1, b"string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query( query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query( xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'), "VALUE_RANGE 0 1 4") # Check database factory functions are wrapped as expected (or not wrapped # in the first cases): expect_exception( AttributeError, lambda msg: msg.find("has no attribute 'open_stub'") != -1, lambda: xapian.open_stub(b"nosuchdir/nosuchdb")) expect_exception( AttributeError, lambda msg: msg.find("has no attribute 'open_stub'") != -1, lambda: xapian.open_stub(b"nosuchdir/nosuchdb", xapian.DB_OPEN)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.Database(b"nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_OPEN | xapian.DB_BACKEND_STUB)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.Database( b"nosuchdir/nosuchdb", xapian.DB_BACKEND_GLASS)) expect_exception( xapian.DatabaseCreateError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_GLASS)) expect_exception( xapian.FeatureUnavailableError, None, lambda: xapian.Database( b"nosuchdir/nosuchdb", xapian.DB_BACKEND_CHERT)) expect_exception( xapian.FeatureUnavailableError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_CHERT)) expect_exception(xapian.NetworkError, None, xapian.remote_open, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open, b"127.0.0.1", 0, 1) expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"127.0.0.1", 0, 1) # Check wrapping of MatchAll and MatchNothing: expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = b" ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, b"is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist(b"there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(b""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, b"there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to(b'n') while True: try: x = next(term) except StopIteration: break if x.term < b'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term.decode('utf-8')) # Feature test for Document.values count = 0 for term in list(doc.values()): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data(b"Two") doc.add_posting(stem(b"out"), 1) doc.add_posting(stem(b"outside"), 1) doc.add_posting(stem(b"source"), 2) doc.add_value(0, b"yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == b"yes" query = xapian.Query(stem(b"out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith(b'a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, testexpanddecider()) eset_terms = [item.term for item in eset] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith(b'a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, None, 1.9) expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, b"test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query( qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT test@1)") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem(b'en')) expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((SYNONYM WILDCARD OR o) OR Zo@2))") expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((SYNONYM WILDCARD OR outside) OR Zoutsid@2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')), '(foo OR bar\\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')), '(foo OR bar\u00a3)') expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'), '(foo OR bar)') expect_query( qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT Zt\u00e9st@1)") doc = xapian.Document() doc.set_data(b"Unicode with an acc\xe9nt") doc.add_posting(stem(b"out\xe9r"), 1) expect(doc.get_data(), b"Unicode with an acc\xe9nt") term = next(doc.termlist()).term expect(term, b"out\xe9r") # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") stop.add(b'a') expect(stop(b'a'), True) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == b"b" def get_description(self): return "my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), "my_b_stopper") qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") expect(stop(b'b'), True) expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text(b'foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]), (b'foo', 2, [1, 4])]) # Check DateValueRangeProcessor works context("checking that DateValueRangeProcessor works") qp = xapian.QueryParser() vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960) qp.add_valuerangeprocessor(vrpdate) query = qp.parse_query(b'12/03/99..12/04/01') expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)') # Regression test for bug#193, fixed in 1.0.3. context("running regression test for bug#193") vrp = xapian.NumberValueRangeProcessor(0, b'$', True) a = '$10' b = '20' slot, a, b = vrp(a, b.encode('utf-8')) expect(slot, 0) expect(xapian.sortable_unserialise(a), 10) expect(xapian.sortable_unserialise(b), 20) # Feature test for xapian.FieldProcessor context("running feature test for xapian.FieldProcessor") class testfieldprocessor(xapian.FieldProcessor): def __call__(self, s): if s == 'spam': raise Exception('already spam') return xapian.Query("spam") qp.add_prefix('spam', testfieldprocessor()) qp.add_boolean_prefix('boolspam', testfieldprocessor()) query = qp.parse_query('spam:ignored') expect(str(query), 'Query(spam)') # FIXME: This doesn't currently work: # expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam') # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query(b"I like tea") # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata(b'Foo'), b'') db.set_metadata(b'Foo', b'Foo') expect(db.get_metadata(b'Foo'), b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, b'', b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query( xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5), "5 * foo")
def _enquire(self, request, query, order_by, group_by): enquire = xapian.Enquire(self._db) queries = [] and_not_queries = [] boolean_queries = [] if query: query = self._extract_exact_search_terms(query, request) if query: parser = xapian.QueryParser() parser.set_database(self._db) for name, prop in self._props.items(): if not prop.prefix: continue if prop.boolean: parser.add_boolean_prefix(name, prop.prefix) else: parser.add_prefix(name, prop.prefix) parser.add_prefix('', prop.prefix) if prop.slot is not None and \ prop.typecast in [int, float, bool]: value_range = xapian.NumberValueRangeProcessor( prop.slot, name + ':') parser.add_valuerangeprocessor(value_range) parser.add_prefix('', '') query = parser.parse_query( query, xapian.QueryParser.FLAG_PHRASE | xapian.QueryParser.FLAG_BOOLEAN | xapian.QueryParser.FLAG_LOVEHATE | xapian.QueryParser.FLAG_PARTIAL | xapian.QueryParser.FLAG_WILDCARD | xapian.QueryParser.FLAG_PURE_NOT, '') queries.append(query) for name, value in request.items(): prop = self._props.get(name) if prop is None or not prop.prefix: continue sub_queries = [] not_queries = [] for needle in value if type(value) in (tuple, list) else [value]: if needle is None: continue needle = prop.to_string(needle)[0] if needle.startswith('!'): term = _term(prop.prefix, needle[1:]) not_queries.append(xapian.Query(term)) elif needle.startswith('-'): term = _term(prop.prefix, needle[1:]) and_not_queries.append(xapian.Query(term)) else: term = _term(prop.prefix, needle) sub_queries.append(xapian.Query(term)) if not_queries: not_query = xapian.Query(xapian.Query.OP_AND_NOT, [ xapian.Query(''), xapian.Query(xapian.Query.OP_OR, not_queries) ]) sub_queries.append(not_query) if sub_queries: if len(sub_queries) == 1: query = sub_queries[0] else: query = xapian.Query(xapian.Query.OP_OR, sub_queries) if prop.boolean: boolean_queries.append(query) else: queries.append(query) final = None if queries: final = xapian.Query(xapian.Query.OP_AND, queries) if boolean_queries: query = xapian.Query(xapian.Query.OP_AND, boolean_queries) if final is None: final = query else: final = xapian.Query(xapian.Query.OP_FILTER, [final, query]) if final is None: final = xapian.Query('') for i in and_not_queries: final = xapian.Query(xapian.Query.OP_AND_NOT, [final, i]) enquire.set_query(final) if hasattr(xapian, 'MultiValueKeyMaker'): sorter = xapian.MultiValueKeyMaker() if order_by: if order_by.startswith('+'): reverse = False order_by = order_by[1:] elif order_by.startswith('-'): reverse = True order_by = order_by[1:] else: reverse = False prop = self._props.get(order_by) enforce(prop is not None and prop.slot is not None, 'Cannot sort using %r property of %r', order_by, self.metadata.name) sorter.add_value(prop.slot, reverse) # Sort by ascending GUID to make order predictable all time sorter.add_value(0, False) enquire.set_sort_by_key(sorter, reverse=False) else: _logger.warning('In order to support sorting, ' 'Xapian should be at least 1.2.0') if group_by: prop = self._props.get(group_by) enforce(prop is not None and prop.slot is not None, 'Cannot group by %r property of %r', group_by, self.metadata.name) enquire.set_collapse_key(prop.slot) return enquire
def query(self, querystring=None, qtype=None, begin=None, end=None, keywords=[], hashtags=[], synonymslist=[], emotiononly=False): if qtype == 'hy': self.qp.add_valuerangeprocessor( xapian.NumberValueRangeProcessor(self.timestampvi, '')) querystring = begin + '..' + end if emotiononly: self.qp.add_valuerangeprocessor( xapian.NumberValueRangeProcessor(self.emotiononlyvi, 'f', False)) querystring += ' 1.0..1.0f' query = self.qp.parse_query(querystring) print "Parsed query is: %s" % [str(query)] self.enquire.set_query(query) #matches = self.enquire.get_mset(0, self.maxitems) matches = self.enquire.get_mset(0, 10000) # Display the results. print "%i results found." % matches.size() if not self.lowkeywords_proc(matches): return emotions_list, keywords_list = self.keywords_and_emotions_list_proc( matches) return emotions_list, keywords_list if qtype == 'yq': self.qp.add_valuerangeprocessor( xapian.NumberValueRangeProcessor(self.timestampvi, '')) querystring = begin + '..' + end query = self.qp.parse_query(querystring) print "Parsed query is: %s" % [str(query)] self.enquire.set_query(query) #matches = self.enquire.get_mset(0,10) matches = self.enquire.get_mset(0, self.maxitems) # Display the results. print "%i results found." % matches.size() keywords_arr = [] for m in matches: #hashtag hashtags = json.loads(m.document.get_value(self.hashtagsvi)) #keywords keywords_hash = json.loads( m.document.get_value(self.keywordsvi)) keywords_arr.append(keywords_hash) #keywords_counter += Counter(json.loads(m.document.get_value(self.keywordsvi))) print 'mapreduce begin: ', str( time.strftime("%H:%M:%S", time.gmtime())) mapper = SimpleMapReduce(hasharr_to_list, count_words) word_counts = mapper(keywords_arr) keywords_hash = {} for word, count in word_counts: keywords_hash[word] = count for synonyms in synonymslist: if len(synonyms) >= 2 and synonyms[0] in keywords_hash: for word in synonyms[1:]: if word in keywords_hash: keywords_hash[synonyms[0]] += keywords_hash[word] del keywords_hash[word] print 'mapreduce end: ', str( time.strftime("%H:%M:%S", time.gmtime())) #print keywords_counter return hashtags, keywords_hash if qtype == 'lh': self.qp.add_valuerangeprocessor( xapian.NumberValueRangeProcessor(self.timestampvi, '')) timequerystr = begin + '..' + end timequery = self.qp.parse_query(timequerystr) hashtags = ['H' + hashtag.lower() for hashtag in hashtags] keywords = [keyword.lower() for keyword in keywords] keywords.extend(hashtags) if len(keywords) > 0: wordsquery = xapian.Query(xapian.Query.OP_OR, keywords) else: return None query = xapian.Query(xapian.Query.OP_AND, [timequery, wordsquery]) print "Parsed query is: %s" % [str(query)] self.enquire.set_query(query) self.enquire.set_sort_by_value(self.timestampvi, False) #matches = self.enquire.get_mset(0,10) matches = self.enquire.get_mset(0, self.maxitems) # Display the results. print "%i results found." % matches.size() results = [] for m in matches: result = {} result['location'] = m.document.get_value(self.loctvi) result['repost_location'] = m.document.get_value( self.reploctvi) result['timestamp'] = xapian.sortable_unserialise( m.document.get_value(self.timestampvi)) results.append(result) return results
def __init__(self, root, writable=False, create=False, force=False): # xapers root self.root = os.path.abspath(os.path.expanduser(root)) # xapers db directory xapers_path = os.path.join(self.root, '.xapers') # xapes directory initialization if not os.path.exists(xapers_path): if create: if os.path.exists(self.root): if os.listdir(self.root) and not force: raise DatabaseInitializationError( 'Uninitialized Xapers root directory exists but is not empty.' ) os.makedirs(xapers_path) else: if os.path.exists(self.root): raise DatabaseInitializationError( "Xapers directory '%s' does not contain a database." % (self.root)) else: raise DatabaseUninitializedError( "Xapers directory '%s' not found." % (self.root)) # the Xapian db xapian_path = os.path.join(xapers_path, 'xapian') if writable: try: self.xapian = xapian.WritableDatabase(xapian_path, xapian.DB_CREATE_OR_OPEN) except xapian.DatabaseLockError: raise DatabaseLockError("Xapers database locked.") else: self.xapian = xapian.Database(xapian_path) stemmer = xapian.Stem("english") # The Xapian TermGenerator # http://trac.xapian.org/wiki/FAQ/TermGenerator self.term_gen = xapian.TermGenerator() self.term_gen.set_stemmer(stemmer) # The Xapian QueryParser self.query_parser = xapian.QueryParser() self.query_parser.set_database(self.xapian) self.query_parser.set_stemmer(stemmer) self.query_parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) self.query_parser.set_default_op(xapian.Query.OP_AND) # add boolean internal prefixes for name, prefix in self.BOOLEAN_PREFIX.items(): self.query_parser.add_boolean_prefix(name, prefix) # for prefixes that can be applied multiply to the same # document (like tags) set the filter grouping to use AND: # https://xapian.org/docs/apidoc/html/classXapian_1_1QueryParser.html#a67d25f9297bb98c2101a03ff3d60cf30 for name, prefix in self.BOOLEAN_PREFIX_MULTI.items(): self.query_parser.add_boolean_prefix(name, prefix, False) # add probabalistic prefixes for name, prefix in self.PROBABILISTIC_PREFIX.items(): self.query_parser.add_prefix(name, prefix) # add value facets for name, facet in self.NUMBER_VALUE_FACET.items(): self.query_parser.add_valuerangeprocessor( xapian.NumberValueRangeProcessor(facet, name + ':')) # register known source prefixes # FIXME: can we do this by just finding all XSOURCE terms in # db? Would elliminate dependence on source modules at # search time. for source in Sources(): name = source.name self.query_parser.add_boolean_prefix( name, self._make_source_prefix(name))
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") def access_cvar(): return xapian.cvar # Check that SWIG isn't generating cvar (regression test for ticket#297). expect_exception(AttributeError, "'module' object has no attribute 'cvar'", access_cvar) stem = xapian.Stem("english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data("a\0b") if doc.get_data() == "a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), "a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data("is there anybody out there?") doc.add_term("XYzzy") doc.add_posting(stem("is"), 1) doc.add_posting(stem("there"), 2) doc.add_posting(stem("anybody"), 3) doc.add_posting(stem("out"), 4) doc.add_posting(stem("there"), 5) db = xapian.inmemory_open() db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query(xapian.Query(xapian.Query.OP_OR, terms), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query("smoke"), query1, "string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query( query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'), "VALUE_RANGE 0 1 4") expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = " ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, "is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist("there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, "there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to('n') while True: try: x = next(term) except StopIteration: break if x.term < 'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term) # Feature test for Document.values count = 0 for term in doc.values(): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data("Two") doc.add_posting(stem("out"), 1) doc.add_posting(stem("outside"), 1) doc.add_posting(stem("source"), 2) doc.add_value(0, "yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == "yes" query = xapian.Query(stem("out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith('a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, testexpanddecider()) eset_terms = [term[xapian.ESET_TNAME] for term in eset.items] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith('a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect(eset.items[-1][xapian.ESET_WT] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, None, 1.9) expect(eset.items[-1][xapian.ESET_WT] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, "test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query( qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT test:(pos=1))") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem('en')) expect_query( qp.parse_query("foo o", qp.FLAG_PARTIAL), "(Zfoo:(pos=1) AND ((out:(pos=2) SYNONYM outsid:(pos=2)) OR Zo:(pos=2)))" ) expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL), "(Zfoo:(pos=1) AND Zoutsid:(pos=2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'), '(foo OR bar)') expect_query( qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT Zt\xc3\xa9st:(pos=1))") doc = xapian.Document() doc.set_data(u"Unicode with an acc\xe9nt") doc.add_posting(stem(u"out\xe9r"), 1) expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8')) term = doc.termlist().next().term expect(term, u"out\xe9r".encode('utf-8')) # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo:(pos=1) AND Zbar:(pos=2) AND Za:(pos=3))") stop.add('a') expect(stop('a'), True) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo:(pos=1) AND Zbar:(pos=2))") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == "b" def get_description(self): return u"my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), u"my_b_stopper") qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo:(pos=1) AND Zbar:(pos=2) AND Za:(pos=3))") expect(stop('b'), True) expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo:(pos=1) AND Zbar:(pos=2))") # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text('foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]), ('foo', 2, [1, 4])]) # Check DateValueRangeProcessor works context("checking that DateValueRangeProcessor works") qp = xapian.QueryParser() vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960) qp.add_valuerangeprocessor(vrpdate) query = qp.parse_query('12/03/99..12/04/01') expect(str(query), 'Xapian::Query(VALUE_RANGE 1 19991203 20011204)') # Regression test for bug#193, fixed in 1.0.3. context("running regression test for bug#193") vrp = xapian.NumberValueRangeProcessor(0, '$', True) a = '$10' b = '20' slot, a, b = vrp(a, b) expect(slot, 0) expect(xapian.sortable_unserialise(a), 10) expect(xapian.sortable_unserialise(b), 20) # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query("I like tea") # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata('Foo'), '') db.set_metadata('Foo', 'Foo') expect(db.get_metadata('Foo'), 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, '', 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query( xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5), "5 * foo")
def search(dbpath, querystring, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(dbpath) # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stem_strategy(queryparser.STEM_SOME) queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") # and add in value range processors # Start of custom VRP code class PopulationValueRangeProcessor(xapian.ValueRangeProcessor): def __init__(self, value, low, high): super(PopulationValueRangeProcessor, self).__init__() self.nvrp = xapian.NumberValueRangeProcessor(value) self.low = low self.high = high def __call__(self, begin, end): if begin != u"": try: _begin = int(begin) if _begin < self.low or _begin > self.high: raise ValueError() except: return (xapian.BAD_VALUENO, begin, end) if end != u"": try: _end = int(end) if _end < self.low or _end > self.high: raise ValueError() except: return (xapian.BAD_VALUENO, begin, end) return self.nvrp(begin, end) queryparser.add_valuerangeprocessor( PopulationValueRangeProcessor(3, 500000, 50000000)) # End of custom VRP code # Start of date example code queryparser.add_valuerangeprocessor( xapian.DateValueRangeProcessor(2, True, 1860)) queryparser.add_valuerangeprocessor(xapian.NumberValueRangeProcessor( 1, '')) # End of date example code # And parse the query query = queryparser.parse_query(querystring) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) # And print out something about each match matches = [] for match in enquire.get_mset(offset, pagesize): fields = json.loads(match.document.get_data()) print u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % { 'rank': match.rank + 1, 'docid': match.docid, 'name': fields.get('name', u''), 'date': fields.get('admitted', u''), 'pop': fields.get('population', u''), 'lat': fields.get('latitude', u''), 'lon': fields.get('longitude', u''), } matches.append(match.docid) # Finally, make sure we log the query and displayed results support.log_matches(querystring, offset, pagesize, matches)
def __init__(self, value, low, high): super(PopulationValueRangeProcessor, self).__init__() self.nvrp = xapian.NumberValueRangeProcessor(value) self.low = low self.high = high