def test_director_exception(): """Test handling of an exception raised in a director. """ db = setup_database() query = xapian.Query('it') enq = xapian.Enquire(db) enq.set_query(query) class TestException(Exception): def __init__(self, a, b): Exception.__init__(self, a + b) rset = xapian.RSet() rset.add_document(1) class EDecider(xapian.ExpandDecider): def __call__(self, term): raise TestException("foo", "bar") edecider = EDecider() expect_exception(TestException, "foobar", edecider, "foo") expect_exception(TestException, "foobar", enq.get_eset, 10, rset, edecider) class MDecider(xapian.MatchDecider): def __call__(self, doc): raise TestException("foo", "bar") mdecider = MDecider() expect_exception(TestException, "foobar", mdecider, xapian.Document()) expect_exception(TestException, "foobar", enq.get_mset, 0, 10, None, mdecider)
def test_eset_iter(): """Test iterators over ESets. """ db = setup_database() query = xapian.Query(xapian.Query.OP_OR, "was", "it") rset = xapian.RSet() rset.add_document(3) context("getting eset items without a query") enquire = xapian.Enquire(db) eset = enquire.get_eset(10, rset) items = [item for item in eset] expect(len(items), 3) expect(len(items), len(eset)) context("getting eset items with a query") enquire = xapian.Enquire(db) enquire.set_query(query) eset = enquire.get_eset(10, rset) items2 = [item for item in eset] expect(len(items2), 2) expect(len(items2), len(eset)) context("comparing eset items with a query to those without") expect(items2[0].term, items[0].term) expect(items2[1].term, items[2].term) context("comparing eset weights with a query to those without") expect(items2[0].weight, items[0].weight) expect(items2[1].weight, items[2].weight)
def suggest(self, search, offset=0, limit=0, moffset=0, mlimit=0, klimit=1.0, kmlimit=1.0, prefix=None, decider=None, score=False, format_term=True, collapse_stems=True, include_query_terms=True, order=None, reverse=False): """ Suggest terms that would possibly yield more relevant results for the given query. """ self.backend.reopen() if mlimit == 0: mlimit = int(self.backend.get_doccount() * kmlimit) enq = xapian.Enquire(self.backend) enq.set_query(search.query) mset = self._build_mset(enq, offset=moffset, limit=mlimit, order=order, reverse=reverse) rset = xapian.RSet() for m in mset: rset.add_document(m.docid) if prefix is not None: decider = PrefixDecider(prefix) if decider is None: decider = LanguageDecider() if limit == 0: limit = int(self.backend.get_doccount() * klimit) eset = enq.get_eset( limit, rset, enq.INCLUDE_QUERY_TERMS if include_query_terms else 0, 1.0, decider, -3) for item in eset.items: val = item[0].decode('utf8') yield (val, item[1])
def get_suggestions(self, count=10, filter=None): """ Compute suggestions for more terms Return a Xapian ESet """ # Use the first 30 results as the key ones to use to compute relevant # terms rset = xapian.RSet() for m in self.enquire.get_mset(0, 30): rset.add_document(m.docid) # Get results, optionally filtered if filter is None: filter = self.BasicFilter() return self.enquire.get_eset(count, rset, filter)
def xapian_search(self, k=100, showscore=True): print self.database.xapian dbpath_doc = self.database.xapian db_doc = xapian.Database(dbpath_doc) doc_qp = xapian.QueryParser() doc_qp.set_stemmer(xapian.Stem("en")) doc_qp.set_stopper(self.stopper) doc_qp.set_database(db_doc) #doc_qp.set_default_op( xapian.Query.OP_ELITE_SET) doc_qp.set_default_op(xapian.Query.OP_AND) doc_qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) doc_query = doc_qp.parse_query(self.queryString) offset, limit = 0, min(k, db_doc.get_doccount()) print limit enquire = xapian.Enquire(db_doc) enquire.set_query(doc_query) doc_matches = enquire.get_mset(offset, limit) rset = xapian.RSet() ids = [] scores = [] match_terms = {} for match in doc_matches: rset.add_document(match.docid) document = match.document scores.append(match.weight) ids.append(document.get_data()) m_terms = enquire.matching_terms(match) match_terms[match.docid] = [term for term in m_terms] self.ranked_ids = ids self.match_terms = match_terms alternatives = enquire.get_eset(100, rset, 0) # print alternatives self.alternatives = {} for a in alternatives.items: self.alternatives[a[1]] = a[0] if showscore: return ids, scores
def eset_profile(self, items_repository, size, content_filter): """ Return most relevant tags for a list of packages. """ # Store package documents in a relevant set enquire = xapian.Enquire(items_repository) docs = data.axi_search_pkgs(items_repository, self.pkg_profile) rset_packages = xapian.RSet() for d in docs: rset_packages.add_document(d.docid) # Get expanded query terms (statistically good differentiators) eset_tags = enquire.get_eset(size * 2, rset_packages, xapian.Enquire.INCLUDE_QUERY_TERMS, 1, content_filter) # Eliminate duplicated stemmed term profile = self._eliminate_duplicated([res.term for res in eset_tags], size) return profile
def test_eset(self): """ test finding "similar" items than the ones found before """ query = xapian.Query("foo") self.enquire.set_query(query) # this yields very few results matches = self.enquire.get_mset(0, 100) # create a relevance set from the query rset = xapian.RSet() #print "original finds: " for match in matches: #print match.document.get_data() rset.add_document(match.docid) # and use that to get a extended set eset = self.enquire.get_eset(20, rset) #print eset # build a query from the eset eset_query = xapian.Query(xapian.Query.OP_OR, [e.term for e in eset]) self.enquire.set_query(eset_query) # ensure we have more results now than before eset_matches = self.enquire.get_mset(0, 100) self.assertTrue(len(matches) < len(eset_matches))
def run(self, rec, user, recommendation_size): """ Perform recommendation strategy. """ temp_index = xapian.WritableDatabase("/tmp/Database", xapian.DB_CREATE_OR_OVERWRITE) profile = self.get_user_profile(user, rec) doc = xapian.Document() for pkg in profile: doc.add_term(pkg) doc.add_term("TO_BE_DELETED") docid = temp_index.add_document(doc) temp_index.add_database(rec.users_repository) rset = xapian.RSet() rset.add_document(docid) # rset = self.get_rset_from_profile(profile) enquire = xapian.Enquire(temp_index) enquire.set_weighting_scheme(rec.weight) eset = enquire.get_eset(recommendation_size, rset, PkgExpandDecider(user.items())) result = self.get_result_from_eset(eset) return result
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") # A regexp check would be better, but seems to create a bogus "leak" of -1 # objects in Python 3. expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z') expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"') def access_cvar(): res = xapian.cvar print("Unhandled constants: ", res) return res # Check that SWIG isn't generating cvar (regression test for ticket#297). # # Python 3.5 generates a different exception message here to earlier # versions, so we need a check which matches both. expect_exception(AttributeError, lambda msg: msg.find("has no attribute 'cvar'") != -1, access_cvar) stem = xapian.Stem(b"english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data(b"a\0b") if doc.get_data() == b"a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), b"a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data(b"is there anybody out there?") doc.add_term(b"XYzzy") doc.add_posting(stem(b"is"), 1) doc.add_posting(stem(b"there"), 2) doc.add_posting(stem(b"anybody"), 3) doc.add_posting(stem(b"out"), 4) doc.add_posting(stem(b"there"), 5) db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY) db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query( xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, (b"smoke", b"test", b"tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query(b"smoke"), query1, b"string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query( query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query( xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'), "VALUE_RANGE 0 1 4") # Check database factory functions are wrapped as expected (or not wrapped # in the first cases): expect_exception( AttributeError, lambda msg: msg.find("has no attribute 'open_stub'") != -1, lambda: xapian.open_stub(b"nosuchdir/nosuchdb")) expect_exception( AttributeError, lambda msg: msg.find("has no attribute 'open_stub'") != -1, lambda: xapian.open_stub(b"nosuchdir/nosuchdb", xapian.DB_OPEN)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.Database(b"nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_OPEN | xapian.DB_BACKEND_STUB)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.Database( b"nosuchdir/nosuchdb", xapian.DB_BACKEND_GLASS)) expect_exception( xapian.DatabaseCreateError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_GLASS)) expect_exception( xapian.FeatureUnavailableError, None, lambda: xapian.Database( b"nosuchdir/nosuchdb", xapian.DB_BACKEND_CHERT)) expect_exception( xapian.FeatureUnavailableError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_CHERT)) expect_exception(xapian.NetworkError, None, xapian.remote_open, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open, b"127.0.0.1", 0, 1) expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"127.0.0.1", 0, 1) # Check wrapping of MatchAll and MatchNothing: expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = b" ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, b"is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist(b"there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(b""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, b"there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to(b'n') while True: try: x = next(term) except StopIteration: break if x.term < b'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term.decode('utf-8')) # Feature test for Document.values count = 0 for term in list(doc.values()): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data(b"Two") doc.add_posting(stem(b"out"), 1) doc.add_posting(stem(b"outside"), 1) doc.add_posting(stem(b"source"), 2) doc.add_value(0, b"yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == b"yes" query = xapian.Query(stem(b"out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith(b'a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, testexpanddecider()) eset_terms = [item.term for item in eset] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith(b'a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, None, 1.9) expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, b"test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query( qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT test@1)") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem(b'en')) expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((SYNONYM WILDCARD OR o) OR Zo@2))") expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((SYNONYM WILDCARD OR outside) OR Zoutsid@2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')), '(foo OR bar\\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')), '(foo OR bar\u00a3)') expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'), '(foo OR bar)') expect_query( qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT Zt\u00e9st@1)") doc = xapian.Document() doc.set_data(b"Unicode with an acc\xe9nt") doc.add_posting(stem(b"out\xe9r"), 1) expect(doc.get_data(), b"Unicode with an acc\xe9nt") term = next(doc.termlist()).term expect(term, b"out\xe9r") # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") stop.add(b'a') expect(stop(b'a'), True) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == b"b" def get_description(self): return "my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), "my_b_stopper") qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") expect(stop(b'b'), True) expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text(b'foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]), (b'foo', 2, [1, 4])]) # Check DateValueRangeProcessor works context("checking that DateValueRangeProcessor works") qp = xapian.QueryParser() vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960) qp.add_valuerangeprocessor(vrpdate) query = qp.parse_query(b'12/03/99..12/04/01') expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)') # Regression test for bug#193, fixed in 1.0.3. context("running regression test for bug#193") vrp = xapian.NumberValueRangeProcessor(0, b'$', True) a = '$10' b = '20' slot, a, b = vrp(a, b.encode('utf-8')) expect(slot, 0) expect(xapian.sortable_unserialise(a), 10) expect(xapian.sortable_unserialise(b), 20) # Feature test for xapian.FieldProcessor context("running feature test for xapian.FieldProcessor") class testfieldprocessor(xapian.FieldProcessor): def __call__(self, s): if s == 'spam': raise Exception('already spam') return xapian.Query("spam") qp.add_prefix('spam', testfieldprocessor()) qp.add_boolean_prefix('boolspam', testfieldprocessor()) query = qp.parse_query('spam:ignored') expect(str(query), 'Query(spam)') # FIXME: This doesn't currently work: # expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam') # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query(b"I like tea") # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata(b'Foo'), b'') db.set_metadata(b'Foo', b'Foo') expect(db.get_metadata(b'Foo'), b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, b'', b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query( xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5), "5 * foo")
def more_like_this(self, model_instance, additional_query=None, start_offset=0, end_offset=None, limit_to_registered_models=True, result_class=None, **kwargs): """ Given a model instance, returns a result set of similar documents. Required arguments: `model_instance` -- The model instance to use as a basis for retrieving similar documents. Optional arguments: `additional_query` -- An additional query to narrow results `start_offset` -- The starting offset (default=0) `end_offset` -- The ending offset (default=None), if None, then all documents `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True) Returns: A dictionary with the following keys: `results` -- A list of `SearchResult` `hits` -- The total available results Opens a database connection, then builds a simple query using the `model_instance` to build the unique identifier. For each document retrieved(should always be one), adds an entry into an RSet (relevance set) with the document id, then, uses the RSet to query for an ESet (A set of terms that can be used to suggest expansions to the original query), omitting any document that was in the original query. Finally, processes the resulting matches and returns. """ database = self._database() if result_class is None: result_class = SearchResult query = xapian.Query(DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)) enquire = xapian.Enquire(database) enquire.set_query(query) rset = xapian.RSet() if not end_offset: end_offset = database.get_doccount() for match in self._get_enquire_mset(database, enquire, 0, end_offset): rset.add_document(match.docid) query = xapian.Query(xapian.Query.OP_ELITE_SET, [ expand.term for expand in enquire.get_eset( match.document.termlist_count(), rset, XHExpandDecider()) ], match.document.termlist_count()) query = xapian.Query( xapian.Query.OP_AND_NOT, [query, DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)]) if limit_to_registered_models: registered_models = self.build_models_list() if len(registered_models) > 0: query = xapian.Query( xapian.Query.OP_AND, query, xapian.Query(xapian.Query.OP_OR, [ xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model)) for model in registered_models ])) if additional_query: query = xapian.Query(xapian.Query.OP_AND, query, additional_query) enquire.set_query(query) results = [] matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) for match in matches: app_label, module_name, pk, model_data = pickle.loads( self._get_document_data(database, match.document)) results.append( result_class(app_label, module_name, pk, match.percent, **model_data)) return { 'results': results, 'hits': self._get_hit_count(database, enquire), 'facets': { 'fields': {}, 'dates': {}, 'queries': {}, }, 'spelling_suggestion': None, }
def run(self, search_options, progressbar=None): # Matching set logging.debug('Getting MSet') progressbar.set_text('0%') while gtk.events_pending(): gtk.main_iteration() mset = search_options['enquire'].get_mset( 0, search_options['n_mset'], 0, None, #MMMatchDeciderAlwaysTrue(progressbar, 1/float(n_mset + n_eset))) #MMMatchDeciderAlwaysTrue()) None) # Results set logging.debug('Getting RSet') progressbar.set_fraction(0.33) progressbar.set_text('33%') while gtk.events_pending(): gtk.main_iteration() docs = [] rset = xapian.RSet() for y, d in enumerate(mset): if y < search_options['n_mset']: rset.add_document(d.docid) docs.append([ d.percent, d.document.get_data(), d.document.get_value(2) ]) else: logging.warning( 'More docs in mset than expected, something is wrong') # Obtain the "Expansion set" for the search: the n most relevant terms that # match the filter logging.debug('Getting ESet') progressbar.set_fraction(0.66) progressbar.set_text('66%') while gtk.events_pending(): gtk.main_iteration() eset = search_options['enquire'].get_eset( search_options['n_eset'], rset, search_options[ 'eset_showqueryterms'], # 0 = exclude query terms in eset; 1 = include query terms in eset 1, #MMRsetFilter(stopwords[lang], [], progressbar, 1/float(n_mset + n_eset))) MMEsetFilter(stopwords[search_options['selected_language']], search_options['eset_white_list'])) # Read the "Expansion set" and scan tags and their score tagscores = dict() for item in eset: tag = item.term tagscores[tag] = item.weight tags = [] if tagscores != dict(): maxscore = max(tagscores.itervalues()) minscore = min(tagscores.itervalues()) for k in tagscores.iterkeys(): tags.append([ k, (tagscores[k] - minscore) * 100 / (maxscore - minscore) * 3 + 75 ]) # sort by tag alphabetically tags.sort() return docs, tags
def run(self, search_options, progressbar=None): logging.debug('Getting MSet') progressbar.set_text('0%') while gtk.events_pending(): gtk.main_iteration() mset = search_options['enquire'].get_mset( 0, search_options['n_mset'], 0, None, #MMMatchDeciderAlwaysTrue(progressbar, 1/float(self.n_mset + self.n_eset + self.n_eset*self.n_eset))) #MMMatchDeciderAlwaysTrue()) None) logging.debug('Getting RSet') progressbar.set_fraction(0.25) progressbar.set_text('25%') while gtk.events_pending(): gtk.main_iteration() rset = xapian.RSet() for y, m in enumerate(mset): rset.add_document(m.docid) logging.debug('Getting ESet') progressbar.set_fraction(0.5) progressbar.set_text('50%') while gtk.events_pending(): gtk.main_iteration() eset = search_options['enquire'].get_eset( search_options['n_eset'], rset, search_options[ 'eset_showqueryterms'], # 0 = exclude query terms in eset; 1 = include query terms in eset 1, MMEsetFilter(stopwords[search_options['selected_language']], search_options['eset_white_list'])) progressbar.set_fraction(0.75) progressbar.set_text('75%') while gtk.events_pending(): gtk.main_iteration() logging.debug('Calculating distances on %i terms' % len(eset)) positions_matrix = {} wdf_dict = {} for ki, keyword in enumerate(eset): positions_arrays = {} freq = 0 for m in mset: docid = m.docid try: positions_array = set(search_options['db'].positionlist( docid, keyword.term)) except xapian.RangeError: positions_array = [] positions_arrays[docid] = positions_array tl = search_options['db'].get_document(docid).termlist() try: wdf = tl.skip_to(keyword.term).wdf except: continue else: if wdf_dict.has_key(ki): wdf_dict[ki] += wdf else: wdf_dict[ki] = wdf positions_matrix[ki] = positions_arrays wdf_dict[ki] /= float(len(mset)) #print "weight (%s): %f" % (keyword.term, wdf_dict[ki]) if progressbar is not None: fraction = 0.75 + 0.125 / float(search_options['n_eset']) * ki progressbar.set_fraction(fraction) progressbar.set_text('%.0f%%' % (fraction * 100)) while gtk.events_pending(): gtk.main_iteration() full_distances_list = [] for ki, keyword in enumerate(eset): for oi, other in enumerate(eset): if keyword.term < other.term: distance = 0 for m in mset: doc_distances = [] docid = m.docid for i in positions_matrix[ki][docid]: for j in positions_matrix[oi][docid]: doc_distances.append(abs(i - j)) # doc_distances contiene le distanze di tutte le # possibili coppie di occorrenze di i e j nel documento. # Noi teniamo solo le max(wdf_i, wdf_j) coppie che hanno # distanza minima tl = search_options['db'].get_document( docid).termlist() try: keyword_wdf = tl.skip_to(keyword.term).wdf other_wdf = tl.skip_to(other.term).wdf except: pass num_kept_distances = max(keyword_wdf, other_wdf) if doc_distances != []: doc_distances.sort() distance += sum([ 1 / float(i) for i in doc_distances[:num_kept_distances] ]) #print "%s(%d), %s(%d): dist=%s, kept=%i, kept_dist=%s, doc=%d(%d), dist=%f" % (keyword.term, keyword_wdf, other.term, other_wdf, doc_distances, num_kept_distances, doc_distances[:num_kept_distances], docid, len(mset), distance) if distance != 0: f = lambda x: x / float(num_kept_distances) / float( len(mset)) #print "%s, %s: %f" % (keyword.term, other.term, f(distance)) full_distances_list.append([ keyword.term, other.term, f(distance), wdf_dict[ki], wdf_dict[oi] ]) if progressbar is not None: fraction = 0.875 + 0.125 / float( search_options['n_eset']) * ki progressbar.set_fraction(fraction) progressbar.set_text('%.0f%%' % (fraction * 100)) while gtk.events_pending(): gtk.main_iteration() #print full_distances_list return full_distances_list
def run(self, search_options, progressbar=None): logging.debug('Getting MSet') progressbar.set_text('0%') while gtk.events_pending(): gtk.main_iteration() mset = search_options['enquire'].get_mset( 0, search_options['n_mset'], 0, None, #MMMatchDeciderAlwaysTrue(progressbar, 1/float(self.n_mset + self.n_eset + self.n_eset*self.n_eset))) #MMMatchDeciderAlwaysTrue()) None) logging.debug('Getting RSet') progressbar.set_fraction(0.25) progressbar.set_text('25%') while gtk.events_pending(): gtk.main_iteration() rset = xapian.RSet() for y, m in enumerate(mset): rset.add_document(m.docid) logging.debug('Getting ESet') progressbar.set_fraction(0.5) progressbar.set_text('50%') while gtk.events_pending(): gtk.main_iteration() eset = search_options['enquire'].get_eset( search_options['n_eset'] + 1, rset, search_options[ 'eset_showqueryterms'], # 0 = exclude query terms in eset; 1 = include query terms in eset 1, MMEsetFilter(stopwords[search_options['selected_language']], search_options['eset_white_list'])) logging.debug('Calculating distances on %i terms' % len(eset)) progressbar.set_fraction(0.75) progressbar.set_text('75%') while gtk.events_pending(): gtk.main_iteration() positions_matrix = {} for ki, keyword in enumerate(eset): positions_arrays = {} for m in mset: docid = m.docid try: positions_array = set(search_options['db'].positionlist( docid, keyword.term)) except xapian.RangeError: positions_array = [] positions_arrays[docid] = positions_array positions_matrix[ki] = positions_arrays if progressbar is not None: fraction = progressbar.get_fraction() + 0.125 / float( search_options['n_eset']) progressbar.set_fraction(fraction) progressbar.set_text('%.0f%%' % (fraction * 100)) while gtk.events_pending(): gtk.main_iteration() distances_list = [] for ki, keyword in enumerate(eset): for oi, other in enumerate(eset): if ki < oi: distances = [] for m in mset: docid = m.docid count = [] for i in positions_matrix[ki][docid]: for j in positions_matrix[oi][docid]: count.append(abs(i - j)) if count != []: distances.append(min(count)) if distances != []: #print ",".join([keyword, other, "%f" % (sum(distances)/float(len(distances)))]) f = lambda x: 1 / float( sum(x) / float(search_options['n_mset'])) distances_list.append([ keyword.term, other.term, f(distances), keyword.weight, other.weight ]) #distances_list.append([other.term, # keyword.term, # f(distances), # other.weight, # keyword.weight]) if progressbar is not None: fraction = progressbar.get_fraction() + 0.125 / float( search_options['n_eset'] * search_options['n_eset']) progressbar.set_fraction(fraction) progressbar.set_text('%.0f%%' % (fraction * 100)) while gtk.events_pending(): gtk.main_iteration() return distances_list
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") # A regexp check would be better, but seems to create a bogus "leak" of -1 # objects in Python 3. expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z') expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"') def access_cvar(): res = xapian.cvar print "Unhandled constants: ", res return res # Check that SWIG isn't generating cvar (regression test for ticket#297). expect_exception(AttributeError, "'module' object has no attribute 'cvar'", access_cvar) stem = xapian.Stem("english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data("a\0b") if doc.get_data() == "a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), "a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data("is there anybody out there?") doc.add_term("XYzzy") doc.add_posting(stem("is"), 1) doc.add_posting(stem("there"), 2) doc.add_posting(stem("anybody"), 3) doc.add_posting(stem("out"), 4) doc.add_posting(stem("there"), 5) db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY) db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query(xapian.Query(xapian.Query.OP_OR, terms), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query("smoke"), query1, "string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query(query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'), "VALUE_RANGE 0 1 4") # Check database factory functions are wrapped as expected: expect_exception(xapian.DatabaseNotFoundError, None, xapian.Database, "nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB) expect_exception(xapian.DatabaseNotFoundError, None, xapian.WritableDatabase, "nosuchdir/nosuchdb", xapian.DB_OPEN|xapian.DB_BACKEND_STUB) expect_exception(xapian.NetworkError, None, xapian.remote_open, "/bin/false", "") expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, "/bin/false", "") expect_exception(xapian.NetworkError, None, xapian.remote_open, "127.0.0.1", 0, 1) expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, "127.0.0.1", 0, 1) # Check wrapping of MatchAll and MatchNothing: expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) # Check Xapian::BAD_VALUENO is wrapped suitably. enq.set_collapse_key(xapian.BAD_VALUENO) enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = " ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, "is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist("there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, "there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to('n') while True: try: x = next(term) except StopIteration: break if x.term < 'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term) # Feature test for Document.values count = 0 for term in doc.values(): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data("Two") doc.add_posting(stem("out"), 1) doc.add_posting(stem("outside"), 1) doc.add_posting(stem("source"), 2) doc.add_value(0, "yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == "yes" query = xapian.Query(stem("out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith('a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, testexpanddecider()) eset_terms = [item.term for item in eset] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith('a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, None, 1.9) expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, "test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query(qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(0 * <alldocuments> AND_NOT test@1)") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem('en')) expect_query(qp.parse_query("foo ox", qp.FLAG_PARTIAL), "(Zfoo@1 AND (WILDCARD SYNONYM ox OR Zox@2))") expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL), "(Zfoo@1 AND (WILDCARD SYNONYM outside OR Zoutsid@2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'), '(foo OR bar)') expect_query(qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(0 * <alldocuments> AND_NOT Zt\xc3\xa9st@1)") doc = xapian.Document() doc.set_data(u"Unicode with an acc\xe9nt") doc.add_posting(stem(u"out\xe9r"), 1) expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8')) term = doc.termlist().next().term expect(term, u"out\xe9r".encode('utf-8')) # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") stop.add('a') expect(stop('a'), True) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == "b" def get_description(self): return u"my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), u"my_b_stopper") qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") expect(stop('b'), True) expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Test SimpleStopper initialised from a file. try: srcdir = os.environ['srcdir'] except KeyError: srcdir = '.' stop = xapian.SimpleStopper(srcdir + '/../shortstop.list') expect(stop('a'), True) expect(stop('am'), False) expect(stop('an'), True) expect(stop('the'), True) expect_exception(xapian.InvalidArgumentError, None, xapian.SimpleStopper, 'nosuchfile') # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text('foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]), ('foo', 2, [1, 4])]) # Check DateRangeProcessor works context("checking that DateRangeProcessor works") qp = xapian.QueryParser() rpdate = xapian.DateRangeProcessor(1, xapian.RP_DATE_PREFER_MDY, 1960) qp.add_rangeprocessor(rpdate) query = qp.parse_query('12/03/99..12/04/01') expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)') # Feature test for xapian.FieldProcessor context("running feature test for xapian.FieldProcessor") class testfieldprocessor(xapian.FieldProcessor): def __call__(self, s): if s == 'spam': raise Exception('already spam') return xapian.Query("spam") qp.add_prefix('spam', testfieldprocessor()) qp.add_boolean_prefix('boolspam', testfieldprocessor()) qp.add_boolean_prefix('boolspam2', testfieldprocessor(), False) # Old-style qp.add_boolean_prefix('boolspam3', testfieldprocessor(), '') qp.add_boolean_prefix('boolspam4', testfieldprocessor(), 'group') qp.add_boolean_prefix('boolspam5', testfieldprocessor(), None) query = qp.parse_query('spam:ignored') expect(str(query), 'Query(spam)') expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam') # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query("I like tea") # Regression test for bug fixed in 1.4.4: # https://bugs.debian.org/849722 oqparser.add_boolean_prefix('tag', 'K', '') # Make sure other cases also work: oqparser.add_boolean_prefix('zag', 'XR', False) # Old-style oqparser.add_boolean_prefix('rag', 'XR', None) oqparser.add_boolean_prefix('nag', 'XB', '') oqparser.add_boolean_prefix('bag', 'XB', 'blergh') oqparser.add_boolean_prefix('gag', 'XB', u'blergh') oqparser.add_boolean_prefix('jag', 'XB', b'blergh') # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata('Foo'), '') db.set_metadata('Foo', 'Foo') expect(db.get_metadata('Foo'), 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, '', 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query(xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5), "5 * foo")
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") def access_cvar(): return xapian.cvar # Check that SWIG isn't generating cvar (regression test for ticket#297). expect_exception(AttributeError, "'module' object has no attribute 'cvar'", access_cvar) stem = xapian.Stem("english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data("a\0b") if doc.get_data() == "a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), "a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data("is there anybody out there?") doc.add_term("XYzzy") doc.add_posting(stem("is"), 1) doc.add_posting(stem("there"), 2) doc.add_posting(stem("anybody"), 3) doc.add_posting(stem("out"), 4) doc.add_posting(stem("there"), 5) db = xapian.inmemory_open() db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query(xapian.Query(xapian.Query.OP_OR, terms), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query("smoke"), query1, "string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query( query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'), "VALUE_RANGE 0 1 4") expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = " ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, "is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist("there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, "there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to('n') while True: try: x = next(term) except StopIteration: break if x.term < 'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term) # Feature test for Document.values count = 0 for term in doc.values(): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data("Two") doc.add_posting(stem("out"), 1) doc.add_posting(stem("outside"), 1) doc.add_posting(stem("source"), 2) doc.add_value(0, "yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == "yes" query = xapian.Query(stem("out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith('a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, testexpanddecider()) eset_terms = [term[xapian.ESET_TNAME] for term in eset.items] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith('a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect(eset.items[-1][xapian.ESET_WT] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, None, 1.9) expect(eset.items[-1][xapian.ESET_WT] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, "test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query( qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT test:(pos=1))") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem('en')) expect_query( qp.parse_query("foo o", qp.FLAG_PARTIAL), "(Zfoo:(pos=1) AND ((out:(pos=2) SYNONYM outsid:(pos=2)) OR Zo:(pos=2)))" ) expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL), "(Zfoo:(pos=1) AND Zoutsid:(pos=2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')), '(foo OR bar\xc2\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'), '(foo OR bar)') expect_query( qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT Zt\xc3\xa9st:(pos=1))") doc = xapian.Document() doc.set_data(u"Unicode with an acc\xe9nt") doc.add_posting(stem(u"out\xe9r"), 1) expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8')) term = doc.termlist().next().term expect(term, u"out\xe9r".encode('utf-8')) # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo:(pos=1) AND Zbar:(pos=2) AND Za:(pos=3))") stop.add('a') expect(stop('a'), True) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo:(pos=1) AND Zbar:(pos=2))") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == "b" def get_description(self): return u"my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), u"my_b_stopper") qp.set_stopper(stop) expect(stop('a'), False) expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo:(pos=1) AND Zbar:(pos=2) AND Za:(pos=3))") expect(stop('b'), True) expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo:(pos=1) AND Zbar:(pos=2))") # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text('foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]), ('foo', 2, [1, 4])]) # Check DateValueRangeProcessor works context("checking that DateValueRangeProcessor works") qp = xapian.QueryParser() vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960) qp.add_valuerangeprocessor(vrpdate) query = qp.parse_query('12/03/99..12/04/01') expect(str(query), 'Xapian::Query(VALUE_RANGE 1 19991203 20011204)') # Regression test for bug#193, fixed in 1.0.3. context("running regression test for bug#193") vrp = xapian.NumberValueRangeProcessor(0, '$', True) a = '$10' b = '20' slot, a, b = vrp(a, b) expect(slot, 0) expect(xapian.sortable_unserialise(a), 10) expect(xapian.sortable_unserialise(b), 20) # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query("I like tea") # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata('Foo'), '') db.set_metadata('Foo', 'Foo') expect(db.get_metadata('Foo'), 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, '', 'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query( xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5), "5 * foo")
# Combine command line arguments up to "--" with spaces between # them, so that simple queries don't have to be quoted at the shell # level. query_string = sys.argv[2] index = 3 while index < len(sys.argv): arg = sys.argv[index] index += 1 if arg == '--': # Passed marker, move to parsing relevant docids. break query_string += ' ' query_string += arg # Create an RSet with the listed docids in. reldocs = xapian.RSet() for index in range(index, len(sys.argv)): reldocs.add_document(int(sys.argv[index])) # Parse the query string to produce a Xapian::Query object. qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = qp.parse_query(query_string) if not query.empty(): print("Parsed query is: %s" % str(query)) # Find the top 10 results for the query.
enquire.set_query(query) # Now, instead of showing the results of the query, we ask Xapian what are the # terms in the index that are most relevant to this search. # Normally, you would use the results to suggest the user possible ways for # refining the search. I instead abuse this feature to see what are the tags # that are most related to the search results. # Use an adaptive cutoff to avoid to pick bad results as references matches = enquire.get_mset(0, 1) topWeight = matches[0].weight enquire.set_cutoff(0, topWeight * 0.7) # Select the first 10 documents as the key ones to use to compute relevant # terms rset = xapian.RSet() for m in enquire.get_mset(0, 10): rset.add_document(m.docid) # Xapian supports providing a filter object, to say that we are only interested # in some terms. # This one filters out all the keywords that are not tags, or that were in the # list of query terms. class Filter(xapian.ExpandDecider): def __call__(self, term): """ Return true if we want the term, else false """ return term[:2] == "XT"
def get_neighborhood_rset(self, user, rec): mset = self.get_neighborhood(user, rec) rset = xapian.RSet() for m in mset: rset.add_document(m.document.get_docid()) return rset