def doc_scores(self, searcher, weighting = None, exclude_docs = None): query, filterquery = self.subqueries filter = BitVector(searcher.doc_count_all()) for docnum in filterquery.docs(searcher, exclude_docs = exclude_docs): filter.set(docnum) for docnum, score in query.doc_scores(searcher, weighting = weighting): if docnum not in filter: continue yield docnum, score
def docs(self, searcher, exclude_docs=None): vector = BitVector(searcher.doc_count_all()) text = self.text for fieldname in self.fieldnames: fieldnum = searcher.fieldname_to_num(fieldname) if (fieldnum, text) in searcher: for docnum, _ in searcher.postings(fieldnum, self.text, exclude_docs=exclude_docs): vector.set(docnum) return iter(vector)
def docs(self, searcher, exclude_docs = None): vector = BitVector(searcher.doc_count_all()) text = self.text for fieldname in self.fieldnames: fieldnum = searcher.fieldname_to_num(fieldname) if (fieldnum, text) in searcher: for docnum, _ in searcher.postings(fieldnum, self.text, exclude_docs = exclude_docs): vector.set(docnum) return iter(vector)
def _not_vector(searcher, notqueries, sourcevector): # Returns a BitVector where the positions are docnums # and True means the docnum is banned from the results. # 'sourcevector' is the incoming exclude_docs. This # function makes a copy of it and adds the documents # from notqueries if sourcevector is None: nvector = BitVector(searcher.reader().doc_count_all()) else: nvector = sourcevector.copy() for nquery in notqueries: nvector.set_from(nquery.docs(searcher)) return nvector
def search(self, query, limit=5000, sortedby=None, reverse=False): """Runs the query represented by the query object and returns a Results object. See the help for :meth:`~Searcher.find` for information on the parameters. :param query: a :class:`whoosh.query.Query` object. :rtype: :class:`Results` """ ixreader = self.ixreader t = now() if sortedby is not None: if isinstance(sortedby, basestring): sorter = scoring.FieldSorter(sortedby) elif isinstance(sortedby, (list, tuple)): sorter = scoring.MultiFieldSorter( [FieldSorter(fn) for fn in sortedby]) elif isinstance(sortedby, Sorter): sorter = sortedby else: raise ValueError( "sortedby argument must be a string, list, or Sorter (%r)" % sortedby) scored_list = sorter.order(self, query.docs(self), reverse=reverse) scores = None docvector = BitVector(ixreader.doc_count_all(), source=scored_list) if len(scored_list) > limit: scored_list = list(scored_list)[:limit] else: # Sort by scores topdocs = TopDocs(limit, ixreader.doc_count_all()) final = self.weighting.final topdocs.add_all((docnum, final(self, docnum, score)) for docnum, score in query.doc_scores(self)) best = topdocs.best() if best: # topdocs.best() returns a list like # [(docnum, score), (docnum, score), ... ] # This unpacks that into two lists: docnums and scores scored_list, scores = zip(*topdocs.best()) else: scored_list = [] scores = [] docvector = topdocs.docs t = now() - t return Results(self, query, scored_list, docvector, runtime=t, scores=scores)
def docs(self, searcher, exclude_docs = None): if not self.subqueries: return hits = BitVector(searcher.doc_count_all()) self._split_queries() if self._notqueries: exclude_docs = _not_vector(self._notqueries, searcher, exclude_docs) getbit = hits.__getitem__ setbit = hits.set for q in self._subqueries: for docnum in q.docs(searcher, exclude_docs = exclude_docs): if not getbit(docnum): yield docnum setbit(docnum)
def __init__(self, capacity, max_doc, docvector=None): self.capacity = capacity self.docs = docvector or BitVector(max_doc) self.heap = [] self._total = 0
def search(self, query, limit=5000, sortedby=None, reverse=False, minscore=0.0001): """Runs the query represented by the ``query`` object and returns a Results object. :param query: a :class:`whoosh.query.Query` object. :param limit: the maximum number of documents to score. If you're only interested in the top N documents, you can set limit=N to limit the scoring for a faster search. :param sortedby: if this parameter is not None, the results are sorted instead of scored. If this value is a string, the results are sorted by the field named in the string. If this value is a list or tuple, it is assumed to be a sequence of strings and the results are sorted by the fieldnames in the sequence. Otherwise 'sortedby' should be a scoring.Sorter object. The fields you want to sort by must be indexed. For example, to sort the results by the 'path' field:: searcher.find(q, sortedby = "path") To sort the results by the 'path' field and then the 'category' field:: searcher.find(q, sortedby = ("path", "category")) To use a sorting object:: searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn)) Using a string or tuple simply instantiates a :class:`whoosh.scoring.FieldSorter` or :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a custom sort order, instantiate your own ``FieldSorter`` with a ``key`` argument, or write a custom :class:`whoosh.scoring.Sorter` class. FieldSorter and MultiFieldSorter cache the document order, using 4 bytes times the number of documents in the index, and taking time to cache. To increase performance, instantiate your own sorter and re-use it (but remember you need to recreate it if the index changes). :param reverse: if ``sortedby`` is not None, this reverses the direction of the sort. :param minscore: the minimum score to include in the results. :rtype: :class:`Results` """ ixreader = self.ixreader t = now() if sortedby is not None: if isinstance(sortedby, basestring): sorter = scoring.FieldSorter(sortedby) elif isinstance(sortedby, (list, tuple)): sorter = scoring.MultiFieldSorter( [FieldSorter(fn) for fn in sortedby]) elif isinstance(sortedby, Sorter): sorter = sortedby else: raise ValueError( "sortedby argument must be a string, list, or Sorter (%r)" % sortedby) scored_list = sorter.order(self, query.docs(self), reverse=reverse) scores = None docvector = BitVector(ixreader.doc_count_all(), source=scored_list) if len(scored_list) > limit: scored_list = list(scored_list)[:limit] else: # Sort by scores topdocs = TopDocs(limit, ixreader.doc_count_all()) final = self.weighting.final topdocs.add_all(((docnum, final(self, docnum, score)) for docnum, score in query.doc_scores(self)), minscore) best = topdocs.best() if best: # topdocs.best() returns a list like # [(docnum, score), (docnum, score), ... ] # This unpacks that into two lists: docnums and scores scored_list, scores = zip(*topdocs.best()) else: scored_list = [] scores = [] docvector = topdocs.docs t = now() - t return Results(self, query, scored_list, docvector, runtime=t, scores=scores)
def search(self, query, limit=5000, weighting=None, sortedby=None, reverse=False): """Runs the query represented by the query object and returns a Results object. :query: a query.Query object representing the search query. You can translate a query string into a query object with e.g. qparser.QueryParser. :limit: the maximum number of documents to score. If you're only interested in the top N documents, you can set limit=N to limit the scoring for a faster search. :weighting: if this parameter is not None, use this weighting object to score the results instead of the default. :sortedby: if this parameter is not None, the results are sorted instead of scored. If this value is a string, the results are sorted by the field named in the string. If this value is a list or tuple, it is assumed to be a sequence of strings and the results are sorted by the fieldnames in the sequence. Otherwise 'sortedby' should be a scoring.Sorter object. The fields you want to sort by must be indexed. For example, to sort the results by the 'path' field:: searcher.search(q, sortedby = "path") To sort the results by the 'path' field and then the 'category' field:: searcher.search(q, sortedby = ("path", "category")) To use a sorting object:: searcher.search(q, sortedby = scoring.NullSorter) :reverse: if 'sortedby' is not None, this reverses the direction of the sort. """ doc_reader = self.doc_reader t = time.time() if sortedby is not None: if isinstance(sortedby, basestring): sortedby = scoring.FieldSorter(sortedby) elif isinstance(sortedby, (list, tuple)): sortedby = scoring.MultiFieldSorter(sortedby) elif callable(sortedby): sortedby = sortedby() scored_list = sortedby.order(self, query.docs(self), reverse=reverse) docvector = BitVector(doc_reader.doc_count_all(), source=scored_list) if len(scored_list) > limit: scored_list = list(scored_list)[:limit] else: # Sort by scores topdocs = TopDocs(limit, doc_reader.doc_count_all()) topdocs.add_all( query.doc_scores(self, weighting=weighting or self.weighting)) scored_list = topdocs.best() docvector = topdocs.docs t = time.time() - t return Results(self, query, scored_list, docvector, runtime=t)