def index(self, lib): print "Indexing with %s..." % lib options = self.options chunk = int(options.chunk) skip = int(options.skip) upto = int(options.upto) count = 0 skipc = skip starttime = chunkstarttime = now() lib.indexer() for d in self.spec.documents(): skipc -= 1 if not skipc: lib.index_document(d) count += 1 skipc = skip if chunk and not count % chunk: t = now() sofar = t - starttime print "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count/sofar) chunkstarttime = t if count > upto: break spooltime = now() print "Spool time:", spooltime - starttime lib.finish() committime = now() print "Commit time:", committime - spooltime print "Total time to index", count, "documents:", committime - starttime
def index(self, lib): print "Indexing with %s..." % lib options = self.options chunk = int(options.chunk) skip = int(options.skip) upto = int(options.upto) count = 0 skipc = skip starttime = chunkstarttime = now() lib.indexer() for d in self.spec.documents(): skipc -= 1 if not skipc: lib.index_document(d) count += 1 skipc = skip if chunk and not count % chunk: t = now() sofar = t - starttime print "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % ( count, t - chunkstarttime, chunk, sofar, count / sofar) chunkstarttime = t if count > upto: break spooltime = now() print "Spool time:", spooltime - starttime lib.finish() committime = now() print "Commit time:", committime - spooltime print "Total time to index", count, "documents:", committime - starttime
def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): if not os.path.exists(ixdir): os.mkdir(ixdir) # Multi-lingual stop words stoplist = (analysis.STOP_WORDS | set("de la der und le die et en al no von di du da " "del zur ein".split())) # Schema ana = analysis.StemmingAnalyzer(stoplist=stoplist) schema = fields.Schema( title=fields.TEXT(analyzer=ana), author=fields.TEXT(phrase=False), subject=fields.TEXT(analyzer=ana, phrase=False), file=fields.STORED, pos=fields.STORED, ) # MARC fields to extract mfields = set(subjectfields) # Subjects mfields.update("100 110 111".split()) # Author mfields.add("245") # Title print("Indexing with %d processor(s) and %d MB per processor" % (procs, limitmb)) c = 0 t = now() ix = index.create_in(ixdir, schema) with ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) as w: filenames = [ filename for filename in os.listdir(basedir) if fnmatch.fnmatch(filename, glob) ] for filename in filenames: path = os.path.join(basedir, filename) print("Indexing", path) f = open(path, 'rb') for x, pos in read_file(f, mfields): w.add_document(title=uni(title(x)), author=uni(author(x)), subject=uni(subjects(x)), file=filename, pos=pos) c += 1 f.close() print("Committing...") print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
def search_file(self, lib): f = open(self.options.termfile, "rb") terms = [line.strip() for line in f] f.close() print("Searching %d terms with %s" % (len(terms), lib)) lib.searcher() starttime = now() for r in lib.findterms(terms): pass searchtime = now() - starttime print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime)
def search(self, lib): lib.searcher() t = now() q = lib.query() print "Query:", q r = lib.find(q) print "Search time:", now() - t t = now() self.spec.print_results(lib.results(r)) print "Print time:", now() - t
def search_file(self, lib): f = open(self.options.termfile, "rb") terms = [line.strip() for line in f] f.close() print "Searching %d terms with %s" % (len(terms), lib) lib.searcher() starttime = now() for r in lib.findterms(terms): pass searchtime = now() - starttime print "Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime
def finish(self, doccount, lengthfile, termtable, postingwriter): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return pqueue = self.postingqueue rqueue = self.resultsqueue for _ in xrange(self.procs): pqueue.put((-1, doccount)) #print "Joining..." t = now() for task in self.tasks: task.join() #print "Join:", now() - t #print "Getting results..." t = now() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length #print "Results:", now() - t #print "Writing lengths..." t = now() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() #print "Lengths:", now() - t t = now() iterator = imerge([read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) write_postings(self.schema, termtable, lengths, postingwriter, iterator) for runname, count in runs: os.remove(runname) #print "Merge:", now() - t self.cleanup()
def cache_messages(self, archive, cache): print("Caching messages in %s..." % cache) if not os.path.exists(archive): raise Exception("Archive file %r does not exist" % archive) t = now() f = open(cache, "wb") c = 0 for d in self.get_messages(archive): c += 1 dump(d, f) if not c % 1000: print(c) f.close() print("Cached messages in ", now() - t, "seconds")
def prepare(self, top_searcher, q, context): """This method is called before a search. Subclasses can override this to perform set-up work, but they should still call the superclass's method because it sets several necessary attributes on the collector object: self.top_searcher The top-level searcher. self.q The query object self.context ``context.needs_current`` controls whether a wrapping collector requires that this collector's matcher be in a valid state at every call to ``collect()``. If this is ``False``, the collector is free to use faster methods that don't necessarily keep the matcher updated, such as ``matcher.all_ids()``. :param top_searcher: the top-level :class:`whoosh.searching.Searcher` object. :param q: the :class:`whoosh.query.Query` object being searched for. :param context: a :class:`whoosh.searching.SearchContext` object containing information about the search. """ self.top_searcher = top_searcher self.q = q self.context = context self.starttime = now() self.runtime = None self.docset = set()
def _complex_sort_query(self, q, limit=None, reverse=False, filter=None): t = now() if self.arrays is None: self._complex_cache() comb = self.searcher._filter_to_comb(filter) docnums = [docnum for docnum in self.searcher.docs_for_query(q) if (not comb) or docnum in comb] docnums.sort(key=self._complex_key_fn, reverse=reverse) docset = set(docnums) # I artificially enforce the limit here, even thought the current # implementation can't use it, so that the results don't change based # on single- vs- multi-segment. if limit: docnums = docnums[:limit] runtime = now() - t return self._results(q, docnums, docset, runtime)
def index(self, lib): print("Indexing with %s..." % lib) options = self.options every = None if options.every is None else int(options.every) merge = options.merge chunk = int(options.chunk) skip = int(options.skip) upto = int(options.upto) count = 0 skipc = skip starttime = chunkstarttime = now() lib.indexer() for d in self.spec.documents(): skipc -= 1 if not skipc: lib.index_document(d) count += 1 skipc = skip if chunk and not count % chunk: t = now() sofar = t - starttime print( "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count / sofar)) chunkstarttime = t if count > upto: break if every and not count % every: print("----Commit") lib.finish(merge=merge) lib.indexer(create=False) spooltime = now() print("Spool time:", spooltime - starttime) lib.finish(merge=merge) committime = now() print("Commit time:", committime - spooltime) totaltime = committime - starttime print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" % (count, totaltime, totaltime / 60.0)) print("Indexed %0.3f docs/s" % (count / totaltime))
def sort_query(self, query, sortedby, reverse=False): if isinstance(sortedby, basestring): sorter = self._field_sorter(sortedby) elif isinstance(sortedby, (list, tuple)): sorter = scoring.MultiFieldSorter([self._field_sorter(fname) for fname in sortedby]) elif isinstance(sortedby, Sorter): sorter = sortedby else: raise ValueError("sortedby argument (%R) must be a string, list," " or Sorter" % sortedby) t = now() sorted_docs = list(sorter.order(self, query.docs(self), reverse=reverse)) runtime = now() - t return Results(self, query, sorted_docs, None, runtime)
def test_20000_single(): sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(sc, "20000single") as ix: domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima"] t = now() for i in xrange(20000): w = ix.writer() w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.commit() print("Write single:", now() - t) t = now() ix.optimize() print("Optimize single:", now() - t)
def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True): ix = index.open_dir(ixdir) qp = qparser.QueryParser("title", ix.schema) q = qp.parse(qstring) with ix.searcher(weighting=scoring.PL2()) as s: if scores: r = s.search(q, limit=limit, optimize=optimize) for hit in r: print_record(hit.rank, basedir, hit["file"], hit["pos"]) print("Found %d records in %0.06f seconds" % (len(r), r.runtime)) else: t = now() for i, docnum in enumerate(s.docs_for_query(q)): if not limit or i < limit: fields = s.stored_fields(docnum) print_record(i, basedir, fields["file"], fields["pos"]) print("Found %d records in %0.06f seconds" % (i, now() - t))
def finish(self): """This method is called after a search. Subclasses can override this to perform set-up work, but they should still call the superclass's method because it sets several necessary attributes on the collector object: self.runtime The time (in seconds) the search took. """ self.runtime = now() - self.starttime
def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): if not os.path.exists(ixdir): os.mkdir(ixdir) # Multi-lingual stop words stoplist = (analysis.STOP_WORDS | set("de la der und le die et en al no von di du da " "del zur ein".split())) # Schema ana = analysis.StemmingAnalyzer(stoplist=stoplist) schema = fields.Schema(title=fields.TEXT(analyzer=ana), author=fields.TEXT(phrase=False), subject=fields.TEXT(analyzer=ana, phrase=False), file=fields.STORED, pos=fields.STORED, ) # MARC fields to extract mfields = set(subjectfields) # Subjects mfields.update("100 110 111".split()) # Author mfields.add("245") # Title print("Indexing with %d processor(s) and %d MB per processor" % (procs, limitmb)) c = 0 t = now() ix = index.create_in(ixdir, schema) with ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) as w: filenames = [filename for filename in os.listdir(basedir) if fnmatch.fnmatch(filename, glob)] for filename in filenames: path = os.path.join(basedir, filename) print("Indexing", path) f = open(path, 'rb') for x, pos in read_file(f, mfields): w.add_document(title=uni(title(x)), author=uni(author(x)), subject=uni(subjects(x)), file=filename, pos=pos) c += 1 f.close() print("Committing...") print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
def test_20000_buffered(): from whoosh.writing import BufferedWriter sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(sc, "20000buffered") as ix: domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima"] t = now() w = BufferedWriter(ix, limit=100, period=None) for i in xrange(20000): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.close() print("Write buffered:", now() - t) t = now() ix.optimize() print("Optimize buffered:", now() - t)
def index_document(self, d): try: self.archive.indexDictionary(str(self.count), d) except ValueError: print "d=", d raise self.count += 1 if not self.count % int(self.options.batch): t = now() self.archive.store(lazy=True) self.indexer(create=False)
def index_document(self, d): try: self.archive.indexDictionary(str(self.count), d) except ValueError: print("d=", d) raise self.count += 1 if not self.count % int(self.options.batch): t = now() self.archive.store(lazy=True) self.indexer(create=False)
def dump_run(self): if self.size > 0: #print "Dumping run..." t = now() filename = self.unique_name(".run") runfile = open(filename, "w+b") self.postings.sort() for p in self.postings: dump(p, runfile) runfile.close() self.runs.append((filename, self.count)) self.postings = [] self.size = 0 self.count = 0
def timing(name=None): t = now() yield t = now() - t print("%s: %0.06f s" % (name or '', t))
content = fields.TEXT(spelling=True, analyzer=ana) chapter = fields.ID(sortable=True) size = fields.NUMERIC(sortable=True) lastopened = fields.TEXT(sortable=True) lastchanged = fields.TEXT(sortable=True) created = fields.TEXT(sortable=True) ix = index.create_in(indexdir, PydocSchema) with ix.writer(limitmb=2048) as w: t = now() for dirpath, dirnames, filenames in os.walk(sourcedir): chapter = unicode(os.path.basename(dirpath)) for filename in filenames: filepath = os.path.join(dirpath, filename) size = os.path.getsize(filepath) path = dirpath fileName, fileExt = os.path.splitext(filename) fileName = unicode(fileName, errors='ignore') fileExt = unicode(fileExt, errors='ignore') data = None lasto = time.ctime(os.stat(filepath).st_atime) lasto = unicode(lasto[4:])
schema = fields.Schema( tags=fields.KEYWORD(stored=True, vector=formats.Existence())) if not os.path.exists(dirname): os.mkdir(dirname) reindex = False if reindex or not index.exists_in(dirname): tags = [] for _ in xrange(tagcount): tag = u"".join( random.choice(string.ascii_lowercase) for _ in xrange(5)) tags.append(tag) ix = index.create_in(dirname, schema) t = now() with ix.writer() as w: for i in xrange(doccount): doc = u" ".join(random.sample(tags, random.randint(10, 20))) w.add_document(tags=doc) if not i % 10000: print i print now() - t ix = index.open_dir(dirname) with ix.searcher() as s: tags = list(s.lexicon("tags")) facet = sorting.FieldFacet("tags", allow_overlap=True) qtag = random.choice(tags) print "tag=", qtag q = query.Term("tags", qtag)
dirname = "testindex" schema = fields.Schema(tags=fields.KEYWORD(stored=True, vector=formats.Existence())) if not os.path.exists(dirname): os.mkdir(dirname) reindex = False if reindex or not index.exists_in(dirname): tags = [] for _ in xrange(tagcount): tag = u"".join(random.choice(string.ascii_lowercase) for _ in xrange(5)) tags.append(tag) ix = index.create_in(dirname, schema) t = now() with ix.writer() as w: for i in xrange(doccount): doc = u" ".join(random.sample(tags, random.randint(10, 20))) w.add_document(tags=doc) if not i % 10000: print i print now() - t ix = index.open_dir(dirname) with ix.searcher() as s: tags = list(s.lexicon("tags")) facet = sorting.FieldFacet("tags", allow_overlap=True) qtag = random.choice(tags) print "tag=", qtag
def __exit__(self, exc_type, exc_val, exc_tb): if not exc_type: print "%0.8f" % (now() - self.t)
def _simple_sort_query(self, q, limit=None, reverse=False, filter=None): # If the direction of all sort fields is the same, we can use field # caches to do the sorting t = now() docset = set() sortedby = [c[0] for c in self.criteria] reverse = self.criteria[0][1] ^ reverse comb = self.searcher._filter_to_comb(filter) if self.searcher.subsearchers: heap = [] # I wish I could actually do a heap thing here, but the Python heap # queue only works with greater-than, and I haven't thought of a # smart way to get around that yet, so I'm being dumb and using # nlargest/nsmallest on the heap + each subreader list :( op = nlargest if reverse else nsmallest for s, offset in self.searcher.subsearchers: # This searcher is wrapping a MultiReader, so push the sorting # down to the leaf readers and then combine the results. docnums = [docnum for docnum in q.docs(s) if (not comb) or docnum + offset in comb] # Add the docnums to the docset docset.update(docnums) # Ask the reader to return a list of (key, docnum) pairs to # sort by. If limit=None, the returned list is not sorted. If # limit=True, it is sorted. r = s.reader() srt = r.key_docs_by(sortedby, docnums, limit, reverse=reverse, offset=offset) if limit: # Pick the "limit" smallest/largest items from the current # and new list heap = op(limit, heap + srt) else: # If limit=None, we'll just add everything to the "heap" # and sort it at the end. heap.extend(srt) # Sort the heap and take the docnums docnums = [docnum for _, docnum in sorted(heap, reverse=reverse)] else: # This searcher is wrapping an atomic reader, so we don't need to # get tricky combining the results of multiple readers, just ask # the reader to sort the results. r = self.searcher.reader() docnums = [docnum for docnum in q.docs(self.searcher) if (not comb) or docnum in comb] docnums = r.sort_docs_by(sortedby, docnums, reverse=reverse) docset = set(docnums) # I artificially enforce the limit here, even thought the current # implementation can't use it, so that the results don't change # based on single- vs- multi-segment. docnums = docnums[:limit] runtime = now() - t return self._results(q, docnums, docset, runtime)
def download_archive(self, archive): print("Downloading Enron email archive to %r..." % archive) t = now() urlretrieve(self.enron_archive_url, archive) print("Downloaded in ", now() - t, "seconds")
def search(self, query, limit=10, sortedby=None, reverse=False, optimize=True): """Runs the query represented by the ``query`` object and returns a Results object. :param query: a :class:`whoosh.query.Query` object. :param limit: the maximum number of documents to score. If you're only interested in the top N documents, you can set limit=N to limit the scoring for a faster search. :param sortedby: if this parameter is not None, the results are sorted instead of scored. If this value is a string, the results are sorted by the field named in the string. If this value is a list or tuple, it is assumed to be a sequence of strings and the results are sorted by the fieldnames in the sequence. Otherwise 'sortedby' should be a scoring.Sorter object. The fields you want to sort by must be indexed. For example, to sort the results by the 'path' field:: searcher.find(q, sortedby = "path") To sort the results by the 'path' field and then the 'category' field:: searcher.find(q, sortedby = ("path", "category")) To use a sorting object:: searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn)) Using a string or tuple simply instantiates a :class:`whoosh.scoring.FieldSorter` or :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a custom sort order, instantiate your own ``FieldSorter`` with a ``key`` argument, or write a custom :class:`whoosh.scoring.Sorter` class. FieldSorter and MultiFieldSorter cache the document order, using 4 bytes times the number of documents in the index, and taking time to cache. To increase performance, instantiate your own sorter and re-use it (but remember you need to recreate it if the index changes). :param reverse: if ``sortedby`` is not None, this reverses the direction of the sort. :param optimize: use optimizations to get faster results when possible. :rtype: :class:`Results` """ if limit is not None and limit < 1: raise ValueError("limit must be >= 1") if sortedby is not None: return self.sort_query(query, sortedby, reverse=reverse) t = now() matcher = query.matcher(self) if isinstance(matcher, NullMatcher): scores = [] docnums = [] bitset = None else: scores, docnums, bitset = collect(self, matcher, limit, usequality=optimize) runtime = now() - t return Results(self, query, docnums, scores, runtime, docs=bitset)
def test_bigsort(): times = 30000 dirname = "testindex" df = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=df) if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) ix = index.create_in(dirname, schema) print("Writing...") t = now() w = ix.writer(limitmb=512) for i in xrange(times): dt = datetime.fromtimestamp(random.randint(15839593, 1294102139)) w.add_document(id=text_type(i), date=dt) w.commit() print("Writing took ", now() - t) ix = index.open_dir(dirname) s = ix.searcher() q = query.Wildcard("id", "1?2*") t = now() x = list(df.sortable_terms(s.reader(), "date")) print(now() - t, len(x)) t = now() for y in x: p = list(s.postings("date", y).all_ids()) print(now() - t) t = now() r = s.search(q, limit=25, sortedby="date", reverse=True) print("Search 1 took", now() - t) print("len=", r.scored_length()) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) from heapq import nlargest t = now() sf = s.stored_fields gen = ((sf(n)["date"], n) for n in q.docs(s)) r = nlargest(25, gen) print(now() - t)
class PydocSchema(fields.SchemaClass): path = fields.STORED title = fields.TEXT(stored=True, sortable=True, spelling=True, analyzer=ana) tgrams = fields.NGRAMWORDS ext = fields.TEXT(stored=True, sortable=True) content = fields.TEXT(spelling=True, analyzer=ana) chapter = fields.ID(sortable=True) size = fields.NUMERIC(sortable=True) lastopened = fields.TEXT(sortable=True) lastchanged = fields.TEXT(sortable=True) created = fields.TEXT(sortable=True) ix = index.create_in(indexdir, PydocSchema) with ix.writer(limitmb=2048) as w: t = now() for dirpath, dirnames, filenames in os.walk(sourcedir): try: for filename in filenames: filepath = os.path.join(dirpath, filename) size = os.path.getsize(filepath) path = dirpath chapter = unicode(os.path.basename(dirpath)) fileName, fileExt = os.path.splitext(filename) fileName = unicode(fileName) fileExt = unicode(fileExt) data = None print dirpath, filename try:
def __enter__(self): self.t = now()
def test_now(): from whoosh.util import now t1 = now() t2 = now() assert t1 <= t2
def results(self, qstring, cat_order, category=None, shortcuts=None, limit=None, cat_limit=5): from whoosh.util import now t = now() s = self.searcher limit = limit or self.limit showall = False if shortcuts: qstring, showall = self.expand_shortcuts(qstring, shortcuts) if category: filter = query.Term("category", category) else: filter = None all_q = self.make_query(qstring, "content") show_best = (not category and all(isinstance(lq, query.Term) and lq.field() == "content" for lq in all_q.leaves())) if show_best: best_q = self.make_query(qstring, "bestbet") best_r = s.search(best_q, limit=10) else: best_r = None grams_groups = None grams_q = self.make_query(qstring, "grams") if any(fn == "grams" for fn, _ in grams_q.iter_all_terms()): try: grams_r = s.search(grams_q, limit=limit, groupedby="category", filter=filter) except query.QueryError: pass else: grams_groups = grams_r.groups() all_r = s.search(all_q, limit=limit, groupedby="category", filter=filter) all_groups = all_r.groups() # OK, this is complicated... we want to present the categories in the # order defined in cat_order, BUT we want categories that have grams # matches to come before categories that only have content matches final_order = [] if grams_groups: # Add categories in grams_groups in the order defined by cat_order for cat in cat_order: if cat in grams_groups: final_order.append(cat) # Add any categories in grams_groups that aren't in cat_order final_order.extend(cat for cat in sorted(grams_groups) if cat not in cat_order) seen = set(final_order) # Add categories in all_groups in the order defined by cat_order, IF # they weren't already added in the previous step for cat in cat_order: if cat in all_groups and cat not in seen: final_order.append(cat) # Add any categories in all_groups that weren't added in the previous # steps final_order.extend(cat for cat in sorted(all_groups) if cat not in cat_order and cat not in seen) # If there's only one category, there's no point in cutting it off, # just show all hits showall = showall or len(final_order) == 1 # For each category, pull out the docnums and get their stored fields length = 0 categories = [] for cat in final_order: # Combine the docnums for this category from grams and all docnums = [] seen = set() if grams_groups: for docnum in grams_groups.get(cat, ()): docnums.append(docnum) seen.add(docnum) for docnum in all_groups.get(cat, ()): if docnum not in seen: docnums.append(docnum) seen.add(docnum) # If the number of hits is exactly the limit + 1, then there's no # point showing a "show more" line instead of that one extra hit, # so just increase the limit in that case if len(docnums) == cat_limit + 1: cutoff = len(docnums) else: cutoff = cat_limit if not showall and len(docnums) > cutoff: docnums = docnums[:cutoff] length += len(seen) docs = [s.stored_fields(docnum) for docnum in docnums] categories.append((cat, docs, len(seen))) sent = now() runtime_ms = (sent - t) * 1000 return { "qstring": qstring, "best": best_r, "category": category, "categories": categories, "length": length, "limit": limit, "hits": all_r, "sent": sent, "runtime": runtime_ms, }
def test_bigsort(): times = 30000 dirname = "testindex" df = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=df) if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) ix = index.create_in(dirname, schema) print("Writing...") t = now() w = ix.writer(limitmb=512) for i in xrange(times): dt = datetime.fromtimestamp(random.randint(15839593, 1294102139)) w.add_document(id=text_type(i), date=dt) w.commit() print("Writing took ", now() - t) ix = index.open_dir(dirname) s = ix.searcher() q = query.Wildcard("id", "1?2*") t = now() x = list(df.sortable_values(s.reader(), "date")) print(now() - t, len(x)) t = now() for y in x: p = list(s.postings("date", y).all_ids()) print(now() - t) t = now() r = s.search(q, limit=25, sortedby="date", reverse=True) print("Search 1 took", now() - t) print("len=", r.scored_length()) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) from heapq import nlargest t = now() sf = s.stored_fields gen = ((sf(n)["date"], n) for n in q.docs(s)) r = nlargest(25, gen) print(now() - t)
ana = analysis.StemmingAnalyzer(stoplist=stoplists["en"], maxsize=40) class EmailSchema(fields.SchemaClass): subject = fields.TEXT(stored=True, sortable=True, analyzer=ana) sgrams = fields.NGRAMWORDS body = fields.TEXT(stored=True, spelling=True, analyzer=ana) sender = fields.TEXT(phrase=False, stored=True) sent = fields.DATETIME(sortable=True) filename = fields.STORED key = fields.STORED ix = index.create_in(indexdir, EmailSchema) with ix.writer(limitmb=1024) as w: t = now() parser = email.parser.Parser() for filename in os.listdir(sourcedir): if not filename.endswith(".txt"): continue filepath = os.path.join(sourcedir, filename) print(filepath) mbox = mailbox.mbox(filepath) for key, msg in mbox.iteritems(): sent = None if msg["date"]: sent_tuple = email.utils.parsedate_tz(msg["date"]) if sent_tuple and sent_tuple[0] >= 2000: tm = email.utils.mktime_tz(sent_tuple) sent = datetime.datetime.fromtimestamp(tm)