def ingest_entities(tags_corpus: Dict, writer: BufferedWriter): print('INFO: evaluate entities') entities_corpus = get_entities_corpus() print('INFO: start entities ingestion') for veid in entities_corpus.keys(): if veid not in tags_corpus: continue tagsets = [{ 'locale': locale, 'values': tags } for locale, tags in tags_corpus[veid].items()] eid = generate_eid() names = set() for locale, uri in entities_corpus[veid].items(): names.add(Entity.name(uri)) storage.set(f'{eid}:{locale}', uri) supported = Language.locales() locales = [ tags['locale'] for tags in tagsets if tags['locale'] in supported and len(tags['values']) > 0 ] onehot = {locale: locale in locales for locale in supported} for name in names: writer.add_document(ngram=name, name=name, eid=eid, **onehot) for tags in tagsets: locale = tags['locale'] key = f'{eid}:{locale}:tags' for tag in tags['values']: storage.lpush(key, tag)
def test_20000_buffered(): from whoosh.writing import BufferedWriter sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(sc, "20000buffered") as ix: domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima"] t = now() w = BufferedWriter(ix, limit=100, period=None) for i in xrange(20000): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.close() print("Write buffered:", now() - t) t = now() ix.optimize() print("Optimize buffered:", now() - t)
class IndexWhoosh(Index): """Implements the whoosh engine as indexer.""" query_hash = QueryParser("hash", schema=SCHEMA) def create(self): self._indexer = create_in(self.path, SCHEMA) self._index = self._indexer def open(self, ro=False): if os.path.isdir(self.path): self._index = open_dir(self.path) self._indexer = self._index else: os.mkdir(self.path) self.create() self._searcher = self._index.searcher() self._opened = True def set_metadata(self, name, value): with open(os.path.join(self.path, "metadata-%s" % name), "w") as f: f.write(pickle.dumps(value)) def get_metadata(self, name, default=None): try: with open(os.path.join(self.path, "metadata-%s" % name)) as f: return pickle.loads(f.read()) except: return default def _timer(self): while True: self.flush() sleep(self.flush_time) def __init__(self, path, size=None, rows=None, flush_time=10, *args, **kwargs): self._opened = False self._index = None self._writer = None self.path = os.path.join(DEFAULT_INDEX_PATH, path) self.flush_time = flush_time self.flush_thread = threading.Thread(target=self._timer) self.open() self.count = 0 def flush(self): if getattr(self, "callback_flush", None): self.callback_flush(self) if self._writer is not None: self._writer.commit() self.count = 0 def is_indexed(self, hash): return self._searcher.search(self.query_hash.parse( unicode(hash))).estimated_length() > 0 def __call__(self, pipeline): self._writer = BufferedWriter(self._indexer, period=10, limit=1000) try: self.flush_thread.start() for event in pipeline: self.count += 1 self._writer.add_document(source=unicode(event["source"]), name=unicode(event["index"]), raw=unicode(event["_raw"]), time=int(event.time), hash=unicode(event.hash)) finally: self.flush() def search(self, expr, limit=10000): with self._index.searcher() as searcher: query = QueryParser("raw", self._index.schema) query.add_plugin(FieldsPlugin()) query.add_plugin(RangePlugin()) query.add_plugin(GtLtPlugin()) query.add_plugin(WildcardPlugin()) query = query.parse(expr) for x in searcher.search(query, limit=limit): yield x def __iter__(self): for x in self.search(u"*", None): yield x["raw"]
class IndexWhoosh(Index): """Implements the whoosh engine as indexer.""" query_hash = QueryParser("hash", schema=SCHEMA) def create(self): self._indexer = create_in(self.path, SCHEMA) self._index = self._indexer def open(self, ro=False): if os.path.isdir(self.path): self._index = open_dir(self.path) self._indexer = self._index else: os.mkdir(self.path) self.create() self._searcher = self._index.searcher() self._opened = True def set_metadata(self, name, value): with open(os.path.join(self.path,"metadata-%s" % name),"w") as f: f.write(pickle.dumps(value)) def get_metadata(self, name, default=None): try: with open(os.path.join(self.path,"metadata-%s" % name)) as f: return pickle.loads(f.read()) except: return default def _timer(self): while True: self.flush() sleep(self.flush_time) def __init__(self, path, size=None, rows=None, flush_time=10, *args, **kwargs): self._opened = False self._index = None self._writer = None self.path = os.path.join(DEFAULT_INDEX_PATH, path) self.flush_time = flush_time self.flush_thread = threading.Thread(target=self._timer) self.open() self.count = 0 def flush(self): if getattr(self, "callback_flush", None): self.callback_flush(self) if self._writer is not None: self._writer.commit() self.count = 0 def is_indexed(self, hash): return self._searcher.search(self.query_hash.parse(unicode(hash))).estimated_length() > 0 def __call__(self, pipeline): self._writer = BufferedWriter(self._indexer, period=10, limit=1000) try: self.flush_thread.start() for event in pipeline: self.count += 1 self._writer.add_document(source=unicode(event["source"]), name=unicode(event["index"]), raw=unicode(event["_raw"]), time=int(event.time), hash=unicode(event.hash)) finally: self.flush() def search(self, expr, limit=10000): with self._index.searcher() as searcher: query = QueryParser("raw", self._index.schema) query.add_plugin(FieldsPlugin()) query.add_plugin(RangePlugin()) query.add_plugin(GtLtPlugin()) query.add_plugin(WildcardPlugin()) query = query.parse(expr) for x in searcher.search(query, limit=limit): yield x def __iter__(self): for x in self.search(u"*", None): yield x["raw"]