예제 #1
0
파일: ingest.py 프로젝트: deezer/muzeeglot
def ingest_entities(tags_corpus: Dict, writer: BufferedWriter):
    print('INFO: evaluate entities')
    entities_corpus = get_entities_corpus()
    print('INFO: start entities ingestion')
    for veid in entities_corpus.keys():
        if veid not in tags_corpus:
            continue
        tagsets = [{
            'locale': locale,
            'values': tags
        } for locale, tags in tags_corpus[veid].items()]
        eid = generate_eid()
        names = set()
        for locale, uri in entities_corpus[veid].items():
            names.add(Entity.name(uri))
            storage.set(f'{eid}:{locale}', uri)
        supported = Language.locales()
        locales = [
            tags['locale'] for tags in tagsets
            if tags['locale'] in supported and len(tags['values']) > 0
        ]
        onehot = {locale: locale in locales for locale in supported}
        for name in names:
            writer.add_document(ngram=name, name=name, eid=eid, **onehot)
        for tags in tagsets:
            locale = tags['locale']
            key = f'{eid}:{locale}:tags'
            for tag in tags['values']:
                storage.lpush(key, tag)
def test_20000_buffered():
    from whoosh.writing import BufferedWriter

    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000buffered") as ix:
        domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot",
                  "golf", "hotel", "india", "juliet", "kilo", "lima"]

        t = now()
        w = BufferedWriter(ix, limit=100, period=None)
        for i in xrange(20000):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        w.close()
        print("Write buffered:", now() - t)

        t = now()
        ix.optimize()
        print("Optimize buffered:", now() - t)
예제 #3
0
class IndexWhoosh(Index):
    """Implements the whoosh engine as indexer."""

    query_hash = QueryParser("hash", schema=SCHEMA)

    def create(self):
        self._indexer = create_in(self.path, SCHEMA)
        self._index = self._indexer

    def open(self, ro=False):
        if os.path.isdir(self.path):
            self._index = open_dir(self.path)
            self._indexer = self._index
        else:
            os.mkdir(self.path)
            self.create()

        self._searcher = self._index.searcher()
        self._opened = True

    def set_metadata(self, name, value):
        with open(os.path.join(self.path, "metadata-%s" % name), "w") as f:
            f.write(pickle.dumps(value))

    def get_metadata(self, name, default=None):
        try:
            with open(os.path.join(self.path, "metadata-%s" % name)) as f:
                return pickle.loads(f.read())
        except:
            return default

    def _timer(self):
        while True:
            self.flush()
            sleep(self.flush_time)

    def __init__(self,
                 path,
                 size=None,
                 rows=None,
                 flush_time=10,
                 *args,
                 **kwargs):
        self._opened = False
        self._index = None
        self._writer = None
        self.path = os.path.join(DEFAULT_INDEX_PATH, path)
        self.flush_time = flush_time
        self.flush_thread = threading.Thread(target=self._timer)
        self.open()
        self.count = 0

    def flush(self):
        if getattr(self, "callback_flush", None):
            self.callback_flush(self)

        if self._writer is not None:
            self._writer.commit()
            self.count = 0

    def is_indexed(self, hash):
        return self._searcher.search(self.query_hash.parse(
            unicode(hash))).estimated_length() > 0

    def __call__(self, pipeline):
        self._writer = BufferedWriter(self._indexer, period=10, limit=1000)
        try:
            self.flush_thread.start()
            for event in pipeline:
                self.count += 1
                self._writer.add_document(source=unicode(event["source"]),
                                          name=unicode(event["index"]),
                                          raw=unicode(event["_raw"]),
                                          time=int(event.time),
                                          hash=unicode(event.hash))
        finally:
            self.flush()

    def search(self, expr, limit=10000):
        with self._index.searcher() as searcher:
            query = QueryParser("raw", self._index.schema)
            query.add_plugin(FieldsPlugin())
            query.add_plugin(RangePlugin())
            query.add_plugin(GtLtPlugin())
            query.add_plugin(WildcardPlugin())
            query = query.parse(expr)
            for x in searcher.search(query, limit=limit):
                yield x

    def __iter__(self):
        for x in self.search(u"*", None):
            yield x["raw"]
예제 #4
0
파일: whoosh.py 프로젝트: ajdiaz/mole
class IndexWhoosh(Index):
    """Implements the whoosh engine as indexer."""

    query_hash = QueryParser("hash", schema=SCHEMA)

    def create(self):
        self._indexer = create_in(self.path, SCHEMA)
        self._index = self._indexer

    def open(self, ro=False):
        if os.path.isdir(self.path):
            self._index = open_dir(self.path)
            self._indexer = self._index
        else:
            os.mkdir(self.path)
            self.create()

        self._searcher = self._index.searcher()
        self._opened = True

    def set_metadata(self, name, value):
        with open(os.path.join(self.path,"metadata-%s" % name),"w") as f:
            f.write(pickle.dumps(value))

    def get_metadata(self, name, default=None):
        try:
            with open(os.path.join(self.path,"metadata-%s" % name)) as f:
                    return pickle.loads(f.read())
        except:
            return default

    def _timer(self):
        while True:
            self.flush()
            sleep(self.flush_time)

    def __init__(self, path, size=None, rows=None, flush_time=10, *args, **kwargs):
        self._opened = False
        self._index = None
        self._writer = None
        self.path = os.path.join(DEFAULT_INDEX_PATH, path)
        self.flush_time = flush_time
        self.flush_thread = threading.Thread(target=self._timer)
        self.open()
        self.count = 0

    def flush(self):
        if getattr(self, "callback_flush", None):
            self.callback_flush(self)

        if self._writer is not None:
            self._writer.commit()
            self.count = 0

    def is_indexed(self, hash):
        return self._searcher.search(self.query_hash.parse(unicode(hash))).estimated_length() > 0

    def __call__(self, pipeline):
        self._writer = BufferedWriter(self._indexer, period=10, limit=1000)
        try:
            self.flush_thread.start()
            for event in pipeline:
                self.count += 1
                self._writer.add_document(source=unicode(event["source"]),
                                          name=unicode(event["index"]),
                                          raw=unicode(event["_raw"]),
                                          time=int(event.time),
                                          hash=unicode(event.hash))
        finally:
            self.flush()

    def search(self, expr, limit=10000):
        with self._index.searcher() as searcher:
            query = QueryParser("raw", self._index.schema)
            query.add_plugin(FieldsPlugin())
            query.add_plugin(RangePlugin())
            query.add_plugin(GtLtPlugin())
            query.add_plugin(WildcardPlugin())
            query = query.parse(expr)
            for x in searcher.search(query, limit=limit):
                yield x

    def __iter__(self):
        for x in self.search(u"*", None):
            yield x["raw"]