Пример #1
0
class LuceneKeyValueStore(object):
    def __init__(self, path):
        lazyImport()
        self._writer, self._reader, self._searcher = self._getLucene(path)
        self._latestModifications = {}
        self._doc = Document()
        self._keyField = StringField("key", "", Field.Store.NO)
        self._valueField = Field("value", "", UNINDEXED_TYPE)
        self._doc.add(self._keyField)
        self._doc.add(self._valueField)

    def get(self, key, default=None):
        try:
            return self[key]
        except KeyError:
            return default

    def __setitem__(self, key, value):
        key = str(key)
        value = str(value)
        self._maybeReopen()
        self._keyField.setStringValue(key)
        self._valueField.setStringValue(value)
        self._writer.updateDocument(Term("key", key), self._doc)
        self._latestModifications[key] = value

    def __getitem__(self, key):
        key = str(key)
        value = self._latestModifications.get(key)
        if value is DELETED_RECORD:
            raise KeyError(key)
        if not value is None:
            return value
        self._maybeReopen()
        topDocs = self._searcher.search(TermQuery(Term("key", key)), 1)
        if topDocs.totalHits.value == 0:
            raise KeyError(key)
        return self._searcher.doc(topDocs.scoreDocs[0].doc).get("value")

    def __delitem__(self, key):
        key = str(key)
        self._writer.deleteDocuments(Term("key", key))
        self._latestModifications[key] = DELETED_RECORD

    def __len__(self):
        raise NotImplementedError

    def __iter__(self):
        raise NotImplementedError

    def items(self):
        raise NotImplementedError

    def keys(self):
        raise NotImplementedError

    def values(self):
        raise NotImplementedError

    def _getLucene(self, path):
        directory = FSDirectory.open(Paths.get(path))
        config = IndexWriterConfig(None)
        config.setRAMBufferSizeMB(256.0)  # faster
        config.setUseCompoundFile(False)  # faster, for Lucene 4.4 and later
        writer = IndexWriter(directory, config)
        reader = writer.getReader()
        searcher = IndexSearcher(reader)
        return writer, reader, searcher

    def _maybeReopen(self):
        if len(self._latestModifications) > 10000:
            newReader = DirectoryReader.openIfChanged(self._reader,
                                                      self._writer, True)
            if not newReader is None:
                self._reader.close()
                self._reader = newReader
                self._searcher = IndexSearcher(self._reader)
                self._latestModifications.clear()

    def commit(self):
        self._writer.commit()

    def close(self):
        self._writer.close()
Пример #2
0
    def testSimple(self):
        writer = self.getWriter(analyzer=SimpleAnalyzer())

        doc = Document()
        field = Field("foo", "", TextField.TYPE_NOT_STORED)
        doc.add(field)

        dvField = FloatDocValuesField("foo_boost", 0.0)
        doc.add(dvField)

        field2 = Field("bar", "", TextField.TYPE_NOT_STORED)
        doc.add(field2)

        field.setStringValue("quick brown fox")
        field2.setStringValue("quick brown fox")
        dvField.setFloatValue(2.0)  # boost x2
        writer.addDocument(doc)

        field.setStringValue("jumps over lazy brown dog")
        field2.setStringValue("jumps over lazy brown dog")
        dvField.setFloatValue(4.0)  # boost x4
        writer.addDocument(doc)

        reader = writer.getReader()
        writer.close()

        # no boosting
        searcher1 = self.getSearcher(reader=reader)
        base = searcher1.getSimilarity(True)

        # boosting
        searcher2 = self.getSearcher(reader=reader)

        class _similarity(PythonPerFieldSimilarityWrapper):

            def __init__(_self, base):
                super(_similarity, _self).__init__()
                _self.base = base
                _self.fooSim = BoostingSimilarity(base, "foo_boost")

            def get(_self, field):
                return _self.fooSim if "foo" == field else _self.base

        searcher2.setSimilarity(_similarity(base))

        # in this case, we searched on field "foo". first document should have
        # 2x the score.
        tq = TermQuery(Term("foo", "quick"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)

        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(
            boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2.0,
            SCORE_EPSILON)

        # this query matches only the second document, which should have 4x
        # the score.
        tq = TermQuery(Term("foo", "jumps"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)
        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(
            boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4.0,
            SCORE_EPSILON)

        # search on on field bar just for kicks, nothing should happen, since
        # we setup our sim provider to only use foo_boost for field foo.
        tq = TermQuery(Term("bar", "quick"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)
        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(
            boost.scoreDocs[0].score, noboost.scoreDocs[0].score,
            SCORE_EPSILON)

        reader.close()
Пример #3
0
class Indexer(Retriever):
    def __init__(self,
                 lang,
                 dataset,
                 analyzer,
                 index_path=None,
                 data_path=None,
                 ram_size=2048):
        """ Returns scored documents in multiple languages.

        Parameters:
        dataset  (str): ['mlqa_dev', 'mlqa_test', 'wiki']
        lang     (str): ['en', 'es', 'de']
        anlyzer  (str): ['en', 'es', 'de', 'standard']
        ram_size (int): Size of memory used while indexing

        Returns:
        """
        super().__init__()

        idxdir = self.get_index(lang, dataset, index_path)
        self.mlqa = True
        if dataset == 'mlqa_dev':
            self.dataset = MLQADataset('dev', lang, lang, data_path)
        elif dataset == 'mlqa_test':
            self.dataset = MLQADataset('test', lang, lang, data_path)
        elif dataset == 'wiki':
            self.mlqa = False
            self.dataset = Wiki(lang, data_path)
        else:
            raise RuntimeError("No dataloader for {}".format(dataset))

        # stores index files, poor concurency try NIOFSDirectory instead
        store = SimpleFSDirectory(Paths.get(idxdir))
        # limit max. number of tokens per document.
        # analyzer will not consume more tokens than that
        #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        # configuration for index writer
        config = IndexWriterConfig(analyzers[analyzer]())
        # creates or overwrites index
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        # setting similarity BM25Similarity(k1=1.2,b=0.75)
        similarity = BM25Similarity(self.k1, self.b)
        config.setSimilarity(similarity)
        config.setRAMBufferSizeMB(float(ram_size))
        # create index writer
        self.writer = IndexWriter(store, config)

        self.ftdata = FieldType()
        self.ftmeta = FieldType()
        # IndexSearcher will return value of the field
        self.ftdata.setStored(True)
        self.ftmeta.setStored(True)
        # will be analyzed by Analyzer
        self.ftdata.setTokenized(True)
        self.ftmeta.setTokenized(False)
        # what informations are stored (probabli DOCS would be sufficient)
        # DOCS: Only documents are indexed: term frequencies and positions are omitted.
        #       Phrase and other positional queries on the field will throw an exception,
        #       and scoring will behave as if any term in the document appears only once.
        # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are
        #       omitted. This enables normal scoring, except Phrase and other positional
        #       queries will throw an exception.
        # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions.
        #       This is a typical default for full-text search: full scoring is enabled
        #       and positional queries are supported.
        self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        self.ftmeta.setIndexOptions(IndexOptions.DOCS)
        # instantiate some reusable objects
        # TODO: create document, add fields then change only field value and
        # re-add document
        self.doc = Document()
        # Id cannot be reused because there is multiple values
        # I could store list of fields and add one if its not enough
        #self.fieldId = Field("id", "dummy", self.ftmeta)
        self.fieldTitle = Field("title", "dummy", self.ftdata)
        self.doc.add(self.fieldTitle)
        self.fieldContext = Field("context", "dummy", self.ftdata)
        self.doc.add(self.fieldContext)
        self.fieldIds = [Field("id", "dummy", self.ftmeta)]

    def addDoc(self, ids, title, context):
        # to save resources field objects are not created each time a new
        # document is being added. fieldIds keeps already created objects
        for n, i in enumerate(ids):
            if n < len(self.fieldIds):
                self.fieldIds[n].setStringValue(i)
            else:
                self.fieldIds.append(Field("id", i, self.ftmeta))
            self.doc.add(self.fieldIds[n])

        self.fieldTitle.setStringValue(title)
        self.fieldContext.setStringValue(context)
        self.writer.addDocument(self.doc)
        # because the number of ids is not known, they have to be deleted
        # otherwise there could contain values from previous iteration
        self.doc.removeFields("id")

    def createIndex(self):
        ids = []
        for i, doc in enumerate(self.dataset.get()):
            if self.mlqa:
                ids = doc['qid']
            self.addDoc(ids, doc['title'], doc['context'])
        self.commit()

    def commit(self):
        self.writer.commit()
        self.writer.close()
        if not self.mlqa:
            self.dataset.close()
Пример #4
0
class LuceneKeyValueStore(object):
    def __init__(self, path):
        lazyImport()
        self._writer, self._reader, self._searcher = self._getLucene(path)
        self._latestModifications = {}
        self._doc = Document()
        self._keyField = StringField("key", "", Field.Store.NO)
        self._valueField = Field("value", "", UNINDEXED_TYPE)
        self._doc.add(self._keyField)
        self._doc.add(self._valueField)

    def get(self, key, default=None):
        try:
            return self[key]
        except KeyError:
            return default

    def __setitem__(self, key, value):
        key = str(key)
        value = str(value)
        self._maybeReopen()
        self._keyField.setStringValue(key)
        self._valueField.setStringValue(value)
        self._writer.updateDocument(Term("key", key), self._doc)
        self._latestModifications[key] = value

    def __getitem__(self, key):
        key = str(key)
        value = self._latestModifications.get(key)
        if value is DELETED_RECORD:
            raise KeyError(key)
        if not value is None:
            return value
        self._maybeReopen()
        topDocs = self._searcher.search(TermQuery(Term("key", key)), 1)
        if topDocs.totalHits == 0:
            raise KeyError(key)
        return self._searcher.doc(topDocs.scoreDocs[0].doc).get("value")

    def __delitem__(self, key):
        key = str(key)
        self._writer.deleteDocuments(Term("key", key))
        self._latestModifications[key] = DELETED_RECORD

    def __len__(self):
        raise NotImplementedError

    def __iter__(self):
        raise NotImplementedError

    def items(self):
        raise NotImplementedError

    def keys(self):
        raise NotImplementedError

    def values(self):
        raise NotImplementedError

    def _getLucene(self, path):
        directory = FSDirectory.open(Paths.get(path))
        config = IndexWriterConfig(None)
        config.setRAMBufferSizeMB(256.0) # faster
        config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
        writer = IndexWriter(directory, config)
        reader = writer.getReader()
        searcher = IndexSearcher(reader)
        return writer, reader, searcher

    def _maybeReopen(self):
        if len(self._latestModifications) > 10000:
            newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True)
            if not newReader is None:
                self._reader.close()
                self._reader = newReader
                self._searcher = IndexSearcher(self._reader)
                self._latestModifications.clear()

    def commit(self):
        self._writer.commit()

    def close(self):
        self._writer.close()
Пример #5
0
    def testSimple(self):
        writer = self.getWriter(analyzer=SimpleAnalyzer())

        doc = Document()
        field = Field("foo", "", TextField.TYPE_NOT_STORED)
        doc.add(field)

        dvField = FloatDocValuesField("foo_boost", 0.0)
        doc.add(dvField)

        field2 = Field("bar", "", TextField.TYPE_NOT_STORED)
        doc.add(field2)

        field.setStringValue("quick brown fox")
        field2.setStringValue("quick brown fox")
        dvField.setFloatValue(2.0)  # boost x2
        writer.addDocument(doc)

        field.setStringValue("jumps over lazy brown dog")
        field2.setStringValue("jumps over lazy brown dog")
        dvField.setFloatValue(4.0)  # boost x4
        writer.addDocument(doc)

        reader = writer.getReader()
        writer.close()

        # no boosting
        searcher1 = self.getSearcher(reader=reader)
        base = searcher1.getSimilarity(True)

        # boosting
        searcher2 = self.getSearcher(reader=reader)

        class _similarity(PythonPerFieldSimilarityWrapper):
            def __init__(_self, base):
                super(_similarity, _self).__init__()
                _self.base = base
                _self.fooSim = BoostingSimilarity(base, "foo_boost")

            def get(_self, field):
                return _self.fooSim if "foo" == field else _self.base

        searcher2.setSimilarity(_similarity(base))

        # in this case, we searched on field "foo". first document should have
        # 2x the score.
        tq = TermQuery(Term("foo", "quick"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)

        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(boost.scoreDocs[0].score,
                         noboost.scoreDocs[0].score * 2.0, SCORE_EPSILON)

        # this query matches only the second document, which should have 4x
        # the score.
        tq = TermQuery(Term("foo", "jumps"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)
        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(boost.scoreDocs[0].score,
                         noboost.scoreDocs[0].score * 4.0, SCORE_EPSILON)

        # search on on field bar just for kicks, nothing should happen, since
        # we setup our sim provider to only use foo_boost for field foo.
        tq = TermQuery(Term("bar", "quick"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)
        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score,
                         SCORE_EPSILON)

        reader.close()