class Indexer(Retriever): def __init__(self, lang, dataset, analyzer, index_path=None, data_path=None, ram_size=2048): """ Returns scored documents in multiple languages. Parameters: dataset (str): ['mlqa_dev', 'mlqa_test', 'wiki'] lang (str): ['en', 'es', 'de'] anlyzer (str): ['en', 'es', 'de', 'standard'] ram_size (int): Size of memory used while indexing Returns: """ super().__init__() idxdir = self.get_index(lang, dataset, index_path) self.mlqa = True if dataset == 'mlqa_dev': self.dataset = MLQADataset('dev', lang, lang, data_path) elif dataset == 'mlqa_test': self.dataset = MLQADataset('test', lang, lang, data_path) elif dataset == 'wiki': self.mlqa = False self.dataset = Wiki(lang, data_path) else: raise RuntimeError("No dataloader for {}".format(dataset)) # stores index files, poor concurency try NIOFSDirectory instead store = SimpleFSDirectory(Paths.get(idxdir)) # limit max. number of tokens per document. # analyzer will not consume more tokens than that #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # configuration for index writer config = IndexWriterConfig(analyzers[analyzer]()) # creates or overwrites index config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # setting similarity BM25Similarity(k1=1.2,b=0.75) similarity = BM25Similarity(self.k1, self.b) config.setSimilarity(similarity) config.setRAMBufferSizeMB(float(ram_size)) # create index writer self.writer = IndexWriter(store, config) self.ftdata = FieldType() self.ftmeta = FieldType() # IndexSearcher will return value of the field self.ftdata.setStored(True) self.ftmeta.setStored(True) # will be analyzed by Analyzer self.ftdata.setTokenized(True) self.ftmeta.setTokenized(False) # what informations are stored (probabli DOCS would be sufficient) # DOCS: Only documents are indexed: term frequencies and positions are omitted. # Phrase and other positional queries on the field will throw an exception, # and scoring will behave as if any term in the document appears only once. # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are # omitted. This enables normal scoring, except Phrase and other positional # queries will throw an exception. # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions. # This is a typical default for full-text search: full scoring is enabled # and positional queries are supported. self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.ftmeta.setIndexOptions(IndexOptions.DOCS) # instantiate some reusable objects # TODO: create document, add fields then change only field value and # re-add document self.doc = Document() # Id cannot be reused because there is multiple values # I could store list of fields and add one if its not enough #self.fieldId = Field("id", "dummy", self.ftmeta) self.fieldTitle = Field("title", "dummy", self.ftdata) self.doc.add(self.fieldTitle) self.fieldContext = Field("context", "dummy", self.ftdata) self.doc.add(self.fieldContext) self.fieldIds = [Field("id", "dummy", self.ftmeta)] def addDoc(self, ids, title, context): # to save resources field objects are not created each time a new # document is being added. fieldIds keeps already created objects for n, i in enumerate(ids): if n < len(self.fieldIds): self.fieldIds[n].setStringValue(i) else: self.fieldIds.append(Field("id", i, self.ftmeta)) self.doc.add(self.fieldIds[n]) self.fieldTitle.setStringValue(title) self.fieldContext.setStringValue(context) self.writer.addDocument(self.doc) # because the number of ids is not known, they have to be deleted # otherwise there could contain values from previous iteration self.doc.removeFields("id") def createIndex(self): ids = [] for i, doc in enumerate(self.dataset.get()): if self.mlqa: ids = doc['qid'] self.addDoc(ids, doc['title'], doc['context']) self.commit() def commit(self): self.writer.commit() self.writer.close() if not self.mlqa: self.dataset.close()