def docs(self, name, value, counts=False): """Generate doc ids which contain given term, optionally with frequency counts.""" docsenum = index.MultiFields.getTermDocsEnum(self.indexReader, name, util.BytesRef(value)) docs = iter(docsenum.nextDoc, index.PostingsEnum.NO_MORE_DOCS) if docsenum else () return ((doc, docsenum.freq()) for doc in docs) if counts else iter(docs)
def items(self, *values: str) -> Iterator[document.Field]: """Generate indexed component fields.""" field = getattr(self, 'docValueLess', self) for value in values: for name, text in zip(self.names, self.values(value)): yield document.Field(name, text, field) if self.docvalues: yield self.docValueClass(name, util.BytesRef(text))
def test_indexes(tempdir): with pytest.raises(TypeError): engine.IndexSearcher() with pytest.raises(lucene.JavaError): engine.Indexer(tempdir, 'r') indexer = engine.Indexer() indexer.set('name', engine.Field.String, stored=True) indexer.set('text', engine.Field.Text) with engine.Indexer(tempdir) as temp: temp.add() with pytest.raises(KeyError), engine.Indexer(tempdir) as temp: temp.add() temp.add(missing='') for other in (temp, temp.directory, tempdir): indexer += other assert len(indexer) == 3 analyzer = engine.Analyzer.whitespace() indexer.add(text=analyzer.tokens('?'), name=util.BytesRef('{}')) indexer.commit() assert indexer[next(indexer.docs('text', '?'))]['name'] == '{}' indexer.delete('text', '?') indexer.commit(merge=True) assert not indexer.hasDeletions() indexer.commit(merge=1) assert len(list(indexer.readers)) == 1 reader = engine.indexers.IndexReader(indexer.indexReader) del reader.indexReader with pytest.raises(AttributeError): reader.maxDoc del indexer.indexSearcher with pytest.raises(AttributeError): indexer.search indexer = engine.Indexer(tempdir) indexer.add() indexer.commit() files = set(os.listdir(tempdir)) path = os.path.join(tempdir, 'temp') with indexer.snapshot() as commit: indexer.commit(merge=1) assert indexer.indexCommit.generation > commit.generation engine.indexers.copy(commit, path) assert set(os.listdir(path)) == set(commit.fileNames) < files < set( os.listdir(tempdir)) filepath = os.path.join(path, commit.segmentsFileName) os.remove(filepath) open(filepath, 'w').close() with pytest.raises(OSError): engine.indexers.copy(commit, path) with pytest.raises(lucene.JavaError): indexer.check(tempdir) del indexer assert engine.Indexer(tempdir) assert not os.path.exists(os.path.join(tempdir, commit.segmentsFileName)) assert engine.IndexWriter.check(tempdir).clean assert not engine.IndexWriter.check(tempdir, fix=True).numBadSegments
def positions(self, name, value, payloads=False, offsets=False): """Generate doc ids and positions which contain given term, optionally with offsets, or only ones with payloads.""" docsenum = index.MultiFields.getTermPositionsEnum( self.indexReader, name, util.BytesRef(value)) for doc in (iter(docsenum.nextDoc, index.PostingsEnum.NO_MORE_DOCS) if docsenum else ()): positions = (docsenum.nextPosition() for _ in range(docsenum.freq())) if payloads: positions = ((position, docsenum.payload.utf8ToString()) for position in positions if docsenum.payload) elif offsets: positions = ((docsenum.startOffset(), docsenum.endOffset()) for position in positions) yield doc, list(positions)
def terms(self, name, value='', stop='', counts=False, distance=0, prefix=0): """Generate a slice of term values, optionally with frequency counts. :param name: field name :param value: term prefix, start value (given stop), or fuzzy value (given distance) :param stop: optional upper bound for range terms :param counts: include frequency counts :param distance: maximum edit distance for fuzzy terms :param prefix: prefix length for fuzzy terms """ terms = index.MultiFields.getTerms(self.indexReader, name) if not terms: return iter([]) term, termsenum = index.Term(name, value), terms.iterator() if distance: distance = (float if lucene6 else int)(distance) terms = termsenum = search.FuzzyTermsEnum(terms, util.AttributeSource(), term, distance, prefix, False) else: termsenum.seekCeil(util.BytesRef(value)) terms = itertools.chain([termsenum.term()], util.BytesRefIterator.cast_(termsenum)) terms = map(operator.methodcaller('utf8ToString'), terms) predicate = partial(operator.gt, stop) if stop else operator.methodcaller( 'startswith', value) if not distance: terms = itertools.takewhile(predicate, terms) return ((term, termsenum.docFreq()) for term in terms) if counts else terms
def payload(self, data): self.Payload.payload = util.BytesRef(data)
def range(cls, name: str, start, stop, lower=True, upper=False) -> 'Query': """Return lucene RangeQuery, by default with a half-open interval.""" start, stop = (value if value is None else util.BytesRef(value) for value in (start, stop)) return cls(search.TermRangeQuery, name, start, stop, lower, upper)