def index_scan(): print("Scanning the index") #pdb.set_trace() indexPath = File("indexOut/").toPath() indexDir = FSDirectory.open(indexPath) reader = DirectoryReader.open(indexDir) fields = MultiFields.getFields(reader) for field in fields: term = MultiFields.getTerms(reader,field) print(field, "->" , term)
def evaluate_index(index_dir, context, analyzer): # eval time of indexing (overall) # we should also measure the elapsed time of # each index_document call seperately start = time.clock() Indexer(index_dir, context, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(index_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) # print str(vocab_size) # size of vocabulary # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field # print str(vocabulary.getSumTotalTermFreq()) # #tokens # print str(vocabulary.getSumDocFreq()) # #postings reader.close() return duration, vocab_size
def get_coll_termvector(self, field): """ Returns collection term vector for the given field.""" self.open_reader() fields = MultiFields.getFields(self.reader) if fields is not None: terms = fields.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum
def fieldnames(self): indexAndTaxonomy = self._indexAndTaxonomy fieldnames = [] fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader()) if fields is None: return fieldnames iterator = fields.iterator() while iterator.hasNext(): fieldnames.append(iterator.next()) return fieldnames
def getTFForField(self, field): tfs = {} fields = MultiFields.getFields(self.reader) terms = fields.terms(field) enum = BytesRefIterator.cast_(terms.iterator(None)) try: while enum.next(): termval = TermsEnum.cast_(enum) termString = termval.term().utf8ToString() freq = self.reader.totalTermFreq(Term(field, termString)) tfs[termString] = freq except: pass return tfs
def get_terms(indexReader, field='text'): """ Gets all terms in an index. :param indexReader: IndexReader object of your index :param field: document field from which terms should be counted :return: list of terms (strings) """ terms = [] multiterms = MultiFields.getTerms(indexReader, field) termit = multiterms.iterator() it = BytesRefIterator.cast_( termit) # Inheritance apparently doesn't work in PyLucene... term = it.next() while term: terms.append(term.utf8ToString()) term = it.next() return terms
def get_doc_vectors(self, reader, tweets, num_docs): """ plot each document in a vector space. """ terms_dict, idf_dict = self.get_dicts(reader, tweets, num_docs) doc_vectors = [] liveDocs = MultiFields.getLiveDocs(reader) for i in range(0, reader.maxDoc()): if liveDocs is not None and not liveDocs.get(i): continue doc_vectors.append(DocVector(terms_dict, idf_dict)) tfvs = reader.getTermVector(i, "contents") terms_enum = tfvs.iterator(None) for bytes_ref in util.BytesRefIterator.cast_(terms_enum): doc_vectors[i].set_entry(bytes_ref.utf8ToString(), terms_enum.totalTermFreq()) return doc_vectors
def test_FieldEnumeration(self): self.test_indexDocument() store = self.openStore() writer = None try: analyzer = self.getAnalyzer() writer = self.getWriter(store, analyzer, False) doc = Document() doc.add(Field("title", "value of testing", TextField.TYPE_STORED)) doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED)) doc.add(Field("owner", "unittester", StringField.TYPE_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add(Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add(Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) finally: self.closeStore(store, writer) store = self.openStore() reader = None try: reader = DirectoryReader.open(store) term_enum = MultiFields.getTerms(reader, "docid").iterator() docids = [term.utf8ToString() for term in BytesRefIterator.cast_(term_enum)] self.assertEqual(len(docids), 2) finally: self.closeStore(store, reader)
def test_FieldEnumeration(self): self.test_indexDocument() store = self.openStore() writer = None try: analyzer = self.getAnalyzer() writer = self.getWriter(store, analyzer, False) doc = Document() doc.add(Field("title", "value of testing", TextField.TYPE_STORED)) doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED)) doc.add(Field("owner", "unittester", StringField.TYPE_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add( Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add( Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) finally: self.closeStore(store, writer) store = self.openStore() reader = None try: reader = DirectoryReader.open(store) term_enum = MultiFields.getTerms(reader, "docid").iterator() docids = [ term.utf8ToString() for term in BytesRefIterator.cast_(term_enum) ] self.assertEqual(len(docids), 2) finally: self.closeStore(store, reader)
def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms
def test_getFieldInfos(self): self.test_indexDocument() store = self.openStore() reader = None try: reader = DirectoryReader.open(store) fieldInfos = MultiFields.getMergedFieldInfos(reader) for fieldInfo in fieldInfos.iterator(): self.assert_(fieldInfo.name in ['owner', 'search_name', 'meta_words', 'docid', 'title']) if fieldInfo.isIndexed(): self.assert_(fieldInfo.name in ['owner', 'meta_words', 'docid', 'title']) if fieldInfo.isIndexed() and not fieldInfo.hasVectors(): self.assert_(fieldInfo.name in ['owner', 'meta_words', 'docid', 'title']) finally: store = self.closeStore(store, reader)
def test_getFieldInfos(self): self.test_indexDocument() store = self.openStore() reader = None try: reader = DirectoryReader.open(store) fieldInfos = MultiFields.getMergedFieldInfos(reader) for fieldInfo in fieldInfos.iterator(): self.assert_( fieldInfo.name in ['owner', 'search_name', 'meta_words', 'docid', 'title']) if fieldInfo.isIndexed(): self.assert_(fieldInfo.name in ['owner', 'meta_words', 'docid', 'title']) if fieldInfo.isIndexed() and not fieldInfo.hasVectors(): self.assert_(fieldInfo.name in ['owner', 'meta_words', 'docid', 'title']) finally: store = self.closeStore(store, reader)
def get_dicts(self, reader, tweets, num_docs): """ investigate index by constructing term dict (term,id) and idf dict (term,idf_val). """ terms_dict = {} idf_dict = {} terms_ctr = 0 # iterate over each term in index term_enum = MultiFields.getTerms(reader, "contents").iterator(None) for bytes_ref in util.BytesRefIterator.cast_(term_enum): s = bytes_ref.utf8ToString() terms_dict[s] = terms_ctr terms_ctr += 1 # count occurences of this term in the index and calculate idf doc_presence_ctr = 0 for tweet in tweets: if s in tweet[1].text.lower(): doc_presence_ctr += 1 idf_dict[s] = log(float(num_docs) / doc_presence_ctr, 10) return terms_dict, idf_dict
def evaluate_index(data_dir, store_dir, analyzer): """ Evaluates vocabulary size and indexing speed for different analyzer configurations. """ start = time.clock() Indexer(data_dir, store_dir, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(store_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() # sometimes .size() doesn't return the correct size, in this case # we have to count manually if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) reader.close() return duration, vocab_size
from org.apache.lucene.index import MultiFields from org.apache.lucene.util import BytesRef, BytesRefIterator import timeit if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print('lucene', lucene.VERSION) start_time = timeit.default_timer() try: index = IndexFiles(os.path.dirname(sys.argv[0]), StandardAnalyzer()) index_reader = DirectoryReader.open(index.store) # get vocab size terms = MultiFields.getTerms(index_reader, 'contents') termEnum = terms.iterator() vocabCounter = 0 for term in BytesRefIterator.cast_(termEnum): vocabCounter += 1 print("Number of docs:", index_reader.numDocs()) print("Vocab size:", vocabCounter) # print min, max, mean querystr = 'بازار بزرگ تهران' print("Query: ", querystr) q = QueryParser("contents", index.analyzer).parse(querystr) hitsPerPage = 20 searcher = IndexSearcher(index_reader) docs = searcher.search(q, hitsPerPage) hits = docs.scoreDocs
def stats_tooltip(word, doc_id, reader): # content statistics term = Term('content', tokenize(word)) term_text = unicode(term).replace('content:', '') doc_count = reader.docFreq(term) # in how many docs the term appears total_term_count = reader.totalTermFreq( term) # how many times the term appears in any doc n_docs = reader.getDocCount('content') # total number of docs postings = MultiFields.getTermDocsEnum(reader, 'content', BytesRef(term_text)) while postings.docID() != doc_id: # this is bad postings.nextDoc() term_count = postings.freq() # how many times the term appears in this doc similarity = ClassicSimilarity() tf = similarity.tf(float(term_count)) # sqrt(term_freq) # whether the term is is common or rare among all the docs idf = similarity.idf(long(doc_count), long(n_docs)) # log((n_docs+1)/(doc_count+1)) + 1 # abstract statistics abstract_term = Term('abstract', tokenize(word)) abstract_doc_count = reader.docFreq(abstract_term) abstract_total_term_count = reader.totalTermFreq(abstract_term) a_idf = similarity.idf(long(abstract_doc_count), long(n_docs)) abstract_postings = MultiFields.getTermDocsEnum(reader, 'abstract', BytesRef(term_text)) if not abstract_postings: # the term appears in no document's abstract abstract_term_count = 0 a_tf = 1 else: while abstract_postings.docID() != doc_id: # this is bad if abstract_postings.nextDoc() == abstract_postings.NO_MORE_DOCS: abstract_term_count = 0 # it does not appear in this document's abstract a_tf = 1 break else: # no break, it does appear in this document's abstract abstract_term_count = abstract_postings.freq() a_tf = similarity.tf(float(abstract_term_count)) content_score = tf * idf**2 * CONTENT_BOOST abstract_score = a_tf * a_idf**2 * ABSTRACT_BOOST # mixing concerns like nobody's business return ''' <div class="popup"> <div class="term">{}</div> <table> <tr> <th> </th> <th>abstr</th> <th>body</th> <th>total</th> </tr> <tr><td>this doc</td> <td>{}</td> <td>{}</td> <td>{}</td> </tr> <tr><td>TF</td> <td>{:.2g}</td> <td>{:.2g}</td> <td>{:.2g}</td> </tr> <tr><td>nr docs</td> <td>{}</td> <td>{}</td> <td>{}</td> </tr> <tr><td>IDF</td> <td>{:.2g}</td> <td>{:.2g}</td> <td>{:.2g}</td> </tr> <tr><td>score</td> <td>{:.2g}</td> <td>{:.2g}</td> <td><b>{:.2g}</b></td> </tr> <tr><td>all docs</td> <td>{}</td> <td>{}</td> <td>{}</td> </tr> </table> <div class="total-docs">{}</div> </div> '''.format( term_text, abstract_term_count, term_count - abstract_term_count, term_count, a_tf, tf, a_tf * tf, abstract_doc_count, doc_count, doc_count, a_idf, idf, a_idf * idf, abstract_score, content_score, abstract_score * content_score, abstract_total_term_count, total_term_count - abstract_total_term_count, total_term_count, n_docs)
def testPayloadsPos0(self): writer = self.getWriter(analyzer=TestPayloadAnalyzer()) doc = Document() doc.add(Field("content", "a a b c d e a f g h i j a b k k", TextField.TYPE_STORED)) writer.addDocument(doc) reader = writer.getReader() writer.close() tp = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "content", BytesRef("a")) count = 0 self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS) # "a" occurs 4 times self.assertEqual(4, tp.freq()) expected = 0 self.assertEqual(expected, tp.nextPosition()) self.assertEqual(1, tp.nextPosition()) self.assertEqual(3, tp.nextPosition()) self.assertEqual(6, tp.nextPosition()) # only one doc has "a" self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS) searcher = self.getSearcher(reader=reader) stq1 = SpanTermQuery(Term("content", "a")) stq2 = SpanTermQuery(Term("content", "k")) sqs = [stq1, stq2] snq = SpanNearQuery(sqs, 30, False) count = 0 sawZero = False pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) while pspans.next(): payloads = pspans.getPayload() sawZero |= pspans.start() == 0 it = payloads.iterator() while it.hasNext(): count += 1 it.next() self.assertEqual(5, count) self.assert_(sawZero) spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) count = 0 sawZero = False while spans.next(): count += 1 sawZero |= spans.start() == 0 self.assertEqual(4, count) self.assert_(sawZero) sawZero = False psu = PayloadSpanUtil(searcher.getTopReaderContext()) pls = psu.getPayloadsForQuery(snq) count = pls.size() it = pls.iterator() while it.hasNext(): bytes = JArray('byte').cast_(it.next()) s = bytes.string_ sawZero |= s == "pos: 0" self.assertEqual(5, count) self.assert_(sawZero)
def testPayloadsPos0(self): writer = self.getWriter(analyzer=TestPayloadAnalyzer()) doc = Document() doc.add( Field("content", "a a b c d e a f g h i j a b k k", TextField.TYPE_STORED)) writer.addDocument(doc) reader = writer.getReader() writer.close() tp = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "content", BytesRef("a")) count = 0 self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS) # "a" occurs 4 times self.assertEqual(4, tp.freq()) expected = 0 self.assertEqual(expected, tp.nextPosition()) self.assertEqual(1, tp.nextPosition()) self.assertEqual(3, tp.nextPosition()) self.assertEqual(6, tp.nextPosition()) # only one doc has "a" self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS) searcher = self.getSearcher(reader=reader) stq1 = SpanTermQuery(Term("content", "a")) stq2 = SpanTermQuery(Term("content", "k")) sqs = [stq1, stq2] snq = SpanNearQuery(sqs, 30, False) count = 0 sawZero = False pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) while pspans.next(): payloads = pspans.getPayload() sawZero |= pspans.start() == 0 it = payloads.iterator() while it.hasNext(): count += 1 it.next() self.assertEqual(5, count) self.assert_(sawZero) spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) count = 0 sawZero = False while spans.next(): count += 1 sawZero |= spans.start() == 0 self.assertEqual(4, count) self.assert_(sawZero) sawZero = False psu = PayloadSpanUtil(searcher.getTopReaderContext()) pls = psu.getPayloadsForQuery(snq) count = pls.size() it = pls.iterator() while it.hasNext(): bytes = JArray('byte').cast_(it.next()) s = bytes.string_ sawZero |= s == "pos: 0" self.assertEqual(5, count) self.assert_(sawZero)
def main(): #constants FIELD_CONTENTS = "text" DOC_NAME = "identifier" STORE_DIR = "./full_index1" #take search term as command line argument if len(sys.argv) != 4: print( 'Format should be: python search_docs.py, [term to search for], redo? y/n, window_size' ) exit(0) #parse user input TERM = sys.argv[1] remake_df = True if sys.argv[2] == 'y' else False window_size = int(sys.argv[3]) #other options stem_flag = True spell_check_flag = False #get dataframe doc_data = get_doc_df(remake_df) #get dictionary SA_dict = get_dict(stem_flag) print('Searching for: "' + TERM + '"') sa_term = [] date_range = (1791, 1800) method = 'linear' #vs 1/x example_flag = False #full_dict = pickle.load(open('./spellcheck/full_word_list.pkl'), 'rb') full_dict, modern_dict, map_chars, charlist = sp_ch.load_clean_word_list() ### replacement table rep_data = pickle.load(open('./spellcheck/rep_table.pkl', 'rb')) print(rep_data) rep_table = rep_data['rep_table'] charlist = rep_data['charlist'] try: map_chars = rep_data['charmap'] except: map_chars = rep_data['map_chars'] ### top_n = 4 top_replacements = {} for cf, from_letter in enumerate(rep_table): sort_idx = np.argsort(from_letter)[::-1] #print(from_letter) top_rep = [sort_idx[i] for i in range(top_n)] #print(top_rep) top_replacements[charlist[cf]] = [charlist[char] for char in top_rep] # if not 'sentiment_vals_w_'+TERM in list(doc_data): if 1: # not glob.glob('./pickles/%s_df.pkl'%TERM): lucene.initVM() searcher, reader, query = define_search_params(STORE_DIR, FIELD_CONTENTS, TERM) fieldInfos = MultiFields.getMergedFieldInfos(reader) print(fieldInfos) for fieldInfo in fieldInfos.iterator(): print(fieldInfo.name) # Run the query and get documents that contain the term docs_containing_term = searcher.search(query, reader.numDocs()) print('Found ' + str(len(docs_containing_term.scoreDocs)) + ' documents with the term "' + TERM + '".') print('Calculating sentiment scores...') term_words = [] #hits = searcher.search(query, 1) for hit in tqdm(docs_containing_term.scoreDocs): doc = searcher.doc(hit.doc) #get the text from each document doc_text = doc.get("text") #doc.get("text")#.encode("utf-8") #single doc returns the score data for a single document, and a list of words that appear in the term windows for that document score_data, doc_words = sa.single_doc( TERM, doc_text, SA_dict, full_dict, top_replacements, window_size, spell_check_flag, example_flag, stem_flag, method) #print(score_data) term_words.append((doc.get(DOC_NAME).split('/')[-1], doc_words)) sa_doc_score = [doc.get(DOC_NAME)] + score_data sa_term.append(sa_doc_score) sa_df = a_sa.make_sa_df(doc_data, sa_term, TERM) pickle.dump(sa_df, open('./pickles/%s_df.pkl' % TERM, 'wb')) pickle.dump(term_words, open('./pickles/%s_words.pkl' % TERM, 'wb')) else: sa_df = doc_data print(sa_df) #process dataframe for various properties (split this into specific functions later) use_weighted = True total_doc = False
# titles = index.get_documents(ids, ["id", "title"]) print "\n".join(map(str, docs)) sys.exit() fields = [ DocField("id", stored=True, indexed=True), DocField("text", stored=True, indexed=True) ] index = Index(fields=fields) texts = [ "just writing ", "what ever dude", "el dudino", "your dude", "the Dude" ] for i, text in enumerate(texts): index.add(id='doc_%d' % (i + 1), text=text) index.commit() ids, scores = index.search("dude+ever", ["text"], limit=10) print index.get_documents(ids, "id") # Try out some filters filter = TermsFilter([Term("id", "doc_2")]) ids, scores = index.search("dude+ever", ["text"], filter, limit=10) print index.get_documents(ids, "id") fields = MultiFields.getMergedFieldInfos(index.reader).iterator() for f in fields: print f.attributes() # print filter.getDocIdSet(index.reader)
def testSetPosition(self): class _tokenizer(PythonTokenizer): def __init__(_self): super(_tokenizer, _self).__init__() _self.TOKENS = ["1", "2", "3", "4", "5"] _self.INCREMENTS = [1, 2, 1, 0, 1] _self.i = 0 _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_) _self.termAtt = _self.addAttribute(CharTermAttribute.class_) _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_) def incrementToken(_self): if _self.i == len(_self.TOKENS): return False _self.clearAttributes() _self.termAtt.append(_self.TOKENS[_self.i]) _self.offsetAtt.setOffset(_self.i, _self.i) _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i]) _self.i += 1 return True def reset(_self): super(_tokenizer, _self).reset() _self.i = 0 class _analyzer(PythonAnalyzer): def createComponents(_self, fieldName): return Analyzer.TokenStreamComponents(_tokenizer()) def initReader(_self, fieldName, reader): return reader writer = self.getWriter(analyzer=_analyzer()) d = Document() d.add(Field("field", "bogus", TextField.TYPE_STORED)) writer.addDocument(d) writer.commit() writer.close() searcher = self.getSearcher() reader = searcher.getIndexReader() pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1")) pos.nextDoc() # first token should be at position 0 self.assertEqual(0, pos.nextPosition()) pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2")) pos.nextDoc() # second token should be at position 2 self.assertEqual(2, pos.nextPosition()) b = PhraseQuery.Builder() b.add(Term("field", "1")) b.add(Term("field", "2")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # same as previous, just specify positions explicitely. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 1) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # specifying correct positions should find the phrase. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 2) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "3")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # phrase query would find it when correct positions are specified. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "4"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) # phrase query should fail for non existing searched term # even if there exist another searched terms in the same searched # position. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "9"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # multi-phrase query should succed for non existing searched term # because there exist another searched terms in the same searched # position. b = MultiPhraseQuery.Builder() b.add([Term("field", "3"), Term("field", "9")], 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "4")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits))