def test_intersection(): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u("a"), value=u("alpha bravo charlie delta")) w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo")) w.add_document(key=u("c"), value=u("charlie delta golf hotel")) w.commit() w = ix.writer() w.add_document(key=u("d"), value=u("india alpha bravo charlie")) w.add_document(key=u("e"), value=u("delta bravo india bravo")) w.commit() with ix.searcher() as s: q = And([Term("value", u("bravo")), Term("value", u("delta"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "e"] q = And([Term("value", u("bravo")), Term("value", u("alpha"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "b", "d"]
def test_regular_and(): ix = get_index() with ix.searcher() as s: aq = And([Term("text", "bravo"), Term("text", "alfa")]) m = aq.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] for span in m.spans(): v = orig[span.start] assert v == "bravo" or v == "alfa" m.next()
def test_random_intersections(): domain = [ u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike"), ] segments = 5 docsperseg = 50 fieldlimits = (3, 10) documents = [] schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) # Create docsperseg * segments documents containing random words from # the domain list. Add the documents to the index, but also keep them # in the "documents" list for the sanity check for i in xrange(segments): w = ix.writer() for j in xrange(docsperseg): docnum = i * docsperseg + j # Create a string of random words doc = u(" ").join(choice(domain) for _ in xrange(randint(*fieldlimits))) # Add the string to the index w.add_document(key=docnum, value=doc) # Add a (docnum, string) tuple to the documents list documents.append((docnum, doc)) w.commit() assert_not_equal(len(ix._segments()), 1) testcount = 20 testlimits = (2, 5) with ix.searcher() as s: for i in xrange(s.doc_count_all()): assert_not_equal(s.stored_fields(i).get("key"), None) for _ in xrange(testcount): # Create a random list of words and manually do an intersection of # items in "documents" that contain the words ("target"). words = sample(domain, randint(*testlimits)) target = [] for docnum, doc in documents: if all((doc.find(w) > -1) for w in words): target.append(docnum) target.sort() # Create a query from the list of words and get two matchers from # it. q = And([Term("value", w) for w in words]) m1 = q.matcher(s) m2 = q.matcher(s) # Try getting the list of IDs from all_ids() ids1 = list(m1.all_ids()) # Try getting the list of IDs using id()/next() ids2 = [] while m2.is_active(): ids2.append(m2.id()) m2.next() # Check that the two methods return the same list assert_equal(ids1, ids2) # Check that the IDs match the ones we manually calculated assert_equal(_keys(s, ids1), target)
def test_random_intersections(): domain = [u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike")] segments = 5 docsperseg = 50 fieldlimits = (3, 10) documents = [] schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) # Create docsperseg * segments documents containing random words from # the domain list. Add the documents to the index, but also keep them # in the "documents" list for the sanity check for i in xrange(segments): w = ix.writer() for j in xrange(docsperseg): docnum = i * docsperseg + j # Create a string of random words doc = u(" ").join(choice(domain) for _ in xrange(randint(*fieldlimits))) # Add the string to the index w.add_document(key=docnum, value=doc) # Add a (docnum, string) tuple to the documents list documents.append((docnum, doc)) w.commit() assert len(ix._segments()) != 1 testcount = 20 testlimits = (2, 5) with ix.searcher() as s: for i in xrange(s.doc_count_all()): assert s.stored_fields(i).get("key") is not None for _ in xrange(testcount): # Create a random list of words and manually do an intersection of # items in "documents" that contain the words ("target"). words = sample(domain, randint(*testlimits)) target = [] for docnum, doc in documents: if all((doc.find(w) > -1) for w in words): target.append(docnum) target.sort() # Create a query from the list of words and get two matchers from # it. q = And([Term("value", w) for w in words]) m1 = q.matcher(s) m2 = q.matcher(s) # Try getting the list of IDs from all_ids() ids1 = list(m1.all_ids()) # Try getting the list of IDs using id()/next() ids2 = [] while m2.is_active(): ids2.append(m2.id()) m2.next() # Check that the two methods return the same list assert ids1 == ids2 # Check that the IDs match the ones we manually calculated assert _keys(s, ids1) == target