def test_merged_lengths(self): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(f1=u"A B C", f2=u"X") w.add_document(f1=u"B C D E", f2=u"Y Z") w.commit() w = writing.IndexWriter(ix) w.add_document(f1=u"A", f2=u"B C D E X Y") w.add_document(f1=u"B C", f2=u"X") w.commit(writing.NO_MERGE) w = writing.IndexWriter(ix) w.add_document(f1=u"A B X Y Z", f2=u"B C") w.add_document(f1=u"Y X", f2=u"A B") w.commit(writing.NO_MERGE) dr = ix.doc_reader() self.assertEqual(dr[0]["f1"], u"A B C") self.assertEqual(dr.doc_field_length(0, "f1"), 3) self.assertEqual(dr.doc_field_length(2, "f2"), 6) self.assertEqual(dr.doc_field_length(4, "f1"), 5)
def test_deletion(self): s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(key=u"A", name=u"Yellow brown", value=u"Blue red green purple?") w.add_document(key=u"B", name=u"Alpha beta", value=u"Gamma delta epsilon omega.") w.add_document(key=u"C", name=u"One two", value=u"Three four five.") w.commit() count = ix.delete_by_term("key", u"B") self.assertEqual(count, 1) ix.commit() self.assertEqual(ix.doc_count_all(), 3) self.assertEqual(ix.doc_count(), 2) ix.optimize() self.assertEqual(ix.doc_count(), 2) tr = ix.term_reader() self.assertEqual(list(tr.lexicon("name")), ["brown", "one", "two", "yellow"])
def test_creation(self): s = fields.Schema() s.add("content", fields.TEXT(phrase=True)) s.add("title", fields.TEXT(stored=True)) s.add("path", fields.ID(stored=True)) s.add("tags", fields.KEYWORD(stored=True)) s.add("quick", fields.NGRAM) s.add("note", fields.STORED) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(title=u"First", content=u"This is the first document", path=u"/a", tags=u"first second third", quick=u"First document", note=u"This is the first document") w.start_document() w.add_field("content", u"Let's try this again") w.add_field("title", u"Second") w.add_field("path", u"/b") w.add_field("tags", u"Uno Dos Tres") w.add_field("quick", u"Second document") w.add_field("note", u"This is the second document") w.end_document() w.commit()
def test_frequency(self): s = fields.Schema(content=fields.KEYWORD) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(content=u"A B C D E") w.add_document(content=u"B B B B C D D") w.add_document(content=u"D E F") w.commit() tr = ix.term_reader() self.assertEqual(tr.doc_frequency("content", u"B"), 2) self.assertEqual(tr.frequency("content", u"B"), 5) self.assertEqual(tr.doc_frequency("content", u"E"), 2) self.assertEqual(tr.frequency("content", u"E"), 2) self.assertEqual(tr.doc_frequency("content", u"A"), 1) self.assertEqual(tr.frequency("content", u"A"), 1) self.assertEqual(tr.doc_frequency("content", u"D"), 3) self.assertEqual(tr.frequency("content", u"D"), 4) self.assertEqual(tr.doc_frequency("content", u"F"), 1) self.assertEqual(tr.frequency("content", u"F"), 1) self.assertEqual(tr.doc_frequency("content", u"Z"), 0) self.assertEqual(tr.frequency("content", u"Z"), 0) self.assertEqual(list(tr), [(0, u"A", 1, 1), (0, u"B", 2, 5), (0, u"C", 2, 2), (0, u"D", 3, 4), (0, u"E", 2, 2), (0, u"F", 1, 1)])
def _multi_segment_index(self): ix = self._create_index() w = writing.IndexWriter(ix) w.add_document(f1 = u"A B C", f2 = u"1 2 3", f3 = u"X Y Z") w.add_document(f1 = u"D E F", f2 = u"4 5 6", f3 = u"Q R S") w.commit() w = writing.IndexWriter(ix) w.add_document(f1 = u"A E C", f2 = u"1 4 6", f3 = u"X Q S") w.add_document(f1 = u"A A A", f2 = u"2 3 5", f3 = u"Y R Z") w.commit(writing.NO_MERGE) w = writing.IndexWriter(ix) w.add_document(f1 = u"A B", f2 = u"1 2", f3 = u"X Y") w.commit(writing.NO_MERGE) return ix
def test_integrity(self): s = fields.Schema(name=fields.TEXT, value=fields.TEXT) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(name=u"Yellow brown", value=u"Blue red green purple?") w.add_document(name=u"Alpha beta", value=u"Gamma delta epsilon omega.") w.commit() w = writing.IndexWriter(ix) w.add_document(name=u"One two", value=u"Three four five.") w.commit() tr = ix.term_reader() self.assertEqual(ix.doc_count_all(), 3) self.assertEqual(list(tr.lexicon("name")), ["alpha", "beta", "brown", "one", "two", "yellow"])
def optimize(self): """Optimizes this index's segments. This will fail if the index is already locked for writing. """ if len(self.segments) < 2 and not self.segments.has_deletions(): return from whoosh import writing w = writing.IndexWriter(self) w.commit(writing.OPTIMIZE)
def add_scored_words(self, ws): """Adds a list of ("word", score) tuples to the backend dictionary. Associating words with a score lets you use the 'usescores' keyword argument of the suggest() method to order the suggestions using the scores. :ws: A sequence of ("word", score) tuples. """ writer = writing.IndexWriter(self.index()) for text, score in ws: if text.isalpha(): fields = {"word": text, "score": score} for size in xrange(self.mingram, self.maxgram + 1): nga = analysis.NgramAnalyzer(size) gramlist = [t.text for t in nga(text)] if len(gramlist) > 0: fields["start%s" % size] = gramlist[0] fields["end%s" % size] = gramlist[-1] fields["gram%s" % size] = " ".join(gramlist) writer.add_document(**fields) writer.commit()
def setUp(self): s = fields.Schema(key=fields.ID(stored=True), name=fields.TEXT, value=fields.TEXT) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(key=u"A", name=u"Yellow brown", value=u"Blue red green render purple?") w.add_document(key=u"B", name=u"Alpha beta", value=u"Gamma delta epsilon omega.") w.add_document(key=u"C", name=u"One two", value=u"Three rendered four five.") w.add_document(key=u"D", name=u"Quick went", value=u"Every red town.") w.add_document(key=u"E", name=u"Yellow uptown", value=u"Interest rendering outer photo!") w.commit() self.ix = ix
def test_lengths_ram(self): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(f1=u"A B C D E", f2=u"X Y Z") w.add_document(f1=u"B B B B C D D Q", f2=u"Q R S T") w.add_document(f1=u"D E F", f2=u"U V A B C D E") w.commit() dr = ix.doc_reader() ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, 3)] ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, 3)] self.assertEqual(dr[0]["f1"], "A B C D E") self.assertEqual(dr.doc_field_length(0, "f1"), 5) self.assertEqual(dr.doc_field_length(1, "f1"), 8) self.assertEqual(dr.doc_field_length(2, "f1"), 3) self.assertEqual(dr.doc_field_length(0, "f2"), 3) self.assertEqual(dr.doc_field_length(1, "f2"), 4) self.assertEqual(dr.doc_field_length(2, "f2"), 7) self.assertEqual(ix.field_length("f1"), 16) self.assertEqual(ix.field_length("f2"), 14)