def test_lengths(self): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) ix = self.make_index("testindex", s) try: w = ix.writer() tokens = u"ABCDEFG" from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u" ".join(islice(cycle(tokens), length))) w.commit() dr = ix.doc_reader() ls1 = [ dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths)) ] ls2 = [ dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths)) ] self.assertEqual(ls1, [0] * len(lengths)) self.assertEqual(ls2, lengths) dr.close() ix.close() finally: self.destroy_index("testindex")
def make_whoosh_schema(): """ Creates and returns the whoosh schema being used. Note: typically you will want to retrieve the schema from the index itself (ix.schema). This function exists to create a schema object during the creation of the index. """ from whoosh import fields schema = fields.Schema(name=fields.TEXT, rules_text=fields.TEXT, flavor_text=fields.TEXT, sets=fields.KEYWORD(stored=True), types=fields.KEYWORD(stored=True), subtypes=fields.KEYWORD(stored=True), power=fields.NUMERIC, toughness=fields.NUMERIC, cmc=fields.NUMERIC, mana_cost=fields.KEYWORD, white=fields.NUMERIC, blue=fields.NUMERIC, black=fields.NUMERIC, red=fields.NUMERIC, green=fields.NUMERIC, legal_formats=fields.KEYWORD(stored=True), data_obj=fields.STORED) return schema
def test_merged_lengths(self): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(f1=u"A B C", f2=u"X") w.add_document(f1=u"B C D E", f2=u"Y Z") w.commit() w = writing.IndexWriter(ix) w.add_document(f1=u"A", f2=u"B C D E X Y") w.add_document(f1=u"B C", f2=u"X") w.commit(writing.NO_MERGE) w = writing.IndexWriter(ix) w.add_document(f1=u"A B X Y Z", f2=u"B C") w.add_document(f1=u"Y X", f2=u"A B") w.commit(writing.NO_MERGE) dr = ix.doc_reader() self.assertEqual(dr[0]["f1"], u"A B C") self.assertEqual(dr.doc_field_length(0, "f1"), 3) self.assertEqual(dr.doc_field_length(2, "f2"), 6) self.assertEqual(dr.doc_field_length(4, "f1"), 5)
def test_merged_lengths(self): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(f1=u"A B C", f2=u"X") w.add_document(f1=u"B C D E", f2=u"Y Z") w.commit() w = ix.writer() w.add_document(f1=u"A", f2=u"B C D E X Y") w.add_document(f1=u"B C", f2=u"X") w.commit(NO_MERGE) w = ix.writer() w.add_document(f1=u"A B X Y Z", f2=u"B C") w.add_document(f1=u"Y X", f2=u"A B") w.commit(NO_MERGE) dr = ix.reader() self.assertEqual(dr.stored_fields(0)["f1"], u"A B C") self.assertEqual(dr.doc_field_length(0, "f1"), 3) self.assertEqual(dr.doc_field_length(2, "f2"), 6) self.assertEqual(dr.doc_field_length(4, "f1"), 5) dr.close()
class OffersSchema(fields.SchemaClass): pk = fields.ID(unique=True, stored=True) slug = fields.ID(stored=True) name = fields.TEXT(stored=True) wishes = fields.KEYWORD(commas=True) target_group = fields.KEYWORD(commas=True) organization = fields.TEXT(stored=True, sortable=True)
def create_index(self): if not os.path.exists("twitter_index"): os.mkdir("twitter_index") schema = fields.Schema(tweet_id=fields.TEXT(stored=True), batch=fields.NUMERIC(stored=True), content=fields.TEXT(stored=True), posted=fields.DATETIME(stored=True), owner_sn=fields.TEXT(stored=True), owner_id=fields.TEXT(stored=True), owner_name=fields.TEXT(stored=True), isRT=fields.BOOLEAN(stored=True), timesRT=fields.NUMERIC(stored=True), timesFav= fields.NUMERIC(stored=True), orig_timesRT=fields.NUMERIC(stored=True), orig_timesFav= fields.NUMERIC(stored=True), hashtags=fields.KEYWORD(stored=True), orgnlTweet = fields.TEXT(stored=True), mentions=fields.KEYWORD(stored=True), media = fields.TEXT(stored=True), url = fields.TEXT(stored=True), liwc=fields.TEXT(stored=True)) self.INDEX = index.create_in("twitter_index", schema, indexname="TWTTR") print("New searching index succesfully created") return self.INDEX
class WorkspaceSchema(fields.SchemaClass): id = fields.ID(stored=True, unique=True) owner = fields.TEXT(stored=True, spelling=True) name = fields.TEXT(stored=True, spelling=True) description = fields.NGRAM(stored=True, minsize=1, phrase=True) lastmodified = fields.DATETIME(stored=True) longdescription = fields.NGRAM(stored=True, minsize=1, phrase=True) public = fields.BOOLEAN(stored=True) users = fields.KEYWORD(commas=True) groups = fields.KEYWORD(commas=True) shared = fields.BOOLEAN(stored=True)
def project_schema(self): return fields.Schema( path=fields.ID(stored=True, unique=True), name=fields.ID(stored=True), user=fields.ID(stored=True), index=fields.ID(stored=True), classifiers=fields.KEYWORD(commas=True, scorable=True), keywords=fields.KEYWORD(stored=True, commas=False, scorable=True), version=fields.STORED(), doc_version=fields.STORED(), type=fields.ID(stored=True), text_path=fields.STORED(), text_title=fields.STORED(), text=fields.TEXT(analyzer=NgramWordAnalyzer(), stored=False, phrase=False))
def test_or_nots2(): # Issue #286 schema = fields.Schema(a=fields.KEYWORD(stored=True), b=fields.KEYWORD(stored=True)) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: w.add_document(b=u("bravo")) with ix.searcher() as s: q = query.Or([query.Term("a", "alfa"), query.Not(query.Term("b", "alfa")) ]) r = s.search(q) assert len(r) == 1
def _create_index(): s = fields.Schema(f1=fields.KEYWORD(stored=True), f2=fields.KEYWORD, f3=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) return ix
def test_scoring(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with ix.searcher() as s: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r] == ["Calculator", "Index", "Accumulator"]
def create_whoosh_schema(self): schema_classname = "WhooshSchema" schema_classname = str(schema_classname) attrs = OrderedDict() for c in self.columns: if c in self.ngram_columns: field = fields.NGRAM( minsize=self.ngram_minsize, maxsize=self.ngram_maxsize, stored=True, ) elif c in self.phrase_columns: field = fields.TEXT(stored=True) elif c in self.keyword_columns: field = fields.KEYWORD( lowercase=self.keyword_lowercase, commas=self.keyword_commas, stored=True, ) else: field = fields.STORED() attrs[c] = field SchemaClass = type(schema_classname, (fields.SchemaClass,), attrs) schema = SchemaClass() return schema
def _create_index(self): s = fields.Schema(f1 = fields.KEYWORD(stored = True), f2 = fields.KEYWORD, f3 = fields.KEYWORD) st = store.RamStorage() ix = index.Index(st, s, create = True) return ix
def create_whoosh_schema(self): """ Dynamically create whoosh.fields.SchemaClass schema object. It defines how you index your dataset. :rtype: SchemaClass """ schema_classname = "WhooshSchema" schema_classname = str(schema_classname) attrs = OrderedDict() for c_setting in self.columns: if c_setting.type_is_ngram: field = fields.NGRAM( minsize=c_setting.ngram_minsize, maxsize=c_setting.ngram_maxsize, stored=True, ) elif c_setting.type_is_phrase: field = fields.TEXT(stored=True) elif c_setting.type_is_keyword: field = fields.KEYWORD( lowercase=c_setting.keyword_lowercase, commas=c_setting.keyword_commas, stored=True, ) else: field = fields.STORED() attrs[c_setting.name] = field SchemaClass = type(schema_classname, (fields.SchemaClass,), attrs) schema = SchemaClass() # type: SchemaClass return schema
def test_batchsize_eq_doccount(): check_multi() schema = fields.Schema(a=fields.KEYWORD(stored=True)) with TempIndex(schema) as ix: with ix.writer(procs=4, batchsize=10) as w: for i in xrange(10): w.add_document(a=u(str(i)))
class Fields(object): INDEX_DIR = os.path.join(Globals.BASE_DIR, 'fields') INDEX = None SCHEMA = fields.Schema(name=fields.TEXT(analyzer=analysis.FancyAnalyzer(), stored=True, chars=True), tags=fields.KEYWORD(scorable=True)) @classmethod def get_index(cls): if cls.INDEX is None: if not os.path.exists(cls.INDEX_DIR): os.mkdir(cls.INDEX_DIR) if index.exists_in(cls.INDEX_DIR): cls.INDEX = index.open_dir(cls.INDEX_DIR) else: cls.INDEX = index.create_in(cls.INDEX_DIR, cls.SCHEMA) writer = cls.INDEX.writer() for att in Definitions.all_atts(): writer.add_document(name=unicode(att['ov'])) writer.add_document(name=u'ov') writer.add_document(name=u'nov') writer.add_document(name=u'id') writer.commit() return cls.INDEX @classmethod def search(cls, query_string): qp = qparser.MultifieldParser(cls.SCHEMA.names(), schema=cls.SCHEMA) q = qp.parse(query_string) s = cls.get_index().searcher() results = s.search(q, limit=None) return [r['name'] for r in results]
def test_query_facet2(): domain = u("abcdefghi") schema = fields.Schema(v=fields.KEYWORD(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for i, ltr in enumerate(domain): v = "%s %s" % (ltr, domain[0 - i]) w.add_document(v=v) with ix.searcher() as s: q1 = query.TermRange("v", "a", "c") q2 = query.TermRange("v", "d", "f") q3 = query.TermRange("v", "g", "i") facets = sorting.Facets() facets.add_query("myfacet", { "a-c": q1, "d-f": q2, "g-i": q3 }, allow_overlap=True) r = s.search(query.Every(), groupedby=facets) assert_equal(r.groups("myfacet"), { 'a-c': [0, 1, 2, 7, 8], 'd-f': [4, 5], 'g-i': [3, 6] })
def test_creation(self): s = fields.Schema() s.add("content", fields.TEXT(phrase=True)) s.add("title", fields.TEXT(stored=True)) s.add("path", fields.ID(stored=True)) s.add("tags", fields.KEYWORD(stored=True)) s.add("quick", fields.NGRAM) s.add("note", fields.STORED) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(title=u"First", content=u"This is the first document", path=u"/a", tags=u"first second third", quick=u"First document", note=u"This is the first document") w.start_document() w.add_field("content", u"Let's try this again") w.add_field("title", u"Second") w.add_field("path", u"/b") w.add_field("tags", u"Uno Dos Tres") w.add_field("quick", u"Second document") w.add_field("note", u"This is the second document") w.end_document() w.commit()
def test_index_decimals(): from decimal import Decimal schema = fields.Schema(name=fields.KEYWORD(stored=True), num=fields.NUMERIC(int)) ix = RamStorage().create_index(schema) with ix.writer() as w: with pytest.raises(TypeError): w.add_document(name=u("hello"), num=Decimal("3.2")) schema = fields.Schema(name=fields.KEYWORD(stored=True), num=fields.NUMERIC(Decimal, decimal_places=5)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(name=u("hello"), num=Decimal("3.2"))
def test_removefield(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True)) with TempIndex(schema, "removefield") as ix: w = ix.writer() w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad")) w.add_document(id=u("c"), content=u("charlie"), city=u("cairo")) w.add_document(id=u("d"), content=u("delta"), city=u("dakar")) w.commit() with ix.searcher() as s: assert s.document(id=u("c")) == {"id": "c", "city": "cairo"} w = ix.writer() w.remove_field("content") w.remove_field("city") w.commit() ixschema = ix._current_schema() assert ixschema.names() == ["id"] assert ixschema.stored_names() == ["id"] with ix.searcher() as s: assert ("content", b("charlie")) not in s.reader() assert s.document(id=u("c")) == {"id": u("c")}
def test_closed_searcher(): from whoosh.reading import ReaderClosed schema = fields.Schema(key=fields.KEYWORD(stored=True, sortable=True)) with TempStorage() as st: ix = st.create_index(schema) with ix.writer() as w: w.add_document(key=u"alfa") w.add_document(key=u"bravo") w.add_document(key=u"charlie") w.add_document(key=u"delta") w.add_document(key=u"echo") s = ix.searcher() r = s.search(query.TermRange("key", "b", "d")) s.close() assert s.is_closed with pytest.raises(ReaderClosed): assert r[0]["key"] == "bravo" with pytest.raises(ReaderClosed): s.reader().column_reader("key") with pytest.raises(ReaderClosed): s.suggest("key", "brovo") s = ix.searcher() r = s.search(query.TermRange("key", "b", "d")) assert r[0] assert r[0]["key"] == "bravo" c = s.reader().column_reader("key") assert c[1] == "bravo" assert s.suggest("key", "brovo") == ["bravo"]
def test_overlapping_vector(): schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD(vector=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, tags=u("alfa bravo charlie")) w.add_document(id=1, tags=u("bravo charlie delta")) w.add_document(id=2, tags=u("charlie delta echo")) w.add_document(id=3, tags=u("delta echo alfa")) w.add_document(id=4, tags=u("echo alfa bravo")) with ix.searcher() as s: of = sorting.FieldFacet("tags", allow_overlap=True) cat = of.categorizer(s) assert cat._use_vectors r = s.search(query.Every(), groupedby={"tags": of}) assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], 'charlie': [0, 1, 2], 'delta': [1, 2, 3], 'echo': [2, 3, 4]} fcts = sorting.Facets() fcts.add_field("tags", allow_overlap=True) r = s.search(query.Every(), groupedby=fcts) assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], 'charlie': [0, 1, 2], 'delta': [1, 2, 3], 'echo': [2, 3, 4]}
def test_not_order(): schema = fields.Schema(id=fields.STORED, count=fields.KEYWORD(lowercase=True), cats=fields.KEYWORD(lowercase=True)) qp = default.QueryParser("count", schema) q1 = qp.parse(u("(NOT (count:0) AND cats:1)")) assert q1.__class__ == query.And assert q1[0].__class__ == query.Not assert q1[1].__class__ == query.Term assert q1.__unicode__() == '(NOT count:0 AND cats:1)' q2 = qp.parse(u("(cats:1 AND NOT (count:0))")) assert q2.__class__ == query.And assert q2[0].__class__ == query.Term assert q2[1].__class__ == query.Not assert q2.__unicode__() == '(cats:1 AND NOT count:0)'
def test_batchsize_eq_doccount(): from whoosh.filedb.multiproc import MpWriter schema = fields.Schema(a=fields.KEYWORD(stored=True)) with TempIndex(schema) as ix: with ix.writer(procs=4, batchsize=10) as w: for i in xrange(10): w.add_document(a=u(str(i)))
def test_creation2(): s = fields.Schema(a=fields.ID(stored=True), b=fields.ID, c=fields.KEYWORD(scorable=True)) assert s.names() == ["a", "b", "c"] assert "a" in s assert "b" in s assert "c" in s
class CatalogueResourceSchema(fields.SchemaClass): pk = fields.ID(stored=True, unique=True) vendor_name = fields.ID name = fields.TEXT(stored=True) vendor = fields.TEXT(stored=True, spelling=True) version = fields.TEXT(stored=True) template_uri = fields.STORED type = fields.TEXT(stored=True) creation_date = fields.DATETIME title = fields.TEXT(stored=True, spelling=True) image = fields.STORED smartphoneimage = fields.STORED description = fields.TEXT(stored=True, spelling=True) wiring = fields.TEXT(spelling=True) public = fields.BOOLEAN users = fields.KEYWORD(commas=True) groups = fields.KEYWORD(commas=True) content = fields.NGRAMWORDS()
def test_nested_delete(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Deleter")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("delete")) # Delete "Accumulator" class with ix.writer() as w: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "Accumulator")) w.delete_by_query(q) # Check that Accumulator AND ITS METHODS are deleted with ix.searcher() as s: r = s.search(query.Term("kind", "class")) assert sorted(hit["name"] for hit in r) == ["Calculator", "Deleter", "Index"] names = [fs["name"] for _, fs in s.iter_docs()] assert names == [ "Index", "add document", "add reader", "close", "Calculator", "add", "add all", "add some", "multiply", "close", "Deleter", "add", "delete" ] # Delete any class with a close method with ix.writer() as w: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "close")) w.delete_by_query(q) # Check the CLASSES AND METHODS are gone with ix.searcher() as s: names = [fs["name"] for _, fs in s.iter_docs()] assert names == ["Deleter", "add", "delete"]
class TweetSchema(fields.SchemaClass): id = fields.ID(stored=True, unique=True) url = fields.ID(stored=True, unique=True) text = fields.TEXT(stored=True) source = fields.TEXT(stored=True) reply = fields.BOOLEAN(stored=True) in_reply_to_id = fields.TEXT(stored=True) in_reply_to_name = fields.TEXT(stored=True) user_mentions = fields.KEYWORD(stored=True) hashtags = fields.KEYWORD(stored=True) urls = fields.KEYWORD(stored=True) geo = fields.BOOLEAN(stored=True) latitude = fields.NUMERIC(stored=True) longitude = fields.NUMERIC(stored=True) date = fields.DATETIME(stored=True)
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() tokens = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(tokens), length))) w.commit() with ix.reader() as dr: ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] assert_equal(ls1, [0] * len(lengths)) ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] assert_equal(ls2, [byte_to_length(length_to_byte(l)) for l in lengths])
def test_creation1(): s = fields.Schema() s.add("content", fields.TEXT(phrase=True)) s.add("title", fields.TEXT(stored=True)) s.add("path", fields.ID(stored=True)) s.add("tags", fields.KEYWORD(stored=True)) s.add("quick", fields.NGRAM) s.add("note", fields.STORED) assert s.names() == ["content", "note", "path", "quick", "tags", "title"] assert "content" in s assert "buzz" not in s assert isinstance(s["tags"], fields.KEYWORD)