def whooshFunction(dirdocs): crearTxt(dirdocs) schema = Schema(categoria=TEXT(stored=True), titulo=TEXT(stored=True), enlace=ID(stored=True), descripcion=TEXT(analyzer=StemmingAnalyzer()), fecha=fields.DATETIME(stored=True)) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) writer = ix.writer() for docname in os.listdir(dirdocs): if not os.path.isdir(dirdocs + docname): fileobj = open(dirdocs + '\\' + docname, "r") cat = fileobj.readline().strip() tit = fileobj.readline().strip() enlc = fileobj.readline().strip() f = fileobj.readline().strip() fech = datetime.strptime(f, '%Y-%m-%d %H:%M:%S') descrp = fileobj.readline().strip() fileobj.close() writer.add_document(categoria=cat, titulo=tit, enlace=enlc, descripcion=descrp, fecha=fech) writer.commit()
def test_pickle_schema(): from whoosh import analysis from whoosh.support.charset import accent_map from whoosh.compat import dumps freetext_analyzer = (analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map)) schema = fields.Schema(path=fields.ID(stored=True, unique=True), file_mtime=fields.DATETIME(stored=True), name=fields.TEXT(stored=False, field_boost=2.0), description=fields.TEXT(stored=False, field_boost=1.5, analyzer=freetext_analyzer), content=fields.TEXT(analyzer=freetext_analyzer)) # Try to make some sentences that will require stemming docs = [ u"The rain in spain falls mainly in the plain", u"Plainly sitting on the plain", u"Imagine a greatly improved sentence here" ] with TempIndex(schema) as ix: with ix.writer() as w: for doc in docs: w.add_document(description=doc, content=doc) assert dumps(schema, 2) with ix.reader() as r: assert dumps(r.schema, 2)
def test_highlight_daterange(): from datetime import datetime schema = fields.Schema(id=fields.ID(unique=True, stored=True), title=fields.TEXT(stored=True), content=fields.TEXT(stored=True), released=fields.DATETIME(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.update_document( id=u('1'), title=u('Life Aquatic'), content=u('A nautic film crew sets out to kill a gigantic shark.'), released=datetime(2004, 12, 25) ) w.update_document( id=u('2'), title=u('Darjeeling Limited'), content=u('Three brothers meet in India for a life changing train ' + 'journey.'), released=datetime(2007, 10, 27) ) w.commit() s = ix.searcher() r = s.search(Term('content', u('train')), terms=True) assert len(r) == 1 assert r[0]["id"] == "2" assert (r[0].highlights("content") == 'for a life changing <b class="match term0">train</b> journey') r = s.search(DateRange('released', datetime(2007, 1, 1), None)) assert len(r) == 1 assert r[0].highlights("content") == ''
def borraCreaIndex(): if os.path.exists("index"): shutil.rmtree("index") schema = Schema(remitente=KEYWORD(stored=True), destinatarios=KEYWORD(stored=True), fecha=fields.DATETIME(stored=True), asunto= KEYWORD(stored=True), cuerpo= TEXT(stored=True), nombreFichero=KEYWORD(stored=True)) if not os.path.exists("index"): os.mkdir("index") ix = create_in("index", schema) else: ix = open_dir("index") quedanFicheros=True numeroFichero=1 writer = ix.writer() while quedanFicheros: try: fichero=open("Correos/"+str(numeroFichero)+".txt") texto=fichero.read() textoPorPartes=(texto.split("\n",4)) fechaFormat=datetime.datetime.strptime(textoPorPartes[2].strip(),"%Y%m%d") writer.add_document(remitente=unicode(textoPorPartes[0]), destinatarios=unicode(textoPorPartes[1]), fecha=fechaFormat,asunto=unicode(textoPorPartes[3]), cuerpo=unicode(textoPorPartes[4]),nombreFichero=unicode(str(numeroFichero)+".txt")) numeroFichero=numeroFichero+1 except Exception as ex: print("No hay mas ficheros") writer.commit() quedanFicheros=False
def create_index(self): if not os.path.exists("twitter_index"): os.mkdir("twitter_index") schema = fields.Schema(tweet_id=fields.TEXT(stored=True), batch=fields.NUMERIC(stored=True), content=fields.TEXT(stored=True), posted=fields.DATETIME(stored=True), owner_sn=fields.TEXT(stored=True), owner_id=fields.TEXT(stored=True), owner_name=fields.TEXT(stored=True), isRT=fields.BOOLEAN(stored=True), timesRT=fields.NUMERIC(stored=True), timesFav= fields.NUMERIC(stored=True), orig_timesRT=fields.NUMERIC(stored=True), orig_timesFav= fields.NUMERIC(stored=True), hashtags=fields.KEYWORD(stored=True), orgnlTweet = fields.TEXT(stored=True), mentions=fields.KEYWORD(stored=True), media = fields.TEXT(stored=True), url = fields.TEXT(stored=True), liwc=fields.TEXT(stored=True)) self.INDEX = index.create_in("twitter_index", schema, indexname="TWTTR") print("New searching index succesfully created") return self.INDEX
def test_datetime(): dtf = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=dtf) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for month in xrange(1, 12): for day in xrange(1, 28): w.add_document(id=u("%s-%s") % (month, day), date=datetime(2010, month, day, 14, 0, 0)) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("date:20100523")) assert len(r) == 1 assert r[0]["id"] == "5-23" assert r[0]["date"].__class__ is datetime assert r[0]["date"].month == 5 assert r[0]["date"].day == 23 r = s.search(qp.parse("date:'2010 02'")) assert len(r) == 27 q = qp.parse(u("date:[2010-05 to 2010-08]")) startdt = datetime(2010, 5, 1, 0, 0, 0, 0) enddt = datetime(2010, 8, 31, 23, 59, 59, 999999) assert q.__class__ is query.NumericRange assert q.start == times.datetime_to_long(startdt) assert q.end == times.datetime_to_long(enddt)
def whooshFunction(dirdocs): crearTxt(dirdocs) schema = Schema(titulo=TEXT(stored=True), fecha=fields.DATETIME(stored=True), enlace=TEXT(stored=True), resumen=TEXT(stored=True)) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) writer = ix.writer() for docname in os.listdir(dirdocs): if not os.path.isdir(dirdocs + docname): fileobj = open(dirdocs + '\\' + docname, "r") tit = fileobj.readline().strip() f = fileobj.readline().strip() fech = datetime.strptime(f, '%d/%m/%Y - %H:%M') enl = fileobj.readline().strip() res = fileobj.readline().strip() fileobj.close() writer.add_document(titulo=tit, fecha=fech, enlace=enl, resumen=res) writer.commit()
def get_schema(model, analyzer): schema = {} primary = None searchable = set(getattr(model, '__searchable__', [])) for field in model.__table__.columns: # primary key id if field.primary_key: schema[field.name] = whoosh_fields.ID(stored=True, unique=True, sortable=True) primary = field.name if field.name not in searchable: continue # text types if isinstance(field.type, TEXT_TYPES): schema[field.name] = whoosh_fields.TEXT(analyzer=analyzer) elif isinstance(field.type, DATE_TYPES): is_unique = getattr(field, 'unique', False) schema[field.name] = whoosh_fields.DATETIME(unique=is_unique) elif isinstance(field.type, sql_types.Boolean): schema[field.name] = whoosh_fields.BOOLEAN() else: raise WhooshAlchemyError('cannot index column of type %s' % field.type) return whoosh_fields.Schema(**schema), primary
def test_bigsort(): times = 30000 dirname = "testindex" df = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=df) if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) ix = index.create_in(dirname, schema) print("Writing...") t = now() w = ix.writer(limitmb=512) for i in xrange(times): dt = datetime.fromtimestamp(random.randint(15839593, 1294102139)) w.add_document(id=text_type(i), date=dt) w.commit() print("Writing took ", now() - t) ix = index.open_dir(dirname) s = ix.searcher() q = query.Wildcard("id", "1?2*") t = now() x = list(df.sortable_values(s.reader(), "date")) print(now() - t, len(x)) t = now() for y in x: p = list(s.postings("date", y).all_ids()) print(now() - t) t = now() r = s.search(q, limit=25, sortedby="date", reverse=True) print("Search 1 took", now() - t) print("len=", r.scored_length()) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) from heapq import nlargest t = now() sf = s.stored_fields gen = ((sf(n)["date"], n) for n in q.docs(s)) r = nlargest(25, gen) print(now() - t)
def _init_schema(): schema = fields.Schema() schema.add("id", fields.ID(unique=True, stored=True)) schema.add("short_id", fields.ID(stored=True)) schema.add("status", fields.ID(stored=True)) schema.add("started", fields.DATETIME(stored=True)) schema.add("stopped", fields.DATETIME(stored=True)) schema.add("pkg_type", fields.ID(stored=True)) schema.add("pkg_name", fields.ID(stored=True)) schema.add("pkg_version", fields.ID(stored=True)) schema.add("model_name", fields.ID(stored=True)) schema.add("op_name", fields.ID(stored=True)) schema.add("label", fields.TEXT(stored=True)) schema.add("scalar_*", fields.NUMERIC(float, stored=True), glob=True) schema.add("flagi_*", fields.NUMERIC(int, stored=True), glob=True) schema.add("flagf_*", fields.NUMERIC(int, stored=True), glob=True) schema.add("flagb_*", fields.BOOLEAN(stored=True), glob=True) schema.add("flags_*", fields.ID(stored=True), glob=True) schema.add("priv_*", fields.STORED, glob=True) return schema
class WorkspaceSchema(fields.SchemaClass): id = fields.ID(stored=True, unique=True) owner = fields.TEXT(stored=True, spelling=True) name = fields.TEXT(stored=True, spelling=True) description = fields.NGRAM(stored=True, minsize=1, phrase=True) lastmodified = fields.DATETIME(stored=True) longdescription = fields.NGRAM(stored=True, minsize=1, phrase=True) public = fields.BOOLEAN(stored=True) users = fields.KEYWORD(commas=True) groups = fields.KEYWORD(commas=True) shared = fields.BOOLEAN(stored=True)
def test_query_schema_is_setup_correctly(self): # Given p = Project(name='test', path=self.root) # When p.scan() # Then schema = p._query_parser.schema items = schema.items() from whoosh import fields self.assertIn(('path', fields.TEXT()), items) self.assertIn(('ctime', fields.DATETIME()), items) self.assertIn(('completed', fields.BOOLEAN()), items) self.assertIn(('size', INT), items)
def get_index(self): stem_ana = analysis.StemmingAnalyzer() schema = fields.Schema( id=fields.ID(unique=True), datetime=fields.DATETIME(sortable=True), reply=fields.BOOLEAN, retweet=fields.BOOLEAN, text=fields.TEXT(analyzer=stem_ana, stored=True) ) index_dir = os.path.join(self.dir, "index") if os.path.exists(index_dir): self.index = index.open_dir(index_dir) else: os.mkdir(index_dir) self.index = index.create_in(index_dir, schema)
def test_nontext_update(): schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(unique=True), date=fields.DATETIME(unique=True)) ix = RamStorage().create_index(schema) dt = datetime.now() w = ix.writer() for i in xrange(10): w.add_document(id=i, num=i, date=dt + timedelta(days=i)) w.commit() w = ix.writer() w.update_document(num=8, id="a") w.update_document(num=2, id="b") w.update_document(num=4, id="c") w.update_document(date=dt + timedelta(days=5), id="d") w.update_document(date=dt + timedelta(days=1), id="e") w.update_document(date=dt + timedelta(days=7), id="f") w.commit()
class TweetSchema(fields.SchemaClass): id = fields.ID(stored=True, unique=True) url = fields.ID(stored=True, unique=True) text = fields.TEXT(stored=True) source = fields.TEXT(stored=True) reply = fields.BOOLEAN(stored=True) in_reply_to_id = fields.TEXT(stored=True) in_reply_to_name = fields.TEXT(stored=True) user_mentions = fields.KEYWORD(stored=True) hashtags = fields.KEYWORD(stored=True) urls = fields.KEYWORD(stored=True) geo = fields.BOOLEAN(stored=True) latitude = fields.NUMERIC(stored=True) longitude = fields.NUMERIC(stored=True) date = fields.DATETIME(stored=True)
def _setup_index(self): schema = fields.Schema(path=fields.ID(stored=True), content=fields.TEXT(stored=True), date=fields.DATETIME(stored=True, sortable=True)) indexpath = os.path.join(fs.adirs.user_cache_dir, "index", self.channel) if not os.path.exists(indexpath): os.makedirs(indexpath) ix = create_in(indexpath, schema) writer = ix.writer(procs=self.indexer_procs) for name in os.listdir(self.log_dir): if name.startswith(self.channel + ".") and name.endswith(".yaml"): c, date = self._fields_from_yaml(name) writer.add_document(path=name, content=c, date=date) writer.commit() self.last_index_update = time.time() self.ix = ix lc = LoopingCall(self.update_index) reactor.callFromThread(lc.start, 30, now=False)
def test_open_date_ranges(): basedate = datetime(2011, 1, 24, 6, 25, 0, 0) domain = [basedate + timedelta(days=n) for n in xrange(-20, 20)] schema = fields.Schema(date=fields.DATETIME(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() for d in domain: w.add_document(date=d) w.commit() with ix.searcher() as s: # Without date parser qp = qparser.QueryParser("date", schema) q = qp.parse("[2011-01-10 to]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)] assert_equal(r, target) q = qp.parse("[to 2011-01-30]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)] assert_equal(r, target) # With date parser from whoosh.qparser.dateparse import DateParserPlugin qp.add_plugin(DateParserPlugin(basedate)) q = qp.parse("[10 jan 2011 to]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)] assert_equal(r, target) q = qp.parse("[to 30 jan 2011]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)] assert_equal(r, target)
class PydocSchema(fields.SchemaClass): path = fields.STORED title = fields.TEXT(stored=True, sortable=True, spelling=True, analyzer=ana) tgrams = fields.NGRAMWORDS content = fields.TEXT(spelling=True, analyzer=ana) chapter = fields.ID(sortable=True) size = fields.NUMERIC(sortable=True) rev = fields.NUMERIC(sortable=True) revised = fields.DATETIME(sortable=True) modref = fields.TEXT(analyzer=tech_ana, phrase=False) clsref = fields.TEXT(analyzer=tech_ana, phrase=False) funcref = fields.TEXT(analyzer=tech_ana, phrase=False) pep = fields.TEXT(analyzer=tech_ana, phrase=False) cls = fields.TEXT(analyzer=cls_ana) mod = fields.TEXT(analyzer=tech_ana, phrase=False)
def get_schema(): return fields.Schema(titulo=fields.TEXT(stored=True), start=fields.DATETIME(stored=True), end=fields.DATETIME(stored=True), descripcion=fields.TEXT(stored=True), categoria=fields.TEXT(stored=True))
import hashlib import whoosh.fields as F # This scheme defines the structure of a single knowhow snippet. SCHEMA = F.Schema( # unique identifier id=F.ID(unique=True, stored=True), # a multi-valued analyzed field tag=F.KEYWORD(stored=True, field_boost=2.0), # the text content of the snippet content=F.TEXT(stored=True), # all searchable fields, for use as a default field text=F.TEXT(stored=False), # when the snippet was last modified updated=F.DATETIME(stored=True), ) # Function to create a hasher object for generating id of a snippet. IdGenerator = hashlib.sha256 # The number of hexadecimal characters in an id ID_LENGTH = IdGenerator().digest_size * 2 def identifier(doc): """ Generate a unique identifier based solely on the content of the document. This doesn't take tags or anything else into account, because the content is what really matters. This means that adding the same content with
def open_index(self, index_folder, create_new=False): """ Create a schema, and create/open a search index that lives on disk. """ self.index_folder = index_folder if create_new: if os.path.exists(index_folder): shutil.rmtree(index_folder) print("deleted index folder: " + index_folder) if not os.path.exists(index_folder): os.mkdir(index_folder) exists = index.exists_in(index_folder) #stemming_analyzer = StemmingAnalyzer() stemming_analyzer = StemmingAnalyzer() | LowercaseFilter() #stemming_analyzer = StemmingAnalyzer() | LowercaseFilter() | StopFilter() # ------------------------------ # This is where the search index's document schema # is defined. schema = Schema( id = fields.ID(stored=True, unique=True), kind = fields.ID(stored=True), created_time = fields.DATETIME(stored=True), modified_time = fields.DATETIME(stored=True), indexed_time = fields.DATETIME(stored=True), title = fields.TEXT(stored=True, field_boost=100.0), url = fields.ID(stored=True), mimetype = fields.TEXT(stored=True), owner_email = fields.ID(stored=True), owner_name = fields.TEXT(stored=True), # mainly for email threads, groups.io, hypothesis group = fields.ID(stored=True), repo_name = fields.TEXT(stored=True), repo_url = fields.ID(stored=True), github_user = fields.TEXT(stored=True), tags = fields.KEYWORD(commas=True, stored=True, lowercase=True), # comments only issue_title = fields.TEXT(stored=True, field_boost=100.0), issue_url = fields.ID(stored=True), content = fields.TEXT(stored=True, analyzer=stemming_analyzer) ) # Now that we have a schema, # make an index! if not exists: self.ix = index.create_in(index_folder, schema) else: self.ix = index.open_dir(index_folder)
from whoosh import fields, index from datetime import datetime '''class whooshSCHEMA(fields.SchemaClass): title = fields.TEXT(stored=True,sortable=True) content = fields.TEXT(spelling=True) date = fields.DATETIME(stored=True) summary = fields.STORED url=fields.ID(stored=True, unique=True))''' WHOOSH_SCHEMA = fields.Schema(title=fields.TEXT(stored=True, sortable=True), content=fields.TEXT(spelling=True), date=fields.DATETIME(stored=True), summary=fields.STORED, url=fields.ID(stored=True, unique=True)) #To create an index basically you need a writer object ix = index.create_in("index", schema=WHOOSH_SCHEMA) writer = ix.writer() writer.add_document(title="pycones 2017", content="python conference", date=datetime(2017, 9, 22), summary="discovering python search engine", url="http://pycones.es") writer.add_document(title="python 2017", content="pycones2017", date=datetime(2017, 9, 22), summary="discovering python search engine", url="http://pycones.es")