def test_build_attrs(self): schema = Schema() adapter = SAAdapter(SANotIndexable, schema) self.assertEquals(adapter.indexable, False) self.assertEquals(adapter.doc_attrs, {}) adapter = SAAdapter(Entity, schema) self.assertEquals(adapter.indexable, False) adapter = SAAdapter(SubclassEntityIndexable, schema) self.assertEquals(adapter.indexable, True) self.assertEquals(set(adapter.doc_attrs), set(('object_key', 'id', 'name', 'object_type', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator'))) self.assert_(all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues())) self.assertEquals(set(schema.names()), set(('object_key', 'id', 'object_type', 'name', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator'))) schema = Schema( id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True), ) adapter = SAAdapter(Indexable, schema) self.assertEquals(adapter.indexable, True) self.assertEquals(set(adapter.doc_attrs), set(('id', 'text', 'num', 'name'))) self.assert_(all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues())) self.assertEquals(set(schema.names()), set(('id', 'text', 'num', 'name'))) self.assertTrue(isinstance(schema['text'], TEXT)) self.assertTrue(isinstance(schema['num'], NUMERIC))
def test_build_attrs_3(): schema = Schema() adapter = SAAdapter(SubclassEntityIndexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { "allowed_roles_and_users", "created_at", "creator", "creator_name", "id", "name", "name_prefix", "object_key", "object_type", "owner", "owner_name", "slug", "tag_ids", "tag_text", "text", "updated_at", } assert all(lambda f: callable(f) for f in adapter.doc_attrs.values()) assert set(schema.names()) == { "allowed_roles_and_users", "created_at", "creator", "creator_name", "id", "name", "name_prefix", "object_key", "object_type", "owner", "owner_name", "slug", "tag_ids", "tag_text", "text", "updated_at", }
def __init__(self): chfilter = CharsetFilter(accent_map) stoplist = stoplists["en"].union(stoplists["fr"]) analyzer = RegexTokenizer() | LowercaseFilter() | \ StopFilter(stoplist=stoplist) | chfilter # defines the schema # see http://pythonhosted.org/Whoosh/schema.html for reference keywordType = KEYWORD(lowercase=True, scorable=True) self.schema = Schema(content=TEXT(analyzer=analyzer), docType=TEXT, docId=ID(stored=True, unique=True), tags=keywordType) # Adds dynamic fields so each documents can index its fields in the # same Whoosh index self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True) self.schema.add('*_date', DATETIME, glob=True) self.schema.add('*_number', NUMERIC, glob=True) self.schema.add('*_boolean', BOOLEAN, glob=True) # Creates the index folder and Whoosh index files if it doesn't exist # And loads the index in any case if not os.path.exists("indexes"): os.mkdir("indexes") self.index = index.create_in("indexes", self.schema) else: self.index = index.open_dir("indexes") # Creates the doctypes folder if it doesn't exist if not os.path.exists("doctypes"): os.mkdir("doctypes") # Creates the doctypes default schema file if it doesn't exist if not os.path.exists('doctypes/doctypes_schema.json'): with open('doctypes/doctypes_schema.json', 'w') as defaultFile: defaultFile.write("{}") ''' Loads the doctypes schema if it's valid, otherwise recreates it Doctypes schema is a dictionary of doctypes with their fields created and updated when a document is indexed. That way, we can tell Whoosh which fields to search by default, because there is apparently no way to say "search in all fields". ''' with open('doctypes/doctypes_schema.json', 'r+') as rawJSON: try: self.doctypesSchema = json.load(rawJSON) except ValueError: rawJSON.write("{}") self.doctypesSchema = {}
def test_build_attrs(self): schema = Schema() adapter = SAAdapter(SANotIndexable, schema) assert not adapter.indexable assert adapter.doc_attrs == {} adapter = SAAdapter(Entity, schema) assert adapter.indexable == False adapter = SAAdapter(SubclassEntityIndexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { 'object_key', 'id', 'name', 'slug', 'object_type', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator', 'allowed_roles_and_users', 'tag_ids', 'tag_text', } assert all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues()) assert set(schema.names()) == { 'object_key', 'id', 'object_type', 'name', 'slug', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator', 'allowed_roles_and_users', 'tag_ids', 'tag_text', } schema = Schema( id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True), ) adapter = SAAdapter(Indexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'} assert all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues()) assert set(schema.names()) == {'id', 'text', 'num', 'name'} assert isinstance(schema['text'], TEXT) assert isinstance(schema['num'], NUMERIC)
def test_build_attrs_4(): schema = Schema(id=NUMERIC(bits=64, signed=False, stored=True, unique=True)) adapter = SAAdapter(Indexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { "id", "text", "num", "name", "object_type", "object_key", } assert all(lambda f: callable(f) for f in adapter.doc_attrs.values()) assert set(schema.names()) == { "id", "text", "num", "name", "object_type", "object_key", } assert isinstance(schema["text"], TEXT) assert isinstance(schema["num"], NUMERIC)
def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict()
from galaxy.eggs import require from galaxy.web.framework.helpers import to_unicode # Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether tool search is enabled. try: require("Whoosh") from whoosh.filedb.filestore import RamStorage from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT from whoosh.index import Index from whoosh.scoring import BM25F from whoosh.qparser import MultifieldParser tool_search_enabled = True schema = Schema(id=STORED, title=TEXT, description=TEXT, help=TEXT) except ImportError, e: tool_search_enabled = False schema = None class ToolBoxSearch(object): """ Support searching tools in a toolbox. This implementation uses the "whoosh" search library. """ def __init__(self, toolbox): """ Create a searcher for `toolbox`. """ self.toolbox = toolbox self.enabled = tool_search_enabled if tool_search_enabled: self.build_index()
# CUSTOM ANALYZER wordsplit + lowercase filter, for pathname-like text # # This is useful to: # - avoid removing "stop words" from text # - search case-insensitively # PATHANALYZER = RegexTokenizer() | LowercaseFilter() # INDEX SCHEMA DEFINITION SCHEMA = Schema( fileid=ID(unique=True), owner=TEXT(analyzer=EMAILADDRANALYZER), # this field preserves case of repository name for exact matching repository_rawname=TEXT(analyzer=IDANALYZER), repository=TEXT(stored=True, analyzer=ICASEIDANALYZER), path=TEXT(stored=True, analyzer=PATHANALYZER), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), extension=TEXT(stored=True, analyzer=PATHANALYZER)) IDX_NAME = 'HG_INDEX' FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') FRAGMENTER = ContextFragmenter(200) CHGSETS_SCHEMA = Schema( raw_id=ID(unique=True, stored=True), date=NUMERIC(stored=True), last=BOOLEAN(),
def make_index(): schema = Schema(url=ID(stored=True), tags=KEYWORD) if not os.path.exists("index"): os.mkdir("index") ix = create_in("index", schema) return ix
def get_schema(): return Schema(path=ID(unique=True, stored=True), content=TEXT)
from whoosh.qparser import MultifieldParser from whoosh.query import And, Every, Term from galaxy import exceptions from galaxy.exceptions import ObjectNotFound from galaxy.util.search import parse_filters log = logging.getLogger(__name__) schema = Schema( id=NUMERIC(stored=True), name=TEXT(field_boost=1.7, stored=True), description=TEXT(field_boost=1.5, stored=True), long_description=TEXT(stored=True), homepage_url=TEXT(stored=True), remote_repository_url=TEXT(stored=True), repo_owner_username=TEXT(stored=True), categories=KEYWORD(stored=True, commas=True, scorable=True), times_downloaded=STORED, approved=STORED, last_updated=STORED, repo_lineage=STORED, full_last_updated=STORED) class RepoWeighting(scoring.BM25F): """ Affect the BM25G scoring model through the final method. source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ """ use_final = True
def searchTags(self, tags=[]): """ Search tags in merged pdf file :param tags: List of search tags e.g. ['Introduction', 'Experiment'] :type tags: list """ # Update tags if not tags: tags = self.props['tags'] # Create custom FuzzyTerm for fuzzy tag search class CustomFuzzyTerm(FuzzyTerm): def __init__(self, fieldname, text, boost=1.0, maxdist=self.props['maxdist'], prefixlength=self.props['prefixlength'], constantscore=True): super(CustomFuzzyTerm, self).__init__(fieldname, text, boost, maxdist, prefixlength, constantscore) # Create teporary directory tmpdir if not os.path.exists("tmpdir"): os.mkdir("tmpdir") schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True)) ix = index.create_in("tmpdir", schema) writer = ix.writer() for i, ltObj in enumerate(self.ltObjList): #writer.add_document(title=str(i), content=str(ltObj['OBJ'].get_text()), path=u"/a") #st=str(ltObj['OBJ'].get_text()) st = ltObj['OBJ'].get_text().encode('utf-8') st = st.decode('utf-8') sti = str(i) writer.add_document(title=sti, content=st) writer.commit() results = [] id = 0 for tag in tags: print('Searching tag: ', tag) with ix.searcher() as searcher: #query = QueryParser("content", ix.schema).parse(tag) qp = QueryParser("content", schema=ix.schema, termclass=CustomFuzzyTerm) query = qp.parse(tag + '~4/4') res = searcher.search(query, limit=None) for hit in res: results.append( dict({ 'ID': id, 'DOCNAME': self.ltObjList[hit.docnum]['DOCNAME'], 'TAG': tag, 'PAGE': self.ltObjList[hit.docnum]['PAGE'], 'HITNUM': hit.docnum, 'BBOX': self.ltObjList[hit.docnum]['OBJ'].bbox, 'TEXT': self.ltObjList[hit.docnum]['OBJ'].get_text() })) id += 1 self.results = results return results
import unicodecsv as csv from whoosh import index, sorting from whoosh.analysis import StandardAnalyzer from whoosh.fields import Schema, STORED, NGRAMWORDS, NUMERIC from whoosh.qparser import MultifieldParser _schema = Schema( ror=STORED(), grid=STORED(), name=NGRAMWORDS(stored=False), aliases=NGRAMWORDS(stored=False), num_students=NUMERIC(int, sortable=True, stored=False), citation_score=NUMERIC(int, sortable=True, stored=False), ) _index_path = 'data/ror-whoosh-index' def _read_ror_csv_rows(): rows = [] with open('data/ror-metrics.csv') as ror_csv: reader = csv.DictReader(ror_csv) for row in reader: row['aliases'] = row['aliases'].split( u'###') if row['aliases'] else [] row['num_students'] = int( row['num_students']) if row['num_students'] else None row['citation_score'] = float( row['citation_score']) if row['citation_score'] else None rows.append(row)
def _build_doc_attrs(self, model_class: Type[Model], schema: Schema) -> None: mapper = sa.inspect(model_class) args = self.doc_attrs # Any field not in schema will be stored here. # After all field have been discovered, we add the missing ones. field_definitions = {} def setup_field( attr_name: str, field_name: Union[Tuple[str, Union[type, ID]], str]) -> None: field_def = False if not isinstance(field_name, str): field_name, field_def = field_name if field_name not in schema: if (field_name not in field_definitions or field_definitions[field_name] is False): field_definitions[field_name] = field_def # attrgetter offers dotted name support. Useful for attributes on # related objects. args.setdefault(field_name, {})[attr_name] = attrgetter(attr_name) # model level definitions for name, field_names in self.index_to: if isinstance(field_names, str): field_names = (field_names, ) for field_name in field_names: setup_field(name, field_name) # per column definitions for col in mapper.columns: name = col.name info = col.info if not info.get("searchable"): continue index_to = info.get("index_to", (name, )) if isinstance(index_to, str): index_to = (index_to, ) for field_name in index_to: setup_field(name, field_name) # add missing fields to schema for field_name, field_def in field_definitions.items(): if field_name in schema: continue if field_def is False: field_def = TEXT(stored=True, analyzer=accent_folder) logger.debug( "Adding field to schema:\n" " Model: %s\n" ' Field: "%s" %s', model_class._object_type(), field_name, field_def, ) schema.add(field_name, field_def)
from whoosh.fields import Schema, TEXT, ID import index_helper coaches_index_dir = "coaches_index" coaches_schema = Schema( Name=TEXT, CoachAPIID=ID(stored=True), TeamID=TEXT, WinLoss=TEXT, DOB=TEXT, Recognitions=TEXT, PastTeams=TEXT, PlayersCoached=TEXT, ) coaches_attributes = [ "Name", "CoachAPIID", "TeamID", "WinLoss", "DOB", "Recognitions", "PastTeams", "PlayersCoached" ] coaches_id_name = "CoachAPIID" def search_coach_index(query): return index_helper.search_index(coaches_index_dir, coaches_schema, coaches_attributes, coaches_id_name, query) if __name__ == '__main__':
from whoosh import index, qparser import json from paths import here_path, merged_dir_path, top_dir from movies import movies, WhichMovie, name_dict, movie_dict from load_files import yarn_file_paths, parsed_scripts_file_paths, fandom_links_file_path character_links = json.load(fandom_links_file_path.open('r', encoding='UTF-8')) index_dir_path = here_path / "indexdir" if not index_dir_path.exists(): index_dir_path.mkdir() schema = Schema( movie=NUMERIC(stored=True), character=NUMERIC(stored=True), quote=KEYWORD(stored=True), ) ix = index.create_in(index_dir_path, schema) writer = ix.writer() print("Building index") for script_file in parsed_scripts_file_paths: print(f'Building index for file "{script_file.relative_to(top_dir)}"') movie: WhichMovie = name_dict[script_file.stem] script_data = json.load(script_file.open('r', encoding="UTF-8")) print(f"Indexing ({movie}): ", end="") len_all_quotes = len(script_data['quotes'])
import lib.DatabaseLayer as db argParser = argparse.ArgumentParser(description='Fulltext indexer for the MongoDB CVE collection') argParser.add_argument('-v', action='store_true', default=False, help='Verbose logging') argParser.add_argument('-l', default=5, help='Number of last entries to index (Default: 5) - 0 to index all documents') argParser.add_argument('-n', action='store_true', default=False, help='lookup complete cpe (Common Platform Enumeration) name for vulnerable configuration to add in the index') args = argParser.parse_args() c = cves.last(namelookup=args.n) indexpath = Configuration.getIndexdir() from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, ID schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT) if not os.path.exists(indexpath): os.mkdir(indexpath) if not exists_in(indexpath): ix = create_in(indexpath, schema) else: ix = open_dir(indexpath) def dumpallcveid(entry=None): return db.getCVEID if not entry else db.getCVEIDs(int(entry)) def getcve(cveid=None): if cveid is None: return False
def build_index(self): """Build a `Whoosh <https://whoosh.readthedocs.io/en/latest/index.html>`_ index for product types searches. .. versionadded:: 1.0 """ index_dir = os.path.join(self.conf_dir, ".index") # use eodag_version to help keeping index up-to-date eodag_version = self.get_version() create_index = not exists_in(index_dir) # check index version if not create_index: if self._product_types_index is None: logger.debug("Opening product types index in %s", index_dir) self._product_types_index = open_dir(index_dir) try: self.guess_product_type(eodagVersion=eodag_version) except NoMatchingProductType: create_index = True finally: if create_index: shutil.rmtree(index_dir) logger.debug( "Out-of-date product types index removed from %s", index_dir) if create_index: logger.debug("Creating product types index in %s", index_dir) makedirs(index_dir) product_types_schema = Schema( ID=fields.STORED, abstract=fields.TEXT, instrument=fields.IDLIST, platform=fields.ID, platformSerialIdentifier=fields.IDLIST, processingLevel=fields.ID, sensorType=fields.ID, eodagVersion=fields.ID, license=fields.ID, title=fields.ID, missionStartDate=fields.ID, missionEndDate=fields.ID, ) non_indexable_fields = ["bands"] self._product_types_index = create_in(index_dir, product_types_schema) ix_writer = self._product_types_index.writer() for product_type in self.list_product_types(): versioned_product_type = dict( product_type, **{"eodagVersion": eodag_version}) # add to index ix_writer.add_document( **{ k: v for k, v in versioned_product_type.items() if k not in non_indexable_fields }) ix_writer.commit() else: if self._product_types_index is None: logger.debug("Opening product types index in %s", index_dir) self._product_types_index = open_dir(index_dir)
#!/usr/bin/python3 # coding: utf-8 # whoosh 使用流程: 1. 创建 schema 2. 索引生成 3. 索引查询 # https://my.oschina.net/u/2351685/blog/603079 # https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py from whoosh.fields import ID, TEXT, Schema from whoosh.index import create_in from whoosh.index import open_dir from whoosh.qparser import QueryParser from whoosh import qparser # 执行前保证 ./tmp/ 目录存在, 否则会报错 ################################################################## ## 0. 先写一个简洁的版本, 后面是讲解 from whoosh.index import create_in from whoosh.fields import TEXT, ID, Schema # 只引入这两个就够了 schema = Schema(title=TEXT(stored=True), content=TEXT) ix = create_in('./tmp', schema) # 存储 schema 信息至 ./tmp/; ** 这个只能执行一遍, 否则会报 LockError ** writer = ix.writer() # 按照 schema 定义信息, 增加需要建立索引的文档 writer.add_document(title='hello', content='hello world') writer.add_document(title='world', content='world hello') writer.commit() # searcher() 要写到 commit() 后面 searcher = ix.searcher() # 创建一个检索器; 最好用 with ix.searcher() as searcher: 来写, 这里只是为了方便 ## 第一种检索方式: print(searcher.find('content', 'hello world').fields(0)) # {'title': 'hello'}; TEXT 会存储位置信息, 支持短语检索 print(searcher.find('content', 'hello world')[1].fields()) # {'title': 'world'}; ## 另一种检索方式: Construct query objects directly from whoosh.query import * myquery = And([Term("content", "hello"), Term("content", "world")]) print(searcher.search(myquery).fields(1)) # {'title': 'world'}; 和上面的结果一样 ## 第三种检索方式: Parse a query string; 一般最好用这种方法做 from whoosh.qparser import QueryParser
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
def store_revision(self, meta, data, overwrite=False, trusted=False, # True for loading a serialized representation or other trusted sources name=None, # TODO name we decoded from URL path action=u'SAVE', remote_addr=None, userid=None, wikiname=None, contenttype_current=None, contenttype_guessed=None, acl_parent=None, ): """ Store a revision into the backend, write metadata and data to it. Usually this will be a new revision, either of an existing item or a new item. With overwrite mode, we can also store over existing revisions. :type meta: dict :type data: open file (file must be closed by caller) :param overwrite: if True, allow overwriting of existing revs. :returns: a Revision instance of the just created revision """ if remote_addr is None: try: # if we get here outside a request, this won't work: remote_addr = unicode(request.remote_addr) except: pass if userid is None: try: # if we get here outside a request, this won't work: userid = flaskg.user.valid and flaskg.user.itemid or None except: pass if wikiname is None: wikiname = app.cfg.interwikiname state = {'trusted': trusted, keys.NAME: name, keys.ACTION: action, keys.ADDRESS: remote_addr, keys.USERID: userid, keys.WIKINAME: wikiname, keys.ITEMID: self.itemid, # real itemid or None 'contenttype_current': contenttype_current, 'contenttype_guessed': contenttype_guessed, 'acl_parent': acl_parent, } ct = meta.get(keys.CONTENTTYPE) if ct == CONTENTTYPE_USER: Schema = UserMetaSchema else: Schema = ContentMetaSchema m = Schema(meta) valid = m.validate(state) # TODO: currently we just log validation results. in the end we should # reject invalid stuff in some comfortable way. if not valid: logging.warning("metadata validation failed, see below") for e in m.children: logging.warning("{0}, {1}".format(e.valid, e)) # we do not have anything in m that is not defined in the schema, # e.g. userdefined meta keys or stuff we do not validate. thus, we # just update the meta dict with the validated stuff: meta.update(dict(m.value.items())) # we do not want None / empty values: meta = dict([(k, v) for k, v in meta.items() if v not in [None, []]]) if self.itemid is None: self.itemid = meta[ITEMID] backend = self.backend if not overwrite: revid = meta.get(REVID) if revid is not None and revid in backend: raise ValueError('need overwrite=True to overwrite existing revisions') meta, data, content = self.preprocess(meta, data) data.seek(0) # rewind file revid = backend.store(meta, data) meta[REVID] = revid self.indexer.index_revision(meta, content) if not overwrite: self._current = self.indexer._document(revid=revid) return Revision(self, revid)
from whoosh.fields import Schema, TEXT, ID from whoosh.index import create_in, open_dir from whoosh.query import * from whoosh.qparser import QueryParser from jieba.analyse import ChineseAnalyzer # whoosh+jieba 初步学习样例 cnAnalyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True, analyzer=cnAnalyzer), content=TEXT(stored=False, analyzer=cnAnalyzer), path=ID(stored=True)) import os.path if not os.path.exists('sample_index'): os.mkdir('sample_index') ix = create_in('sample_index', schema) ix = open_dir('sample_index') writer = ix.writer() writer.add_document(title=u'爱吃大板的博客', content=u'大家好!这里是爱吃大板的博客,欢迎光临!大板是一种雪糕。') writer.add_document(title=u'蝴蝶定理吃雪糕', content=u'好阿婆雪糕蝴蝶定理最爱吃了!必须买下来。It\'s tasty!') writer.commit() writer = ix.writer() writer.add_document(title=u"My document", content=u"This is my document!") writer.add_document(title=u"Second try", content=u"This is the second example.", path='http://sdu.edu.cn/') writer.add_document(title=u"Third time's the charm", content=u"Examples are many.",
__author__ = 'rich' import datetime import os import pandas as pd from whoosh.fields import Schema from whoosh.fields import TEXT, ID, DATETIME, KEYWORD from whoosh.index import open_dir from whoosh.index import create_in from whoosh.query import Term, And, Or from whoosh.qparser import QueryParser my_schema = Schema(id=ID(unique=True, stored=True), lang=TEXT(), screenname=TEXT(), tweettext=TEXT(), hashtags=TEXT(), datetime=DATETIME()) if not os.path.exists("tweets_index"): os.mkdir("tweets_index") index = create_in("tweets_index", my_schema) index = open_dir("tweets_index") writer = index.writer() df = pd.read_csv('tweets/tweets.csv', header=None, names=[ 'id', 'language', 'screenname', 'tweettext', 'hashtags', 'timestamp' ])
import os from whoosh import index from whoosh.fields import Schema, ID, TEXT, NGRAM # 인덱스 데이터를 저장할 디렉터리 지정하기 INDEX_DIR = "indexdir" # 인덱스 전용 스키마 정의하기 schema = Schema( # 인덱스 유닛 ID로 글의 URL 사용하기 post_url=ID(unique=True, stored=True), # 본문을 N그램으로 인덱스화 body=NGRAM(stored=True), ) def get_or_create_index(): # 인덱스 전용 디렉터리가 없다면 만들기 if not os.path.exists(INDEX_DIR): os.mkdir(INDEX_DIR) # 인덱스 전용 파일 만들기 ix = index.create_in(INDEX_DIR, schema) return ix # 이미 인덱스 전용 디렉터리가 있는 경우 # 기존의 인덱스 파일 열어서 사용하기 ix = index.open_dir(INDEX_DIR) return ix
""" import ConfigParser import os import sys sys.path.insert( 1, os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, 'lib'))) # Whoosh is compatible with Python 2.5+ Try to import Whoosh and set flag to indicate whether search is enabled. try: from whoosh.filedb.filestore import FileStorage from whoosh.fields import Schema, STORED, TEXT whoosh_search_enabled = True schema = Schema(id=STORED, name=TEXT, info=TEXT, dbkey=TEXT, message=TEXT) import galaxy.model.mapping from galaxy import config, model except ImportError, e: whoosh_search_enabled = False schema = None def build_index(sa_session, whoosh_index_dir): storage = FileStorage(whoosh_index_dir) index = storage.create_index(schema) writer = index.writer() def to_unicode(a_basestr): if type(a_basestr) is str: return unicode(a_basestr, 'utf-8')
import os import whoosh import codecs from whoosh.fields import Schema from whoosh.index import create_in from mappers import inputdatastream from whoosh.fields import ID, KEYWORD, TEXT FILEPATH = "/Users/denisvrdoljak/Berkeley/W205/Asn4_Work/ WC2015-2testing.csv" #schema setup my_schema = Schema(id = ID(unique=True, stored=True), path = ID(stored=True), tagsearch = ID(stored=True), tags = TEXT(stored=True), date = TEXT(stored=True), hour = TEXT(stored=True), tweet = TEXT(stored=True)) #enter data writer = index.writer() os.mkdir("twitterwwc-index") index = create_in("wwc-index1", my_schema) for i,line in enumerate(inputdatastream(FILEPATH)): print ".", writer.add_document( path = FILEPATH.encode("utf-8"), tagsearch = line.split(",")[4].encode("utf-8"), tags = [word for word in line.split(",")[0] if '#' in word],
from whoosh.support.charset import accent_map from whoosh.analysis import StemmingAnalyzer, StandardAnalyzer, CharsetFilter from flask_sqlalchemy import SQLAlchemy from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from geoalchemy2 import Geometry from shapely import wkb from json import loads from shutil import rmtree from ambiente import geocode_db, whoosh_base db = SQLAlchemy() analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) schema = Schema(id=NUMERIC, nome=TEXT(analyzer=analyzer, stored=True), geom=STORED) class Poligono(db.Model): __tablename__ = 'openLS_localizacaopoligono' __searchable__ = ['nome'] id = db.Column(db.Integer, primary_key=True) nome = db.Column(db.Unicode) geom = db.Column(Geometry('POLYGON')) class Linha(db.Model): __tablename__ = 'openLS_localizacaolinha' __searchable__ = ['nome'] id = db.Column(db.Integer, primary_key=True)
import re, os, codecs import progressbar def visible(element): if element.parent.name in [ 'style', 'script', '[document]', 'head', 'title' ]: return False elif re.match('<!--.*-->', element.encode('utf-8')): return False return True dir = os.listdir('dataset') schema = Schema(content=TEXT(stored=True)) ix = create_in("database", schema) with progressbar.ProgressBar(maxval=21890, widgets=[ ' [', progressbar.Timer(), '][', progressbar.ETA(), '][', progressbar.Percentage(), ']', progressbar.Bar('=', '[', '] '), progressbar.Counter() ]) as bar: for i, l in enumerate(dir): bar.update(i + 1) # print l
import os.path from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT from whoosh.index import create_in from jieba.analyse import ChineseAnalyzer if __name__ == "__main__": schema_doc = Schema(title=TEXT(stored=True, sortable=True), content=TEXT(stored=True, sortable=True, analyzer=ChineseAnalyzer()), url=ID(stored=True)) if not os.path.exists("index_doc"): os.mkdir("index_doc") create_in("index_doc", schema_doc) schema_img = Schema(title=TEXT(stored=True, sortable=True), content=TEXT(stored=True, sortable=True, analyzer=ChineseAnalyzer()), src=ID(stored=True), source=ID(stored=True)) if not os.path.exists("index_img"): os.mkdir("index_img") create_in("index_img", schema_img)
# coding:utf-8 # 因为之前的项目里面用到了whoosh,但是当时没有太多的耐心去看这个,这里深入的整理一下。 # whoosh是一个类似于博客的快速搜索项目,它可以实现快速的检索你想要的内容。 # 首先需要先安装whoosh. pip2 install whoosh # quick_start from whoosh.index import create_in from whoosh.fields import * from whoosh.qparser import QueryParser schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = create_in("indexdir", schema) # 创建了一个对象,indexdir writer = ix.writer() # 一个写入对象 writer.add_document( title=u'First document', path=u"/a", content=u"This is the first document we've added!") # 添加文档,里面是内容和路径以及文件名称。 writer.add_document(title=u'Second document', path=u"/b", content=u"The second one is even more interesting! " ) # 添加文档,里面是内容和路径以及文件名称。 writer.commit() # 提交要添加的内容 with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse( 'first') # 这里就会查找内容中包含first的, 类似re.compile results = searcher.search(query) # 这里才是真正的执行查询操作 print(results[0]) # result: {"title": u"First document", "path": u"/a"} # 上面这段代码并不能运行,运行会触发报错,但是可以简单的看到一些使用方法。 # 这些完全是根据现有经验推断出来的,实际情况,还需要看文档。
#untuk fetching from bs4 import BeautifulSoup import urllib.request #untuk create schema import os, os.path import whoosh.index as index from whoosh.fields import Schema, TEXT, ID #untuk search from whoosh.qparser import QueryParser #creating schema and indexing schema = Schema(title=ID(stored=True), content=TEXT(stored=True)) if not os.path.exists("index"): os.mkdir("index") ix = index.create_in("index", schema) ix = index.open_dir("index") writer = ix.writer() lis_link = list() lis_link.append("http://pythonforbeginners.com") lis_link.append("http://www.python.org") lis_link.append("https://docs.microsoft.com/en-us/dotnet/csharp/") lis_link.append("https://www.tutorialspoint.com/cplusplus/index.htm") for i in range(len(lis_link)): html_page = urllib.request.urlopen(lis_link[i]) soup = BeautifulSoup(html_page, 'html.parser') writer.add_document(title=lis_link[i], content=soup.prettify()) print("success add document " + str(i + 1))
class WhooshEngine(BaseEngine): # whoosh schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), href=ID(stored=True), cfiBase=TEXT(stored=True), spinePos=TEXT(stored=True), content=TEXT) def open(self): try: self.ix = index.open_dir(self.database_path) except Exception as e: logger.error("openning database {} failed".format( self.database_name)) def create(self): if not os.path.exists(self.database_path): os.mkdir(self.database_path) try: logger.debug("openning database {} to create".format( self.database_name)) self.ix = index.create_in(self.database_path, self.schema) except Exception as e: logger.error(e) self.writer = self.ix.writer() def add(self, path='', href='', title='', cfiBase='', spinePos=''): text = self.__get_text(path) self.writer.add_document(title=str(title), path=str(path), href=str(href), cfiBase=str(cfiBase), spinePos=str(spinePos), content=str(text)) logger.debug("Indexed: " + title + ' | ' + path + ' | ' + href + ' | ' + str(spinePos)) def finished(self): self.writer.commit() def query(self, q, limit=None): logger.debug('Q {}'.format(q)) with self.ix.searcher() as searcher: results = [] parsed_query = QueryParser("content", schema=self.ix.schema).parse(q) hits = searcher.search(parsed_query, limit=limit) logger.debug("Hits {}".format(hits)) for hit in hits: item = {} item['title'] = hit["title"].encode("utf-8") item['href'] = hit["href"].encode("utf-8") item['path'] = hit["path"].encode("utf-8") item['cfiBase'] = hit["cfiBase"].encode("utf-8") item['spinePos'] = hit["spinePos"].encode("utf-8") results.append(item) return results def __get_text(self, filename): # html = urllib.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read() html = open(filename, "r") soup = BeautifulSoup(html, "lxml") texts = soup.findAll(text=True) def visible(element): if element.parent.name in [ 'style', 'script', '[document]', 'head', 'title' ]: return False elif re.match('<!--.*-->', str(element.encode('utf-8'))): return False return True visible_texts = filter(visible, texts) contents = ' '.join([s for s in visible_texts]) return contents.strip() #.encode('utf-8')
def init_extensions(app): # 调用 PyMongo 类的 init_app 方法进行初始化 mongo.init_app(app) # 调用 init_app 方法注册 app # 此方法的主要作用就是将 login_manager 本身赋值给 app.login_manager 属性 # 以便 app 能够使用其登录登出等功能 login_manager.init_app(app) # # 使用 cache 可以加快web程序运行速度,即 采用缓存的方法以空间换时间 if app.config.get('USE_CACHE', False): cache.init_app(app, {}) # 获取配置信息并存储在 app 上 configure_uploads(app, upload_photos) # 获取邮箱的配置 mail.init_app(app) # 获取搜索服务的配置 whoosh_searcher.init_app(app) # 使用 jieba 中文分词 chinese_analyzer = ChineseAnalyzer() # 建立索引模式对象 post_schema = Schema(obj_id=ID(unique=True, stored=True), title=TEXT(stored=True, analyzer=chinese_analyzer), content=TEXT(stored=True, analyzer=chinese_analyzer), create_at=DATETIME(stored=True), topic_id=ID(stored=True), user_id=ID(stored=True)) whoosh_searcher.add_index('posts', post_schema) # 获取后台管理界面的相关配置 admin.init_app(app) with app.app_context(): admin.add_view(admin_view.OptionsModelView(mongo.db['options'], '系统设置')) admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理')) admin.add_view( admin_view.TopicsModelView(mongo.db['topics'], '话题管理', category='内容管理')) admin.add_view( admin_view.PostsModelView(mongo.db['posts'], '问答管理', category='内容管理')) # admin.add_view(admin_view.IndexModelView(mongo.db['index_article'], # '主页文章管理', category='内容管理')) admin.add_view( admin_view.PassagewaysModelView(mongo.db['passageways'], '温馨通道', category='推广管理')) admin.add_view( admin_view.FriendLinksModelView(mongo.db['friend_links'], '友链管理', category='推广管理')) admin.add_view( admin_view.PagesModelView(mongo.db['pages'], '页面管理', category='推广管理')) admin.add_view( admin_view.FooterLinksModelView(mongo.db['footer_links'], '底部链接', category='推广管理')) admin.add_view( admin_view.AdsModelView(mongo.db['ads'], '广告管理', category='推广管理'))
import jieba import sys reload(sys) sys.setdefaultencoding('utf-8') from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT from whoosh.index import create_in from whoosh.index import open_dir from jieba.analyse import ChineseAnalyzer # from pyltp import Segmentor analyzer = ChineseAnalyzer() import os.path # import pyltp filename_list = [] ID_list = [] schema = Schema(title=TEXT, content=TEXT) schema = Schema( title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer), ID=ID(stored=True), tags=KEYWORD, icon=STORED, ) path_generator = os.walk("./blog_engine") for path, d, filelist in path_generator: for filename in d: ix = open_dir("/home/iiip/桌面/blog_engine/" + filename)
def get_schema(): return Schema(title=TEXT(stored=True), date_start=DATETIME(stored=True), date_end=DATETIME(stored=True), description=TEXT(stored=True), categoria=TEXT(stored=True))
# -*- coding: utf-8 -*- #http://blog.csdn.net/twsxtd/article/details/8308893 最近想做一个搜索引擎,当然少不了看下闻名遐迩的Lucene,不得不说确实非常出色,但是对于python的实现pylucene确是差强人意,首先它 不是纯python实现 而是做了一层包装到头来还是使用java,依赖于JDK不说安装步骤繁琐至极,而且Lucene可用的中文分词词库非常之多但是由 于这层粘合关系很多都用不上, 最终还是放弃,不过平心而论如果用Java实现的确很完美。其它的有sphinx以及基于它实现的专门用于中文的 coreseek,不过看了下好像是基于SQL语言的, 对于我要做的事情好像关系不大;还有用C++写的xapian框架,可以说是一片好评啊,速度精度 都非常不错,但最终还是看上了纯python实现的Whoosh, 首先对于python使用来说非常简单,就是一个模块,easy_install就行, 但是搜了一下国内的资料非常之少,没有办法,就把它的文档翻译一下吧~~今天开始 Quick Start Whoosh是一个索引文本和搜索文本的类库,他可以为你提供搜索文本的服务,比如如果你在创建一个博客的软件,你可以用whoosh为它添加添加一个搜索功能以便用户来搜索博客的入口 下面是一个简短的例子: from whoosh.index import create_in from whoosh.fields import * schema = Schema(title = TEXT(stored = True),path = ID(stored=True),content=TEXT) ix = create_in("/home/gswewf/百科/indexer",schema)#(这里的“indexer”实际上是一个目录,因此按照这个步骤来会出错,你得先创建目录,译者注) writer = ix.writer() writer.add_document(title=u"First document",path=u"/a", content = u"this is the first document we've add!") writer.add_document(title=u"Second document", path=u"/b", ... content=u"The second one is even more interesting!") writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("first") results = searcher.search(query) results[0] {"title": u"First document", "path": u"/a"} Index和Schema对象
def __init__(self, index_dir, backend, wiki_name=None, acl_rights_contents=[], **kw): """ Store params, create schemas. """ self.index_dir = index_dir self.index_dir_tmp = index_dir + '.temp' self.backend = backend self.wikiname = wiki_name self.ix = {} # open indexes self.schemas = {} # existing schemas common_fields = { # wikiname so we can have a shared index in a wiki farm, always check this! WIKINAME: ID(stored=True), # tokenized NAME from metadata - use this for manual searching from UI NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), # unmodified NAME from metadata - use this for precise lookup by the code. # also needed for wildcard search, so the original string as well as the query # (with the wildcard) is not cut into pieces. NAME_EXACT: ID(field_boost=3.0), # revision id (aka meta id) REVID: ID(unique=True, stored=True), # parent revision id PARENTID: ID(stored=True), # MTIME from revision metadata (converted to UTC datetime) MTIME: DATETIME(stored=True), # tokenized CONTENTTYPE from metadata CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), # unmodified list of TAGS from metadata TAGS: ID(stored=True), LANGUAGE: ID(stored=True), # USERID from metadata USERID: ID(stored=True), # ADDRESS from metadata ADDRESS: ID(stored=True), # HOSTNAME from metadata HOSTNAME: ID(stored=True), # SIZE from metadata SIZE: NUMERIC(stored=True), # ACTION from metadata ACTION: ID(stored=True), # tokenized COMMENT from metadata COMMENT: TEXT(stored=True), # SUMMARY from metadata SUMMARY: TEXT(stored=True), # data (content), converted to text/plain and tokenized CONTENT: TEXT(stored=True), } latest_revs_fields = { # ITEMID from metadata - as there is only latest rev of same item here, it is unique ITEMID: ID(unique=True, stored=True), # unmodified list of ITEMLINKS from metadata ITEMLINKS: ID(stored=True), # unmodified list of ITEMTRANSCLUSIONS from metadata ITEMTRANSCLUSIONS: ID(stored=True), # tokenized ACL from metadata ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True), } latest_revs_fields.update(**common_fields) userprofile_fields = { # Note: email / openid (if given) should be unique, but we might # have lots of empty values if it is not given and thus it is NOT # unique overall! Wrongly declaring it unique would lead to whoosh # killing other users from index when update_document() is called! EMAIL: ID(stored=True), OPENID: ID(stored=True), } latest_revs_fields.update(**userprofile_fields) all_revs_fields = { ITEMID: ID(stored=True), } all_revs_fields.update(**common_fields) latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)), ("*_text", TEXT(stored=True)), ("*_keyword", KEYWORD(stored=True)), ("*_numeric", NUMERIC(stored=True)), ("*_datetime", DATETIME(stored=True)), ("*_boolean", BOOLEAN(stored=True)), ] # Adding dynamic fields to schemas for glob, field_type in dynamic_fields: latest_revisions_schema.add(glob, field_type, glob=True) all_revisions_schema.add(glob, field_type, glob=True) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema # what fields could whoosh result documents have (no matter whether all revs index # or latest revs index): self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
def store_revision(self, meta, data, overwrite=False, trusted=False, # True for loading a serialized representation or other trusted sources name=None, # TODO name we decoded from URL path action=ACTION_SAVE, remote_addr=None, userid=None, wikiname=None, contenttype_current=None, contenttype_guessed=None, acl_parent=None, return_rev=False, fqname=None, ): """ Store a revision into the backend, write metadata and data to it. Usually this will be a new revision, either of an existing item or a new item. With overwrite mode, we can also store over existing revisions. :type meta: dict :type data: open file (file must be closed by caller) :param overwrite: if True, allow overwriting of existing revs. :param return_rev: if True, return a Revision instance of the just created revision :returns: a Revision instance or None """ if remote_addr is None: try: # if we get here outside a request, this won't work: remote_addr = unicode(request.remote_addr) except: pass if userid is None: try: # if we get here outside a request, this won't work: userid = flaskg.user.valid and flaskg.user.itemid or None except: pass if wikiname is None: wikiname = app.cfg.interwikiname state = {'trusted': trusted, NAME: [name], ACTION: action, ADDRESS: remote_addr, USERID: userid, WIKINAME: wikiname, NAMESPACE: None, ITEMID: self.itemid, # real itemid or None 'contenttype_current': contenttype_current, 'contenttype_guessed': contenttype_guessed, 'acl_parent': acl_parent, FQNAME: fqname, } ct = meta.get(CONTENTTYPE) if ct == CONTENTTYPE_USER: Schema = UserMetaSchema else: Schema = ContentMetaSchema m = Schema(meta) valid = m.validate(state) if not valid: logging.warning("metadata validation failed, see below") for e in m.children: logging.warning("{0}, {1}".format(e.valid, e)) logging.warning("data validation skipped as we have no valid metadata") if VALIDATION_HANDLING == VALIDATION_HANDLING_STRICT: raise ValueError('metadata validation failed and strict handling requested, see the log for details') # we do not have anything in m that is not defined in the schema, # e.g. userdefined meta keys or stuff we do not validate. thus, we # just update the meta dict with the validated stuff: meta.update(dict(m.value.items())) # we do not want None / empty values: # XXX do not kick out empty lists before fixing NAME processing: meta = dict([(k, v) for k, v in meta.items() if v not in [None, ]]) if valid and not validate_data(meta, data): # need valid metadata to validate data logging.warning("data validation failed") if VALIDATION_HANDLING == VALIDATION_HANDLING_STRICT: raise ValueError('data validation failed and strict handling requested, see the log for details') if self.itemid is None: self.itemid = meta[ITEMID] backend = self.backend if not overwrite: revid = meta.get(REVID) if revid is not None and revid in backend: raise ValueError('need overwrite=True to overwrite existing revisions') meta, data, content = self.preprocess(meta, data) data.seek(0) # rewind file backend_name, revid = backend.store(meta, data) meta[REVID] = revid self.indexer.index_revision(meta, content, backend_name) if not overwrite: self._current = self.indexer._document(revid=revid) if return_rev: return Revision(self, revid)
def get_schema(): return Schema(remitente=TEXT(stored=True), destinatarios=KEYWORD(stored=True), asunto=TEXT(stored=True), contenido=TEXT(stored=True))
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v for a, v in list(info.items()): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in list(info.keys()): if a not in self.schema.names(): del info[a] for a, v in list(info.items()): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return b2u(self._collections) def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(list(self.objects.keys())) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return b2u(list(self.objects.values())) else: return b2u(list(self.infos.values())) from whoosh.qparser import QueryParser # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return b2u(list(lst))
def init_indexes_and_parsers(): path = app.config['SEARCH_INDEX_PATH'] # Initialize the documentations index name = 'doc' if exists_in(path, indexname=name): indexes['doc'] = open_dir(path, indexname=name) else: try: os.makedirs(path) except OSError: pass schema = Schema( id=ID(stored=True, unique=True), ) schema.add( 'title_*', TEXT(field_boost=2.0, analyzer=domotego_analyzer), glob=True ) schema.add('text_*', TEXT(analyzer=domotego_analyzer), glob=True) indexes['doc'] = create_in(path, schema, indexname=name) index_docs(Page.objects(pagetype='doc')) # Initialize the categories index name = 'category' if exists_in(path, indexname=name): indexes['category'] = open_dir(path, indexname=name) else: try: os.makedirs(path) except OSError: pass schema = Schema( id=ID(stored=True, unique=True), ) schema.add( 'name_*', TEXT(field_boost=2.0, analyzer=domotego_analyzer), glob=True ) schema.add( 'description_*', TEXT(analyzer=domotego_analyzer), glob=True ) indexes['category'] = create_in(path, schema, indexname=name) index_categories(Category.objects) # Initialize the products index name = 'product' if exists_in(path, indexname=name): indexes['product'] = open_dir(path, indexname=name) else: try: os.makedirs(path) except OSError: pass schema = Schema( id=ID(stored=True, unique=True), reference=KEYWORD, keywords=KEYWORD(lowercase=True, field_boost=1.5) ) schema.add( 'name_*', TEXT(field_boost=2.0, analyzer=domotego_analyzer), glob=True ) schema.add( 'description_*', TEXT(analyzer=domotego_analyzer), glob=True ) indexes['product'] = create_in(path, schema, indexname=name) index_products(BaseProduct.objects) # Initialize the parsers docparserfields = [] categoryparserfields = [] productparserfields = ['reference', 'keywords'] for lg in app.config['LANGS']: docparserfields.append('title_'+lg) docparserfields.append('text_'+lg) categoryparserfields.append('name_'+lg) categoryparserfields.append('description_'+lg) productparserfields.append('name_'+lg) productparserfields.append('description_'+lg) parsers['doc'] = qparser.MultifieldParser( docparserfields, schema=indexes['doc'].schema, termclass=FuzzierTerm ) parsers['category'] = qparser.MultifieldParser( categoryparserfields, schema=indexes['category'].schema, termclass=FuzzierTerm ) parsers['product'] = qparser.MultifieldParser( productparserfields, schema=indexes['product'].schema, termclass=FuzzierTerm )
def index_corpus(self): """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5, or PHI7. TLG takes almost 13 min; PHI5 1.5 min. To setup index parameters >>> # cltk_index = CLTKIndex('latin', 'phi5') # 1.5 min, 363 docs >>> # cltk_index = CLTKIndex('latin', 'phi5', chunk='work') # 2 min, 837 docs >>> # cltk_index = CLTKIndex('greek', 'tlg') # 13 min, 1823 docs >>> # cltk_index = CLTKIndex('greek', 'tlg', chunk='work') #15.5 min, 6625 docs # And to start indexing: >>> # cltk_index.index_corpus() TODO: Prevent overwriting. Ask user to rm old dir before re-indexing. TODO: Add option for lemmatizing. TODO: Add for figure out lower() options. TODO: Process TLG through forthcoming normalize(). TODO: Add name to each index. TODO: Turn off any language-specific mods (eg, stemming, case) that Whoosh might be doing by default. """ # Setup index dir schema = Schema(path=ID(stored=True), author=TEXT(stored=True), content=TEXT) try: _index = create_in(self.index_path, schema) except FileNotFoundError: os.makedirs(self.index_path) _index = create_in(self.index_path, schema) writer = _index.writer() # Setup corpus to be indexed if self.lang == 'greek' and self.corpus == 'tlg': corpus_path = os.path.normpath(get_cltk_data_dir() + '/greek/text/tlg/plaintext/') if self.chunk == 'work': corpus_path = os.path.normpath( get_cltk_data_dir() + '/greek/text/tlg/individual_works/') elif self.lang == 'latin' and self.corpus == 'phi5': corpus_path = os.path.normpath(get_cltk_data_dir() + '/latin/text/phi5/plaintext/') if self.chunk == 'work': corpus_path = os.path.normpath( get_cltk_data_dir() + '/latin/text/phi5/individual_works/') assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path # pylint: disable=line-too-long files = os.listdir(corpus_path) if self.lang == 'greek' and self.corpus == 'tlg': files = [f[:-4] for f in files if f.startswith('TLG')] corpus_index = TLG_AUTHOR_MAP elif self.lang == 'latin' and self.corpus == 'phi5': files = [f[:-4] for f in files if f.startswith('LAT')] corpus_index = PHI5_AUTHOR_MAP time_0 = time.time() logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus)) # pylint: disable=line-too-long logger.info('Index will be written to: "%s".' % self.index_path) if self.chunk == 'author': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': file = file[3:] author = corpus_index[file] path = os.path.join(corpus_path, 'TLG' + file + '.TXT') if self.lang == 'latin' and self.corpus == 'phi5': author = corpus_index[file] path = os.path.join(corpus_path, file + '.TXT') except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) if self.chunk == 'work': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[3:-8]] if self.lang == 'latin' and self.corpus == 'phi5': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[:-8]] except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) logger.info('Commencing to commit changes.') writer.commit() time_1 = time.time() elapsed = time_1 - time_0 logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed))) # pylint: disable=line-too-long
def __init__(self, index_storage, backend, wiki_name=None, acl_rights_contents=[], **kw): """ Store params, create schemas. """ self.index_storage = index_storage self.backend = backend self.wikiname = wiki_name self.ix = {} # open indexes self.schemas = {} # existing schemas common_fields = { # wikiname so we can have a shared index in a wiki farm, always check this! WIKINAME: ID(stored=True), # namespace, so we can have different namespaces within a wiki, always check this! NAMESPACE: ID(stored=True), # tokenized NAME from metadata - use this for manual searching from UI NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), # unmodified NAME from metadata - use this for precise lookup by the code. # also needed for wildcard search, so the original string as well as the query # (with the wildcard) is not cut into pieces. NAME_EXACT: ID(field_boost=3.0), # revision id (aka meta id) REVID: ID(unique=True, stored=True), # parent revision id PARENTID: ID(stored=True), # backend name (which backend is this rev stored in?) BACKENDNAME: ID(stored=True), # MTIME from revision metadata (converted to UTC datetime) MTIME: DATETIME(stored=True), # publish time from metadata (converted to UTC datetime) PTIME: DATETIME(stored=True), # ITEMTYPE from metadata, always matched exactly hence ID ITEMTYPE: ID(stored=True), # tokenized CONTENTTYPE from metadata CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), # unmodified list of TAGS from metadata TAGS: ID(stored=True), LANGUAGE: ID(stored=True), # USERID from metadata USERID: ID(stored=True), # ADDRESS from metadata ADDRESS: ID(stored=True), # HOSTNAME from metadata HOSTNAME: ID(stored=True), # SIZE from metadata SIZE: NUMERIC(stored=True), # ACTION from metadata ACTION: ID(stored=True), # tokenized COMMENT from metadata COMMENT: TEXT(stored=True), # SUMMARY from metadata SUMMARY: TEXT(stored=True), # DATAID from metadata DATAID: ID(stored=True), # TRASH from metadata TRASH: BOOLEAN(stored=True), # data (content), converted to text/plain and tokenized CONTENT: TEXT(stored=True, spelling=True), } latest_revs_fields = { # ITEMID from metadata - as there is only latest rev of same item here, it is unique ITEMID: ID(unique=True, stored=True), # unmodified list of ITEMLINKS from metadata ITEMLINKS: ID(stored=True), # unmodified list of ITEMTRANSCLUSIONS from metadata ITEMTRANSCLUSIONS: ID(stored=True), # tokenized ACL from metadata ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True), # ngram words, index ngrams of words from main content CONTENTNGRAM: NGRAMWORDS(minsize=3, maxsize=6), } latest_revs_fields.update(**common_fields) userprofile_fields = { # Note: email / openid (if given) should be unique, but we might # have lots of empty values if it is not given and thus it is NOT # unique overall! Wrongly declaring it unique would lead to whoosh # killing other users from index when update_document() is called! EMAIL: ID(stored=True), OPENID: ID(stored=True), DISABLED: BOOLEAN(stored=True), LOCALE: ID(stored=True), SUBSCRIPTION_IDS: ID(), SUBSCRIPTION_PATTERNS: ID(), } latest_revs_fields.update(**userprofile_fields) # XXX This is a highly adhoc way to support indexing of ticket items. ticket_fields = { EFFORT: NUMERIC(stored=True), DIFFICULTY: NUMERIC(stored=True), SEVERITY: NUMERIC(stored=True), PRIORITY: NUMERIC(stored=True), ASSIGNED_TO: ID(stored=True), SUPERSEDED_BY: ID(stored=True), DEPENDS_ON: ID(stored=True), CLOSED: BOOLEAN(stored=True), } latest_revs_fields.update(**ticket_fields) blog_entry_fields = { } latest_revs_fields.update(**blog_entry_fields) all_revs_fields = { ITEMID: ID(stored=True), } all_revs_fields.update(**common_fields) latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)), ("*_text", TEXT(stored=True)), ("*_keyword", KEYWORD(stored=True)), ("*_numeric", NUMERIC(stored=True)), ("*_datetime", DATETIME(stored=True)), ("*_boolean", BOOLEAN(stored=True)), ] # Adding dynamic fields to schemas for glob, field_type in dynamic_fields: latest_revisions_schema.add(glob, field_type, glob=True) all_revisions_schema.add(glob, field_type, glob=True) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema # what fields could whoosh result documents have (no matter whether all revs index # or latest revs index): self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
def __init__(self, tfidf_path, strict=True): schema = Schema(docid=ID(stored=True), content=TEXT(stored=True)) self.ix = open_dir(tfidf_path) self.searcher = self.ix.searcher()