def test_resetsearchindexes_command_existing_dir_other_indexes( self, getdefaultlocale_mock): self.options["interactive"] = False os.mkdir(self.new_index_dir) index.create_in(self.new_index_dir, fields.Schema(content=fields.TEXT), 'other_index') self.assertTrue(os.path.exists(self.new_index_dir)) with self.settings(WIRECLOUD_INDEX_DIR=self.new_index_dir): try: call_command('resetsearchindexes', **self.options) except SystemExit: raise CommandError('') self.options['stdout'].seek(0) self.assertEqual(self.options['stdout'].read(), '') self.options['stderr'].seek(0) self.assertEqual(self.options['stderr'].read(), '') self.assertTrue(os.path.exists(self.new_index_dir)) self.assertTrue( index.exists_in(self.new_index_dir, indexname='other_index')) for search_index in get_available_search_engines(): self.assertTrue( index.exists_in(self.new_index_dir, indexname=search_index.indexname))
def __init__(self, indexname=IDX_NAME, index_location=None, repo_location=None, sa=None, repo_list=None, repo_update_list=None): self.indexname = indexname self.index_location = index_location if not index_location: raise Exception('You have to provide index location') self.repo_location = repo_location if not repo_location: raise Exception('You have to provide repositories location') self.repo_paths = ScmModel(sa).repo_scan(self.repo_location) #filter repo list if repo_list: #Fix non-ascii repo names to unicode repo_list = map(safe_unicode, repo_list) self.filtered_repo_paths = {} for repo_name, repo in self.repo_paths.items(): if repo_name in repo_list: self.filtered_repo_paths[repo_name] = repo self.repo_paths = self.filtered_repo_paths #filter update repo list self.filtered_repo_update_paths = {} if repo_update_list: self.filtered_repo_update_paths = {} for repo_name, repo in self.repo_paths.items(): if repo_name in repo_update_list: self.filtered_repo_update_paths[repo_name] = repo self.repo_paths = self.filtered_repo_update_paths self.initial = True if not os.path.isdir(self.index_location): os.makedirs(self.index_location) log.info('Cannot run incremental index since it does not ' 'yet exist running full build') elif not exists_in(self.index_location, IDX_NAME): log.info('Running full index build as the file content ' 'index does not exist') elif not exists_in(self.index_location, CHGSET_IDX_NAME): log.info('Running full index build as the changeset ' 'index does not exist') else: self.initial = False
def test_resetsearchindexes_command_individual_index(self, getdefaultlocale_mock): self.options['indexes'] = 'user' with self.settings(WIRECLOUD_INDEX_DIR=self.new_index_dir): call_command('resetsearchindexes', **self.options) self.options['stdout'].seek(0) self.options['stderr'].seek(0) for search_index in get_available_search_engines(): if search_index.indexname != 'user': self.assertFalse(index.exists_in(self.new_index_dir, indexname=search_index.indexname)) self.assertTrue(index.exists_in(self.new_index_dir, indexname='user'))
def get_indices(): if not os.path.exists(INDEX_DIR): os.mkdir(INDEX_DIR) if index.exists_in(INDEX_DIR): return index.open_dir(INDEX_DIR) else: return full_index()
def search_files(index_dir, content): """ search file content in index if not hit: return False if hit: return results """ index_exist = index.exists_in(index_dir) if not index_exist: print ("index not exist") return False ix = index.open_dir(index_dir) content = unicode(content) with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) query = parser.parse(content) # whoosh.searching.Results results = searcher.search(query) print (type(results)) l = len(results) print l for h in results: # whoosh.searching.Hit print type(h) print h return results return False
def run(self): # open index self.buffer = deque(maxlen=BUFFERLINES) if not exists(self.indexdir): makedirs(self.indexdir) self.ix = create_in(self.indexdir, SCHEMA) else: if exists_in(self.indexdir): self.ix = open_dir(self.indexdir) else: self.ix = create_in(self.indexdir, SCHEMA) self.qp = QueryParser("content", self.ix.schema) self.searcher = self.ix.searcher() index_p = self.index_p while True: try: # check index_p try: type, data = index_p.recv() except EOFError: break try: if type == QUERY: self._processSearch(data) elif type == LOG: self._processLog(data) elif type == RENAME: self._processRename(data) else: prnt("Unexpected data in logindexsearch.") except: print_exc() prnt("EXCEPTION in logindexsearch process.") except KeyboardInterrupt: break self._dumpBuffer(self.buffer) self.searcher.close() self.ix.close()
def get_index(index_dir, schema=doc_schema): lib.ensure_dir(index_dir) if index.exists_in(index_dir): ix = index.open_dir(index_dir) else: ix = index.create_in(index_dir, schema) return ix
def get_whoosh_index(force_create=False): from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import CharsetFilter, StemmingAnalyzer, NgramWordAnalyzer from whoosh.support.charset import accent_map analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) ngramAnalyzer = NgramWordAnalyzer( minsize=2, maxsize=4) schema = Schema( title = TEXT(analyzer=analyzer, spelling=True, stored=True, field_boost=3.0), abstract = TEXT(analyzer=analyzer, stored=True, field_boost=2.0), path = ID(unique=True, stored=True), authors = TEXT(analyzer=analyzer, sortable=True, field_boost=1.5), content = TEXT(analyzer=analyzer, stored=True), tags = KEYWORD(sortable=True, commas=True, field_boost=1.5, lowercase=True), status = KEYWORD, classname = KEYWORD, typeahead = TEXT(spelling=True, stored=True, phrase=False) ) if not os.path.exists(settings.WHOOSH_ROOT): os.mkdir(settings.WHOOSH_ROOT) if not exists_in(settings.WHOOSH_ROOT) or force_create: index = create_in(settings.WHOOSH_ROOT, schema) else: index = open_dir(settings.WHOOSH_ROOT) return index
def app(): # indexdir = "indexdir" indexdir = r'D:\files\whoosh_code_data\whoosh_base\index_files' storage = FileStorage(indexdir) fname = storage.list() # ['dinosaur.db_loh0qsax01wwdijy.seg', 'dinosaur.db_WRITELOCK', 'mmorpg.db_1mwe4pojwea459cm.seg', 'mmorpg.db_WRITELOCK', # print(fname) indices = [] # [FileIndex(FileStorage('indexdir'), 'dinosaur.db'), FileIndex(FileStorage('indexdir'), 'mmorpg.db'), FileIndex(FileStorage('indexdir'), 'superfamicom.db')] n = 0 for f in fname: if not f.endswith(".seg"): continue print(f) ind = f.split('_')[0] # print(ind) if exists_in(indexdir, indexname=ind): indices.append(open_dir(indexdir, ind)) n += 1 if n == 1: break # print(indices[0]) # indices = [indices[0]] search(indices)
def __init__(self): """ Init Instance """ super(Indexer, self).__init__() # Indexer configuration - index dir and schema setup self.baseindexpath = join(os.environ['AIL_HOME'], self.process.config.get("Indexer", "path")) self.indexRegister_path = join( os.environ['AIL_HOME'], self.process.config.get("Indexer", "register")) self.indexertype = self.process.config.get("Indexer", "type") self.INDEX_SIZE_THRESHOLD = self.process.config.getint( "Indexer", "index_max_size") self.indexname = None self.schema = None self.ix = None if self.indexertype == "whoosh": self.schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT) if not os.path.exists(self.baseindexpath): os.mkdir(self.baseindexpath) # create the index register if not present time_now = int(time.time()) if not os.path.isfile( self.indexRegister_path): # index are not organised self.redis_logger.debug("Indexes are not organized") self.redis_logger.debug( "moving all files in folder 'old_index' ") # move all files to old_index folder self.move_index_into_old_index_folder() self.redis_logger.debug("Creating new index") # create all_index.txt with open(self.indexRegister_path, 'w') as f: f.write(str(time_now)) # create dir os.mkdir(join(self.baseindexpath, str(time_now))) with open(self.indexRegister_path, "r") as f: allIndex = f.read() allIndex = allIndex.split() # format [time1\ntime2] allIndex.sort() try: self.indexname = allIndex[-1].strip('\n\r') except IndexError as e: self.indexname = time_now self.indexpath = join(self.baseindexpath, str(self.indexname)) if not exists_in(self.indexpath): self.ix = create_in(self.indexpath, self.schema) else: self.ix = open_dir(self.indexpath) self.last_refresh = time_now
def open_index(self, index_folder, create_new=False): self.index_folder = index_folder if create_new: if os.path.exists(index_folder): shutil.rmtree(index_folder) print "deleted index folder: " + index_folder if not os.path.exists(index_folder): os.mkdir(index_folder) exists = index.exists_in(index_folder) stemming_analyzer = StemmingAnalyzer() schema = Schema( path=ID(stored=True, unique=True) , filename=TEXT(stored=True, field_boost=100.0) , tags=KEYWORD(stored=True, scorable=True, field_boost=80.0) , headlines=KEYWORD(stored=True, scorable=True, field_boost=60.0) , doubleemphasiswords=KEYWORD(stored=True, scorable=True, field_boost=40.0) , emphasiswords=KEYWORD(stored=True, scorable=True, field_boost=20.0) , content=TEXT(stored=True, analyzer=stemming_analyzer) , time=STORED ) if not exists: self.ix = index.create_in(index_folder, schema) else: self.ix = index.open_dir(index_folder)
def main(index_base_path,vendor_code,index_type,data_file_type,data_file_path,data_file_list): if cidx.create_whoosh_idx(os.path.join(index_base_path,vendor_code),index_type): print "sucess index creation at -->: ",os.path.join(index_base_path,vendor_code,index_type) else: print "failed index creation at -->: ", os.path.join(index_base_path, vendor_code, index_type) quit() if index.exists_in(os.path.join(index_base_path,vendor_code,index_type)): ix = index.open_dir(os.path.join(index_base_path,vendor_code,index_type)) for file in file_list: print "indexing file : ", file idx_writer = ix.writer() data_reader = dfi.DataFileIterator(data_file_type,data_file_path, file) for iRecord in data_reader: idx_writer.add_document(isin=unicode(iRecord.get('ID_ISIN', None), "utf-8"), sedol=unicode(iRecord.get('ID_SEDOL1', None), "utf-8"), cusip=unicode(iRecord.get('ID_CUSIP', None), "utf-8"), country_issue_iso=unicode(iRecord.get('CNTRY_ISSUE_ISO', None), "utf-8"), corp_ticker=unicode(iRecord.get('EQY_PRIM_SECURITY_TICKER', None), "utf-8"), exch_code=unicode(iRecord.get('EXCH_CODE', None), "utf-8"), currency=unicode(iRecord.get('CRNCY', None), "utf-8"), raw_data=iRecord) idx_writer.commit() else: print "failed to open index at -->: ", os.path.join(index_base_path, vendor_code, index_type) quit() quit()
def __init__(self, index_dir: Path, from_scratch: bool = False): index_name = 'index' if not Path(index_dir).exists(): Path(index_dir).mkdir() def _clear(): import shutil shutil.rmtree(index_dir) index_dir.mkdir() self.ix = index.create_in(index_dir, IndexMsg.schema, index_name) if from_scratch: _clear() self.ix = index.open_dir(index_dir, index_name) \ if index.exists_in(index_dir, index_name) \ else index.create_in(index_dir, IndexMsg.schema, index_name) assert repr(self.ix.schema.names) == repr(IndexMsg.schema.names), \ f"Incompatible schema in your index '{index_dir}'\n" \ f"\tExpected: {IndexMsg.schema}\n" \ f"\tOn disk: {self.ix.schema}" self._clear = _clear # use closure to avoid introducing too much members self.query_parser = QueryParser('content', IndexMsg.schema) self.highlighter = highlight.Highlighter()
def _init_index(self, reset=False): index_path = os.path.join(jupyter_data_dir(), 'index') # clear out old index if requested if reset: shutil.rmtree(index_path, True) # make sure there's a path to store the index data if not os.path.exists(index_path): os.makedirs(index_path) if not exists_in(index_path): # create an index with the current schema schema = Schema(basename=TEXT(stored=True, field_boost=5.0), dirname=ID(stored=True), path=ID(stored=True, unique=True), content=TEXT(stored=False), time=STORED) self.ix = create_in(index_path, schema) else: # open the existing index self.ix = open_dir(index_path) # build a query parser based on the current schema self.query_parser = MultifieldParser(["content", "basename", "dirname"], self.ix.schema)
def main(): try: os.mkdir(index_dir) except OSError: print '%s is already exists' % index_dir if exists_in(index_dir): choise = raw_input( 'Previous Index Found\nOptions:\n1.Create new Index\n2.Incremental Indexing\nEnter your option:' ) if choise == '1': index_my_docs(index_dir, True) elif choise == '2': index_my_docs(index_dir) ch = raw_input('Do you want to optimize the index?(y/n):') if ch == 'y': print 'Optimizing.Please wait...' optimize_index() print 'Optimizing Completed' else: print 'Wrong Option.Exiting....' sys.exit(0) else: print 'No previous index found. Creating new....' index_my_docs(index_dir, True) print 'Indexing Completed!'
def indexloc(self): from sidr.orm import db import json from whoosh.index import create_in, open_dir, exists_in from whoosh import fields, qparser, query schema = fields.Schema(gid=fields.TEXT(stored=True), country_code=fields.ID(stored=True), names=fields.NGRAMWORDS(stored=True, minsize=3, maxsize=15)) if not exists_in("indexer", indexname="adms"): ix = create_in("indexer", schema, indexname="adms") ix = open_dir("indexer", indexname="adms") writer = ix.writer() """ with ix.searcher() as s: qp = qparser.QueryParser("names", schema=ix.schema) q = qp.parse(u"Westonia") # results = s.search(q, limit=20, filter=query.Term("country_code", "AU")) results = s.documents() # results = searcher.search('hey', terms=True) # qp = qparser.QueryParser("content", ix.schema) # results = searcher.search(user_q) for res in results: print(repr(res)) """ rows = db.engine.execute('SELECT * FROM geoname') for row in rows: writer.add_document( gid=str(row['id']), country_code=row['country_code'], names="%s , %s , %s" % (row['name'], row['asciiname'], row['name_alternate'])) writer.commit()
def get_or_create_index(path, schema, src): """Get or create an Index.""" index = open_dir(path) if exists_in(path) else create_in(path, schema) indexed_titles = set(field['title'] for field in gen_indexed_fields(index)) documents = set(gen_documents(src)) update_index(index.writer(), indexed_titles, documents) return index
def __init__(self, modref): ''' inits the plugin ''' self.modref = modref super().__init__(modref.message_handler, self) self.providers = set() self.movies = {} self.lock = Lock() self.runFlag = True # init the search engine self.whoosh_schema = Schema(source=KEYWORD(stored=True), provider=KEYWORD(stored=True), title=TEXT(stored=True), category=TEXT(stored=True), uri=ID(stored=True, unique=True), url=STORED, mime=STORED, duration=STORED, source_type=STORED, description=STORED, timestamp=DATETIME(stored=True)) self.index_dir = DirectoryMapper.abspath(self.get_plugin_id(), 'runtime', 'indexdir', True) if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) if index.exists_in(self.index_dir): self.whoosh_ix = index.open_dir(self.index_dir) else: self.reset_index() # creates a new index
def get(self, name='__indexdir', dump=config.DUMP_FOLDER, destructive=False): index_path = config.ROOT.joinpath(name) path = str(index_path) if destructive and index_path.exists(): shutil.rmtree(path) while index_path.exists(): pass if destructive or (not index_path.exists() or not index.exists_in(path)): try: index_path.mkdir() self.index = index.create_in(path, WikiSchema()) logging.info('Index newly created, adding documents') self.build(directory=dump) except (FileExistsError, FileNotFoundError) as e: logger.error('Index already exist or parent not found') sys.exit(0) self.index = index.open_dir(path) print(' * Bootstrap index reader') self.reader = self.index.reader() return self
def index_search(group, sheet_name, wiki_key): sha = permissions_sha(sheet_name, wiki_key, group) dir = os.path.join(app.config['SPREADSHEET_FOLDER'], sheet_name, "indices", sha) if(index.exists_in(dir)): print("Index already exists for " + sheet_name + " / " + wiki_key + " / " + group + " (or comparable)") ix = index.open_dir(dir) indices[sha] = ix return try: os.mkdir(dir) except FileExistsError: pass print("Reindexing for " + sheet_name + " / " + group) schema = Schema(key=ID(stored=True, unique=True), content=TEXT) ix = create_in(dir, schema) writer = ix.writer() for o in cull_invalid_objects(group, sheet_name, wiki_key): writer.add_document( key=o[sheet_config[sheet_name]["key_column"]], content=" ".join([str(c) for c in cull_invalid_columns(o, permissions[sheet_name][wiki_key][group]["columns"]).values()]) ) writer.commit() indices[sha] = ix return ""
def incremental_index(doc_dir): """ Update index based on document last update time """ if (index.exists_in(get_index_dir()) == False): clean_index(doc_dir) return ix = index.open_dir(get_index_dir()) indexed_paths = set() # The set of all paths in the index to_index = set() # The set of all paths we need to re-index writer = ix.writer() with ix.searcher() as searcher: # Loop over the stored fields in the index for fields in searcher.all_stored_fields(): indexed_path = fields['path'] indexed_paths.add(indexed_path) if not os.path.exists(indexed_path): # This file was deleted since it was indexed --> Delete index writer.delete_by_term('path', indexed_path) else: # Check if this file was changed since it was indexed indexed_time = fields['time'] modify_time = os.path.getmtime(indexed_path) if modify_time > indexed_time: # The file has changed, delete it and add it to the list of files to reindex writer.delete_by_term('path', indexed_path) to_index.add(indexed_path) for filename in get_document_names(doc_dir): path = os.path.join(doc_dir, filename) if path in to_index or path not in indexed_paths: add_doc(writer, filename) writer.commit(optimize=True)
def _get_index(index_path, schema): if index.exists_in(index_path): return index.open_dir(index_path) else: if not os.path.exists(index_path): os.mkdir(index_path) return index.create_in(index_path, schema)
def createIndex(self, file, directory): if os.path.exists(directory) and not exists_in(directory): print('Directory already exists and does not contain any index, deleting and creating new index...\n') shutil.rmtree(directory) os.mkdir(directory) if not os.path.exists(directory): os.mkdir(directory) if exists_in(directory): print('overwriting current index...\n') self.directory = directory self.ix = create_in(directory, self.schema) self.writer = self.ix.writer() self.writeToIndex(file)
def get_index(config): """ Return the current index object if there is one. If not attempt to open the index in wsearch.indexdir. If there isn't one in the dir, create one. If there is not dir, create the dir. """ index_dir = config.get('wsearch.indexdir', SEARCH_DEFAULTS['wsearch.indexdir']) if not os.path.isabs(index_dir): index_dir = os.path.join(config.get('root_dir', ''), index_dir) if exists_in(index_dir): # For now don't trap exceptions, as we don't know what they # will be and so we want them to raise destructively. index = open_dir(index_dir) else: try: os.mkdir(index_dir) except OSError: pass schema = config.get('wsearch.schema', SEARCH_DEFAULTS['wsearch.schema']) index = create_in(index_dir, Schema(**schema)) return index
def create_or_open_index(self): if index.exists_in(self.index_dir): self.ix = index.open_dir(self.index_dir) else: if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) self.ix = create_in(self.index_dir, self.schema)
def ix(self, name): schema = getattr(self, '%s_schema' % name) if not exists_in(self.index_path, indexname=name): return create_in(self.index_path, schema, indexname=name) ix = open_dir(self.index_path, indexname=name) update_schema(ix, schema) return ix
def createIndex(): if not os.path.exists(index_dir): os.mkdir(index_dir) if not index.exists_in(index_dir): schema = Schema(title=TEXT(stored=True), body=TEXT(stored=True), link=TEXT(stored=True)) ix = create_in(index_dir, schema) else: ix = index.open_dir(index_dir) writer = ix.writer() for feed in source_dn_all.entries: description = feed.summary.split("<img") title = feed.title.encode('utf-8') if checkIfDocExists(title, 'DN') is False: with open('dn_news.txt', 'a') as news_file: news_file.write(feed.title.encode('utf-8')+' \n') writer.add_document(title=feed.title, body=description[0], link=feed['feedburner_origlink']) for feed in source_jn_all.entries: description = feed.summary.split("<img") title = feed.title.encode('utf-8') if checkIfDocExists(title, 'JN') is False: with open('jn_news.txt', 'a') as news_file: news_file.write(feed.title.encode('utf-8')+' \n') writer.add_document(title=feed.title, body=description[0], link=feed['feedburner_origlink']) writer.commit()
def get_index(index, indexname="ARTIFACTS", schema=None): """Open or create a whoosh index. Opens a whoosh index with the specified name and schema. If there is no index with the specified name, a new index is created. Parameters ---------- index : str The name of the index. schema : whoosh.fields.Schema The schema to use for the index. Returns ------- libcflib.index.NestedIndex A whoosh index with the specified name and schema. """ storage = FileStorage(index) if not os.path.exists(index): os.mkdir(index) if exists_in(index, indexname): return NestedIndex(storage, schema=schema, indexname=indexname) else: return NestedIndex.create(storage, schema, indexname)
def open_index(self, index_folder, create_new=False): self.index_folder = index_folder if create_new: if os.path.exists(index_folder): shutil.rmtree(index_folder) print "deleted index folder: " + index_folder if not os.path.exists(index_folder): os.mkdir(index_folder) exists = index.exists_in(index_folder) stemming_analyzer = StemmingAnalyzer() schema = Schema(path=ID(stored=True, unique=True), filename=TEXT(stored=True, field_boost=100.0), tags=KEYWORD(stored=True, scorable=True, field_boost=80.0), headlines=KEYWORD(stored=True, scorable=True, field_boost=60.0), doubleemphasiswords=KEYWORD(stored=True, scorable=True, field_boost=40.0), emphasiswords=KEYWORD(stored=True, scorable=True, field_boost=20.0), content=TEXT(stored=True, analyzer=stemming_analyzer), time=STORED) if not exists: self.ix = index.create_in(index_folder, schema) else: self.ix = index.open_dir(index_folder)
def addTermNarrower(self, tagSubjectList, termNarrower): if not index.exists_in(utils.indexerDir(), utils.indexName): self.createNewIndex() for tagSubject in tagSubjectList: self.__writer.add_document(tagSubject=unicode(tagSubject), termNarrower=unicode(termNarrower)) self.addToWordList(termNarrower)
def search_documents(filter): results = None # Check for existing index dir_path = os.path.join(DATA_DIR, 'index') if not os.path.exists(dir_path) or not Index.exists_in(dir_path): return None index = Index.open_dir(dir_path) if filter.startswith('tags:'): fields = ['tags'] filter = filter[5:] else: fields = ['path', 'content'] parser = MultifieldParser(fields, schema=index.schema) search_query = parser.parse(unicode(filter)) # Try documents search try: searcher = index.searcher(closereader=False) return searcher.search(search_query, collapse=[sorting.FieldFacet('path'), sorting.FieldFacet('content')], collapse_order=sorting.FieldFacet('revision', reverse=True), sortedby=[sorting.FieldFacet('path'), sorting.FieldFacet('date', reverse=True)] ) finally: searcher.close() return results
def createIndex(): if not os.path.exists(index_dir): os.mkdir(index_dir) if not index.exists_in(index_dir): schema = Schema(title=TEXT(stored=True), body=TEXT(stored=True), link=TEXT(stored=True)) ix = create_in(index_dir, schema) else: ix = index.open_dir(index_dir) writer = ix.writer() for feed in source_dn_all.entries: description = feed.summary.split("<img") title = feed.title.encode('utf-8') if checkIfDocExists(title, 'DN') is False: with open('dn_news.txt', 'a') as news_file: news_file.write(feed.title.encode('utf-8') + ' \n') writer.add_document(title=feed.title, body=description[0], link=feed['feedburner_origlink']) for feed in source_jn_all.entries: description = feed.summary.split("<img") title = feed.title.encode('utf-8') if checkIfDocExists(title, 'JN') is False: with open('jn_news.txt', 'a') as news_file: news_file.write(feed.title.encode('utf-8') + ' \n') writer.add_document(title=feed.title, body=description[0], link=feed['feedburner_origlink']) writer.commit()
def _init_index(self, reset=False): index_path = os.path.join(jupyter_data_dir(), "index") # clear out old index if requested if reset: shutil.rmtree(index_path, True) # make sure there's a path to store the index data if not os.path.exists(index_path): os.makedirs(index_path) if not exists_in(index_path): # create an index with the current schema analyzer = ChineseAnalyzer() schema = Schema( basename=TEXT(stored=True, field_boost=5.0, analyzer=analyzer), dirname=ID(stored=True, analyzer=analyzer), path=ID(stored=True, unique=True, analyzer=analyzer), content=TEXT(stored=False, analyzer=analyzer), time=STORED, ) self.ix = create_in(index_path, schema) else: # open the existing index self.ix = open_dir(index_path) # build a query parser based on the current schema self.query_parser = MultifieldParser(["content", "basename", "dirname"], self.ix.schema)
def __init__(self): self.directory = os.path.join(edocuments.root_folder, '.index') self.dirty = False schema = Schema(**{ PATH: ID(stored=True, unique=True), CONTENT: TEXT(stored=True), DATE: STORED, DIRECTORY: STORED, MD5: TEXT(stored=True), }) self.parser_path = QueryParser("path_id", schema) self.parser_content = QueryParser("content", schema) if not exists_in(self.directory): os.makedirs(self.directory) self.index = create_in(self.directory, schema) else: self.index = open_dir(self.directory) if 'path' in self.index.schema.names(): with self.index.writer() as writer: writer.remove_field('path') if 'directory' not in self.index.schema.names(): with self.index.writer() as writer: writer.add_field('directory', STORED) if 'md5' not in self.index.schema.names(): with self.index.writer() as writer: writer.add_field('md5', TEXT(stored=True)) print( 'Field length:\npath: %i\ncontent: %i\nmd5: %i' % ( self.index.field_length("path_id"), self.index.field_length("content"), self.index.field_length("md5"), ) )
def __init__(self, db_path): ensuredir(db_path) if index.exists_in(db_path): self.index = index.open_dir(db_path) else: self.index = index.create_in(db_path, schema=self.schema) self.qparser = QueryParser('text', self.schema)
def init(self): ix_path = os.path.join(self.path, self.name) if whoosh_index.exists_in(ix_path): return whoosh_index.open_dir(ix_path) if not os.path.exists(ix_path): os.makedirs(ix_path) return whoosh_index.create_in(ix_path, self.schema)
def __init__(self, pickle_path='index', index_name='telegram_searcher', from_scratch=False): analyzer = ChineseAnalyzer() schema = Schema( content=TEXT(stored=True, analyzer=analyzer), url=ID(stored=True, unique=True), chat_id=STORED(), post_time=DATETIME(stored=True), ) if not Path(pickle_path).exists(): Path(pickle_path).mkdir() def _clear(): pattern = re.compile(f'^_?{index_name}.*') for file in Path(pickle_path).iterdir(): if pattern.match(file.name): os.remove(str(file)) self.ix = create_in(pickle_path, schema, index_name) if from_scratch: _clear() self.ix = open_dir(pickle_path, index_name) \ if exists_in(pickle_path, index_name) \ else create_in(pickle_path, schema, index_name) self._clear = _clear # use closure to avoid introducing to much members self.query_parser = QueryParser('content', schema) self.highlighter = highlight.Highlighter()
def index_exists(dirname=INDEXDIR, indexname=INDEXNAME): """ index_exists([dirname="index", indexname="MAIN"]) Verifica se o índice :attr:`indexname` existe no diretório :attr:`dirname`. .. code-block:: python from storyline.engine.index import index_exists # Exemplo em que existe o diretório index com índice MAIN. >>> index_exists() True >>> index_exists("index") True >>> index_exists("index", "indexname") False :param dirname: Nome do diretório do índice. :type dirname: str :param indexname: Nome do índice. :tyoe indexname: str :returns: True ou False. """ return index.exists_in(dirname, indexname.upper())
def index(self, locales, init=False, **options): """Create index records for all dimensions in the cube""" # FIXME: this works only for one locale - specified in browser if init: self.initialize() if not index.exists_in(self.path): raise Exception("Index is not initialized in '%s'" % self.path) ix = index.open_dir(self.path) self.writer = ix.writer() # for dimension in self.cube.dimensions: options = options or {} cube = self.browser.cube for locale_tag, locale in enumerate(locales): for dim_tag, dimension in enumerate(cube.dimensions): self.index_dimension(dimension, dim_tag, locale=locale, locale_tag=locale_tag, **options) self.writer.commit()
def __init__(self, in_folder='testing_index', bool_only=False, from_file=None, with_dict=None, with_index=None, custom_schema=None): ''' implement optional stuff ''' self.indexdir = in_folder '''if custom_schema: self._schema = custom_schema''' self.bool_only = bool_only self._schema = Schema(keystring=TEXT(stored=not self.bool_only), valuestring=TEXT(stored=not self.bool_only)) if not index.exists_in(self.indexdir): #logging.info("index does not exist in indexdir, will create") if not os.path.exists(self.indexdir): #logging.info("indexdir does not exist, will create") os.mkdir(self.indexdir) #logging.info("created indexdir") self.ix = index.create_in(self.indexdir, self._schema) #logging.info("created index in indexdir") else: self.ix = index.open_dir(self.indexdir) #logging.info("found and opened existing index in indexdir") if os.path.exists(self.indexdir + '/ixIsTruthy'): ixinfofile = open(self.indexdir + '/ixIsTruthy', 'rb') ixinfo = ixinfofile.readline() if ixinfo == '1': ixinfo = True elif ixinfo == '0': ixinfo = False if ixinfo != self.bool_only: raise Exception( 'cannot open existing index in a different bool_only mode. change it or add to argument and set as true' ) ixinfofile.close() else: ixinfofile = open(self.indexdir + '/ixIsTruthy', 'wb') if self.bool_only: ixinfofile.write('1') else: ixinfofile.write('0') ixinfofile.close() self.writer = self.ix.writer() if with_dict: for key in with_dict: writer.add_document(keystring=unicode(key), valuestring=unicode(with_dict[key])) self.writer.commit()
def get_location_index(): if(index.exists_in(whoosh_index_path, indexname="location_index")): loc_ix = index.open_dir(whoosh_index_path, indexname="location_index") else: loc_ix = index.create_in(whoosh_index_path, schema=LocationSchema(), indexname="location_index") fill_location_index(loc_ix) return loc_ix
def _get_index(self): index_directory = "%s/%s" % (current_app.config.get("WHOOSH_BASE"), self.__class__.__name__) if not Path(index_directory).exists(): Path(index_directory).mkdir() model_index = None if not index.exists_in(index_directory): return create_in(index_directory, self._get_schema()) return index.open_dir(index_directory)
def get_or_create_index(path, schema, src): """Get or create an Index.""" index = open_dir(path) if exists_in(path) else create_in(path, schema) indexed_titles = set(field['title'] for field in gen_indexed_fields(index)) corpus = Corpus(src) documents = set(corpus.gen_documents()) update_index(index.writer(), indexed_titles, documents) return index
def get_category_index(): if(index.exists_in(whoosh_index_path, indexname="category_index")): cat_ix = index.open_dir(whoosh_index_path, indexname="category_index") else: cat_ix = index.create_in(whoosh_index_path, schema=CategorySchema(), indexname="category_index") fill_category_index(cat_ix) return cat_ix
def get_restaurant_index(): if(index.exists_in(whoosh_index_path, indexname="restaurant_index")): rest_ix = index.open_dir(whoosh_index_path, indexname="restaurant_index") else: rest_ix = index.create_in(whoosh_index_path, schema=RestaurantSchema(), indexname="restaurant_index") fill_restaurant_index(rest_ix) return rest_ix
def get_index(self): ip = self.indexpath if not self.indexpath.startswith('/'): ip = path.join(self.env.path, ip) if not path.exists(ip): os.mkdir(ip) if not index.exists_in(ip): index.create_in(ip, self.SCHEMA) return index.open_dir(ip)
def __init__(self, repos_path, index_path): self.repo = Repo(repos_path) self.index_path = index_path self.git_index = self.repo.open_index() if not exists_in(self.index_path): schema = Schema(path=ID(unique=True, stored=True), itime=STORED, content=TEXT) self.ix = create_in(self.index_path, schema) else: self.ix = open_dir(self.index_path)
def __init__(self, path="./urllist"): if not index.exists_in(path): schema = Schema(title=TEXT(stored=True), aya=KEYWORD(stored=True), url=ID(stored=True), cache=ID(stored=True)) makedirs(path) ix = create_in(path, schema) else: ix = open_dir(path) self.index = ix
def incremental_index(index_dir, root_dir): """ Only re-index the documents that have changed index_dir: dir to save index infos root_dir: dir of all files to be indexed """ if not os.path.exists(index_dir): os.mkdir(index_dir) index_exist = index.exists_in(index_dir) if not index_exist: print ("index not exist, create it") ix = index.create_in(index_dir, schema=get_schema()) ix = index.open_dir(index_dir) # all paths in the index indexed_paths = set() # all paths we need to re-index to_reindex_paths = set() with ix.searcher() as searcher: writer = ix.writer() # Loop over the stored fileds in the index for fields in searcher.all_stored_fields(): indexed_path = fields["path"] indexed_paths.add(indexed_path) if not os.path.exists(indexed_path): # This file was deleted since it was indexed # So delete from the index writer.delete_by_term("path", indexed_path) else: # Check if this file was changed since it was indexed indexed_time = fields["time"] mtime = os.path.getmtime(indexed_path) if mtime > indexed_time: # This file has changed since it was indexed # So delete from the index writer.delete_by_term("path", indexed_path) # And add it to the list of files to reindex to_reindex_paths.add(indexed_path) # Loop over the files in the filesystem for filepath in list_all_files(root_dir): if filepath not in indexed_paths: # This is a new file, so indexed it add_file_to_index(writer, filepath) print ("{0} is a new file".format(filepath)) elif filepath in to_reindex_paths: # This is file that's changed, so indexed it add_file_to_index(writer, filepath) print ("{0} is a changed file".format(filepath)) else: # This file has not changed since it was indexed print ("{0} not changed".format(filepath)) pass writer.commit()
def setup(self): import os if not os.path.exists(self.location): os.mkdir(self.location) self.ix = index.create_in(self.location, self.schema) elif index.exists_in(self.location): self.ix = index.open_dir(self.location, schema=self.schema) else: self.ix = index.create_in(self.location, self.schema)
def index(self): if self._index is None: if not os.path.isdir(self.location): os.makedirs(self.location) if exists_in(self.location): self._index = open_dir(self.location) else: self._index = create_in(self.location, self._get_schema()) return self._index
def init_app(self, app): """Initialize module and checks if the index exists""" self.app = app if not 'WHOOSH_INDEX_PATH' in self.app.config: raise exc.InitializationError("You must set the WHOOSH_INDEX_PATH option in the configuration") self.index_dir = self.app.config["WHOOSH_INDEX_PATH"] if not exists_in(self.index_dir): self.setup_index()
def get_index(cls): idxdir = cls.get_index_dir() if index.exists_in(idxdir): idx = index.open_dir(idxdir) else: if not os.path.exists(idxdir): os.makedirs(idxdir) idx = index.create_in(idxdir, cls.schema) return idx
def get_index_writer(self, clear=False): if clear: ix = self.create_index() else: if index.exists_in(self._index_dir): ix = index.open_dir(self._index_dir) else: ix = self.create_index() return ix.writer()
def __init__(self): self.index_dir = self.index_dir_setting if not os.path.isabs(self.index_dir): self.index_dir = os.path.join(get_global_env(self.env).path, self.index_dir) if index.exists_in(self.index_dir): self.index = index.open_dir(self.index_dir) else: self.index = None
def init(): # Setting my schema ... schema_email = Schema( path=TEXT(stored=True), sender_email=TEXT(stored=True), recipient_emails=TEXT, date=DATETIME, subject=TEXT(stored=True), body=TEXT, ) schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True)) schemas = {"index_emails": schema_email, "index_book": schema_book} if not os.path.exists(index_path): os.mkdir(index_path) indexes = {} for ixname, schema in schemas.items(): """ Esta parte es mejorable, ya que sólo indexa si no existe indice. No tiene en cuenta si los archivos indexados se han modificado o si se han eliminado como se explica aquí: @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing """ exists = index.exists_in(index_path, indexname=ixname) if not exists: ix = index.create_in(index_path, schema, indexname=ixname) # Indexing ... ix = index.open_dir(index_path, indexname=ixname) writer = ix.writer() if ixname == "index_emails": files = read_dir() index_emails(files, writer) elif ixname == "index_book": index_book(writer) else: ix = index.open_dir(index_path, indexname=ixname) indexes[ixname] = ix # Main routine while True: ix = indexes.get("index_emails") with ix.searcher() as searcher: input_user = str(raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): ")) mparser = MultifieldParser(["subject", "body"], schema=ix.schema) myquery = mparser.parse(unicode(input_user)) results = searcher.search(myquery) print "==================================================" for result in results: # read_file(result.get("path")) print ("Remitente: " + findNameBySender(indexes, result.get("sender_email"))) print ("Asunto: " + result.get("subject")) print "=================================================="