def get_schema_init_fields(self) -> Dict[str, "FieldType"]: """Returns the arguments to be passed to the whoosh schema object instantiation found in the method `get_schema`. """ from whoosh.fields import TEXT, ID, KEYWORD, STORED # noqa: F401 # This part is non-negotiable fields = {Database.get_id_key(): ID(stored=True, unique=True)} # TODO: this is a security risk, find a way to fix it user_prototype = eval( papis.config.getstring('whoosh-schema-prototype')) # KeysView[str] fields.update(user_prototype) fields_list = papis.config.getlist('whoosh-schema-fields') for field in fields_list: fields.update({field: TEXT(stored=True)}) return fields
def createSearchableData(root): schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True), textdata=TEXT(stored=True)) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = create_in("indexdir", schema) writer = ix.writer() filepath = [os.path.join(root, i) for i in os.listdir(root)] for path in filepath: print(path) fp = open(path, 'r', encoding="UTF-8") text = fp.read() writer.add_document(title=path.split("\\")[1], path=path, content=text, textdata=text) fp.close() print("Almost done") writer.commit() ix.close() print("Done")
def create_update_whoosh_index(video_id): container_name = "corpus-container" video_id_no_txt_extension = os.path.splitext(video_id)[0] block_blob_service = BlockBlobService(storage_account, storage_key) video_content = block_blob_service.get_blob_to_text( container_name, video_id).content if not os.path.exists(corpus_index_dir): os.mkdir(corpus_index_dir) schema = Schema(title=ID(stored=True, unique=True), content=TEXT(stored=True, analyzer=StemmingAnalyzer())) index = create_in(corpus_index_dir, schema) else: index = open_dir(corpus_index_dir) index_writer = index.writer() index_writer.add_document(title=video_id_no_txt_extension, content=video_content) index_writer.commit() extract_and_update_video_keywords(video_id_no_txt_extension, video_content)
def create_en_index(cursor): counter = 0 tk = TinySegmenterTokenizer(tinysegmenter.TinySegmenter()) schema = Schema(uuid=ID(stored=True), content=TEXT(analyzer=tk, stored=True)) if not os.path.exists("jp_index"): os.mkdir("jp_index") # Creating a index writer to add document as per schema ix = create_in("jp_index", schema) writer = ix.writer() cursor.execute("SELECT * FROM jp_wiki") result = cursor.fetchall() for row in result: counter += 1 if counter % 100 == 0: print("Processing data in row %d" % counter) writer.add_document(uuid=row[0], content=row[1]) writer.commit()
def __init__(self, toolbox, index_dir=None, index_help=True): self.schema = Schema(id=ID(stored=True, unique=True), old_id=ID, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.index_dir = index_dir self.toolbox = toolbox self.index = self._index_setup() # We keep track of how many times the tool index has been rebuilt. # We start at -1, so that after the first index the count is at 0, # which is the same as the toolbox reload count. This way we can skip # reindexing if the index count is equal to the toolbox reload count. self.index_count = -1
def __init__(self, config): from whoosh.fields import TEXT, ID, Schema from whoosh.index import create_in, open_dir self.editor = config.get('brain', 'editor') self.result_limit = int(config.get('brain', 'result-limit')) self.delimiter = config.get('brain', 'delimiter') self.storage_path = os.path.expanduser(config.get('brain', 'storage')) self.index_path = os.path.expanduser(config.get('brain', 'index')) if not os.path.exists(self.storage_path): os.makedirs(self.storage_path) if not os.path.exists(self.index_path): os.makedirs(self.index_path) schema = Schema(entry=ID(stored=True, unique=True), content=TEXT(stored=True)) self.index = create_in(self.index_path, schema) self.index_all() else: self.index = open_dir(self.index_path)
def index(self): schema = Schema( Name=TEXT(stored=True), Description=TEXT(stored=True), Yearofrelease=TEXT(stored=True), Rating=TEXT(stored=True), Genre=TEXT(stored=True), ImdbUrl=ID(stored=True, unique=True), Votes=TEXT(stored=True) ) if not os.path.exists("indexdir"): os.mkdir("indexdir") indexer = create_in("indexdir", schema) writer = indexer.writer() csvfile = open('original.csv', 'rt', encoding='utf-8') rows = csv.reader(csvfile, delimiter=',') for row in rows: cell_count = 1 for cell in row: if cell_count == 1: Name = cell elif cell_count == 2: Description = cell elif cell_count == 3: Yearofrelease = cell elif cell_count == 4: Rating = cell elif cell_count == 5: Genre = cell elif cell_count == 6: ImdbUrl = cell elif cell_count == 7: Votes = cell elif cell_count == 8: img = cell cell_count += 1 writer.add_document(Name=Name, Description=Description, Yearofrelease=Yearofrelease, Rating=Rating, Genre=Genre, ImdbUrl=ImdbUrl, Votes=Votes) writer.commit() self.indexer = indexer
def createIndex(files_to_index): ''' Takes a directory of files and indexes them so that they may be searched through. Args: files_to_index (string) = name of directory to create index in ''' #analyser object for pre-processing search terms stem_analyzer = StemmingAnalyzer() #create search schema schema = Schema(title=TEXT(analyzer=stem_analyzer, stored=True), path=ID(stored=True), content=TEXT(analyzer=stem_analyzer, stored=True), textdata=TEXT(stored=True)) #if path to index does not exist, create it if not os.path.exists("indexdir"): os.mkdir("indexdir") # Creating a index writer to add document as per schema ix = create_in("indexdir", schema) writer = ix.writer() #parses every file in FAQs folder and extracts question and answer text data filepaths = [ os.path.join(files_to_index, i) for i in os.listdir(files_to_index) ] for path in filepaths: html = BeautifulSoup(open(path, 'r'), "lxml") #parse question text question = html.find("div", {"id": "question"}).getText() #parse answer text answer = html.find("div", { "id": "answer" }).getText().replace(u'Â\xa0', u' ') writer.add_document(title=question, path=path, content=answer, textdata=answer) writer.commit()
def instantiate(result_path,mapping_complet,corpus_type, reinit_db,name): print 'whoosh being instantiated' if reinit_db: if corpus_type=='isi': schema = MySchema_isi() else: print 'ici\n' + corpus_type fields={} print mapping_complet.keys() for i,key in enumerate(mapping_complet.keys()): if key=='ISIpubdate': fields[str(key)]=NUMERIC(stored=True) elif key=='accessionNo': fields[str(key)]=ID(stored=True,unique=True) else: fields[str(key)]=TEXT(stored=True) fields['CO']=TEXT() print 'whoosh fields',fields.keys() schema = Schema(**fields) import shutil if reinit_db==True: try: shutil.rmtree(os.path.join(result_path,'indexdir'+'_'+name)) print 'rep indexdir deleted' except: pass try: os.mkdir(os.path.join(result_path,'indexdir'+'_'+name)) print 'on a cree ',os.path.join(result_path,'indexdir'+'_'+name) except: pass ix = create_in(os.path.join(result_path,'indexdir'+'_'+name), schema) else: from whoosh.index import open_dir try: ix = open_dir(os.path.join(result_path,'indexdir'+'_'+name)) except: reinit_db=True ix=instantiate(result_path,mapping_complet,corpus_type, reinit_db,name) return ix
def createSearchableData(self): schema = Schema(title=TEXT(stored=True),path=ID(stored=True), content=TEXT) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema, indexname = self.index) writer = ix.writer() filepaths = [os.path.join(self.root,i) for i in os.listdir(self.root)] for path in filepaths: text = self.getPDFText(path) text2 = self.getPDFText2(path) for i in range(len(text)): writer.add_document(title=path.split("\\")[1] + '_Page_' + str(i), path=path, content=text[i]) writer.add_document(title=path.split("\\")[1] + '_Page_' + str(i) + '_2', path=path, content=text2[i]) print(path.split("\\")[1] + ' has been indexed') writer.commit()
def index(self): schema = Schema(title=TEXT(stored=True), path=ID(stored=True), \ content=TEXT, textdata=TEXT(stored=True)) if not os.path.exists(self.idx_path): os.mkdir(self.idx_path) # Creating a index writer to add document as per schema ix = create_in(self.idx_path, schema) writer = ix.writer() filepaths = [os.path.join(self.root, i) for i in os.listdir(self.root)] for path in filepaths: with codecs.open(path, "r", "utf-8") as f: content = f.read() writer.add_document(title=unicode(path.split("/")[1]), path=unicode(path.split("/")[0]), \ content=content, textdata=content) writer.commit() return "true"
def createSearchableData(root): schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT, URL=TEXT(stored=True) , textdata=TEXT(stored=True)) if not os.path.exists(indexoppath): os.mkdir(indexoppath) ix = create_in(indexoppath, schema) writer = ix.writer() filepaths = [os.path.join(root, i) for i in os.listdir(root)] for path in filepaths: fp = open(path, 'r') text = fp.read() doc_title = path.split("/")[-1] if doc_title in URLMap.keys(): url = str(URLMap[doc_title]['URL']) else: print(doc_title + ' not in url map') url = 'default' writer.add_document(title=doc_title, path=path,content=text, URL=url , textdata=text) fp.close() writer.commit()
def createSearchableData(data_file): ''' Schema definition: video_id, video_title, description ''' stem_analyzer = StemmingAnalyzer() schema = Schema(id=ID(stored=True),title=TEXT(stored=True),description=TEXT(analyzer=stem_analyzer(stored=True))) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = create_in("indexdir", schema) writer = ix.writer() with open('data_for_indexing.json') as f: youtube_array = json.load(f) for item in youtube_array: writer.add_document(id=item['id'], title=item['title'], description=item['description']) writer.commit()
def get_whoosh_index(): from django.conf import settings from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED schema = Schema(title=TEXT(stored=True), path=ID(unique=True, stored=True), content=TEXT(stored=True), tags=KEYWORD, classname=KEYWORD) if not os.path.exists(settings.WHOOSH_ROOT): os.mkdir(settings.WHOOSH_ROOT) if not exists_in(settings.WHOOSH_ROOT): index = create_in(settings.WHOOSH_ROOT, schema) else: index = open_dir(settings.WHOOSH_ROOT) return index
def __init__(self, *args, **kwargs): """ Instantiate the whoosh schema and writer and create/open the index. """ self.schema = kwargs.get( 'schema', Schema( content_id=ID(stored=True, unique=True), content=TEXT(), )) self.log = kwargs.get('log', Logging()) # get the absolute path and create the dir if required self.index_path = kwargs.get('index_path', INDEX_PATH) if self.create(self.index_path): self.log.info("SearchIndex", "__init__", "New index created.") # create an index obj and buffered writer self.index_obj = open_dir(self.index_path)
def createSearchableData(web_docs): schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT, textdata=TEXT(stored=True)) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = create_in("indexdir", schema) writer = ix.writer() for doc in web_docs: text = doc['text'] title = doc['title'] writer.add_document(title=title, path=doc['url'], content=text, textdata=text) writer.commit()
def __init__(self, path, index_name): ''' 创建一个MyWhoosh索引类,如果指定路径下的索引存在则直接打开,若不存在就创建新的 :param path: 索引类要存放的文件夹路径 :param index_name: 索引名称(一个文件夹下可以存放多个索引) ''' self.index_name = index_name analyzer = ChineseAnalyzer() # 导入中文分词工具 schema = fields.Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) try: self.ix = index.open_dir(path, index_name) print("索引文件已经存在,use old index") except: self.ix = create_in( path, schema=schema, indexname=index_name) # path 为索引创建的地址,indexname为索引名称 print("没有检测到旧索引文件,creat net index")
class HiveJobListing(SchemaClass): '''Class to store the details associated with each Hive job''' job_url = ID(stored=True) title = TEXT(stored=True,analyzer=QUERY_ANALYZER) owner = KEYWORD(stored=True) completion_time = DATETIME(stored=True) query = TEXT(stored=True,analyzer=QUERY_ANALYZER) def __init__(self): self.job_url = None self.title = None self.owner = None self.completion_time = None self.query = None def __str__(self): return 'Url: %s, Title: %s, Owner: %s, Time: %s, Query: %s...' % ( self.job_url, self.title, self.owner, self.completion_time, self.query[0:10])
def build_index(self): jieba.setLogLevel(logging.INFO) analyzer = ChineseAnalyzer() # 创建索引模板 schema = Schema(id=NUMERIC(stored=True), title=TEXT(stored=True, analyzer=analyzer), url=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) # 创建索引文件,在[index/]下 path = os.path.dirname(os.path.abspath(__file__)) + "/index" if not os.path.exists(path): os.mkdir(path) index = create_in(path, schema) else: index = open_dir(path) # 构建索引,增加需要索引的内容 writer = index.writer() total_row = self.post.count_documents({}) false_row = self.post.count_documents({'indexed': False}) indexed_row = total_row - false_row while True: row = self.post.find_one({'indexed': False}) if row is None: writer.commit() print('\n\tindexed successfully.') break else: writer.add_document(id=(total_row + 1) if (row['id'] == -1) else row['id'], title=row['title'], url=row['url'], content=row['content']) self.post.update_one({'title': row['title']}, {'$set': { 'indexed': True }}) writer.commit() writer = index.writer() indexed_row += 1 print( f"\rIndexing: {'▉' * (int(indexed_row/total_row*100//2))} {indexed_row/total_row*100:.2f}%", end='')
def create_index(docs): # We create the index schema. schema = Schema(id=ID(stored=True), title=TEXT, content=TEXT) if not os.path.exists(index_dir): os.mkdir(index_dir) ix = index.create_in(index_dir, schema) # The writer() method of the Index object returns an IndexWriter object that lets us add documents to the index. writer = ix.writer() i = 1 # Add documents to the index. for doc in docs: print("Indexing document: " + str(i)) i += 1 # Add document # Notes: # - Indexed text fields must be passed as unicode value. (use "str".decode()) # - Fields can be left empty, i.e., we don't have to fill in a value for every field. try: docID = doc['id'].decode() except(UnicodeEncodeError): docID = "empty".decode() try: name = doc['title'].decode() except(UnicodeEncodeError): name = "".decode() try: text = doc['content'].decode() except(UnicodeEncodeError): text = "".decode() writer.add_document(id=docID, title=name, content=text) # # #writer.add_document(id=doc['id'], title=doc['title'], content=doc['content']) # Calling commit() on the IndexWriter saves the added documents to the index. writer.commit() ix.close()
def create_indexer(doc_directory, index_directory): my_analyzer = RegexTokenizer() | LowercaseFilter() schema = Schema(id=ID(stored=True), title=TEXT(stored=True, analyzer=my_analyzer), summary=TEXT, article=TEXT(analyzer=my_analyzer), keywords=KEYWORD(stored=True, analyzer=my_analyzer), date=DATETIME(stored=True), path=TEXT(stored=True)) if not os.path.exists(index_directory): os.mkdir(index_directory) ix = create_in(index_directory, schema) writer = ix.writer() nt = 0 print("==============================") t1 = time.clock() for dirname, subdirs, files in os.walk(doc_directory): if (files != []): n = 0 for filename in files: filename = os.path.join(dirname, filename) obj = load_json(filename) writer.add_document(id=obj['id'], title=obj['title'], summary=obj['summary'], article=obj['article'], keywords=obj['keywords'], date=obj['date'], path=filename) n += 1 print("{}: {}".format(dirname, n)) nt += n t2 = time.clock() print("==============================") print("Docs: {}, Time: {:.2f}s".format(nt, (t2 - t1))) print("Writing index...") writer.commit() t3 = time.clock() print("Total time: {:.2f}s".format(t3 - t1)) print("==============================")
def open_index(indexdir, incremental=False): """ Opens the index with the given name. If the directory or the index do not yet exist, the are created. @type indexdir: str @param indexdir: The name of the index directory. @type incremental: bool @param incremental: Whether to preserve existing index content. @rtype: whoosh.Index @return: An object representing the index. """ if not os.path.exists(indexdir): os.makedirs(indexdir) if incremental and index.exists_in(indexdir): return index.open_dir(indexdir) schema = Schema(number=NUMERIC(stored=True), filename=ID(stored=True), line=TEXT(analyzer=SimpleAnalyzer(), stored=True)) return index.create_in(indexdir, schema)
def __init__(self, destination=None): super(Search, self).__init__() self.destination = destination self.writer = None self.ix = None self.schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), page=TEXT(stored=True), content=TEXT(stored=True), unique=ID(stored=True)) if not os.path.exists(self.destination): os.mkdir(self.destination) if not self.exists(self.destination): return self.create(self.destination) return self.previous(self.destination)
def __init__(self, questions_file): """ """ self.questions_file = questions_file if os.path.exists(self._CACHED_INDEX): self.indx = index.open_dir(self._CACHED_INDEX) else: os.makedirs(self._CACHED_INDEX) schema = Schema(key=ID(stored=True, unique=True), text=TEXT(stored=False)) ix = index.create_in(self._CACHED_INDEX, schema) writer = ix.writer() with open(self.questions_file) as f: for i, row in tqdm(enumerate(f)): writer.add_document(key=str(i), text=row.strip()) writer.commit() self.indx = ix self.query_parser = QueryParser('text', self.indx.schema) self.searcher = self.indx.searcher()
def create(self, in_memory=False): analyzer = StandardAnalyzer(minsize=1, stoplist=None) | CleanUpFilter() schema = Schema(source=TEXT(stored=True, analyzer=analyzer), target=TEXT(stored=True, analyzer=analyzer), comment=STORED, context=STORED, softcatala=BOOLEAN, project=ID(stored=True)) if in_memory: st = RamStorage() ix = st.create_index(schema) else: if not os.path.exists(self.dir_name): os.mkdir(self.dir_name) ix = create_in(self.dir_name, schema) self.writer = ix.writer() return ix
def _create_index_dir(self): # Create indexes os.makedirs(self._path) # Create schema schema = Schema(id=ID(stored=True, unique=True), tags=KEYWORD(stored=True), named_tags=KEYWORD(stored=True)) # Create index index = create_in(self._path, schema) index_writer = index.writer() index_writer.add_document(id=unicode('11111'), tags=unicode('test1 test2'), named_tags=unicode('test1 test2')) index_writer.add_document(id=unicode('22222'), tags=unicode('test1 test3 test4'), named_tags=unicode('test1 test3 test4')) index_writer.add_document(id=unicode('33333'), tags=unicode('test2 test5 test6'), named_tags=unicode('test2 test5 test6')) index_writer.commit()
def createSearchableData(root): schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT, textdata=TEXT(stored=True)) in_path="./indexdir" # Insert here where you want you save the Index if not os.path.exists(in_path): os.mkdir(in_path) # Creating an index writer to add document as per schema ix = create_in(in_path, schema) writer = ix.writer() filepaths = [os.path.join(root, i) for i in os.listdir(root)] for path in filepaths: fp = open(path) print(path) text = fp.read() writer.add_document(title=path.split("/")[6], path=path, content=text, textdata=text) fp.close() writer.commit()
def build_whoosh_database(): analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True, analyzer=analyzer), type=TEXT(stored=True), link=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) ix = create_in(whoosh_database, schema) writer = ix.writer() mpost = MPost() recs = mpost.query_all() for rec in recs: text2 = html2text.html2text(tornado.escape.xhtml_unescape( rec.cnt_html)) print(text2) writer.add_document(title=rec.title, type='<span style="color:blue;">[文档]</span>', link='/post/{0}.html'.format(rec.uid), content=text2) writer.commit()
def index(path): """Index all of the artifacts in a specified directory. Parameters ---------- path : str The path to the directory containing the artifacts to be indexed. """ ind = os.path.abspath(os.path.join(path, os.pardir, "whoosh")) schema = create_whoosh_schema(SCHEMAS["artifact"]["schema"]) schema.add("pkg", TEXT(stored=True)) schema.add("channel", TEXT(stored=True)) schema.add("arch", TEXT(stored=True)) schema.add("filename", TEXT(stored=True)) schema.add("path", ID(stored=True, unique=True)) ix = get_index(ind, schema=schema) unindexed = _unindexed_artifacts(path, ix) print(f"TOTAL UNINDEXED ARTIFACTS: {len(unindexed)}") unindexed = list(unindexed)[:5000] progress = tqdm.tqdm(total=len(unindexed)) writer = ix.writer() with ThreadPoolExecutor(max_workers=1) as pool: futures = [ pool.submit(get_artifact, path, artifact, progress_callback=progress.update) for artifact in unindexed ] for f in as_completed(futures): try: data = f.result() except Exception as e: print(e) else: writer.add_document(**data) writer.commit()
def index_graph_description(self, index_name='graphs'): from whoosh.fields import TEXT, ID, NGRAM, NUMERIC, KEYWORD from whoosh.analysis import StemmingAnalyzer, SimpleAnalyzer, IDAnalyzer from whoosh.analysis.filters import LowercaseFilter print 'Building %s index...' % index_name # build a single schema from the fields exposed by the different search # types print '\tSchema:' fields = { 'gid': ID(stored=True), 'description': KEYWORD(lowercase=True, scorable=True) } #fields = {'gid': ID(stored=True), 'description': TEXT(analyzer=SimpleAnalyzer(ur'[.\s]', True))} from whoosh.fields import Schema schema = Schema(**fields) # Create the index schema index = self.recreate_index(index_name, schema) # Add documents to the index print '\tWrite indexes:' writer = index.writer() c = 0 from digipal.models import Graph for graph in Graph.objects.filter( graph_components__isnull=False).prefetch_related( 'graph_components', 'graph_components__component', 'graph_components__features').distinct(): c += 1 doc = { 'gid': unicode(graph.id), 'description': graph.get_serialised_description() } writer.add_document(**doc) print '\t\tIndex %d graphs' % c writer.commit()