def delete_documents(self, doc_set, paths): """Delete documents from the index.""" index = open_dir(self.index_path) writer = AsyncWriter(index) query = And([ Term('set', doc_set), Or([Term('path', path) for path in paths]) ]) writer.delete_by_query(query) writer.commit()
def whoosh_index(self): it = QTreeWidgetItemIterator( self.notesTree, QTreeWidgetItemIterator.All) print("Starting complete indexing.") #writer = self.ix.writer() writer = AsyncWriter(self.ix) while it.value(): treeItem = it.value() name = self.notesTree.itemToPage(treeItem) path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/') print(path) fileobj = open(path, 'r', encoding='utf-8') content = fileobj.read() fileobj.close() if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=name, title=parseTitle(content, name), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) else: writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='') it += 1 writer.commit() print("Finished completely reindexing.")
def whoosh_index(self): it = QtWidgets.QTreeWidgetItemIterator( self.notesTree, QtWidgets.QTreeWidgetItemIterator.All) print("Starting complete indexing.") #writer = self.ix.writer() writer = AsyncWriter(self.ix) while it.value(): treeItem = it.value() name = self.notesTree.itemToPage(treeItem) path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/') print(path) fileobj = open(path, 'r', encoding='utf-8') content = fileobj.read() fileobj.close() if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=name, title=parseTitle(content, name), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) else: writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='') it += 1 writer.commit() print("Finished completely reindexing.")
def incremental_index(indexdir, indexname, rowData): """ 注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意 :param rowData: 每一行的数据 :param indexdir: :param indexname: :return: """ # print(indexdir) storage = FileStorage(indexdir) ix = FileIndex(storage, indexname=indexname) writer = AsyncWriter(ix) docline = """writer.add_document(""" for key in rowData: val = rowData[key] if not val: val = "" elif isinstance(val, (Decimal, )): val = str(val) else: val = pymysql.escape_string(str(val)) docline += key + '="' + val + '", ' docline = docline.rstrip(", ") docline += """)""" exec(docline) # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc") # writer.add_document(content="人在塔在", ID="hik") writer.commit()
def search_spam( post, ix, ): """ Search spam index for posts similar to this one. Returns """ writer = AsyncWriter(ix) add_post_to_index(post=post, writer=writer, is_spam=post.is_spam) writer.commit() # Search for this post in the spam index fields = ['uid'] results = search.preform_whoosh_search(ix=ix, query=post.uid, fields=fields) # Preform more_like_this on this posts content similar_content = results[0].more_like_this('content', top=5) # Remove this post from the spam index after results are collected. writer = AsyncWriter(ix) writer.delete_by_term('uid', text=post.uid) writer.commit() # Get the results into a list and close the searcher object. similar_content = list(map(search.normalize_result, similar_content)) results.searcher.close() return similar_content
def delPage(self, item): index = item.childCount() while index > 0: index = index - 1 self.dirname = item.child(index).text(0) self.delPage(item.child(index)) # remove attachment folder attDir = self.itemToAttachmentDir(item) for info in QDir(attDir).entryInfoList(): QDir().remove(info.absoluteFilePath()) QDir().rmdir(attDir) pagePath = self.itemToPage(item) self.ix = open_dir(self.settings.indexdir) query = QueryParser('path', self.ix.schema).parse(pagePath) #writer = self.ix.writer() writer = AsyncWriter(self.ix) n = writer.delete_by_query(query) # n = writer.delete_by_term('path', pagePath) writer.commit() #self.ix.close() b = QDir(self.notePath).remove(self.pageToFile(pagePath)) parent = item.parent() parentPage = self.itemToPage(parent) if parent is not None: index = parent.indexOfChild(item) parent.takeChild(index) if parent.childCount() == 0: # if no child, dir not needed QDir(self.notePath).rmdir(parentPage) else: index = self.indexOfTopLevelItem(item) self.takeTopLevelItem(index) QDir(self.notePath).rmdir(pagePath)
def saveToWhoosh(self, df, dataset_id, overwrite=False): # use whoosh search engine to enable full text search if not os.path.exists(self.whoosh_root): os.mkdir(self.whoosh_root) ws_path = os.path.join(self.whoosh_root, dataset_id) if not os.path.exists(ws_path): os.mkdir(ws_path) logMsg( str(os.path.abspath(ws_path)) + ' does not exist, create it to store whoosh index') overwrite = True elif overwrite: shutil.rmtree(ws_path) os.mkdir(ws_path) schema = Schema(DOC_ID=NUMERIC(stored=True), TEXT=TEXT) if overwrite: ix = create_in(ws_path, schema) else: ix = open_dir(ws_path) writer = AsyncWriter(ix) with self.workflow.dao.create_session() as session: doc_iter = session.query(Document).filter( Document.DATASET_ID == dataset_id) for doc in doc_iter: writer.add_document(DOC_ID=doc.DOC_ID, TEXT=doc.TEXT) writer.commit() pass
def updateIndex(self): ''' Update whoosh index, which cost much computing resource ''' page = self.parent.notesTree.currentPage() content = self.toPlainText() try: #writer = self.ix.writer() writer = AsyncWriter(self.ix) if METADATA_CHECKER.match( content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=page, title=parseTitle(content, page), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) writer.commit() else: writer.update_document(path=page, title=parseTitle(content, page), content=content, tags='') writer.commit() except: print("Whoosh commit failed.")
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) writer.update_document(**doc) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
def update(self, x, who=None): # implement search here x = str(x) aindex = AsyncWriter(self.index, delay=0.2) aindex.add_document(content=x) aindex.commit() return self._emit(x)
def creating_searching_ranking(selected_analyzer, name_of_file, scoring_function, path): #creating Schema with fields id, title and content schema = Schema(id=ID(stored=True), title=TEXT(stored=False, analyzer=selected_analyzer), content=TEXT(stored=False, analyzer=selected_analyzer)) directory_containing_the_index = path ix = create_in( directory_containing_the_index, schema ) #writing index based on schema in the directory where the 'path' is directory_containing_the_index = path ix = index.open_dir( directory_containing_the_index) #opening the index file writer = AsyncWriter(ix) #writer will be used to add content to the fields ALL_DOCUMENTS_file_name = name_of_file #path to the file in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1') csv_reader = csv.reader(in_file, delimiter=',') #reading the file csv_reader.__next__( ) # to skip the header: first line contains the name of each field. for record in csv_reader: #for each row in the 'csv_test' file id = record[1] #read id title = record[2] #read title content = record[3] #read body writer.add_document(id=id, content=title + ' ' + content) writer.commit() in_file.close() #finish writing in the index file
def incremental_index(t, l, c, dirname): id = (Searcher().getcount() + 1) ix = index.open_dir(dirname) # The set of all paths in the index #with ix.searcher() as searcher: indexed_feeds = set() with ix.searcher() as searcher: writer = AsyncWriter(ix) # Loop over the stored fields in the index for fields in searcher.all_stored_fields(): indexed_feed = fields['title'] indexed_feeds.add(indexed_feed) # Loop over the files in the filesystem # Assume we have a function that gathers the filenames of the # documents to be indexed if t not in indexed_feeds: # This is either a file that's changed, or a new file # that wasn't indexed before. So index it! wooshDocuments(id, writer, t, l, c) writer.commit() return id
def index_posts(posts, ix=None, overwrite=False, add_func=add_index): """ Create or update a search index of posts. """ ix = ix or init_index() # The writer is asynchronous by default writer = AsyncWriter(ix) elapsed, progress = timer_func() total = posts.count() stream = islice(zip(count(1), posts), None) # Loop through posts and add to index for step, post in stream: progress(step, total=total, msg="posts indexed") add_func(post=post, writer=writer) # Commit to index if overwrite: logger.info("Overwriting the old index") writer.commit(mergetype=writing.CLEAR) else: logger.debug("Committing to index") writer.commit() elapsed(f"Committed {total} posts to index.")
def delPage(self, item): index = item.childCount() while index > 0: index = index - 1 self.dirname = item.child(index).text(0) self.delPage(item.child(index)) # remove attachment folder attDir = self.itemToAttachmentDir(item) for info in QtCore.QDir(attDir).entryInfoList(): QtCore.QDir().remove(info.absoluteFilePath()) QtCore.QDir().rmdir(attDir) pagePath = self.itemToPage(item) self.ix = open_dir(self.settings.indexdir) query = QueryParser("path", self.ix.schema).parse(pagePath) # writer = self.ix.writer() writer = AsyncWriter(self.ix) n = writer.delete_by_query(query) # n = writer.delete_by_term('path', pagePath) writer.commit() # self.ix.close() b = QtCore.QDir(self.notePath).remove(self.pageToFile(pagePath)) parent = item.parent() parentPage = self.itemToPage(parent) if parent is not None: index = parent.indexOfChild(item) parent.takeChild(index) if parent.childCount() == 0: # if no child, dir not needed QtCore.QDir(self.notePath).rmdir(parentPage) else: index = self.indexOfTopLevelItem(item) self.takeTopLevelItem(index) QtCore.QDir(self.notePath).rmdir(pagePath)
def createSearchableData(root): ana = analysis.StemmingAnalyzer() ## definisco lo schema del mio indice schema = Schema( title=TEXT(stored=True),\ author=TEXT(stored=True),\ genre=KEYWORD(stored=True), \ link=ID(stored=True), \ path=ID(stored=True), \ price=ID(stored=True), \ content=TEXT(stored=True),\ contentData=TEXT) ## creo la directory indexdir if not os.path.exists("indexdir"): os.mkdir("indexdir") cwd = os.getcwd() print(cwd) ## Creo un indexWriter, che aggiunga i documenti secondo lo schema ix = create_in("indexdir", schema) writer = AsyncWriter(ix) ## Trovo i file nella directory, e ne salvo i percorsi filepaths = [os.path.join(root, i) for i in os.listdir(root)] num = 1 # per ogni percorso trovato... for path in filepaths: #print(num) num += 1 fp = open(path, 'r', encoding="utf-8") #print(path) # Nella prima riga ho messo il titolo, nella seconda l'autore, nella terza il genere, nella quarta il link, nella quinta il prezzo fileTitle = fp.readline() fileAuthor = fp.readline() fileGenre = fp.readline() fileLink = fp.readline() filePrice = fp.readline() # Tutto il resto del file è occupato dalla trama filePlot = fp.read() # la sezione contentData è data dalle trame preprocessate fileData = tokenize(filePlot) ## Aggiungo un documento all'indice, con tutti i campi necessari writer.add_document( title = fileTitle,\ path = path,\ author = fileAuthor,\ genre = fileGenre,\ link = fileLink,\ price = filePrice, \ content = filePlot,\ contentData = fileData) fp.close() writer.commit()
def store_page(user, url): writer = AsyncWriter(idx) resp = requests.get(url) content = parse(resp.content) now = datetime.now() writer.add_document(ts=now, user=unicode(user), url=unicode(url), content=content) writer.commit()
def main(): logging.basicConfig( level=logging.DEBUG, format='%(filename)s %(levelname)s: %(asctime)s: %(message)s') logger = logging.getLogger('main') logger.info('Executing indexing module') logger.info('Reading file') du = doc_utilities() du.read_data_set(file='data/wikipedia_text_files.csv') logger.info('Task1 - Number of documents = {}'.format( du.get_number_documents())) du.process_documents_for_indexing() collection = du.get_collection_json()[0:1000000] if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", MySchema) #writer = ix.writer() writer = AsyncWriter(ix) with tqdm(total=len(collection), desc="Indexing documents", bar_format="{l_bar}{bar} [ time left: {remaining} ]") as pbar: for d in collection: text = str(d['text']) idt = str(d['id']) title = str(d['title']) writer.add_document(id=idt, title=title, text=text) pbar.update(1) writer.commit()
def add(): d = request.get_json(force=True) url = d.get("url") content = d.get("content") if not url or not content: return jsonify({"status": "missing parameters"}) if urlparse.urlparse(url).netloc.startswith("localhost"): return jsonify({"status": "ignored"}) ix = get_index() writer = AsyncWriter(ix) soup = BeautifulSoup(content) # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) writer.update_document(title=d.get("title", "Untitled"), url=url, content=text, modified=datetime.datetime.now()) writer.commit() return jsonify({"status": "ok"})
def build_index_writer(self, ix): try: writer = AsyncWriter(ix) self.build_src_index(writer, "java") writer.commit() except IndexingError as ie: print ie.message + "index Error!!!"
def add_item(self, item): model = self.model doc = self._create_document(model, item) index = self.model_index writer = AsyncWriter(index, writerargs=self._writer_args()) writer.update_document(**doc) writer.commit() self._close_model_index()
def index_app(path): print("started indexing file " + path) ix = get_index() writer = AsyncWriter(ix) process_appzip(writer, path) writer.commit() ix.close() print("indexing completed " + path)
def delete_documents(self, doc_set, paths): """Delete documents from the index.""" index = open_dir(self.index_path) writer = AsyncWriter(index) query = And( [Term('set', doc_set), Or([Term('path', path) for path in paths])]) writer.delete_by_query(query) writer.commit()
def update(self, index, document, **options): index = base._resolve_index(index) ix = self._storage.open_index(indexname=index.get_name()) writer = AsyncWriter(ix) adapted_document = index.adapt_document(document) writer.update_document(**adapted_document) writer.commit()
def add_spam(post): ix = init_spam_index() writer = AsyncWriter(ix) add_post_to_index(post=post, writer=writer) writer.commit() logger.info("Added spam to index.") return
def add_items(self, item_model, items): model = self.model index = self.model_index writer = AsyncWriter(index) for item in items: doc = self._create_document(model, item) writer.update_document(**doc) writer.commit() self._close_model_index()
def recreate_data(sender=None, **kwargs): """ Readds all the Object in the index. If they already exists will be duplicated """ ix = get_or_create_index() writer = AsyncWriter(ix) for obj in Post.objects.all(): writer.add_document(**obj.index()) writer.commit()
def rebuildGroupsIndex(): createGroupSchema() gix = open_dir(groupsindex_dir) writer = AsyncWriter(gix) groups = rs.zrange("group_ids", 0, -1) for gid in groups: g = json.loads(rs.hget("group:%s" % gid, 'data')) storeGroupInIndex(g, writer) writer.commit()
def storeUserInIndex(u, writer=None): commit = False if writer is None: writer = AsyncWriter(uix) commit = True writer.add_document(nickname=u.get('nickname'), account_id=u.get('account_id'), user_id=u.get('id')) if commit: writer.commit()
def storeGroupInIndex(group, writer=None): commit = False if writer is None: writer = AsyncWriter(gix) commit = True writer.add_document(name=group['name'], descr=group['descr'], id=group['id']) if commit: writer.commit()
def storeBattleInIndex(b, writer=None): commit = False if writer is None: writer = AsyncWriter(bix) commit = True writer.add_document(id=b.get('id'), descr=b.get('descr'), battle_date=u.get('battle_date')) if commit: writer.commit()
def open_index_writer(optimize=False): writer = AsyncWriter(open_index()) try: yield writer except Exception as e: logger.exception(str(e)) writer.cancel() finally: writer.commit(optimize=optimize)
def index_post(post): """Add or update a post's search entry""" writer = AsyncWriter(ix) writer.update_document(id=str(post.id), title=post.title, body=post.body, desc=post.desc, subtitle=post.subtitle, tags=post.tags, authors=' '.join([a.name for a in post.authors])) writer.commit()
def update_bulk(self, index, documents): index = base._resolve_index(index) ix = self._storage.open_index(indexname=index.get_name()) writer = AsyncWriter(ix) adapted_documents = (index.adapt_document(doc) for doc in documents) for doc in adapted_documents: writer.update_document(**doc) writer.commit()
def build_index(self): """Build index for all parsed documents""" ix = self.create_index() writer = AsyncWriter(ix) for i, document in enumerate(self.documents): if document: writer.add_document(**document) update_progress_bar("Building Index", i, len(self.documents)) writer.commit(optimize=True)
def rebuildBattleIndex(): createBattleSchema() bix = open_dir(battlesindex_dir) writer = AsyncWriter(bix) #battles = rs.keys("battle:*") #battles = rs.mget(battles) #for u in users: #u = json.loads(u) #storeUserInIndex(u, writer) writer.commit()
def rebuildUsersIndex(): createUsersSchema() uix = open_dir(usersindex_dir) writer = AsyncWriter(uix) users = rs.keys("users:*") users = rs.mget(users) for u in users: if u is not None: u = json.loads(u) storeUserInIndex(u, writer) writer.commit()
def addLink(self, url, title, summary, txt): titleb = title + " " title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb sumario = summary + " " sumario2 = sumario + sumario text = title10 + sumario2 + " " + txt ix = open_dir(self.indexDir, indexname='MAIN', readonly=False) writer = AsyncWriter(ix) writer.add_document(id=url, content=unicode(text)) writer.commit() ix.close()
def whoosh_task(ids, pool_number, ix, model_class): session = sqla['session'] writer = AsyncWriter(ix) for id_ in ids: obj = session.query(model_class).filter_by(id=id_).one() if obj.title is None or obj.summary is None: continue writer.add_document( title=obj.title, summary=obj.summary ) writer.commit()
def index_documents(self, documents): """Add or update documents in the index.""" index = open_dir(self.index_path) writer = AsyncWriter(index) needs_commit = False for document in documents: needs_commit = True writer.update_document( uid=':'.join((document['set'], document['path'])), path=document['path'], set=document['set'], hash=document['hash'], title=document['title'], content=document['content'], kind=document['kind'], ) if needs_commit: writer.commit()
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) # reset the writer so there is no 'start_doc' error from the # previous failed update attempt writer = AsyncWriter(self.index) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit()
def updateIndex(self): ''' Update whoosh index, which cost much computing resource ''' page = self.parent.notesTree.currentPage() content = self.toPlainText() try: #writer = self.ix.writer() writer = AsyncWriter(self.ix) if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=page, title=parseTitle(content, page), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) writer.commit() else: writer.update_document( path=page, title=parseTitle(content, page), content=content, tags='') writer.commit() except: print("Whoosh commit failed.")
def clear(self): """Remove all content from indexes, and unregister all classes. After clear() the service is stopped. It must be started again to create new indexes and register classes. """ logger.info("Resetting indexes") state = self.app_state for _name, idx in state.indexes.items(): writer = AsyncWriter(idx) writer.commit(merge=True, optimize=True, mergetype=CLEAR) state.indexes.clear() state.indexed_classes.clear() state.indexed_fqcn.clear() self.clear_update_queue() if self.running: self.stop()
def createIndex(self): print " Whoosh Loading from SQL " created = self.createIndexDirIfNotExist() if not created: #already exists return conn = sqlite3.connect(self.dbName) c = conn.cursor() c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''') feeds = c.fetchall() conn.close() linkN = 1 schema = Schema(id = TEXT(stored = True), content=TEXT) ix = create_in(self.indexDir, schema, indexname='MAIN') writer = AsyncWriter(ix) for feed in feeds: # Descartar links sem Titulo if( isinstance(feed[3], type(None))): #print "is Null" continue index = feed[0] # print " Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3] linkN += 1 titolo = feed[3] + " " titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo sumario = feed[4] + " " sumario2 = sumario + sumario text = titolo10 + sumario2 + " " +feed[5] writer.add_document(id=index, content=unicode(text)) writer.commit() ix.close() print " Done Loading from SQL"
def newPageCore(self, item, newPageName): pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/') if not newPageName: dialog = LineEditDialog(pagePath, self) if dialog.exec_(): newPageName = dialog.editor.text() if newPageName: if hasattr(item, 'text'): pagePath = os.path.join(self.notePath, pagePath + '/').replace(os.sep, '/') if not QDir(pagePath).exists(): QDir(self.notePath).mkdir(pagePath) fileName = pagePath + newPageName + self.settings.fileExt fh = QFile(fileName) fh.open(QIODevice.WriteOnly) savestream = QTextStream(fh) savestream << '# ' + newPageName + '\n' savestream << 'Created ' + str(datetime.date.today()) + '\n\n' fh.close() QTreeWidgetItem(item, [newPageName]) newItem = self.pageToItem(pagePath + newPageName) self.sortItems(0, Qt.AscendingOrder) self.setCurrentItem(newItem) if hasattr(item, 'text'): self.expandItem(item) # create attachment folder if not exist attDir = self.itemToAttachmentDir(newItem) if not QDir(attDir).exists(): QDir().mkpath(attDir) # TODO improvement needed, can be reused somehow fileobj = open(fileName, 'r') content = fileobj.read() fileobj.close() self.ix = open_dir(self.settings.indexdir) #writer = self.ix.writer() writer = AsyncWriter(self.ix) writer.add_document(path=pagePath+newPageName, content=content) writer.commit()
def optimize(self): writer = AsyncWriter(self.index) writer.commit(optimize=True)
def add_to_index(self, item_id, text): from whoosh.writing import AsyncWriter writer = AsyncWriter(self.ix) writer.update_document(id=item_id, text=text.lower()) writer.commit()
def index_update(index, items): """ :param:index: index name :param:items: list of (operation, full class name, primary key, data) tuples. """ index_name = index index = service.app_state.indexes[index_name] adapted = service.adapted session = safe_session() updated = set() writer = AsyncWriter(index) try: for op, cls_name, pk, data in items: if pk is None: continue # always delete. Whoosh manual says that 'update' is actually delete + add # operation object_key = f"{cls_name}:{pk}" writer.delete_by_term("object_key", object_key) adapter = adapted.get(cls_name) if not adapter: # FIXME: log to sentry? continue if object_key in updated: # don't add twice the same document in same transaction. The writer will # not delete previous records, ending in duplicate records for same # document. continue if op in ("new", "changed"): with session.begin(nested=True): obj = adapter.retrieve(pk, _session=session, **data) if obj is None: # deleted after task queued, but before task run continue document = service.get_document(obj, adapter) try: writer.add_document(**document) except ValueError: # logger is here to give us more infos in order to catch a weird bug # that happens regularly on CI but is not reliably # reproductible. logger.error("writer.add_document(%r)", document, exc_info=True) raise updated.add(object_key) except Exception: writer.cancel() raise session.close() writer.commit() try: # async thread: wait for its termination writer.join() except RuntimeError: # happens when actual writer was already available: asyncwriter didn't need # to start a thread pass
def add_to_fts(cls, content, title=None, id=None, source_hash=None, tags=None): ix = open_dir(LOCAL_FTS_INDEX) writer = AsyncWriter(ix) writer.add_document(content=content, title=title, id=id, source_hash=source_hash, tags=tags) writer.commit()
#Create index and AsyncWriter object index = create_in("tweetindex", my_schema) writer = AsyncWriter(index) if __name__=='__main__': #Load raw data with open("WC2015_headers.csv",'rb') as to_load: data=csv.DictReader(to_load) for row in data: #Extract required information from date to create python datetime object date=row['created_at'][:19]+' '+row['created_at'][-4:] #Clean text and parse into keywords text=row['text'].replace('\\','') keywords=[word for word in word_tokenize(text) if word not in stops] #Check for Retweets rt=False if 'RT ' in text: rt=True #Add completed document to index writer.add_document(id = unicode(row['id']), screen_name = unicode(row['screen_name']), text = unicode(text), contains_retweet=rt, keyword = unicode(" ".join(keywords)), created = datetime.datetime.strptime(date, "%a %b %d %H:%M:%S %Y") ) writer.commit()
def remove(self, item_id): from whoosh.writing import AsyncWriter writer = AsyncWriter(self.ix) writer.delete_by_term('id', item_id) writer.commit()
def newPageCore(self, item, newPageName, useTemplate=False, templateTitle=None, templateBody=None): pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, "/") if not newPageName: if useTemplate: dialog = mikitemplate.PickTemplateDialog(pagePath, self.settings, parent=self) if dialog.exec_(): curTitleIdx = dialog.titleTemplates.currentIndex() curBodyIdx = dialog.bodyTemplates.currentIndex() dtnow = datetime.datetime.now() if curTitleIdx > -1: titleItem = dialog.titleTemplates.model().item(curTitleIdx) titleItemContent = titleItem.data(TTPL_COL_DATA) titleItemType = titleItem.data(TTPL_COL_EXTRA_DATA) titleParameter = dialog.titleTemplateParameter.text() newPageName = mikitemplate.makeTemplateTitle( titleItemType, titleItemContent, dtnow=dtnow, userinput=titleParameter ) if curBodyIdx > -1: bodyItemIdx = dialog.bodyTemplates.rootModelIndex().child(curBodyIdx, 0) bodyFPath = dialog.bodyTemplates.model().filePath(bodyItemIdx) else: bodyFPath = None else: dialog = LineEditDialog(pagePath, self) if dialog.exec_(): newPageName = dialog.editor.text() prevparitem = None if newPageName: if hasattr(item, "text"): pagePath = os.path.join(self.notePath, pagePath + "/").replace(os.sep, "/") if not QtCore.QDir(pagePath).exists(): QtCore.QDir(self.notePath).mkdir(pagePath) if not QtCore.QDir(os.path.dirname(newPageName)).exists(): curdirname = os.path.dirname(newPageName) needed_parents = [] while curdirname != "": needed_parents.append(curdirname) curdirname = os.path.dirname(curdirname) # create the needed hierarchy in reverse order for i, needed_parent in enumerate(needed_parents[::-1]): paritem = self.pageToItem(needed_parent) if paritem is None: if i == 0: self.newPageCore(item, os.path.basename(needed_parent)) else: self.newPageCore(prevparitem, os.path.basename(needed_parent)) QtCore.QDir(pagePath).mkdir(needed_parent) elif not QtCore.QDir(os.path.join(self.notePath, needed_parent).replace(os.sep, "/")).exists(): QtCore.QDir(pagePath).mkdir(needed_parent) if paritem is not None: prevparitem = paritem else: prevparitem = self.pageToItem(needed_parent) fileName = pagePath + newPageName + self.settings.fileExt fh = QtCore.QFile(fileName) fh.open(QtCore.QIODevice.WriteOnly) savestream = QtCore.QTextStream(fh) if useTemplate and bodyFPath is not None: with open(bodyFPath, "r", encoding="utf-8") as templatef: savestream << mikitemplate.makeTemplateBody( os.path.basename(newPageName), dtnow=dtnow, dt_in_body_txt=self.tr("Created {}"), body=templatef.read(), ) else: savestream << mikitemplate.makeDefaultBody(os.path.basename(newPageName), self.tr("Created {}")) fh.close() if prevparitem is not None: QtWidgets.QTreeWidgetItem(prevparitem, [os.path.basename(newPageName)]) else: QtWidgets.QTreeWidgetItem(item, [os.path.basename(newPageName)]) newItem = self.pageToItem(pagePath + newPageName) self.sortItems(0, Qt.AscendingOrder) self.setCurrentItem(newItem) if hasattr(item, "text"): self.expandItem(item) # create attachment folder if not exist attDir = self.itemToAttachmentDir(newItem) if not QtCore.QDir(attDir).exists(): QtCore.QDir().mkpath(attDir) # TODO improvement needed, can be reused somehow with open(fileName, "r") as fileobj: content = fileobj.read() self.ix = open_dir(self.settings.indexdir) # writer = self.ix.writer() writer = AsyncWriter(self.ix) writer.add_document(path=pagePath + newPageName, content=content) writer.commit()
class Index(object): def __init__(self, directory, persist): self.log = logging.getLogger("ftpvista.index") self._persist = persist if not os.path.exists(directory): self.log.info("Creating the index in %s" % directory) os.mkdir(directory) self._idx = index.create_in(directory, schema=self.get_schema()) else: self.log.info("Opening the index in %s" % directory) self._idx = index.open_dir(directory) self._searcher = self._idx.searcher() self._writer = None self.open_writer() self._last_optimization = None def open_writer(self): # self._writer = BufferedWriter(self._idx, 120, 4000) self._writer = AsyncWriter(self._idx) def get_schema(self): analyzer = StemmingAnalyzer("([a-zA-Z0-9])+") my_analyzer = analyzer | CharsetFilter(accent_map) return Schema( server_id=ID(stored=True), has_id=ID(), path=TEXT(analyzer=my_analyzer, stored=True), name=TEXT(analyzer=my_analyzer, stored=True), ext=TEXT(analyzer=my_analyzer, stored=True), size=ID(stored=True), mtime=ID(stored=True, sortable=True), audio_album=TEXT(analyzer=my_analyzer, stored=True), audio_artist=TEXT(analyzer=my_analyzer, stored=True), audio_title=TEXT(analyzer=my_analyzer, stored=True), audio_track=ID(stored=True), audio_year=ID(stored=True), ) def delete_all_docs(self, server): self.open_writer() self._writer.delete_by_term("server_id", str(server.get_server_id())) self._writer.commit() self.log.info("All documents of server %s deleted" % server.get_ip_addr()) def incremental_server_update(self, server_id, current_files): """Prepares to incrementaly update the documents for the given server. server_id -- Id of the server to update. current_files -- a list of (path, size, mtime) tuples for each files currently on the server. Delete all the outdated files from the index and returns a list of files needing to be reindexed. """ def delete_doc(writer, serverid, path): writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path)) # Build a {path => (size, mtime)} mapping for quick lookups to_index = {} for path, size, mtime in current_files: to_index[path] = (size, mtime) results = self._searcher.documents(server_id=str(server_id)) if results: for fields in results: indexed_path = fields["path"] if indexed_path not in to_index: # This file was deleted from the server since it was indexed delete_doc(self._writer, server_id, indexed_path) self.log.debug("%s has been removed" % indexed_path) else: size, mtime = to_index[indexed_path] try: if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"): # This file has been modified since it was indexed delete_doc(self._writer, server_id, indexed_path) else: # up to date, no need to reindex del to_index[indexed_path] except ValueError: delete_doc(self._writer, server_id, indexed_path) # return the remaining files return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()] def add_document( self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None ): """Add a document with the specified fields in the index. Changes need to be commited. """ # passing the optional arguments is quite a mess # let's build a dict for that purpose _, ext = os.path.splitext(name) ext = ext.lstrip(".") kwargs = { "server_id": server_id, "name": name, "ext": ext, "path": path, "size": size, "mtime": mtime, "has_id": "a", } # Add the optional args if audio_album is not None: kwargs["audio_album"] = audio_album if audio_artist is not None: kwargs["audio_artist"] = audio_artist if audio_title is not None: kwargs["audio_title"] = audio_title if audio_year is not None: kwargs["audio_year"] = audio_year try: self._writer.add_document(**kwargs) except IndexingError: self.open_writer() self._writer.add_document(**kwargs) def commit(self, optimize=False): """ Commit the changes in the index and optimize it """ self.log.info(" -- Begin of Commit -- ") try: self._writer.commit(optimize=optimize) except IndexingError: self.open_writer() self._writer.commit(optimize=optimize) self.log.info("Index commited") self._searcher = self._idx.searcher() self.log.info(" -- End of Commit -- ") def close(self): self.log.info(" -- Closing writer and index -- ") # self._writer.close() """ Close the index """ self._idx.close()