def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } })
def add(): d = request.get_json(force=True) url = d.get("url") content = d.get("content") if not url or not content: return jsonify({"status": "missing parameters"}) if urlparse.urlparse(url).netloc.startswith("localhost"): return jsonify({"status": "ignored"}) ix = get_index() writer = AsyncWriter(ix) soup = BeautifulSoup(content) # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) writer.update_document(title=d.get("title", "Untitled"), url=url, content=text, modified=datetime.datetime.now()) writer.commit() return jsonify({"status": "ok"})
def updateIndex(self): ''' Update whoosh index, which cost much computing resource ''' page = self.parent.notesTree.currentPage() content = self.toPlainText() try: #writer = self.ix.writer() writer = AsyncWriter(self.ix) if METADATA_CHECKER.match( content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=page, title=parseTitle(content, page), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) writer.commit() else: writer.update_document(path=page, title=parseTitle(content, page), content=content, tags='') writer.commit() except: print("Whoosh commit failed.")
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) writer.update_document(**doc) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
def whoosh_index(self): it = QTreeWidgetItemIterator( self.notesTree, QTreeWidgetItemIterator.All) print("Starting complete indexing.") #writer = self.ix.writer() writer = AsyncWriter(self.ix) while it.value(): treeItem = it.value() name = self.notesTree.itemToPage(treeItem) path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/') print(path) fileobj = open(path, 'r', encoding='utf-8') content = fileobj.read() fileobj.close() if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=name, title=parseTitle(content, name), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) else: writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='') it += 1 writer.commit() print("Finished completely reindexing.")
def whoosh_index(self): it = QtWidgets.QTreeWidgetItemIterator( self.notesTree, QtWidgets.QTreeWidgetItemIterator.All) print("Starting complete indexing.") #writer = self.ix.writer() writer = AsyncWriter(self.ix) while it.value(): treeItem = it.value() name = self.notesTree.itemToPage(treeItem) path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/') print(path) fileobj = open(path, 'r', encoding='utf-8') content = fileobj.read() fileobj.close() if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=name, title=parseTitle(content, name), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) else: writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='') it += 1 writer.commit() print("Finished completely reindexing.")
def add(): d = request.get_json(force=True) url = d.get("url") content = d.get("content") if not url or not content: return jsonify({"status": "missing parameters"}) if urlparse.urlparse(url).netloc.startswith("localhost"): return jsonify({"status": "ignored"}) ix = get_index() writer = AsyncWriter(ix) soup = BeautifulSoup(content) # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) writer.update_document(title=d.get("title", "Untitled"), url=url, content=text, modified=datetime.datetime.now()) writer.commit() return jsonify({"status": "ok"})
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } })
def add_item(self, item): model = self.model doc = self._create_document(model, item) index = self.model_index writer = AsyncWriter(index, writerargs=self._writer_args()) writer.update_document(**doc) writer.commit() self._close_model_index()
def add_items(self, item_model, items): model = self.model index = self.model_index writer = AsyncWriter(index) for item in items: doc = self._create_document(model, item) writer.update_document(**doc) writer.commit() self._close_model_index()
def update(self, index, document, **options): index = base._resolve_index(index) ix = self._storage.open_index(indexname=index.get_name()) writer = AsyncWriter(ix) adapted_document = index.adapt_document(document) writer.update_document(**adapted_document) writer.commit()
def update(self, index, document, **options): index = base._resolve_index(index) ix = self._storage.open_index(indexname=index.get_name()) writer = AsyncWriter(ix) adapted_document = index.adapt_document(document) writer.update_document(**adapted_document) writer.commit()
def index_post(post): """Add or update a post's search entry""" writer = AsyncWriter(ix) writer.update_document(id=str(post.id), title=post.title, body=post.body, desc=post.desc, subtitle=post.subtitle, tags=post.tags, authors=' '.join([a.name for a in post.authors])) writer.commit()
def update_bulk(self, index, documents): index = base._resolve_index(index) ix = self._storage.open_index(indexname=index.get_name()) writer = AsyncWriter(ix) adapted_documents = (index.adapt_document(doc) for doc in documents) for doc in adapted_documents: writer.update_document(**doc) writer.commit()
def update_bulk(self, index, documents): index = base._resolve_index(index) ix = self._storage.open_index(indexname=index.get_name()) writer = AsyncWriter(ix) adapted_documents = (index.adapt_document(doc) for doc in documents) for doc in adapted_documents: writer.update_document(**doc) writer.commit()
def update_index(sender, **kwargs): """ Adds/updates an entry in the index. It's connected with the post_save signal of the Object objects so will automatically index every new or modified Object """ ix = get_or_create_index() writer = AsyncWriter(ix) obj = kwargs['instance'] if "created" in kwargs and kwargs['created']: writer.add_document(**obj.index()) else: writer.update_document(**obj.index()) writer.commit()
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug("Indexing for object `%s` skipped", obj) else: # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if "boost" in doc: del doc["boost"] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error( "%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }, ) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() if writer.ident is not None: writer.join()
def index_documents(self, documents): """Add or update documents in the index.""" index = open_dir(self.index_path) writer = AsyncWriter(index) needs_commit = False for document in documents: needs_commit = True writer.update_document( uid=':'.join((document['set'], document['path'])), path=document['path'], set=document['set'], hash=document['hash'], title=document['title'], content=document['content'], kind=document['kind'], ) if needs_commit: writer.commit()
def __update_index(self, metadata): """ This method is used inetrnally to update the index when needed """ # Let's grab a writer to the index writer = AsyncWriter(self.index) # And now, let's crawl the metadata for unknown fields known_fields = self.index.schema.names() for k in metadata.keys(): if k not in known_fields: writer.add_field(k, TEXT(stored=True)) # We just need to add the document to the index now writer.update_document(**metadata) # Commit and close writer.commit()
class IndexPipeline(object): def __init__(self, index): self.index = index @classmethod def from_crawler(cls, crawler): return cls(index=crawler.settings.get('WHOOSH_INDEX', 'indexes')) def process_item(self, item, spider): self.writer = AsyncWriter(get_index(self.index, zufang_schema)) create_time = datetime.datetime.strptime(item['create_time'], "%Y-%m-%d %H:%M:%S") self.writer.update_document(url=item['url'].decode('utf-8'), title=item['title'], description=item['description'], create_time=create_time) self.writer.commit() return item
def index_documents(self, documents): """Add or update documents in the index.""" index = open_dir(self.index_path) writer = AsyncWriter(index) needs_commit = False for document in documents: needs_commit = True writer.update_document( uid=':'.join((document['set'], document['path'])), path=document['path'], set=document['set'], hash=document['hash'], title=document['title'], content=document['content'], kind=document['kind'], ) if needs_commit: writer.commit()
class SearchPipeline(object): cleanup = False def open_spider(self, spider): """ When opening spider, open or create index. """ index_dir = os.path.expanduser('~/.sitesearcher/index') if not os.path.exists(index_dir): os.makedirs(index_dir) self.indexname = spider.allowed_domains[0] if index.exists_in(index_dir, indexname=self.indexname): self.index = index.open_dir(index_dir, indexname=self.indexname) else: self.index = index.create_in( index_dir, indexname=self.indexname, schema=schema, ) self.writer = AsyncWriter(self.index) def process_item(self, item, spider): """ Add crawled item to index. Add items using ``update_document`` to delete any previously indexed versions and avoid duplicates """ self.writer.update_document( url=item.get('url'), content=item.get('content')) def close_spider(self, spider): """ Close index writer on closing of spider an clean up. On closing, delete any previously indexed items that have not been updated in this crawl, as these are obviously no longer reachable sites. """ with self.index.searcher() as searcher: for page in searcher.all_stored_fields(): if page['url'] not in spider.state['update_list']: self.writer.delete_by_term('url', page['url']) self.writer.commit()
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) # reset the writer so there is no 'start_doc' error from the # previous failed update attempt writer = AsyncWriter(self.index) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit()
def update_whoosh_index_doc_num(index, item_iter, item_num, index_name, merge=False): result = False try: if index != None and index != False: n = 0 # writer = index.writer() writer = AsyncWriter(index) try: for item in item_iter: n += 1 if index_name == "call": writer.update_document(doc_id=unicode(str(item.id)), name=item.name) else: LOG.error( "index_name error: in the update_whoosh_index_doc_num!" ) if n % 100 == 0: LOG.debug("Update index[%s] doc_id[%s]", index_name, item.id) if n == item_num: writer.commit(merge=merge) LOG.info("Commit index[%s] success.", index_name) # writer = index.writer() writer = AsyncWriter(index) n = 0 if n % item_num != 0: s = time.time() writer.commit(merge=merge) ss = time.time() LOG.debug("Commit use %ss", ss - s) LOG.info("Commit index[%s] success.", index_name) result = True except Exception, e: LOG.exception(e) writer.cancel() result = False else:
def updateIndex(self): ''' Update whoosh index, which cost much computing resource ''' page = self.parent.notesTree.currentPage() content = self.toPlainText() try: #writer = self.ix.writer() writer = AsyncWriter(self.ix) if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=page, title=parseTitle(content, page), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) writer.commit() else: writer.update_document( path=page, title=parseTitle(content, page), content=content, tags='') writer.commit() except: print("Whoosh commit failed.")
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise self.log.error("Failed to add documents to Whoosh: %s", e)
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise self.log.error("Failed to add documents to Whoosh: %s", e)
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() write = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug(u"Indexing for object '%s' skipped", obj) else: for key in doc: doc[key] = self._from_python(doc[key]) if 'boost' in doc: del doc['boost'] try: write.update_document(**doc) except Exception as e: if not self.silently_fail: raise self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ 'data': { 'index': index, 'object': get_identifier(obj) } }) if len(iterable) > 0: write.commit()
def add_items(self, model, objs): for obj in objs: obj._body_ = self.prepare_body(obj) self._delete_parent_model_data(model, objs) index = self.backend.index.refresh() writer = AsyncWriter(index) for obj in objs: doc = { ID: get_identifier(obj), DJANGO_CT: get_model_ct(obj), DJANGO_ID: force_text(obj.pk), 'text': force_text(obj._body_), } try: writer.update_document(**doc) except Exception as e: raise e if len(objs) > 0: writer.commit()
latest_backends_revids = self._find_latest_backends_revids(self.ix[ALL_REVS], Term(ITEMID, itemid)) if latest_backends_revids: # we have a latest revision, just update the document in the index: assert len(latest_backends_revids) == 1 # this item must have only one latest revision latest_backend_revid = latest_backends_revids[0] # we must fetch from backend because schema for LATEST_REVS is different than for ALL_REVS # (and we can't be sure we have all fields stored, too) meta, _ = self.backend.retrieve(*latest_backend_revid) # we only use meta (not data), because we do not want to transform data->content again (this # is potentially expensive) as we already have the transformed content stored in ALL_REVS index: with self.ix[ALL_REVS].searcher() as searcher: doc = searcher.document(revid=latest_backend_revid[1]) content = doc[CONTENT] doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname, backend_name=latest_backend_revid[0]) writer.update_document(**doc) else: # this is no revision left in this item that could be the new "latest rev", just kill the rev writer.delete_document(docnum_remove) def _modify_index(self, index, schema, wikiname, revids, mode='add', procs=1, limitmb=256): """ modify index contents - add, update, delete the indexed documents for all given revids Note: mode == 'add' is faster but you need to make sure to not create duplicate documents in the index. """ with index.writer(procs=procs, limitmb=limitmb) as writer: for backend_name, revid in revids: if mode in ['add', 'update', ]: meta, data = self.backend.retrieve(backend_name, revid)
# workaround for bug #200 AttributeError: 'FieldCache' object has no attribute 'code' latest_revids = [] if latest_revids: # we have a latest revision, just update the document in the index: assert len(latest_revids) == 1 # this item must have only one latest revision latest_revid = latest_revids[0] # we must fetch from backend because schema for LATEST_REVS is different than for ALL_REVS # (and we can't be sure we have all fields stored, too) meta, _ = self.backend.retrieve(latest_revid) # we only use meta (not data), because we do not want to transform data->content again (this # is potentially expensive) as we already have the transformed content stored in ALL_REVS index: with self.ix[ALL_REVS].searcher() as searcher: doc = searcher.document(revid=latest_revid) content = doc[CONTENT] doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname) writer.update_document(**doc) else: # this is no revision left in this item that could be the new "latest rev", just kill the rev writer.delete_document(docnum_remove) def _modify_index(self, index, schema, wikiname, revids, mode='add', procs=1, limitmb=256): """ modify index contents - add, update, delete the indexed documents for all given revids Note: mode == 'add' is faster but you need to make sure to not create duplicate documents in the index. """ if procs == 1: # MultiSegmentWriter sometimes has issues and is pointless for procs == 1, # so use the simple writer when --procs 1 is given: writer = index.writer()
def add_to_index(self, item_id, text): from whoosh.writing import AsyncWriter writer = AsyncWriter(self.ix) writer.update_document(id=item_id, text=text.lower()) writer.commit()
def add_to_index(self, item_id, text): from whoosh.writing import AsyncWriter writer = AsyncWriter(self.ix) writer.update_document(id=item_id, text=text.lower()) writer.commit()