예제 #1
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })
예제 #2
0
def add():
    d = request.get_json(force=True)
    url = d.get("url")
    content = d.get("content")
    if not url or not content: return jsonify({"status": "missing parameters"})
    if urlparse.urlparse(url).netloc.startswith("localhost"):
        return jsonify({"status": "ignored"})
    ix = get_index()
    writer = AsyncWriter(ix)
    soup = BeautifulSoup(content)
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    writer.update_document(title=d.get("title", "Untitled"),
                           url=url,
                           content=text,
                           modified=datetime.datetime.now())
    writer.commit()
    return jsonify({"status": "ok"})
예제 #3
0
 def updateIndex(self):
     ''' Update whoosh index, which cost much computing resource '''
     page = self.parent.notesTree.currentPage()
     content = self.toPlainText()
     try:
         #writer = self.ix.writer()
         writer = AsyncWriter(self.ix)
         if METADATA_CHECKER.match(
                 content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("",
                                                        content,
                                                        count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=page,
                 title=parseTitle(content, page),
                 content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags',
                                                         [])).strip())
             writer.commit()
         else:
             writer.update_document(path=page,
                                    title=parseTitle(content, page),
                                    content=content,
                                    tags='')
             writer.commit()
     except:
         print("Whoosh commit failed.")
예제 #4
0
 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = AsyncWriter(self.index)
     
     for obj in iterable:
         doc = index.full_prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in doc:
             doc[key] = self._from_python(doc[key])
         
         writer.update_document(**doc)
     
     if len(iterable) > 0:
         # For now, commit no matter what, as we run into locking issues otherwise.
         writer.commit()
         
         # If spelling support is desired, add to the dictionary.
         if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
             sp = SpellChecker(self.storage)
             sp.add_field(self.index, self.content_field_name)
예제 #5
0
 def whoosh_index(self):
     it = QTreeWidgetItemIterator(
         self.notesTree, QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
예제 #6
0
 def whoosh_index(self):
     it = QtWidgets.QTreeWidgetItemIterator(
         self.notesTree, QtWidgets.QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
예제 #7
0
def add():
    d = request.get_json(force=True)
    url = d.get("url")
    content = d.get("content")
    if not url or not content: return jsonify({"status": "missing parameters"})
    if urlparse.urlparse(url).netloc.startswith("localhost"): return  jsonify({"status": "ignored"})
    ix = get_index()
    writer = AsyncWriter(ix)
    soup = BeautifulSoup(content)
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    writer.update_document(title=d.get("title", "Untitled"),
        url=url,
        content=text,
        modified=datetime.datetime.now())
    writer.commit()
    return jsonify({"status": "ok"})
예제 #8
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })
예제 #9
0
 def add_item(self, item):
     model = self.model
     doc = self._create_document(model, item)
     index = self.model_index
     writer = AsyncWriter(index, writerargs=self._writer_args())
     writer.update_document(**doc)
     writer.commit()
     self._close_model_index()
예제 #10
0
 def add_items(self, item_model, items):
     model = self.model
     index = self.model_index
     writer = AsyncWriter(index)
     for item in items:
         doc = self._create_document(model, item)
         writer.update_document(**doc)
     writer.commit()
     self._close_model_index()
예제 #11
0
    def update(self, index, document, **options):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_document = index.adapt_document(document)
        writer.update_document(**adapted_document)
        writer.commit()
예제 #12
0
파일: base.py 프로젝트: niwinz/needlestack
    def update(self, index, document, **options):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_document = index.adapt_document(document)
        writer.update_document(**adapted_document)
        writer.commit()
예제 #13
0
def index_post(post):
    """Add or update a post's search entry"""
    writer = AsyncWriter(ix)
    writer.update_document(id=str(post.id),
                           title=post.title,
                           body=post.body,
                           desc=post.desc,
                           subtitle=post.subtitle,
                           tags=post.tags,
                           authors=' '.join([a.name for a in post.authors]))
    writer.commit()
예제 #14
0
    def update_bulk(self, index, documents):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_documents = (index.adapt_document(doc) for doc in documents)
        for doc in adapted_documents:
            writer.update_document(**doc)

        writer.commit()
예제 #15
0
파일: base.py 프로젝트: niwinz/needlestack
    def update_bulk(self, index, documents):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_documents = (index.adapt_document(doc)
                                for doc in documents)
        for doc in adapted_documents:
            writer.update_document(**doc)

        writer.commit()
예제 #16
0
def update_index(sender, **kwargs):
    """ Adds/updates an entry in the index. It's connected with
        the post_save signal of the Object objects so will automatically
        index every new or modified Object
    """
    ix = get_or_create_index()
    writer = AsyncWriter(ix)
    obj = kwargs['instance']
    if "created" in kwargs and kwargs['created']:
        writer.add_document(**obj.index())
    else:
        writer.update_document(**obj.index())
    writer.commit()
예제 #17
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug("Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if "boost" in doc:
                    del doc["boost"]

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(
                        "%s while preparing object for update" %
                        e.__class__.__name__,
                        exc_info=True,
                        extra={
                            "data": {
                                "index": index,
                                "object": get_identifier(obj)
                            }
                        },
                    )

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            if writer.ident is not None:
                writer.join()
예제 #18
0
 def index_documents(self, documents):
     """Add or update documents in the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     needs_commit = False
     for document in documents:
         needs_commit = True
         writer.update_document(
             uid=':'.join((document['set'], document['path'])),
             path=document['path'],
             set=document['set'],
             hash=document['hash'],
             title=document['title'],
             content=document['content'],
             kind=document['kind'],
         )
     if needs_commit:
         writer.commit()
예제 #19
0
    def __update_index(self, metadata):
        """
        This method is used inetrnally to update the index when needed
        """
        # Let's grab a writer to the index
        writer = AsyncWriter(self.index)

        # And now, let's crawl the metadata for unknown fields
        known_fields = self.index.schema.names()
        for k in metadata.keys():
            if k not in known_fields:
                writer.add_field(k, TEXT(stored=True))

        # We just need to add the document to the index now
        writer.update_document(**metadata)

        # Commit and close
        writer.commit()
예제 #20
0
class IndexPipeline(object):
    def __init__(self, index):
        self.index = index

    @classmethod
    def from_crawler(cls, crawler):
        return cls(index=crawler.settings.get('WHOOSH_INDEX', 'indexes'))

    def process_item(self, item, spider):
        self.writer = AsyncWriter(get_index(self.index, zufang_schema))
        create_time = datetime.datetime.strptime(item['create_time'],
                                                 "%Y-%m-%d %H:%M:%S")
        self.writer.update_document(url=item['url'].decode('utf-8'),
                                    title=item['title'],
                                    description=item['description'],
                                    create_time=create_time)
        self.writer.commit()
        return item
예제 #21
0
 def index_documents(self, documents):
     """Add or update documents in the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     needs_commit = False
     for document in documents:
         needs_commit = True
         writer.update_document(
             uid=':'.join((document['set'], document['path'])),
             path=document['path'],
             set=document['set'],
             hash=document['hash'],
             title=document['title'],
             content=document['content'],
             kind=document['kind'],
         )
     if needs_commit:
         writer.commit()
예제 #22
0
class SearchPipeline(object):
    cleanup = False

    def open_spider(self, spider):
        """ When opening spider, open or create index. """

        index_dir = os.path.expanduser('~/.sitesearcher/index')
        if not os.path.exists(index_dir):
            os.makedirs(index_dir)

        self.indexname = spider.allowed_domains[0]
        if index.exists_in(index_dir, indexname=self.indexname):
            self.index = index.open_dir(index_dir, indexname=self.indexname)
        else:
            self.index = index.create_in(
                index_dir,
                indexname=self.indexname,
                schema=schema,
            )
        self.writer = AsyncWriter(self.index)

    def process_item(self, item, spider):
        """ Add crawled item to index.

        Add items using ``update_document`` to delete any previously indexed
        versions and avoid duplicates
        """

        self.writer.update_document(
            url=item.get('url'), content=item.get('content'))

    def close_spider(self, spider):
        """ Close index writer on closing of spider an clean up.

        On closing, delete any previously indexed items that have not been
        updated in this crawl, as these are obviously no longer reachable sites.
        """

        with self.index.searcher() as searcher:
            for page in searcher.all_stored_fields():
                if page['url'] not in spider.state['update_list']:
                    self.writer.delete_by_term('url', page['url'])
        self.writer.commit()
예제 #23
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            # Document boosts aren't supported in Whoosh 2.5.0+.
            if 'boost' in doc:
                del doc['boost']

            try:
                writer.update_document(**doc)
            except Exception as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })
                
                # reset the writer so there is no 'start_doc' error from the
                # previous failed update attempt
                writer = AsyncWriter(self.index)

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
예제 #24
0
def update_whoosh_index_doc_num(index,
                                item_iter,
                                item_num,
                                index_name,
                                merge=False):
    result = False
    try:
        if index != None and index != False:
            n = 0
            # writer = index.writer()
            writer = AsyncWriter(index)
            try:
                for item in item_iter:
                    n += 1
                    if index_name == "call":
                        writer.update_document(doc_id=unicode(str(item.id)),
                                               name=item.name)
                    else:
                        LOG.error(
                            "index_name error: in the update_whoosh_index_doc_num!"
                        )
                    if n % 100 == 0:
                        LOG.debug("Update index[%s] doc_id[%s]", index_name,
                                  item.id)
                    if n == item_num:
                        writer.commit(merge=merge)
                        LOG.info("Commit index[%s] success.", index_name)
                        # writer = index.writer()
                        writer = AsyncWriter(index)
                        n = 0
                if n % item_num != 0:
                    s = time.time()
                    writer.commit(merge=merge)
                    ss = time.time()
                    LOG.debug("Commit use %ss", ss - s)
                    LOG.info("Commit index[%s] success.", index_name)
                result = True
            except Exception, e:
                LOG.exception(e)
                writer.cancel()
                result = False
        else:
예제 #25
0
파일: mikiedit.py 프로젝트: albfan/mikidown
 def updateIndex(self):
     ''' Update whoosh index, which cost much computing resource '''
     page = self.parent.notesTree.currentPage()
     content = self.toPlainText()        
     try:
         #writer = self.ix.writer()
         writer = AsyncWriter(self.ix)
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=page, title=parseTitle(content, page), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
             writer.commit()
         else:
             writer.update_document(
                 path=page, title=parseTitle(content, page), content=content, tags='')
             writer.commit()
     except:
         print("Whoosh commit failed.")
예제 #26
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Whoosh: %s", e)
예제 #27
0
 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = AsyncWriter(self.index)
     
     for obj in iterable:
         doc = index.full_prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in doc:
             doc[key] = self._from_python(doc[key])
         
         try:
             writer.update_document(**doc)
         except Exception, e:
             if not self.silently_fail:
                 raise
             
             self.log.error("Failed to add documents to Whoosh: %s", e)
예제 #28
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        write = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object '%s' skipped", obj)
            else:
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                if 'boost' in doc:
                    del doc['boost']

                try:
                    write.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       'data': {
                                           'index': index,
                                           'object': get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            write.commit()
예제 #29
0
    def add_items(self, model, objs):
        for obj in objs:
            obj._body_ = self.prepare_body(obj)

        self._delete_parent_model_data(model, objs)

        index = self.backend.index.refresh()
        writer = AsyncWriter(index)

        for obj in objs:
            doc = {
                ID: get_identifier(obj),
                DJANGO_CT: get_model_ct(obj),
                DJANGO_ID: force_text(obj.pk),
                'text': force_text(obj._body_),
            }

            try:
                writer.update_document(**doc)
            except Exception as e:
                raise e

        if len(objs) > 0:
            writer.commit()
예제 #30
0
                latest_backends_revids = self._find_latest_backends_revids(self.ix[ALL_REVS], Term(ITEMID, itemid))
                if latest_backends_revids:
                    # we have a latest revision, just update the document in the index:
                    assert len(latest_backends_revids) == 1  # this item must have only one latest revision
                    latest_backend_revid = latest_backends_revids[0]
                    # we must fetch from backend because schema for LATEST_REVS is different than for ALL_REVS
                    # (and we can't be sure we have all fields stored, too)
                    meta, _ = self.backend.retrieve(*latest_backend_revid)
                    # we only use meta (not data), because we do not want to transform data->content again (this
                    # is potentially expensive) as we already have the transformed content stored in ALL_REVS index:
                    with self.ix[ALL_REVS].searcher() as searcher:
                        doc = searcher.document(revid=latest_backend_revid[1])
                        content = doc[CONTENT]
                    doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname,
                                           backend_name=latest_backend_revid[0])
                    writer.update_document(**doc)
                else:
                    # this is no revision left in this item that could be the new "latest rev", just kill the rev
                    writer.delete_document(docnum_remove)

    def _modify_index(self, index, schema, wikiname, revids, mode='add', procs=1, limitmb=256):
        """
        modify index contents - add, update, delete the indexed documents for all given revids

        Note: mode == 'add' is faster but you need to make sure to not create duplicate
              documents in the index.
        """
        with index.writer(procs=procs, limitmb=limitmb) as writer:
            for backend_name, revid in revids:
                if mode in ['add', 'update', ]:
                    meta, data = self.backend.retrieve(backend_name, revid)
예제 #31
0
                    # workaround for bug #200 AttributeError: 'FieldCache' object has no attribute 'code'
                    latest_revids = []
                if latest_revids:
                    # we have a latest revision, just update the document in the index:
                    assert len(latest_revids) == 1 # this item must have only one latest revision
                    latest_revid = latest_revids[0]
                    # we must fetch from backend because schema for LATEST_REVS is different than for ALL_REVS
                    # (and we can't be sure we have all fields stored, too)
                    meta, _ = self.backend.retrieve(latest_revid)
                    # we only use meta (not data), because we do not want to transform data->content again (this
                    # is potentially expensive) as we already have the transformed content stored in ALL_REVS index:
                    with self.ix[ALL_REVS].searcher() as searcher:
                        doc = searcher.document(revid=latest_revid)
                        content = doc[CONTENT]
                    doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname)
                    writer.update_document(**doc)
                else:
                    # this is no revision left in this item that could be the new "latest rev", just kill the rev
                    writer.delete_document(docnum_remove)

    def _modify_index(self, index, schema, wikiname, revids, mode='add', procs=1, limitmb=256):
        """
        modify index contents - add, update, delete the indexed documents for all given revids

        Note: mode == 'add' is faster but you need to make sure to not create duplicate
              documents in the index.
        """
        if procs == 1:
            # MultiSegmentWriter sometimes has issues and is pointless for procs == 1,
            # so use the simple writer when --procs 1 is given:
            writer = index.writer()
예제 #32
0
 def add_to_index(self, item_id, text):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.update_document(id=item_id, text=text.lower())
     writer.commit()
예제 #33
0
 def add_to_index(self, item_id, text):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.update_document(id=item_id, text=text.lower())
     writer.commit()