Пример #1
19
 def delete_documents(self, doc_set, paths):
     """Delete documents from the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     query = And([
         Term('set', doc_set),
         Or([Term('path', path) for path in paths])
     ])
     writer.delete_by_query(query)
     writer.commit()
Пример #2
0
 def whoosh_index(self):
     it = QTreeWidgetItemIterator(
         self.notesTree, QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
Пример #3
0
 def whoosh_index(self):
     it = QtWidgets.QTreeWidgetItemIterator(
         self.notesTree, QtWidgets.QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
Пример #4
0
def incremental_index(indexdir, indexname, rowData):
    """
    注意这里, 每次增加索引都会产生一个新的seg文件, 会占用空间, 所以这里需要注意
    :param rowData: 每一行的数据
    :param indexdir:
    :param indexname:
    :return:
    """
    # print(indexdir)

    storage = FileStorage(indexdir)
    ix = FileIndex(storage, indexname=indexname)

    writer = AsyncWriter(ix)
    docline = """writer.add_document("""
    for key in rowData:
        val = rowData[key]

        if not val:
            val = ""
        elif isinstance(val, (Decimal, )):
            val = str(val)

        else:
            val = pymysql.escape_string(str(val))
        docline += key + '="' + val + '", '
    docline = docline.rstrip(", ")
    docline += """)"""
    exec(docline)
    # print(docline) # writer.add_document(content="撸啊撸啊德玛西亚", ID="abc")
    # writer.add_document(content="人在塔在", ID="hik")

    writer.commit()
Пример #5
0
def search_spam(
    post,
    ix,
):
    """
    Search spam index for posts similar to this one.
    Returns
    """
    writer = AsyncWriter(ix)
    add_post_to_index(post=post, writer=writer, is_spam=post.is_spam)
    writer.commit()

    # Search for this post in the spam index
    fields = ['uid']

    results = search.preform_whoosh_search(ix=ix,
                                           query=post.uid,
                                           fields=fields)

    # Preform more_like_this on this posts content
    similar_content = results[0].more_like_this('content', top=5)

    # Remove this post from the spam index after results are collected.
    writer = AsyncWriter(ix)
    writer.delete_by_term('uid', text=post.uid)
    writer.commit()

    # Get the results into a list and close the searcher object.
    similar_content = list(map(search.normalize_result, similar_content))

    results.searcher.close()

    return similar_content
Пример #6
0
    def delPage(self, item):

        index = item.childCount()
        while index > 0:
            index = index - 1
            self.dirname = item.child(index).text(0)
            self.delPage(item.child(index))

        # remove attachment folder
        attDir = self.itemToAttachmentDir(item)
        for info in QDir(attDir).entryInfoList():
            QDir().remove(info.absoluteFilePath())
        QDir().rmdir(attDir)

        pagePath = self.itemToPage(item)
        self.ix = open_dir(self.settings.indexdir)
        query = QueryParser('path', self.ix.schema).parse(pagePath)
        #writer = self.ix.writer()
        writer = AsyncWriter(self.ix)
        n = writer.delete_by_query(query)
        # n = writer.delete_by_term('path', pagePath)
        writer.commit()
        #self.ix.close()
        b = QDir(self.notePath).remove(self.pageToFile(pagePath))
        parent = item.parent()
        parentPage = self.itemToPage(parent)
        if parent is not None:
            index = parent.indexOfChild(item)
            parent.takeChild(index)
            if parent.childCount() == 0:  # if no child, dir not needed
                QDir(self.notePath).rmdir(parentPage)
        else:
            index = self.indexOfTopLevelItem(item)
            self.takeTopLevelItem(index)
        QDir(self.notePath).rmdir(pagePath)
Пример #7
0
    def saveToWhoosh(self, df, dataset_id, overwrite=False):
        # use whoosh search engine to enable full text search
        if not os.path.exists(self.whoosh_root):
            os.mkdir(self.whoosh_root)
        ws_path = os.path.join(self.whoosh_root, dataset_id)
        if not os.path.exists(ws_path):
            os.mkdir(ws_path)
            logMsg(
                str(os.path.abspath(ws_path)) +
                ' does not exist, create it to store whoosh index')
            overwrite = True
        elif overwrite:
            shutil.rmtree(ws_path)
            os.mkdir(ws_path)
        schema = Schema(DOC_ID=NUMERIC(stored=True), TEXT=TEXT)
        if overwrite:
            ix = create_in(ws_path, schema)
        else:
            ix = open_dir(ws_path)
        writer = AsyncWriter(ix)

        with self.workflow.dao.create_session() as session:
            doc_iter = session.query(Document).filter(
                Document.DATASET_ID == dataset_id)
            for doc in doc_iter:
                writer.add_document(DOC_ID=doc.DOC_ID, TEXT=doc.TEXT)
            writer.commit()
        pass
Пример #8
0
 def updateIndex(self):
     ''' Update whoosh index, which cost much computing resource '''
     page = self.parent.notesTree.currentPage()
     content = self.toPlainText()
     try:
         #writer = self.ix.writer()
         writer = AsyncWriter(self.ix)
         if METADATA_CHECKER.match(
                 content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("",
                                                        content,
                                                        count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=page,
                 title=parseTitle(content, page),
                 content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags',
                                                         [])).strip())
             writer.commit()
         else:
             writer.update_document(path=page,
                                    title=parseTitle(content, page),
                                    content=content,
                                    tags='')
             writer.commit()
     except:
         print("Whoosh commit failed.")
Пример #9
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            writer.update_document(**doc)

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

            # If spelling support is desired, add to the dictionary.
            if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
Пример #10
0
 def update(self, x, who=None):
     # implement search here
     x = str(x)
     aindex = AsyncWriter(self.index, delay=0.2)
     aindex.add_document(content=x)
     aindex.commit()
     return self._emit(x)
def creating_searching_ranking(selected_analyzer, name_of_file,
                               scoring_function, path):

    #creating Schema with fields id, title and content
    schema = Schema(id=ID(stored=True),
                    title=TEXT(stored=False, analyzer=selected_analyzer),
                    content=TEXT(stored=False, analyzer=selected_analyzer))
    directory_containing_the_index = path
    ix = create_in(
        directory_containing_the_index, schema
    )  #writing index based on schema in the directory where the 'path' is
    directory_containing_the_index = path
    ix = index.open_dir(
        directory_containing_the_index)  #opening the index file
    writer = AsyncWriter(ix)  #writer will be used to add content to the fields

    ALL_DOCUMENTS_file_name = name_of_file  #path to the file
    in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1')
    csv_reader = csv.reader(in_file, delimiter=',')  #reading the file
    csv_reader.__next__(
    )  # to skip the header: first line contains the name of each field.

    for record in csv_reader:  #for each row in the 'csv_test' file
        id = record[1]  #read id
        title = record[2]  #read title
        content = record[3]  #read body
        writer.add_document(id=id, content=title + ' ' + content)

    writer.commit()
    in_file.close()  #finish writing in the index file
Пример #12
0
def incremental_index(t, l, c, dirname):
    id = (Searcher().getcount() + 1)
    ix = index.open_dir(dirname)
    # The set of all paths in the index
    #with ix.searcher() as searcher:

    indexed_feeds = set()

    with ix.searcher() as searcher:
      writer = AsyncWriter(ix)

      # Loop over the stored fields in the index
      for fields in searcher.all_stored_fields():
        indexed_feed = fields['title']
        indexed_feeds.add(indexed_feed)

      # Loop over the files in the filesystem
      # Assume we have a function that gathers the filenames of the
      # documents to be indexed
      if t not in indexed_feeds:
          # This is either a file that's changed, or a new file
          # that wasn't indexed before. So index it!
          wooshDocuments(id, writer, t, l, c)

      writer.commit()
      return id
 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = AsyncWriter(self.index)
     
     for obj in iterable:
         doc = index.full_prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in doc:
             doc[key] = self._from_python(doc[key])
         
         writer.update_document(**doc)
     
     if len(iterable) > 0:
         # For now, commit no matter what, as we run into locking issues otherwise.
         writer.commit()
         
         # If spelling support is desired, add to the dictionary.
         if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
             sp = SpellChecker(self.storage)
             sp.add_field(self.index, self.content_field_name)
Пример #14
0
def index_posts(posts, ix=None, overwrite=False, add_func=add_index):
    """
    Create or update a search index of posts.
    """

    ix = ix or init_index()
    # The writer is asynchronous by default
    writer = AsyncWriter(ix)

    elapsed, progress = timer_func()
    total = posts.count()
    stream = islice(zip(count(1), posts), None)

    # Loop through posts and add to index
    for step, post in stream:
        progress(step, total=total, msg="posts indexed")
        add_func(post=post, writer=writer)

    # Commit to index
    if overwrite:
        logger.info("Overwriting the old index")
        writer.commit(mergetype=writing.CLEAR)
    else:
        logger.debug("Committing to index")
        writer.commit()

    elapsed(f"Committed {total} posts to index.")
Пример #15
0
    def delPage(self, item):

        index = item.childCount()
        while index > 0:
            index = index - 1
            self.dirname = item.child(index).text(0)
            self.delPage(item.child(index))

        # remove attachment folder
        attDir = self.itemToAttachmentDir(item)
        for info in QtCore.QDir(attDir).entryInfoList():
            QtCore.QDir().remove(info.absoluteFilePath())
        QtCore.QDir().rmdir(attDir)

        pagePath = self.itemToPage(item)
        self.ix = open_dir(self.settings.indexdir)
        query = QueryParser("path", self.ix.schema).parse(pagePath)
        # writer = self.ix.writer()
        writer = AsyncWriter(self.ix)
        n = writer.delete_by_query(query)
        # n = writer.delete_by_term('path', pagePath)
        writer.commit()
        # self.ix.close()
        b = QtCore.QDir(self.notePath).remove(self.pageToFile(pagePath))
        parent = item.parent()
        parentPage = self.itemToPage(parent)
        if parent is not None:
            index = parent.indexOfChild(item)
            parent.takeChild(index)
            if parent.childCount() == 0:  # if no child, dir not needed
                QtCore.QDir(self.notePath).rmdir(parentPage)
        else:
            index = self.indexOfTopLevelItem(item)
            self.takeTopLevelItem(index)
        QtCore.QDir(self.notePath).rmdir(pagePath)
Пример #16
0
def createSearchableData(root):

    ana = analysis.StemmingAnalyzer()
    ## definisco lo schema del mio indice
    schema = Schema( title=TEXT(stored=True),\
                     author=TEXT(stored=True),\
                     genre=KEYWORD(stored=True), \
                     link=ID(stored=True), \
                     path=ID(stored=True), \
                     price=ID(stored=True), \
                     content=TEXT(stored=True),\
                     contentData=TEXT)

    ## creo la directory indexdir
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    cwd = os.getcwd()
    print(cwd)

    ## Creo un indexWriter, che aggiunga i documenti secondo lo schema
    ix = create_in("indexdir", schema)
    writer = AsyncWriter(ix)

    ## Trovo i file nella directory, e ne salvo i percorsi
    filepaths = [os.path.join(root, i) for i in os.listdir(root)]

    num = 1
    # per ogni percorso trovato...
    for path in filepaths:
        #print(num)
        num += 1

        fp = open(path, 'r', encoding="utf-8")
        #print(path)

        # Nella prima riga ho messo il titolo, nella seconda l'autore, nella terza il genere, nella quarta il link, nella quinta il prezzo
        fileTitle = fp.readline()
        fileAuthor = fp.readline()
        fileGenre = fp.readline()
        fileLink = fp.readline()
        filePrice = fp.readline()

        # Tutto il resto del file è occupato dalla trama
        filePlot = fp.read()

        # la sezione contentData è data dalle trame preprocessate
        fileData = tokenize(filePlot)

        ## Aggiungo un documento all'indice, con tutti i campi necessari
        writer.add_document( title = fileTitle,\
                             path = path,\
                             author = fileAuthor,\
                             genre = fileGenre,\
                             link = fileLink,\
                             price = filePrice, \
                             content = filePlot,\
                             contentData = fileData)
        fp.close()
    writer.commit()
Пример #17
0
def store_page(user, url):
    writer = AsyncWriter(idx)
    resp = requests.get(url)
    content = parse(resp.content)
    now = datetime.now()
    writer.add_document(ts=now, user=unicode(user), url=unicode(url), content=content)
    writer.commit()
Пример #18
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(filename)s %(levelname)s: %(asctime)s: %(message)s')
    logger = logging.getLogger('main')
    logger.info('Executing indexing module')
    logger.info('Reading file')
    du = doc_utilities()
    du.read_data_set(file='data/wikipedia_text_files.csv')
    logger.info('Task1 - Number of documents = {}'.format(
        du.get_number_documents()))
    du.process_documents_for_indexing()
    collection = du.get_collection_json()[0:1000000]

    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
    ix = index.create_in("indexdir", MySchema)
    #writer = ix.writer()
    writer = AsyncWriter(ix)
    with tqdm(total=len(collection),
              desc="Indexing documents",
              bar_format="{l_bar}{bar} [ time left: {remaining} ]") as pbar:
        for d in collection:
            text = str(d['text'])
            idt = str(d['id'])
            title = str(d['title'])
            writer.add_document(id=idt, title=title, text=text)
            pbar.update(1)
    writer.commit()
Пример #19
0
def add():
    d = request.get_json(force=True)
    url = d.get("url")
    content = d.get("content")
    if not url or not content: return jsonify({"status": "missing parameters"})
    if urlparse.urlparse(url).netloc.startswith("localhost"): return  jsonify({"status": "ignored"})
    ix = get_index()
    writer = AsyncWriter(ix)
    soup = BeautifulSoup(content)
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    writer.update_document(title=d.get("title", "Untitled"),
        url=url,
        content=text,
        modified=datetime.datetime.now())
    writer.commit()
    return jsonify({"status": "ok"})
Пример #20
0
 def build_index_writer(self, ix):
     try:
         writer = AsyncWriter(ix)
         self.build_src_index(writer, "java")
         writer.commit()
     except IndexingError as ie:
         print ie.message + "index Error!!!"
Пример #21
0
 def add_item(self, item):
     model = self.model
     doc = self._create_document(model, item)
     index = self.model_index
     writer = AsyncWriter(index, writerargs=self._writer_args())
     writer.update_document(**doc)
     writer.commit()
     self._close_model_index()
Пример #22
0
def index_app(path):
    print("started indexing file " + path)
    ix = get_index()
    writer = AsyncWriter(ix)
    process_appzip(writer, path)
    writer.commit()
    ix.close()
    print("indexing completed " + path)
Пример #23
0
 def delete_documents(self, doc_set, paths):
     """Delete documents from the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     query = And(
         [Term('set', doc_set),
          Or([Term('path', path) for path in paths])])
     writer.delete_by_query(query)
     writer.commit()
Пример #24
0
    def update(self, index, document, **options):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_document = index.adapt_document(document)
        writer.update_document(**adapted_document)
        writer.commit()
Пример #25
0
def add_spam(post):

    ix = init_spam_index()
    writer = AsyncWriter(ix)
    add_post_to_index(post=post, writer=writer)
    writer.commit()
    logger.info("Added spam to index.")

    return
Пример #26
0
 def add_items(self, item_model, items):
     model = self.model
     index = self.model_index
     writer = AsyncWriter(index)
     for item in items:
         doc = self._create_document(model, item)
         writer.update_document(**doc)
     writer.commit()
     self._close_model_index()
Пример #27
0
def recreate_data(sender=None, **kwargs):
    """ Readds all the Object in the index. If they already exists
        will be duplicated
    """
    ix = get_or_create_index()
    writer = AsyncWriter(ix)
    for obj in Post.objects.all():
        writer.add_document(**obj.index())
    writer.commit()
Пример #28
0
    def update(self, index, document, **options):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_document = index.adapt_document(document)
        writer.update_document(**adapted_document)
        writer.commit()
Пример #29
0
def rebuildGroupsIndex():
    createGroupSchema()
    gix = open_dir(groupsindex_dir)
    writer = AsyncWriter(gix)

    groups = rs.zrange("group_ids", 0, -1)
    for gid in groups:
        g = json.loads(rs.hget("group:%s" % gid, 'data'))
        storeGroupInIndex(g, writer)
    writer.commit()
Пример #30
0
def storeUserInIndex(u, writer=None):
    commit = False
    if writer is None:
        writer = AsyncWriter(uix)
        commit = True
    writer.add_document(nickname=u.get('nickname'),
                        account_id=u.get('account_id'),
                        user_id=u.get('id'))
    if commit:
        writer.commit()
Пример #31
0
def storeGroupInIndex(group, writer=None):
    commit = False
    if writer is None:
        writer = AsyncWriter(gix)
        commit = True
    writer.add_document(name=group['name'],
                        descr=group['descr'],
                        id=group['id'])
    if commit:
        writer.commit()
Пример #32
0
def storeBattleInIndex(b, writer=None):
    commit = False
    if writer is None:
        writer = AsyncWriter(bix)
        commit = True
    writer.add_document(id=b.get('id'),
                        descr=b.get('descr'),
                        battle_date=u.get('battle_date'))
    if commit:
        writer.commit()
Пример #33
0
def open_index_writer(optimize=False):
    writer = AsyncWriter(open_index())

    try:
        yield writer
    except Exception as e:
        logger.exception(str(e))
        writer.cancel()
    finally:
        writer.commit(optimize=optimize)
Пример #34
0
def index_post(post):
    """Add or update a post's search entry"""
    writer = AsyncWriter(ix)
    writer.update_document(id=str(post.id),
                           title=post.title,
                           body=post.body,
                           desc=post.desc,
                           subtitle=post.subtitle,
                           tags=post.tags,
                           authors=' '.join([a.name for a in post.authors]))
    writer.commit()
Пример #35
0
    def update_bulk(self, index, documents):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_documents = (index.adapt_document(doc) for doc in documents)
        for doc in adapted_documents:
            writer.update_document(**doc)

        writer.commit()
Пример #36
0
    def build_index(self):
        """Build index for all parsed documents"""
        ix = self.create_index()
        writer = AsyncWriter(ix)

        for i, document in enumerate(self.documents):
            if document:
                writer.add_document(**document)
            update_progress_bar("Building Index", i, len(self.documents))

        writer.commit(optimize=True)
Пример #37
0
def rebuildBattleIndex():
    createBattleSchema()
    bix = open_dir(battlesindex_dir)
    writer = AsyncWriter(bix)

    #battles = rs.keys("battle:*")
    #battles = rs.mget(battles)
    #for u in users:
    #u = json.loads(u)
    #storeUserInIndex(u, writer)
    writer.commit()
Пример #38
0
    def update_bulk(self, index, documents):
        index = base._resolve_index(index)

        ix = self._storage.open_index(indexname=index.get_name())
        writer = AsyncWriter(ix)

        adapted_documents = (index.adapt_document(doc)
                                for doc in documents)
        for doc in adapted_documents:
            writer.update_document(**doc)

        writer.commit()
Пример #39
0
def rebuildUsersIndex():
    createUsersSchema()
    uix = open_dir(usersindex_dir)
    writer = AsyncWriter(uix)

    users = rs.keys("users:*")
    users = rs.mget(users)
    for u in users:
        if u is not None:
            u = json.loads(u)
            storeUserInIndex(u, writer)
    writer.commit()
Пример #40
0
 def addLink(self, url, title, summary, txt):
     
     titleb = title + " "
     title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb
     sumario = summary + " "
     sumario2 = sumario + sumario
     text = title10 + sumario2 + " " + txt
     
     ix = open_dir(self.indexDir, indexname='MAIN', readonly=False)
     writer = AsyncWriter(ix)
     writer.add_document(id=url, content=unicode(text)) 
     writer.commit()
     ix.close()
Пример #41
0
def whoosh_task(ids, pool_number, ix, model_class):
    session = sqla['session']

    writer = AsyncWriter(ix)
    for id_ in ids:
        obj = session.query(model_class).filter_by(id=id_).one()
        if obj.title is None or obj.summary is None:
            continue

        writer.add_document(
            title=obj.title,
            summary=obj.summary
        )

    writer.commit()
Пример #42
0
 def index_documents(self, documents):
     """Add or update documents in the index."""
     index = open_dir(self.index_path)
     writer = AsyncWriter(index)
     needs_commit = False
     for document in documents:
         needs_commit = True
         writer.update_document(
             uid=':'.join((document['set'], document['path'])),
             path=document['path'],
             set=document['set'],
             hash=document['hash'],
             title=document['title'],
             content=document['content'],
             kind=document['kind'],
         )
     if needs_commit:
         writer.commit()
Пример #43
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            # Document boosts aren't supported in Whoosh 2.5.0+.
            if 'boost' in doc:
                del doc['boost']

            try:
                writer.update_document(**doc)
            except Exception as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })
                
                # reset the writer so there is no 'start_doc' error from the
                # previous failed update attempt
                writer = AsyncWriter(self.index)

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
Пример #44
0
 def updateIndex(self):
     ''' Update whoosh index, which cost much computing resource '''
     page = self.parent.notesTree.currentPage()
     content = self.toPlainText()        
     try:
         #writer = self.ix.writer()
         writer = AsyncWriter(self.ix)
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=page, title=parseTitle(content, page), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
             writer.commit()
         else:
             writer.update_document(
                 path=page, title=parseTitle(content, page), content=content, tags='')
             writer.commit()
     except:
         print("Whoosh commit failed.")
Пример #45
0
    def clear(self):
        """Remove all content from indexes, and unregister all classes.

        After clear() the service is stopped. It must be started again
        to create new indexes and register classes.
        """
        logger.info("Resetting indexes")
        state = self.app_state

        for _name, idx in state.indexes.items():
            writer = AsyncWriter(idx)
            writer.commit(merge=True, optimize=True, mergetype=CLEAR)

        state.indexes.clear()
        state.indexed_classes.clear()
        state.indexed_fqcn.clear()
        self.clear_update_queue()

        if self.running:
            self.stop()
Пример #46
0
    def createIndex(self):
        print "    Whoosh Loading from SQL "      
        created = self.createIndexDirIfNotExist()
        if not created:
            #already exists
            return
        
        conn = sqlite3.connect(self.dbName)
        c = conn.cursor()
        c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''')
        feeds = c.fetchall()
        conn.close()
        
        linkN = 1
        schema = Schema(id = TEXT(stored = True), content=TEXT)
        ix = create_in(self.indexDir, schema, indexname='MAIN')
        writer = AsyncWriter(ix)

        for feed in feeds:
            
            # Descartar links sem Titulo
            if( isinstance(feed[3], type(None))):
                #print "is Null"
                continue
            
            index = feed[0]
            # print "    Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3]
            linkN += 1
            
            titolo = feed[3] + " "
            titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo
            sumario = feed[4] + " "
            sumario2 = sumario + sumario
            text = titolo10 + sumario2 + " " +feed[5]
            
            writer.add_document(id=index, content=unicode(text))
            
            
        writer.commit()
        ix.close()   
        print "    Done Loading from SQL"
Пример #47
0
    def newPageCore(self, item, newPageName):
        pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/')
        if not newPageName:
            dialog = LineEditDialog(pagePath, self)
            if dialog.exec_():
                newPageName = dialog.editor.text()
        if newPageName:
            if hasattr(item, 'text'):
                pagePath = os.path.join(self.notePath,
                                        pagePath + '/').replace(os.sep, '/')
            if not QDir(pagePath).exists():
                QDir(self.notePath).mkdir(pagePath)
            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QFile(fileName)
            fh.open(QIODevice.WriteOnly)
            savestream = QTextStream(fh)
            savestream << '# ' + newPageName + '\n'
            savestream << 'Created ' + str(datetime.date.today()) + '\n\n'
            fh.close()
            QTreeWidgetItem(item, [newPageName])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, 'text'):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QDir(attDir).exists():
                QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            fileobj = open(fileName, 'r')
            content = fileobj.read()
            fileobj.close()
            self.ix = open_dir(self.settings.indexdir)
            #writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath+newPageName, content=content)
            writer.commit()
Пример #48
0
 def optimize(self):
     writer = AsyncWriter(self.index)
     writer.commit(optimize=True)
Пример #49
0
 def add_to_index(self, item_id, text):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.update_document(id=item_id, text=text.lower())
     writer.commit()
Пример #50
0
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = f"{cls_name}:{pk}"
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except Exception:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
Пример #51
0
 def add_to_fts(cls, content, title=None, id=None, source_hash=None, tags=None):
     ix = open_dir(LOCAL_FTS_INDEX)
     writer = AsyncWriter(ix)
     writer.add_document(content=content, title=title, id=id, source_hash=source_hash, tags=tags)
     writer.commit()
Пример #52
0
#Create index and AsyncWriter object
index = create_in("tweetindex", my_schema)
writer = AsyncWriter(index)

if __name__=='__main__':
    #Load raw data
    with open("WC2015_headers.csv",'rb') as to_load:
        data=csv.DictReader(to_load)
        for row in data:
            #Extract required information from date to create python datetime object
            date=row['created_at'][:19]+' '+row['created_at'][-4:]
            
            #Clean text and parse into keywords
            text=row['text'].replace('\\','')
            keywords=[word for word in word_tokenize(text) if word not in stops]
            
            #Check for Retweets
            rt=False
            if 'RT ' in text:
                rt=True
            
            #Add completed document to index
            writer.add_document(id = unicode(row['id']), 
                                screen_name = unicode(row['screen_name']),
                                text = unicode(text),
                                contains_retweet=rt,
                                keyword = unicode(" ".join(keywords)),
                                created = datetime.datetime.strptime(date, "%a %b %d %H:%M:%S %Y")
                                )
        writer.commit()
Пример #53
0
 def remove(self, item_id):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.delete_by_term('id', item_id)
     writer.commit()
Пример #54
0
    def newPageCore(self, item, newPageName, useTemplate=False, templateTitle=None, templateBody=None):
        pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, "/")
        if not newPageName:
            if useTemplate:
                dialog = mikitemplate.PickTemplateDialog(pagePath, self.settings, parent=self)
                if dialog.exec_():
                    curTitleIdx = dialog.titleTemplates.currentIndex()
                    curBodyIdx = dialog.bodyTemplates.currentIndex()
                    dtnow = datetime.datetime.now()
                    if curTitleIdx > -1:
                        titleItem = dialog.titleTemplates.model().item(curTitleIdx)
                        titleItemContent = titleItem.data(TTPL_COL_DATA)
                        titleItemType = titleItem.data(TTPL_COL_EXTRA_DATA)
                        titleParameter = dialog.titleTemplateParameter.text()
                        newPageName = mikitemplate.makeTemplateTitle(
                            titleItemType, titleItemContent, dtnow=dtnow, userinput=titleParameter
                        )
                    if curBodyIdx > -1:
                        bodyItemIdx = dialog.bodyTemplates.rootModelIndex().child(curBodyIdx, 0)
                        bodyFPath = dialog.bodyTemplates.model().filePath(bodyItemIdx)
                    else:
                        bodyFPath = None
            else:
                dialog = LineEditDialog(pagePath, self)
                if dialog.exec_():
                    newPageName = dialog.editor.text()

        prevparitem = None

        if newPageName:
            if hasattr(item, "text"):
                pagePath = os.path.join(self.notePath, pagePath + "/").replace(os.sep, "/")
            if not QtCore.QDir(pagePath).exists():
                QtCore.QDir(self.notePath).mkdir(pagePath)

            if not QtCore.QDir(os.path.dirname(newPageName)).exists():
                curdirname = os.path.dirname(newPageName)
                needed_parents = []
                while curdirname != "":
                    needed_parents.append(curdirname)
                    curdirname = os.path.dirname(curdirname)

                # create the needed hierarchy in reverse order
                for i, needed_parent in enumerate(needed_parents[::-1]):
                    paritem = self.pageToItem(needed_parent)
                    if paritem is None:
                        if i == 0:
                            self.newPageCore(item, os.path.basename(needed_parent))
                        else:
                            self.newPageCore(prevparitem, os.path.basename(needed_parent))
                        QtCore.QDir(pagePath).mkdir(needed_parent)
                    elif not QtCore.QDir(os.path.join(self.notePath, needed_parent).replace(os.sep, "/")).exists():
                        QtCore.QDir(pagePath).mkdir(needed_parent)
                    if paritem is not None:
                        prevparitem = paritem
                    else:
                        prevparitem = self.pageToItem(needed_parent)

            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QtCore.QFile(fileName)
            fh.open(QtCore.QIODevice.WriteOnly)

            savestream = QtCore.QTextStream(fh)
            if useTemplate and bodyFPath is not None:
                with open(bodyFPath, "r", encoding="utf-8") as templatef:
                    savestream << mikitemplate.makeTemplateBody(
                        os.path.basename(newPageName),
                        dtnow=dtnow,
                        dt_in_body_txt=self.tr("Created {}"),
                        body=templatef.read(),
                    )
            else:
                savestream << mikitemplate.makeDefaultBody(os.path.basename(newPageName), self.tr("Created {}"))
            fh.close()
            if prevparitem is not None:
                QtWidgets.QTreeWidgetItem(prevparitem, [os.path.basename(newPageName)])
            else:
                QtWidgets.QTreeWidgetItem(item, [os.path.basename(newPageName)])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, "text"):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QtCore.QDir(attDir).exists():
                QtCore.QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            with open(fileName, "r") as fileobj:
                content = fileobj.read()

            self.ix = open_dir(self.settings.indexdir)
            # writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath + newPageName, content=content)
            writer.commit()
Пример #55
0
class Index(object):
    def __init__(self, directory, persist):
        self.log = logging.getLogger("ftpvista.index")

        self._persist = persist
        if not os.path.exists(directory):
            self.log.info("Creating the index in %s" % directory)
            os.mkdir(directory)
            self._idx = index.create_in(directory, schema=self.get_schema())
        else:
            self.log.info("Opening the index in %s" % directory)
            self._idx = index.open_dir(directory)

        self._searcher = self._idx.searcher()
        self._writer = None
        self.open_writer()
        self._last_optimization = None

    def open_writer(self):
        # self._writer = BufferedWriter(self._idx, 120, 4000)
        self._writer = AsyncWriter(self._idx)

    def get_schema(self):
        analyzer = StemmingAnalyzer("([a-zA-Z0-9])+")
        my_analyzer = analyzer | CharsetFilter(accent_map)
        return Schema(
            server_id=ID(stored=True),
            has_id=ID(),
            path=TEXT(analyzer=my_analyzer, stored=True),
            name=TEXT(analyzer=my_analyzer, stored=True),
            ext=TEXT(analyzer=my_analyzer, stored=True),
            size=ID(stored=True),
            mtime=ID(stored=True, sortable=True),
            audio_album=TEXT(analyzer=my_analyzer, stored=True),
            audio_artist=TEXT(analyzer=my_analyzer, stored=True),
            audio_title=TEXT(analyzer=my_analyzer, stored=True),
            audio_track=ID(stored=True),
            audio_year=ID(stored=True),
        )

    def delete_all_docs(self, server):
        self.open_writer()
        self._writer.delete_by_term("server_id", str(server.get_server_id()))
        self._writer.commit()
        self.log.info("All documents of server %s deleted" % server.get_ip_addr())

    def incremental_server_update(self, server_id, current_files):
        """Prepares to incrementaly update the documents for the given server.

        server_id      -- Id of the server to update.
        current_files  -- a list of (path, size, mtime) tuples for each files
                          currently on the server.

        Delete all the outdated files from the index and returns a list
        of files needing to be reindexed.
        """

        def delete_doc(writer, serverid, path):
            writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path))

        # Build a {path => (size, mtime)} mapping for quick lookups
        to_index = {}
        for path, size, mtime in current_files:
            to_index[path] = (size, mtime)

        results = self._searcher.documents(server_id=str(server_id))
        if results:
            for fields in results:
                indexed_path = fields["path"]

                if indexed_path not in to_index:
                    # This file was deleted from the server since it was indexed
                    delete_doc(self._writer, server_id, indexed_path)
                    self.log.debug("%s has been removed" % indexed_path)
                else:
                    size, mtime = to_index[indexed_path]
                    try:
                        if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"):
                            # This file has been modified since it was indexed
                            delete_doc(self._writer, server_id, indexed_path)
                        else:
                            # up to date, no need to reindex
                            del to_index[indexed_path]
                    except ValueError:
                        delete_doc(self._writer, server_id, indexed_path)

        # return the remaining files
        return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()]

    def add_document(
        self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None
    ):
        """Add a document with the specified fields in the index.

        Changes need to be commited.

        """

        # passing the optional arguments is quite a mess
        # let's build a dict for that purpose

        _, ext = os.path.splitext(name)
        ext = ext.lstrip(".")

        kwargs = {
            "server_id": server_id,
            "name": name,
            "ext": ext,
            "path": path,
            "size": size,
            "mtime": mtime,
            "has_id": "a",
        }

        # Add the optional args
        if audio_album is not None:
            kwargs["audio_album"] = audio_album

        if audio_artist is not None:
            kwargs["audio_artist"] = audio_artist

        if audio_title is not None:
            kwargs["audio_title"] = audio_title

        if audio_year is not None:
            kwargs["audio_year"] = audio_year

        try:
            self._writer.add_document(**kwargs)
        except IndexingError:
            self.open_writer()
            self._writer.add_document(**kwargs)

    def commit(self, optimize=False):
        """ Commit the changes in the index and optimize it """
        self.log.info(" -- Begin of Commit -- ")
        try:
            self._writer.commit(optimize=optimize)
        except IndexingError:
            self.open_writer()
            self._writer.commit(optimize=optimize)
        self.log.info("Index commited")

        self._searcher = self._idx.searcher()
        self.log.info(" -- End of Commit -- ")

    def close(self):
        self.log.info(" -- Closing writer and index -- ")
        # self._writer.close()
        """ Close the index """
        self._idx.close()