Exemplo n.º 1
0
 def update(self, x, who=None):
     # implement search here
     x = str(x)
     aindex = AsyncWriter(self.index, delay=0.2)
     aindex.add_document(content=x)
     aindex.commit()
     return self._emit(x)
def creating_searching_ranking(selected_analyzer, name_of_file,
                               scoring_function, path):

    #creating Schema with fields id, title and content
    schema = Schema(id=ID(stored=True),
                    title=TEXT(stored=False, analyzer=selected_analyzer),
                    content=TEXT(stored=False, analyzer=selected_analyzer))
    directory_containing_the_index = path
    ix = create_in(
        directory_containing_the_index, schema
    )  #writing index based on schema in the directory where the 'path' is
    directory_containing_the_index = path
    ix = index.open_dir(
        directory_containing_the_index)  #opening the index file
    writer = AsyncWriter(ix)  #writer will be used to add content to the fields

    ALL_DOCUMENTS_file_name = name_of_file  #path to the file
    in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1')
    csv_reader = csv.reader(in_file, delimiter=',')  #reading the file
    csv_reader.__next__(
    )  # to skip the header: first line contains the name of each field.

    for record in csv_reader:  #for each row in the 'csv_test' file
        id = record[1]  #read id
        title = record[2]  #read title
        content = record[3]  #read body
        writer.add_document(id=id, content=title + ' ' + content)

    writer.commit()
    in_file.close()  #finish writing in the index file
Exemplo n.º 3
0
def createSearchableData(root):

    ana = analysis.StemmingAnalyzer()
    ## definisco lo schema del mio indice
    schema = Schema( title=TEXT(stored=True),\
                     author=TEXT(stored=True),\
                     genre=KEYWORD(stored=True), \
                     link=ID(stored=True), \
                     path=ID(stored=True), \
                     price=ID(stored=True), \
                     content=TEXT(stored=True),\
                     contentData=TEXT)

    ## creo la directory indexdir
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    cwd = os.getcwd()
    print(cwd)

    ## Creo un indexWriter, che aggiunga i documenti secondo lo schema
    ix = create_in("indexdir", schema)
    writer = AsyncWriter(ix)

    ## Trovo i file nella directory, e ne salvo i percorsi
    filepaths = [os.path.join(root, i) for i in os.listdir(root)]

    num = 1
    # per ogni percorso trovato...
    for path in filepaths:
        #print(num)
        num += 1

        fp = open(path, 'r', encoding="utf-8")
        #print(path)

        # Nella prima riga ho messo il titolo, nella seconda l'autore, nella terza il genere, nella quarta il link, nella quinta il prezzo
        fileTitle = fp.readline()
        fileAuthor = fp.readline()
        fileGenre = fp.readline()
        fileLink = fp.readline()
        filePrice = fp.readline()

        # Tutto il resto del file è occupato dalla trama
        filePlot = fp.read()

        # la sezione contentData è data dalle trame preprocessate
        fileData = tokenize(filePlot)

        ## Aggiungo un documento all'indice, con tutti i campi necessari
        writer.add_document( title = fileTitle,\
                             path = path,\
                             author = fileAuthor,\
                             genre = fileGenre,\
                             link = fileLink,\
                             price = filePrice, \
                             content = filePlot,\
                             contentData = fileData)
        fp.close()
    writer.commit()
Exemplo n.º 4
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(filename)s %(levelname)s: %(asctime)s: %(message)s')
    logger = logging.getLogger('main')
    logger.info('Executing indexing module')
    logger.info('Reading file')
    du = doc_utilities()
    du.read_data_set(file='data/wikipedia_text_files.csv')
    logger.info('Task1 - Number of documents = {}'.format(
        du.get_number_documents()))
    du.process_documents_for_indexing()
    collection = du.get_collection_json()[0:1000000]

    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
    ix = index.create_in("indexdir", MySchema)
    #writer = ix.writer()
    writer = AsyncWriter(ix)
    with tqdm(total=len(collection),
              desc="Indexing documents",
              bar_format="{l_bar}{bar} [ time left: {remaining} ]") as pbar:
        for d in collection:
            text = str(d['text'])
            idt = str(d['id'])
            title = str(d['title'])
            writer.add_document(id=idt, title=title, text=text)
            pbar.update(1)
    writer.commit()
Exemplo n.º 5
0
 def whoosh_index(self):
     it = QTreeWidgetItemIterator(
         self.notesTree, QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
Exemplo n.º 6
0
    def saveToWhoosh(self, df, dataset_id, overwrite=False):
        # use whoosh search engine to enable full text search
        if not os.path.exists(self.whoosh_root):
            os.mkdir(self.whoosh_root)
        ws_path = os.path.join(self.whoosh_root, dataset_id)
        if not os.path.exists(ws_path):
            os.mkdir(ws_path)
            logMsg(
                str(os.path.abspath(ws_path)) +
                ' does not exist, create it to store whoosh index')
            overwrite = True
        elif overwrite:
            shutil.rmtree(ws_path)
            os.mkdir(ws_path)
        schema = Schema(DOC_ID=NUMERIC(stored=True), TEXT=TEXT)
        if overwrite:
            ix = create_in(ws_path, schema)
        else:
            ix = open_dir(ws_path)
        writer = AsyncWriter(ix)

        with self.workflow.dao.create_session() as session:
            doc_iter = session.query(Document).filter(
                Document.DATASET_ID == dataset_id)
            for doc in doc_iter:
                writer.add_document(DOC_ID=doc.DOC_ID, TEXT=doc.TEXT)
            writer.commit()
        pass
Exemplo n.º 7
0
 def whoosh_index(self):
     it = QtWidgets.QTreeWidgetItemIterator(
         self.notesTree, QtWidgets.QTreeWidgetItemIterator.All)
     print("Starting complete indexing.")
     #writer = self.ix.writer()
     writer = AsyncWriter(self.ix)
     while it.value():
         treeItem = it.value()
         name = self.notesTree.itemToPage(treeItem)
         path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/')
         print(path)
         fileobj = open(path, 'r', encoding='utf-8')
         content = fileobj.read()
         fileobj.close()
         if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions:
             no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip()
             self.settings.md.reset().convert(content)
             writer.update_document(
                 path=name, title=parseTitle(content, name), content=no_metadata_content,
                 tags=','.join(self.settings.md.Meta.get('tags', [])).strip())
         else:
             writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='')
        
         it += 1
     writer.commit()
     print("Finished completely reindexing.")
Exemplo n.º 8
0
def store_page(user, url):
    writer = AsyncWriter(idx)
    resp = requests.get(url)
    content = parse(resp.content)
    now = datetime.now()
    writer.add_document(ts=now, user=unicode(user), url=unicode(url), content=content)
    writer.commit()
Exemplo n.º 9
0
def recreate_data(sender=None, **kwargs):
    """ Readds all the Object in the index. If they already exists
        will be duplicated
    """
    ix = get_or_create_index()
    writer = AsyncWriter(ix)
    for obj in Post.objects.all():
        writer.add_document(**obj.index())
    writer.commit()
Exemplo n.º 10
0
def storeBattleInIndex(b, writer=None):
    commit = False
    if writer is None:
        writer = AsyncWriter(bix)
        commit = True
    writer.add_document(id=b.get('id'),
                        descr=b.get('descr'),
                        battle_date=u.get('battle_date'))
    if commit:
        writer.commit()
Exemplo n.º 11
0
def storeGroupInIndex(group, writer=None):
    commit = False
    if writer is None:
        writer = AsyncWriter(gix)
        commit = True
    writer.add_document(name=group['name'],
                        descr=group['descr'],
                        id=group['id'])
    if commit:
        writer.commit()
Exemplo n.º 12
0
def storeUserInIndex(u, writer=None):
    commit = False
    if writer is None:
        writer = AsyncWriter(uix)
        commit = True
    writer.add_document(nickname=u.get('nickname'),
                        account_id=u.get('account_id'),
                        user_id=u.get('id'))
    if commit:
        writer.commit()
Exemplo n.º 13
0
    def build_index(self):
        """Build index for all parsed documents"""
        ix = self.create_index()
        writer = AsyncWriter(ix)

        for i, document in enumerate(self.documents):
            if document:
                writer.add_document(**document)
            update_progress_bar("Building Index", i, len(self.documents))

        writer.commit(optimize=True)
Exemplo n.º 14
0
def whoosh_task(ids, pool_number, ix, model_class):
    session = sqla['session']

    writer = AsyncWriter(ix)
    for id_ in ids:
        obj = session.query(model_class).filter_by(id=id_).one()
        if obj.title is None or obj.summary is None:
            continue

        writer.add_document(title=obj.title, summary=obj.summary)

    writer.commit()
Exemplo n.º 15
0
def add_document(video_id, title, description, text):
    """
    Adds single document to index
    """
    #TODO: check
    index = open_index()
    writer = AsyncWriter(index)
    writer.add_document(text=text,
                        title=title,
                        id=video_id,
                        description=description)
    writer.commit()
Exemplo n.º 16
0
def update_index(sender, **kwargs):
    """ Adds/updates an entry in the index. It's connected with
        the post_save signal of the Object objects so will automatically
        index every new or modified Object
    """
    ix = get_or_create_index()
    writer = AsyncWriter(ix)
    obj = kwargs['instance']
    if "created" in kwargs and kwargs['created']:
        writer.add_document(**obj.index())
    else:
        writer.update_document(**obj.index())
    writer.commit()
Exemplo n.º 17
0
 def addLink(self, url, title, summary, txt):
     
     titleb = title + " "
     title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb
     sumario = summary + " "
     sumario2 = sumario + sumario
     text = title10 + sumario2 + " " + txt
     
     ix = open_dir(self.indexDir, indexname='MAIN', readonly=False)
     writer = AsyncWriter(ix)
     writer.add_document(id=url, content=unicode(text)) 
     writer.commit()
     ix.close()
Exemplo n.º 18
0
    def addLink(self, url, title, summary, txt):

        titleb = title + " "
        title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb
        sumario = summary + " "
        sumario2 = sumario + sumario
        text = title10 + sumario2 + " " + txt

        ix = open_dir(self.indexDir, indexname='MAIN', readonly=False)
        writer = AsyncWriter(ix)
        writer.add_document(id=url, content=unicode(text))
        writer.commit()
        ix.close()
Exemplo n.º 19
0
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi,
                **kwargs):
    """
    Build two search indexes simultaneously
    One is for repositories and the other for tools.

    Returns a tuple with number of repos and tools that were indexed.
    """
    model = ts_mapping.init(file_path,
                            dburi,
                            engine_options={},
                            create_tables=False)
    sa_session = model.context.current
    repo_index, tool_index = _get_or_create_index(whoosh_index_dir)

    repo_index_writer = AsyncWriter(repo_index)
    tool_index_writer = AsyncWriter(tool_index)
    repos_indexed = 0
    tools_indexed = 0

    execution_timer = ExecutionTimer()
    with repo_index.searcher() as searcher:
        for repo in get_repos(sa_session, file_path, hgweb_config_dir,
                              **kwargs):
            tools_list = repo.pop('tools_list')
            repo_id = repo['id']
            indexed_document = searcher.document(id=repo_id)
            if indexed_document:
                if indexed_document['full_last_updated'] == repo.get(
                        'full_last_updated'):
                    # We're done, since we sorted repos by update time
                    break
                else:
                    # Got an update, delete the previous document
                    repo_index_writer.delete_by_term('id', repo_id)

            repo_index_writer.add_document(**repo)

            #  Tools get their own index
            for tool in tools_list:
                tool_index_writer.add_document(**tool)
                tools_indexed += 1

            repos_indexed += 1

    tool_index_writer.commit()
    repo_index_writer.commit()

    log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed)
    log.info("Toolbox index finished %s", execution_timer)
    return repos_indexed, tools_indexed
Exemplo n.º 20
0
    def update_index(self, document):
        """Update search index for a document

		Args:
			self (object): FullTextSearch Instance
			document (_dict): A dictionary with title, path and content
		"""
        ix = self.get_index()

        with ix.searcher():
            writer = AsyncWriter(ix)
            writer.delete_by_term(self.id, document[self.id])
            writer.add_document(**document)
            writer.commit(optimize=True)
Exemplo n.º 21
0
def whoosh_task(ids, pool_number, ix, model_class):
    session = sqla['session']

    writer = AsyncWriter(ix)
    for id_ in ids:
        obj = session.query(model_class).filter_by(id=id_).one()
        if obj.title is None or obj.summary is None:
            continue

        writer.add_document(
            title=obj.title,
            summary=obj.summary
        )

    writer.commit()
Exemplo n.º 22
0
def creating_searching_ranking(selected_analyzer, name_of_file,
                               scoring_function, path):
    """
    Method that creates schema and stores index file based on the retrieved 'csv_test.csv' file  
    input:  
        selected_analyzer - selected text analyzer from the whoosh library
        name_of_file - name of .csv file stored from dataframe variable 'files_text'
        scoring_function - selected scoring function from the whoosh library
        path - path where index files are stored
    """
    #creating Schema with fields id, title and content
    schema = Schema(id=ID(stored=True),\
    title=TEXT(stored=False, analyzer=selected_analyzer),
    content=TEXT(stored=False, analyzer=selected_analyzer))
    directory_containing_the_index = path
    ix = create_in(
        directory_containing_the_index, schema
    )  #vrating index based on schema in the directory where the 'path' is
    directory_containing_the_index = path
    ix = index.open_dir(
        directory_containing_the_index)  #opening the index file
    writer = AsyncWriter(ix)  #writer will be used to add content to the fields

    #num_added_records_so_far=0
    ALL_DOCUMENTS_file_name = name_of_file  #path to the file
    in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1')
    csv_reader = csv.reader(in_file, delimiter=',')  #reading the file
    csv_reader.__next__(
    )  # to skip the header: first line contains the name of each field.
    #num_added_records_so_far = 0
    for record in csv_reader:  #for each row in the 'csv_test' file
        id = record[1]  #read id
        title = record[2]  #read title
        content = record[3]  #read body
        writer.add_document(id=id, content=title + ' ' + content)

#num_added_records_so_far +=1
#if (num_added_records_so_far%1000 == 0):
#    print(" num_added_records_so_far= " + str(num_added_records_so_far))

    writer.commit()
    in_file.close()  #finish writing in the index file
Exemplo n.º 23
0
    def newPageCore(self, item, newPageName):
        pagePath = os.path.join(self.notePath,
                                self.itemToPage(item)).replace(os.sep, '/')
        if not newPageName:
            dialog = LineEditDialog(pagePath, self)
            if dialog.exec_():
                newPageName = dialog.editor.text()
        if newPageName:
            if hasattr(item, 'text'):
                pagePath = os.path.join(self.notePath,
                                        pagePath + '/').replace(os.sep, '/')
            if not QDir(pagePath).exists():
                QDir(self.notePath).mkdir(pagePath)
            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QFile(fileName)
            fh.open(QIODevice.WriteOnly)
            savestream = QTextStream(fh)
            savestream << '# ' + newPageName + '\n'
            savestream << 'Created ' + str(datetime.date.today()) + '\n\n'
            fh.close()
            QTreeWidgetItem(item, [newPageName])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, 'text'):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QDir(attDir).exists():
                QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            fileobj = open(fileName, 'r')
            content = fileobj.read()
            fileobj.close()
            self.ix = open_dir(self.settings.indexdir)
            #writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath + newPageName, content=content)
            writer.commit()
Exemplo n.º 24
0
    def createIndex(self):
        print "    Whoosh Loading from SQL "      
        created = self.createIndexDirIfNotExist()
        if not created:
            #already exists
            return
        
        conn = sqlite3.connect(self.dbName)
        c = conn.cursor()
        c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''')
        feeds = c.fetchall()
        conn.close()
        
        linkN = 1
        schema = Schema(id = TEXT(stored = True), content=TEXT)
        ix = create_in(self.indexDir, schema, indexname='MAIN')
        writer = AsyncWriter(ix)

        for feed in feeds:
            
            # Descartar links sem Titulo
            if( isinstance(feed[3], type(None))):
                #print "is Null"
                continue
            
            index = feed[0]
            # print "    Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3]
            linkN += 1
            
            titolo = feed[3] + " "
            titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo
            sumario = feed[4] + " "
            sumario2 = sumario + sumario
            text = titolo10 + sumario2 + " " +feed[5]
            
            writer.add_document(id=index, content=unicode(text))
            
            
        writer.commit()
        ix.close()   
        print "    Done Loading from SQL"
Exemplo n.º 25
0
    def newPageCore(self, item, newPageName):
        pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/')
        if not newPageName:
            dialog = LineEditDialog(pagePath, self)
            if dialog.exec_():
                newPageName = dialog.editor.text()
        if newPageName:
            if hasattr(item, 'text'):
                pagePath = os.path.join(self.notePath,
                                        pagePath + '/').replace(os.sep, '/')
            if not QDir(pagePath).exists():
                QDir(self.notePath).mkdir(pagePath)
            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QFile(fileName)
            fh.open(QIODevice.WriteOnly)
            savestream = QTextStream(fh)
            savestream << '# ' + newPageName + '\n'
            savestream << 'Created ' + str(datetime.date.today()) + '\n\n'
            fh.close()
            QTreeWidgetItem(item, [newPageName])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, 'text'):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QDir(attDir).exists():
                QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            fileobj = open(fileName, 'r')
            content = fileobj.read()
            fileobj.close()
            self.ix = open_dir(self.settings.indexdir)
            #writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath+newPageName, content=content)
            writer.commit()
Exemplo n.º 26
0
    def createIndex(self):
        print "    Whoosh Loading from SQL "
        created = self.createIndexDirIfNotExist()
        if not created:
            #already exists
            return

        conn = sqlite3.connect(self.dbName)
        c = conn.cursor()
        c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''')
        feeds = c.fetchall()
        conn.close()

        linkN = 1
        schema = Schema(id=TEXT(stored=True), content=TEXT)
        ix = create_in(self.indexDir, schema, indexname='MAIN')
        writer = AsyncWriter(ix)

        for feed in feeds:

            # Descartar links sem Titulo
            if (isinstance(feed[3], type(None))):
                #print "is Null"
                continue

            index = feed[0]
            # print "    Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3]
            linkN += 1

            titolo = feed[3] + " "
            titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo
            sumario = feed[4] + " "
            sumario2 = sumario + sumario
            text = titolo10 + sumario2 + " " + feed[5]

            writer.add_document(id=index, content=unicode(text))

        writer.commit()
        ix.close()
        print "    Done Loading from SQL"
Exemplo n.º 27
0
 def _to_index(self, x):
     # implement search here
     x = str(x) >> log
     aindex = AsyncWriter(self.index, delay=0.2)
     aindex.add_document(content=x)
     aindex.commit()
Exemplo n.º 28
0
    def load_all_dset_metadata(self, dsetname, create_index=False):
        """
            Loads into memory the metadata of a dataset. The metadata is read from a CSV file, which should
            have at least two columns:
             - filename: Paths to the images in the dataset, relative to the image data folder. For backward
                         compatibility '#filename' is also accepted
             - file_attributes: JSON string containing information about the file. The most important file
                                attributes are 'caption' and 'keywords'. The 'caption' field should be a short
                                string which will be used as the caption of the image in result lists. The
                                'keywords' field must contain a comma-separated list of keywords. Each keyword
                                can be used as the source for a search.
            If create_index is True, it builds a search index with the 'keywords' in the file_attributes.
            Arguments:
                dsetname: String corresponding to the dataset within the list of supported
                          datasets.
                create_index: Boolean indicating whether or not to build a search index
                              with the metadata
        """
        metaindex = None
        t = time.time()
        try:
            for afile in os.listdir(os.path.join(self.metadata_dir, dsetname)):
                if afile.endswith(".csv"):
                    metadata_file = os.path.join(self.metadata_dir, dsetname,
                                                 afile)
                    print('Found metadata file at', metadata_file)
                    if create_index:
                        metaindex = open_dir(self.index_dir)
                    with open(metadata_file, 'r') as fin:
                        reader = csv.DictReader(fin)
                        for row in reader:
                            id_field = None
                            if 'filename' in row.keys():
                                id_field = 'filename'
                            elif '#filename' in row.keys():
                                id_field = '#filename'
                            if id_field and 'file_attributes' in row.keys():
                                filename = row[id_field]
                                try:
                                    self.fname2meta[dsetname][
                                        filename] = json.loads(
                                            row['file_attributes'])
                                except:
                                    self.fname2meta[dsetname][filename] = None
                                metadata = self.fname2meta[dsetname][filename]
                                keyword_list = None
                                if metadata and 'keywords' in metadata.keys():
                                    keyword_list = metadata['keywords']
                                if keyword_list and create_index:
                                    keyword_list_splitted = keyword_list.split(
                                        ',')
                                    writer = AsyncWriter(metaindex)
                                    for key in keyword_list_splitted:
                                        key = key.strip()
                                        # delete previous entry if found
                                        query = QueryParser(
                                            'key', metaindex.schema).parse(key)
                                        writer.delete_by_query(
                                            query, metaindex.searcher())
                                        # add document
                                        writer.add_document(
                                            key=str(key),
                                            dataset=str(dsetname))
                                    writer.commit()
                                if keyword_list:  # we would like to do this, even if the index is not created
                                    # register link keyword-file
                                    keyword_list_splitted = keyword_list.split(
                                        ',')
                                    for key in keyword_list_splitted:
                                        key = key.strip()
                                        if key in self.keyword2fname[
                                                dsetname].keys():
                                            self.keyword2fname[dsetname][
                                                key].append(filename)
                                        else:
                                            self.keyword2fname[dsetname][
                                                key] = [filename]
                            else:
                                raise Exception(
                                    '"filename" and/or "file_attributes" columns not found in '
                                    + afile +
                                    ' (are you missing the column names?). Metadata will not be available!.'
                                )

                        print('Finished loading metadata for %s in %s' %
                              (dsetname, str(time.time() - t)))
                        self.is_all_metadata_loaded = True
                    break
        except Exception as e:
            print("load_all_dset_metadata Exception:" + str(e) + '\n')
Exemplo n.º 29
0
        ix = open_dir(settings.index)
        writer = AsyncWriter(ix)

        for entry in entries:
            try:
                item = Item.get(guid=entry['guid'])
            except Item.DoesNotExist:
                item = Item.create(**entry)
            records += 1

            if len(entry['html']):
                soup = BeautifulSoup(entry['html'], settings.fetcher.parser)
                plaintext = ''.join(soup.find_all(text=True))
                writer.add_document(id=item.id,
                                    guid=unicode(item.guid),
                                    title=entry['title'],
                                    text=plaintext,
                                    when=datetime.datetime.utcfromtimestamp(
                                        item.when))

                hrefs = get_link_references(soup)
            else:
                hrefs = []
            hrefs.append(entry['url'])

            if not settings.fetcher.post_processing.expand_links:
                return

            lnow = time.time()
            links = expand_links(set(hrefs))
            log.debug("%s - %d links in %fs" %
                      (netloc, len(hrefs), time.time() - lnow))
Exemplo n.º 30
0
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = f"{cls_name}:{pk}"
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except Exception:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
Exemplo n.º 31
0
    ts_start = current_time_msec()
    writer = AsyncWriter(
        ix)  # used to override the LockError for multiprocessing

    # Directory containing HTML files
    dir_html = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\HW_1\part_1\Cranfield_DATASET\DOCUMENTS'

    for i in tqdm(range(1, len(os.listdir(dir_html)) + 1)):
        file_name = os.path.join(dir_html, "______{}.html".format(i))
        with open(file_name, encoding="utf8") as html_file:

            soup = BeautifulSoup(html_file)
            t = title(soup)
            b = body(soup)
            identifier = i
            writer.add_document(id=str(identifier), title=t, content=b)

    writer.commit()
    html_file.close()
    #
    ts_end = current_time_msec()
    print("TimeStamp: ", time.asctime(time.localtime(time.time())))
    total_time_msec = (ts_end - ts_start)
    print("total_time= " + str(total_time_msec) + "msec")
    print()

# *Time index*
for schema_type in dir_specific:

    schema_type = '\\' + schema_type
    if schema_type == '\\Field Booster':
Exemplo n.º 32
0
 def add(self, note):
     writer = AsyncWriter(self.index)
     writer.add_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
     writer.commit()
Exemplo n.º 33
0
class Index(object):
    def __init__(self, directory, persist):
        self.log = logging.getLogger("ftpvista.index")

        self._persist = persist
        if not os.path.exists(directory):
            self.log.info("Creating the index in %s" % directory)
            os.mkdir(directory)
            self._idx = index.create_in(directory, schema=self.get_schema())
        else:
            self.log.info("Opening the index in %s" % directory)
            self._idx = index.open_dir(directory)

        self._searcher = self._idx.searcher()
        self._writer = None
        self.open_writer()
        self._last_optimization = None

    def open_writer(self):
        # self._writer = BufferedWriter(self._idx, 120, 4000)
        self._writer = AsyncWriter(self._idx)

    def get_schema(self):
        analyzer = StemmingAnalyzer("([a-zA-Z0-9])+")
        my_analyzer = analyzer | CharsetFilter(accent_map)
        return Schema(
            server_id=ID(stored=True),
            has_id=ID(),
            path=TEXT(analyzer=my_analyzer, stored=True),
            name=TEXT(analyzer=my_analyzer, stored=True),
            ext=TEXT(analyzer=my_analyzer, stored=True),
            size=ID(stored=True),
            mtime=ID(stored=True, sortable=True),
            audio_album=TEXT(analyzer=my_analyzer, stored=True),
            audio_artist=TEXT(analyzer=my_analyzer, stored=True),
            audio_title=TEXT(analyzer=my_analyzer, stored=True),
            audio_track=ID(stored=True),
            audio_year=ID(stored=True),
        )

    def delete_all_docs(self, server):
        self.open_writer()
        self._writer.delete_by_term("server_id", str(server.get_server_id()))
        self._writer.commit()
        self.log.info("All documents of server %s deleted" % server.get_ip_addr())

    def incremental_server_update(self, server_id, current_files):
        """Prepares to incrementaly update the documents for the given server.

        server_id      -- Id of the server to update.
        current_files  -- a list of (path, size, mtime) tuples for each files
                          currently on the server.

        Delete all the outdated files from the index and returns a list
        of files needing to be reindexed.
        """

        def delete_doc(writer, serverid, path):
            writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path))

        # Build a {path => (size, mtime)} mapping for quick lookups
        to_index = {}
        for path, size, mtime in current_files:
            to_index[path] = (size, mtime)

        results = self._searcher.documents(server_id=str(server_id))
        if results:
            for fields in results:
                indexed_path = fields["path"]

                if indexed_path not in to_index:
                    # This file was deleted from the server since it was indexed
                    delete_doc(self._writer, server_id, indexed_path)
                    self.log.debug("%s has been removed" % indexed_path)
                else:
                    size, mtime = to_index[indexed_path]
                    try:
                        if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"):
                            # This file has been modified since it was indexed
                            delete_doc(self._writer, server_id, indexed_path)
                        else:
                            # up to date, no need to reindex
                            del to_index[indexed_path]
                    except ValueError:
                        delete_doc(self._writer, server_id, indexed_path)

        # return the remaining files
        return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()]

    def add_document(
        self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None
    ):
        """Add a document with the specified fields in the index.

        Changes need to be commited.

        """

        # passing the optional arguments is quite a mess
        # let's build a dict for that purpose

        _, ext = os.path.splitext(name)
        ext = ext.lstrip(".")

        kwargs = {
            "server_id": server_id,
            "name": name,
            "ext": ext,
            "path": path,
            "size": size,
            "mtime": mtime,
            "has_id": "a",
        }

        # Add the optional args
        if audio_album is not None:
            kwargs["audio_album"] = audio_album

        if audio_artist is not None:
            kwargs["audio_artist"] = audio_artist

        if audio_title is not None:
            kwargs["audio_title"] = audio_title

        if audio_year is not None:
            kwargs["audio_year"] = audio_year

        try:
            self._writer.add_document(**kwargs)
        except IndexingError:
            self.open_writer()
            self._writer.add_document(**kwargs)

    def commit(self, optimize=False):
        """ Commit the changes in the index and optimize it """
        self.log.info(" -- Begin of Commit -- ")
        try:
            self._writer.commit(optimize=optimize)
        except IndexingError:
            self.open_writer()
            self._writer.commit(optimize=optimize)
        self.log.info("Index commited")

        self._searcher = self._idx.searcher()
        self.log.info(" -- End of Commit -- ")

    def close(self):
        self.log.info(" -- Closing writer and index -- ")
        # self._writer.close()
        """ Close the index """
        self._idx.close()
Exemplo n.º 34
0
def insert_docs(docs):
    ix = open_dir(whoosh_index)
    writer = AsyncWriter(ix)
    for doc in docs:
        writer.add_document(**doc)
    writer.commit()
Exemplo n.º 35
0
        writer = AsyncWriter(ix)

        for entry in entries:
            try:
                item = Item.get(guid = entry['guid'])
            except Item.DoesNotExist:
                item = Item.create(**entry)
            records += 1

            if len(entry['html']):
                soup = BeautifulSoup(entry['html'], settings.fetcher.parser)
                plaintext = ''.join(soup.find_all(text=True))
                writer.add_document(
                    id = item.id,
                    guid = unicode(item.guid),
                    title = entry['title'],
                    text = plaintext,
                    when = datetime.datetime.utcfromtimestamp(item.when)
                )

                hrefs = get_link_references(soup)
            else:
                hrefs = []
            hrefs.append(entry['url'])
            
            if not settings.fetcher.post_processing.expand_links:
                return

            lnow = time.time()
            links = expand_links(set(hrefs))
            log.debug("%s - %d links in %fs" % (netloc, len(hrefs),time.time()-lnow))
Exemplo n.º 36
0
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = "{}:{}".format(cls_name, pk)
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except BaseException:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
Exemplo n.º 37
0
        #for post in collection.find():
        statement = table.select()
        for post in engine.execute(statement).fetchall():
            num += 1
            left = total - num
            if left > 5000 and left % 5000 == 0:
                print 'Year', year, 'Left', str(left), 'Count', str(num)
                print 'Consuming time:', str(time() - t)
                print
                t = time()

            #title=post['_id']
            title = post['id']
            sent = post['sent']
            year = post['year']
            writer.add_document(title=title, content=sent, year=year)

    #client.close()
    engine.close()
    print 'Done!'
    print 'Consuming time', str(time() - t2) + 's'
    print

    print 'Restarting database...'
    #os.system('mongod -f /etc/mongodb.conf --shutdown')
    #os.system('mongod -f /etc/mongodb.conf &')
    print 'Done!'
    print

    print 'Waiting...'
    #sleep(15)
Exemplo n.º 38
0
    def newPageCore(self, item, newPageName, useTemplate=False, templateTitle=None, templateBody=None):
        pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, "/")
        if not newPageName:
            if useTemplate:
                dialog = mikitemplate.PickTemplateDialog(pagePath, self.settings, parent=self)
                if dialog.exec_():
                    curTitleIdx = dialog.titleTemplates.currentIndex()
                    curBodyIdx = dialog.bodyTemplates.currentIndex()
                    dtnow = datetime.datetime.now()
                    if curTitleIdx > -1:
                        titleItem = dialog.titleTemplates.model().item(curTitleIdx)
                        titleItemContent = titleItem.data(TTPL_COL_DATA)
                        titleItemType = titleItem.data(TTPL_COL_EXTRA_DATA)
                        titleParameter = dialog.titleTemplateParameter.text()
                        newPageName = mikitemplate.makeTemplateTitle(
                            titleItemType, titleItemContent, dtnow=dtnow, userinput=titleParameter
                        )
                    if curBodyIdx > -1:
                        bodyItemIdx = dialog.bodyTemplates.rootModelIndex().child(curBodyIdx, 0)
                        bodyFPath = dialog.bodyTemplates.model().filePath(bodyItemIdx)
                    else:
                        bodyFPath = None
            else:
                dialog = LineEditDialog(pagePath, self)
                if dialog.exec_():
                    newPageName = dialog.editor.text()

        prevparitem = None

        if newPageName:
            if hasattr(item, "text"):
                pagePath = os.path.join(self.notePath, pagePath + "/").replace(os.sep, "/")
            if not QtCore.QDir(pagePath).exists():
                QtCore.QDir(self.notePath).mkdir(pagePath)

            if not QtCore.QDir(os.path.dirname(newPageName)).exists():
                curdirname = os.path.dirname(newPageName)
                needed_parents = []
                while curdirname != "":
                    needed_parents.append(curdirname)
                    curdirname = os.path.dirname(curdirname)

                # create the needed hierarchy in reverse order
                for i, needed_parent in enumerate(needed_parents[::-1]):
                    paritem = self.pageToItem(needed_parent)
                    if paritem is None:
                        if i == 0:
                            self.newPageCore(item, os.path.basename(needed_parent))
                        else:
                            self.newPageCore(prevparitem, os.path.basename(needed_parent))
                        QtCore.QDir(pagePath).mkdir(needed_parent)
                    elif not QtCore.QDir(os.path.join(self.notePath, needed_parent).replace(os.sep, "/")).exists():
                        QtCore.QDir(pagePath).mkdir(needed_parent)
                    if paritem is not None:
                        prevparitem = paritem
                    else:
                        prevparitem = self.pageToItem(needed_parent)

            fileName = pagePath + newPageName + self.settings.fileExt
            fh = QtCore.QFile(fileName)
            fh.open(QtCore.QIODevice.WriteOnly)

            savestream = QtCore.QTextStream(fh)
            if useTemplate and bodyFPath is not None:
                with open(bodyFPath, "r", encoding="utf-8") as templatef:
                    savestream << mikitemplate.makeTemplateBody(
                        os.path.basename(newPageName),
                        dtnow=dtnow,
                        dt_in_body_txt=self.tr("Created {}"),
                        body=templatef.read(),
                    )
            else:
                savestream << mikitemplate.makeDefaultBody(os.path.basename(newPageName), self.tr("Created {}"))
            fh.close()
            if prevparitem is not None:
                QtWidgets.QTreeWidgetItem(prevparitem, [os.path.basename(newPageName)])
            else:
                QtWidgets.QTreeWidgetItem(item, [os.path.basename(newPageName)])
            newItem = self.pageToItem(pagePath + newPageName)
            self.sortItems(0, Qt.AscendingOrder)
            self.setCurrentItem(newItem)
            if hasattr(item, "text"):
                self.expandItem(item)

            # create attachment folder if not exist
            attDir = self.itemToAttachmentDir(newItem)
            if not QtCore.QDir(attDir).exists():
                QtCore.QDir().mkpath(attDir)

            # TODO improvement needed, can be reused somehow
            with open(fileName, "r") as fileobj:
                content = fileobj.read()

            self.ix = open_dir(self.settings.indexdir)
            # writer = self.ix.writer()
            writer = AsyncWriter(self.ix)
            writer.add_document(path=pagePath + newPageName, content=content)
            writer.commit()
Exemplo n.º 39
0
 def add_to_fts(cls, content, title=None, id=None, source_hash=None, tags=None):
     ix = open_dir(LOCAL_FTS_INDEX)
     writer = AsyncWriter(ix)
     writer.add_document(content=content, title=title, id=id, source_hash=source_hash, tags=tags)
     writer.commit()
Exemplo n.º 40
0
#Create index and AsyncWriter object
index = create_in("tweetindex", my_schema)
writer = AsyncWriter(index)

if __name__=='__main__':
    #Load raw data
    with open("WC2015_headers.csv",'rb') as to_load:
        data=csv.DictReader(to_load)
        for row in data:
            #Extract required information from date to create python datetime object
            date=row['created_at'][:19]+' '+row['created_at'][-4:]
            
            #Clean text and parse into keywords
            text=row['text'].replace('\\','')
            keywords=[word for word in word_tokenize(text) if word not in stops]
            
            #Check for Retweets
            rt=False
            if 'RT ' in text:
                rt=True
            
            #Add completed document to index
            writer.add_document(id = unicode(row['id']), 
                                screen_name = unicode(row['screen_name']),
                                text = unicode(text),
                                contains_retweet=rt,
                                keyword = unicode(" ".join(keywords)),
                                created = datetime.datetime.strptime(date, "%a %b %d %H:%M:%S %Y")
                                )
        writer.commit()
Exemplo n.º 41
0
 def insert(self, link, title, document):
     writer = AsyncWriter(self.ix)
     writer.add_document(link=link,title=title, document=document + title)
     writer.commit()
Exemplo n.º 42
0
def createSearchableData(docsDirectory):
    # definizione dello schema dell'indice
    schema = Schema(docTitle=STORED,
                    procTitle=KEYWORD(lowercase=True),
                    topics=KEYWORD(stored=True, lowercase=True),
                    categories=KEYWORD(stored=True, lowercase=True),
                    pageUrl=ID(stored=True),
                    procContent=TEXT)

    cwd = os.getcwd()
    print(cwd)

    # creazione della directory indexdir
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    # Creazione indexWriter, per aggiungere i documenti secondo lo schema
    ix = create_in("indexdir", schema)
    writer = AsyncWriter(ix)

    # Lista dei file da indicizzare
    filepaths = [
        os.path.join(docsDirectory, i) for i in os.listdir(docsDirectory)
        if i.split(".")[-1] == "json"
    ]

    num = 1
    # per ogni percorso trovato...
    for path in filepaths:
        print(f'{num}/{len(filepaths)}')
        num += 1

        fp = open(path, 'r', encoding="utf-8")
        entry = json.loads(fp.read())
        fp.close()

        docTitle = entry["title"]

        # Titolo tokenizzato, con attenzione a possibili caratteri unicode da trasformare in caratteri ASCII
        processedTitle = list(set(processText(docTitle)))
        #, filterStopwords=True, stemming=True, normalizeAccents=True, minLength=0

        pageUrl = entry["url"]

        # Contenuto in markdown della pagina
        markdownContent = entry["content"]

        # regex per trovare le frasi "chiave" nella pagina, ovvero quelle usate come inizio di una sezione nel markdown
        topicSearch = re.compile(r"\n####.*\n")

        # preprocessing (filtro stopwords e normalizzazione) delle frasi usate come argomento della pagina
        topicSet = set()
        for match in topicSearch.findall(markdownContent):
            topic = str(match).strip(r'\n').strip('#')
            topicSet = topicSet.union(set(processText(topic)))
            #, filterStopwords=True, stemming=True, normalizeAccents=True
        topics = list(topicSet)

        # le categorie sono le pagina padre dopo la homepage. Dopo il processing, vengono fatte passare per un set
        # per eliminare i duplicati.
        categoeries = list(
            set(processText(' '.join(str(pageUrl).split(r'/')[3:-2]))))
        #, filterStopwords=True, stemming=True, normalizeAccents=True

        # precedentemente:
        # category = processText(category, filterStopwords=True, normalizeAccents=True)

        # la sezione contentData è data dal contenuto preprocessato: stemming e normalizzazione
        procContent = processText(markdownContent)
        #, filterStopwords=True, stemming=True, normalizeAccents=True

        # Aggiunta dell'entry all'indice
        writer.add_document(docTitle=docTitle,
                            procTitle=processedTitle,
                            topics=topics,
                            categories=categoeries,
                            pageUrl=pageUrl,
                            procContent=procContent)
    writer.commit()