def update(self, x, who=None): # implement search here x = str(x) aindex = AsyncWriter(self.index, delay=0.2) aindex.add_document(content=x) aindex.commit() return self._emit(x)
def creating_searching_ranking(selected_analyzer, name_of_file, scoring_function, path): #creating Schema with fields id, title and content schema = Schema(id=ID(stored=True), title=TEXT(stored=False, analyzer=selected_analyzer), content=TEXT(stored=False, analyzer=selected_analyzer)) directory_containing_the_index = path ix = create_in( directory_containing_the_index, schema ) #writing index based on schema in the directory where the 'path' is directory_containing_the_index = path ix = index.open_dir( directory_containing_the_index) #opening the index file writer = AsyncWriter(ix) #writer will be used to add content to the fields ALL_DOCUMENTS_file_name = name_of_file #path to the file in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1') csv_reader = csv.reader(in_file, delimiter=',') #reading the file csv_reader.__next__( ) # to skip the header: first line contains the name of each field. for record in csv_reader: #for each row in the 'csv_test' file id = record[1] #read id title = record[2] #read title content = record[3] #read body writer.add_document(id=id, content=title + ' ' + content) writer.commit() in_file.close() #finish writing in the index file
def createSearchableData(root): ana = analysis.StemmingAnalyzer() ## definisco lo schema del mio indice schema = Schema( title=TEXT(stored=True),\ author=TEXT(stored=True),\ genre=KEYWORD(stored=True), \ link=ID(stored=True), \ path=ID(stored=True), \ price=ID(stored=True), \ content=TEXT(stored=True),\ contentData=TEXT) ## creo la directory indexdir if not os.path.exists("indexdir"): os.mkdir("indexdir") cwd = os.getcwd() print(cwd) ## Creo un indexWriter, che aggiunga i documenti secondo lo schema ix = create_in("indexdir", schema) writer = AsyncWriter(ix) ## Trovo i file nella directory, e ne salvo i percorsi filepaths = [os.path.join(root, i) for i in os.listdir(root)] num = 1 # per ogni percorso trovato... for path in filepaths: #print(num) num += 1 fp = open(path, 'r', encoding="utf-8") #print(path) # Nella prima riga ho messo il titolo, nella seconda l'autore, nella terza il genere, nella quarta il link, nella quinta il prezzo fileTitle = fp.readline() fileAuthor = fp.readline() fileGenre = fp.readline() fileLink = fp.readline() filePrice = fp.readline() # Tutto il resto del file è occupato dalla trama filePlot = fp.read() # la sezione contentData è data dalle trame preprocessate fileData = tokenize(filePlot) ## Aggiungo un documento all'indice, con tutti i campi necessari writer.add_document( title = fileTitle,\ path = path,\ author = fileAuthor,\ genre = fileGenre,\ link = fileLink,\ price = filePrice, \ content = filePlot,\ contentData = fileData) fp.close() writer.commit()
def main(): logging.basicConfig( level=logging.DEBUG, format='%(filename)s %(levelname)s: %(asctime)s: %(message)s') logger = logging.getLogger('main') logger.info('Executing indexing module') logger.info('Reading file') du = doc_utilities() du.read_data_set(file='data/wikipedia_text_files.csv') logger.info('Task1 - Number of documents = {}'.format( du.get_number_documents())) du.process_documents_for_indexing() collection = du.get_collection_json()[0:1000000] if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", MySchema) #writer = ix.writer() writer = AsyncWriter(ix) with tqdm(total=len(collection), desc="Indexing documents", bar_format="{l_bar}{bar} [ time left: {remaining} ]") as pbar: for d in collection: text = str(d['text']) idt = str(d['id']) title = str(d['title']) writer.add_document(id=idt, title=title, text=text) pbar.update(1) writer.commit()
def whoosh_index(self): it = QTreeWidgetItemIterator( self.notesTree, QTreeWidgetItemIterator.All) print("Starting complete indexing.") #writer = self.ix.writer() writer = AsyncWriter(self.ix) while it.value(): treeItem = it.value() name = self.notesTree.itemToPage(treeItem) path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/') print(path) fileobj = open(path, 'r', encoding='utf-8') content = fileobj.read() fileobj.close() if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=name, title=parseTitle(content, name), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) else: writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='') it += 1 writer.commit() print("Finished completely reindexing.")
def saveToWhoosh(self, df, dataset_id, overwrite=False): # use whoosh search engine to enable full text search if not os.path.exists(self.whoosh_root): os.mkdir(self.whoosh_root) ws_path = os.path.join(self.whoosh_root, dataset_id) if not os.path.exists(ws_path): os.mkdir(ws_path) logMsg( str(os.path.abspath(ws_path)) + ' does not exist, create it to store whoosh index') overwrite = True elif overwrite: shutil.rmtree(ws_path) os.mkdir(ws_path) schema = Schema(DOC_ID=NUMERIC(stored=True), TEXT=TEXT) if overwrite: ix = create_in(ws_path, schema) else: ix = open_dir(ws_path) writer = AsyncWriter(ix) with self.workflow.dao.create_session() as session: doc_iter = session.query(Document).filter( Document.DATASET_ID == dataset_id) for doc in doc_iter: writer.add_document(DOC_ID=doc.DOC_ID, TEXT=doc.TEXT) writer.commit() pass
def whoosh_index(self): it = QtWidgets.QTreeWidgetItemIterator( self.notesTree, QtWidgets.QTreeWidgetItemIterator.All) print("Starting complete indexing.") #writer = self.ix.writer() writer = AsyncWriter(self.ix) while it.value(): treeItem = it.value() name = self.notesTree.itemToPage(treeItem) path = os.path.join(self.notesTree.pageToFile(name)).replace(os.sep, '/') print(path) fileobj = open(path, 'r', encoding='utf-8') content = fileobj.read() fileobj.close() if METADATA_CHECKER.match(content) and 'meta' in self.settings.extensions: no_metadata_content = METADATA_CHECKER.sub("", content, count=1).lstrip() self.settings.md.reset().convert(content) writer.update_document( path=name, title=parseTitle(content, name), content=no_metadata_content, tags=','.join(self.settings.md.Meta.get('tags', [])).strip()) else: writer.add_document(path=name, title=parseTitle(content, name), content=content, tags='') it += 1 writer.commit() print("Finished completely reindexing.")
def store_page(user, url): writer = AsyncWriter(idx) resp = requests.get(url) content = parse(resp.content) now = datetime.now() writer.add_document(ts=now, user=unicode(user), url=unicode(url), content=content) writer.commit()
def recreate_data(sender=None, **kwargs): """ Readds all the Object in the index. If they already exists will be duplicated """ ix = get_or_create_index() writer = AsyncWriter(ix) for obj in Post.objects.all(): writer.add_document(**obj.index()) writer.commit()
def storeBattleInIndex(b, writer=None): commit = False if writer is None: writer = AsyncWriter(bix) commit = True writer.add_document(id=b.get('id'), descr=b.get('descr'), battle_date=u.get('battle_date')) if commit: writer.commit()
def storeGroupInIndex(group, writer=None): commit = False if writer is None: writer = AsyncWriter(gix) commit = True writer.add_document(name=group['name'], descr=group['descr'], id=group['id']) if commit: writer.commit()
def storeUserInIndex(u, writer=None): commit = False if writer is None: writer = AsyncWriter(uix) commit = True writer.add_document(nickname=u.get('nickname'), account_id=u.get('account_id'), user_id=u.get('id')) if commit: writer.commit()
def build_index(self): """Build index for all parsed documents""" ix = self.create_index() writer = AsyncWriter(ix) for i, document in enumerate(self.documents): if document: writer.add_document(**document) update_progress_bar("Building Index", i, len(self.documents)) writer.commit(optimize=True)
def whoosh_task(ids, pool_number, ix, model_class): session = sqla['session'] writer = AsyncWriter(ix) for id_ in ids: obj = session.query(model_class).filter_by(id=id_).one() if obj.title is None or obj.summary is None: continue writer.add_document(title=obj.title, summary=obj.summary) writer.commit()
def add_document(video_id, title, description, text): """ Adds single document to index """ #TODO: check index = open_index() writer = AsyncWriter(index) writer.add_document(text=text, title=title, id=video_id, description=description) writer.commit()
def update_index(sender, **kwargs): """ Adds/updates an entry in the index. It's connected with the post_save signal of the Object objects so will automatically index every new or modified Object """ ix = get_or_create_index() writer = AsyncWriter(ix) obj = kwargs['instance'] if "created" in kwargs and kwargs['created']: writer.add_document(**obj.index()) else: writer.update_document(**obj.index()) writer.commit()
def addLink(self, url, title, summary, txt): titleb = title + " " title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb sumario = summary + " " sumario2 = sumario + sumario text = title10 + sumario2 + " " + txt ix = open_dir(self.indexDir, indexname='MAIN', readonly=False) writer = AsyncWriter(ix) writer.add_document(id=url, content=unicode(text)) writer.commit() ix.close()
def addLink(self, url, title, summary, txt): titleb = title + " " title10 = titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb + titleb sumario = summary + " " sumario2 = sumario + sumario text = title10 + sumario2 + " " + txt ix = open_dir(self.indexDir, indexname='MAIN', readonly=False) writer = AsyncWriter(ix) writer.add_document(id=url, content=unicode(text)) writer.commit() ix.close()
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi, **kwargs): """ Build two search indexes simultaneously One is for repositories and the other for tools. Returns a tuple with number of repos and tools that were indexed. """ model = ts_mapping.init(file_path, dburi, engine_options={}, create_tables=False) sa_session = model.context.current repo_index, tool_index = _get_or_create_index(whoosh_index_dir) repo_index_writer = AsyncWriter(repo_index) tool_index_writer = AsyncWriter(tool_index) repos_indexed = 0 tools_indexed = 0 execution_timer = ExecutionTimer() with repo_index.searcher() as searcher: for repo in get_repos(sa_session, file_path, hgweb_config_dir, **kwargs): tools_list = repo.pop('tools_list') repo_id = repo['id'] indexed_document = searcher.document(id=repo_id) if indexed_document: if indexed_document['full_last_updated'] == repo.get( 'full_last_updated'): # We're done, since we sorted repos by update time break else: # Got an update, delete the previous document repo_index_writer.delete_by_term('id', repo_id) repo_index_writer.add_document(**repo) # Tools get their own index for tool in tools_list: tool_index_writer.add_document(**tool) tools_indexed += 1 repos_indexed += 1 tool_index_writer.commit() repo_index_writer.commit() log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed) log.info("Toolbox index finished %s", execution_timer) return repos_indexed, tools_indexed
def update_index(self, document): """Update search index for a document Args: self (object): FullTextSearch Instance document (_dict): A dictionary with title, path and content """ ix = self.get_index() with ix.searcher(): writer = AsyncWriter(ix) writer.delete_by_term(self.id, document[self.id]) writer.add_document(**document) writer.commit(optimize=True)
def whoosh_task(ids, pool_number, ix, model_class): session = sqla['session'] writer = AsyncWriter(ix) for id_ in ids: obj = session.query(model_class).filter_by(id=id_).one() if obj.title is None or obj.summary is None: continue writer.add_document( title=obj.title, summary=obj.summary ) writer.commit()
def creating_searching_ranking(selected_analyzer, name_of_file, scoring_function, path): """ Method that creates schema and stores index file based on the retrieved 'csv_test.csv' file input: selected_analyzer - selected text analyzer from the whoosh library name_of_file - name of .csv file stored from dataframe variable 'files_text' scoring_function - selected scoring function from the whoosh library path - path where index files are stored """ #creating Schema with fields id, title and content schema = Schema(id=ID(stored=True),\ title=TEXT(stored=False, analyzer=selected_analyzer), content=TEXT(stored=False, analyzer=selected_analyzer)) directory_containing_the_index = path ix = create_in( directory_containing_the_index, schema ) #vrating index based on schema in the directory where the 'path' is directory_containing_the_index = path ix = index.open_dir( directory_containing_the_index) #opening the index file writer = AsyncWriter(ix) #writer will be used to add content to the fields #num_added_records_so_far=0 ALL_DOCUMENTS_file_name = name_of_file #path to the file in_file = open(ALL_DOCUMENTS_file_name, "r", encoding='latin1') csv_reader = csv.reader(in_file, delimiter=',') #reading the file csv_reader.__next__( ) # to skip the header: first line contains the name of each field. #num_added_records_so_far = 0 for record in csv_reader: #for each row in the 'csv_test' file id = record[1] #read id title = record[2] #read title content = record[3] #read body writer.add_document(id=id, content=title + ' ' + content) #num_added_records_so_far +=1 #if (num_added_records_so_far%1000 == 0): # print(" num_added_records_so_far= " + str(num_added_records_so_far)) writer.commit() in_file.close() #finish writing in the index file
def newPageCore(self, item, newPageName): pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/') if not newPageName: dialog = LineEditDialog(pagePath, self) if dialog.exec_(): newPageName = dialog.editor.text() if newPageName: if hasattr(item, 'text'): pagePath = os.path.join(self.notePath, pagePath + '/').replace(os.sep, '/') if not QDir(pagePath).exists(): QDir(self.notePath).mkdir(pagePath) fileName = pagePath + newPageName + self.settings.fileExt fh = QFile(fileName) fh.open(QIODevice.WriteOnly) savestream = QTextStream(fh) savestream << '# ' + newPageName + '\n' savestream << 'Created ' + str(datetime.date.today()) + '\n\n' fh.close() QTreeWidgetItem(item, [newPageName]) newItem = self.pageToItem(pagePath + newPageName) self.sortItems(0, Qt.AscendingOrder) self.setCurrentItem(newItem) if hasattr(item, 'text'): self.expandItem(item) # create attachment folder if not exist attDir = self.itemToAttachmentDir(newItem) if not QDir(attDir).exists(): QDir().mkpath(attDir) # TODO improvement needed, can be reused somehow fileobj = open(fileName, 'r') content = fileobj.read() fileobj.close() self.ix = open_dir(self.settings.indexdir) #writer = self.ix.writer() writer = AsyncWriter(self.ix) writer.add_document(path=pagePath + newPageName, content=content) writer.commit()
def createIndex(self): print " Whoosh Loading from SQL " created = self.createIndexDirIfNotExist() if not created: #already exists return conn = sqlite3.connect(self.dbName) c = conn.cursor() c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''') feeds = c.fetchall() conn.close() linkN = 1 schema = Schema(id = TEXT(stored = True), content=TEXT) ix = create_in(self.indexDir, schema, indexname='MAIN') writer = AsyncWriter(ix) for feed in feeds: # Descartar links sem Titulo if( isinstance(feed[3], type(None))): #print "is Null" continue index = feed[0] # print " Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3] linkN += 1 titolo = feed[3] + " " titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo sumario = feed[4] + " " sumario2 = sumario + sumario text = titolo10 + sumario2 + " " +feed[5] writer.add_document(id=index, content=unicode(text)) writer.commit() ix.close() print " Done Loading from SQL"
def newPageCore(self, item, newPageName): pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, '/') if not newPageName: dialog = LineEditDialog(pagePath, self) if dialog.exec_(): newPageName = dialog.editor.text() if newPageName: if hasattr(item, 'text'): pagePath = os.path.join(self.notePath, pagePath + '/').replace(os.sep, '/') if not QDir(pagePath).exists(): QDir(self.notePath).mkdir(pagePath) fileName = pagePath + newPageName + self.settings.fileExt fh = QFile(fileName) fh.open(QIODevice.WriteOnly) savestream = QTextStream(fh) savestream << '# ' + newPageName + '\n' savestream << 'Created ' + str(datetime.date.today()) + '\n\n' fh.close() QTreeWidgetItem(item, [newPageName]) newItem = self.pageToItem(pagePath + newPageName) self.sortItems(0, Qt.AscendingOrder) self.setCurrentItem(newItem) if hasattr(item, 'text'): self.expandItem(item) # create attachment folder if not exist attDir = self.itemToAttachmentDir(newItem) if not QDir(attDir).exists(): QDir().mkpath(attDir) # TODO improvement needed, can be reused somehow fileobj = open(fileName, 'r') content = fileobj.read() fileobj.close() self.ix = open_dir(self.settings.indexdir) #writer = self.ix.writer() writer = AsyncWriter(self.ix) writer.add_document(path=pagePath+newPageName, content=content) writer.commit()
def createIndex(self): print " Whoosh Loading from SQL " created = self.createIndexDirIfNotExist() if not created: #already exists return conn = sqlite3.connect(self.dbName) c = conn.cursor() c.execute('''SELECT * FROM newsStorage where ARTICLE <> "" ''') feeds = c.fetchall() conn.close() linkN = 1 schema = Schema(id=TEXT(stored=True), content=TEXT) ix = create_in(self.indexDir, schema, indexname='MAIN') writer = AsyncWriter(ix) for feed in feeds: # Descartar links sem Titulo if (isinstance(feed[3], type(None))): #print "is Null" continue index = feed[0] # print " Whoosh Loaded Titulo " + str(linkN) + ":" + feed[3] linkN += 1 titolo = feed[3] + " " titolo10 = titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo + titolo sumario = feed[4] + " " sumario2 = sumario + sumario text = titolo10 + sumario2 + " " + feed[5] writer.add_document(id=index, content=unicode(text)) writer.commit() ix.close() print " Done Loading from SQL"
def _to_index(self, x): # implement search here x = str(x) >> log aindex = AsyncWriter(self.index, delay=0.2) aindex.add_document(content=x) aindex.commit()
def load_all_dset_metadata(self, dsetname, create_index=False): """ Loads into memory the metadata of a dataset. The metadata is read from a CSV file, which should have at least two columns: - filename: Paths to the images in the dataset, relative to the image data folder. For backward compatibility '#filename' is also accepted - file_attributes: JSON string containing information about the file. The most important file attributes are 'caption' and 'keywords'. The 'caption' field should be a short string which will be used as the caption of the image in result lists. The 'keywords' field must contain a comma-separated list of keywords. Each keyword can be used as the source for a search. If create_index is True, it builds a search index with the 'keywords' in the file_attributes. Arguments: dsetname: String corresponding to the dataset within the list of supported datasets. create_index: Boolean indicating whether or not to build a search index with the metadata """ metaindex = None t = time.time() try: for afile in os.listdir(os.path.join(self.metadata_dir, dsetname)): if afile.endswith(".csv"): metadata_file = os.path.join(self.metadata_dir, dsetname, afile) print('Found metadata file at', metadata_file) if create_index: metaindex = open_dir(self.index_dir) with open(metadata_file, 'r') as fin: reader = csv.DictReader(fin) for row in reader: id_field = None if 'filename' in row.keys(): id_field = 'filename' elif '#filename' in row.keys(): id_field = '#filename' if id_field and 'file_attributes' in row.keys(): filename = row[id_field] try: self.fname2meta[dsetname][ filename] = json.loads( row['file_attributes']) except: self.fname2meta[dsetname][filename] = None metadata = self.fname2meta[dsetname][filename] keyword_list = None if metadata and 'keywords' in metadata.keys(): keyword_list = metadata['keywords'] if keyword_list and create_index: keyword_list_splitted = keyword_list.split( ',') writer = AsyncWriter(metaindex) for key in keyword_list_splitted: key = key.strip() # delete previous entry if found query = QueryParser( 'key', metaindex.schema).parse(key) writer.delete_by_query( query, metaindex.searcher()) # add document writer.add_document( key=str(key), dataset=str(dsetname)) writer.commit() if keyword_list: # we would like to do this, even if the index is not created # register link keyword-file keyword_list_splitted = keyword_list.split( ',') for key in keyword_list_splitted: key = key.strip() if key in self.keyword2fname[ dsetname].keys(): self.keyword2fname[dsetname][ key].append(filename) else: self.keyword2fname[dsetname][ key] = [filename] else: raise Exception( '"filename" and/or "file_attributes" columns not found in ' + afile + ' (are you missing the column names?). Metadata will not be available!.' ) print('Finished loading metadata for %s in %s' % (dsetname, str(time.time() - t))) self.is_all_metadata_loaded = True break except Exception as e: print("load_all_dset_metadata Exception:" + str(e) + '\n')
ix = open_dir(settings.index) writer = AsyncWriter(ix) for entry in entries: try: item = Item.get(guid=entry['guid']) except Item.DoesNotExist: item = Item.create(**entry) records += 1 if len(entry['html']): soup = BeautifulSoup(entry['html'], settings.fetcher.parser) plaintext = ''.join(soup.find_all(text=True)) writer.add_document(id=item.id, guid=unicode(item.guid), title=entry['title'], text=plaintext, when=datetime.datetime.utcfromtimestamp( item.when)) hrefs = get_link_references(soup) else: hrefs = [] hrefs.append(entry['url']) if not settings.fetcher.post_processing.expand_links: return lnow = time.time() links = expand_links(set(hrefs)) log.debug("%s - %d links in %fs" % (netloc, len(hrefs), time.time() - lnow))
def index_update(index, items): """ :param:index: index name :param:items: list of (operation, full class name, primary key, data) tuples. """ index_name = index index = service.app_state.indexes[index_name] adapted = service.adapted session = safe_session() updated = set() writer = AsyncWriter(index) try: for op, cls_name, pk, data in items: if pk is None: continue # always delete. Whoosh manual says that 'update' is actually delete + add # operation object_key = f"{cls_name}:{pk}" writer.delete_by_term("object_key", object_key) adapter = adapted.get(cls_name) if not adapter: # FIXME: log to sentry? continue if object_key in updated: # don't add twice the same document in same transaction. The writer will # not delete previous records, ending in duplicate records for same # document. continue if op in ("new", "changed"): with session.begin(nested=True): obj = adapter.retrieve(pk, _session=session, **data) if obj is None: # deleted after task queued, but before task run continue document = service.get_document(obj, adapter) try: writer.add_document(**document) except ValueError: # logger is here to give us more infos in order to catch a weird bug # that happens regularly on CI but is not reliably # reproductible. logger.error("writer.add_document(%r)", document, exc_info=True) raise updated.add(object_key) except Exception: writer.cancel() raise session.close() writer.commit() try: # async thread: wait for its termination writer.join() except RuntimeError: # happens when actual writer was already available: asyncwriter didn't need # to start a thread pass
ts_start = current_time_msec() writer = AsyncWriter( ix) # used to override the LockError for multiprocessing # Directory containing HTML files dir_html = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\HW_1\part_1\Cranfield_DATASET\DOCUMENTS' for i in tqdm(range(1, len(os.listdir(dir_html)) + 1)): file_name = os.path.join(dir_html, "______{}.html".format(i)) with open(file_name, encoding="utf8") as html_file: soup = BeautifulSoup(html_file) t = title(soup) b = body(soup) identifier = i writer.add_document(id=str(identifier), title=t, content=b) writer.commit() html_file.close() # ts_end = current_time_msec() print("TimeStamp: ", time.asctime(time.localtime(time.time()))) total_time_msec = (ts_end - ts_start) print("total_time= " + str(total_time_msec) + "msec") print() # *Time index* for schema_type in dir_specific: schema_type = '\\' + schema_type if schema_type == '\\Field Booster':
def add(self, note): writer = AsyncWriter(self.index) writer.add_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet) writer.commit()
class Index(object): def __init__(self, directory, persist): self.log = logging.getLogger("ftpvista.index") self._persist = persist if not os.path.exists(directory): self.log.info("Creating the index in %s" % directory) os.mkdir(directory) self._idx = index.create_in(directory, schema=self.get_schema()) else: self.log.info("Opening the index in %s" % directory) self._idx = index.open_dir(directory) self._searcher = self._idx.searcher() self._writer = None self.open_writer() self._last_optimization = None def open_writer(self): # self._writer = BufferedWriter(self._idx, 120, 4000) self._writer = AsyncWriter(self._idx) def get_schema(self): analyzer = StemmingAnalyzer("([a-zA-Z0-9])+") my_analyzer = analyzer | CharsetFilter(accent_map) return Schema( server_id=ID(stored=True), has_id=ID(), path=TEXT(analyzer=my_analyzer, stored=True), name=TEXT(analyzer=my_analyzer, stored=True), ext=TEXT(analyzer=my_analyzer, stored=True), size=ID(stored=True), mtime=ID(stored=True, sortable=True), audio_album=TEXT(analyzer=my_analyzer, stored=True), audio_artist=TEXT(analyzer=my_analyzer, stored=True), audio_title=TEXT(analyzer=my_analyzer, stored=True), audio_track=ID(stored=True), audio_year=ID(stored=True), ) def delete_all_docs(self, server): self.open_writer() self._writer.delete_by_term("server_id", str(server.get_server_id())) self._writer.commit() self.log.info("All documents of server %s deleted" % server.get_ip_addr()) def incremental_server_update(self, server_id, current_files): """Prepares to incrementaly update the documents for the given server. server_id -- Id of the server to update. current_files -- a list of (path, size, mtime) tuples for each files currently on the server. Delete all the outdated files from the index and returns a list of files needing to be reindexed. """ def delete_doc(writer, serverid, path): writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path)) # Build a {path => (size, mtime)} mapping for quick lookups to_index = {} for path, size, mtime in current_files: to_index[path] = (size, mtime) results = self._searcher.documents(server_id=str(server_id)) if results: for fields in results: indexed_path = fields["path"] if indexed_path not in to_index: # This file was deleted from the server since it was indexed delete_doc(self._writer, server_id, indexed_path) self.log.debug("%s has been removed" % indexed_path) else: size, mtime = to_index[indexed_path] try: if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"): # This file has been modified since it was indexed delete_doc(self._writer, server_id, indexed_path) else: # up to date, no need to reindex del to_index[indexed_path] except ValueError: delete_doc(self._writer, server_id, indexed_path) # return the remaining files return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()] def add_document( self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None ): """Add a document with the specified fields in the index. Changes need to be commited. """ # passing the optional arguments is quite a mess # let's build a dict for that purpose _, ext = os.path.splitext(name) ext = ext.lstrip(".") kwargs = { "server_id": server_id, "name": name, "ext": ext, "path": path, "size": size, "mtime": mtime, "has_id": "a", } # Add the optional args if audio_album is not None: kwargs["audio_album"] = audio_album if audio_artist is not None: kwargs["audio_artist"] = audio_artist if audio_title is not None: kwargs["audio_title"] = audio_title if audio_year is not None: kwargs["audio_year"] = audio_year try: self._writer.add_document(**kwargs) except IndexingError: self.open_writer() self._writer.add_document(**kwargs) def commit(self, optimize=False): """ Commit the changes in the index and optimize it """ self.log.info(" -- Begin of Commit -- ") try: self._writer.commit(optimize=optimize) except IndexingError: self.open_writer() self._writer.commit(optimize=optimize) self.log.info("Index commited") self._searcher = self._idx.searcher() self.log.info(" -- End of Commit -- ") def close(self): self.log.info(" -- Closing writer and index -- ") # self._writer.close() """ Close the index """ self._idx.close()
def insert_docs(docs): ix = open_dir(whoosh_index) writer = AsyncWriter(ix) for doc in docs: writer.add_document(**doc) writer.commit()
writer = AsyncWriter(ix) for entry in entries: try: item = Item.get(guid = entry['guid']) except Item.DoesNotExist: item = Item.create(**entry) records += 1 if len(entry['html']): soup = BeautifulSoup(entry['html'], settings.fetcher.parser) plaintext = ''.join(soup.find_all(text=True)) writer.add_document( id = item.id, guid = unicode(item.guid), title = entry['title'], text = plaintext, when = datetime.datetime.utcfromtimestamp(item.when) ) hrefs = get_link_references(soup) else: hrefs = [] hrefs.append(entry['url']) if not settings.fetcher.post_processing.expand_links: return lnow = time.time() links = expand_links(set(hrefs)) log.debug("%s - %d links in %fs" % (netloc, len(hrefs),time.time()-lnow))
def index_update(index, items): """ :param:index: index name :param:items: list of (operation, full class name, primary key, data) tuples. """ index_name = index index = service.app_state.indexes[index_name] adapted = service.adapted session = safe_session() updated = set() writer = AsyncWriter(index) try: for op, cls_name, pk, data in items: if pk is None: continue # always delete. Whoosh manual says that 'update' is actually delete + add # operation object_key = "{}:{}".format(cls_name, pk) writer.delete_by_term("object_key", object_key) adapter = adapted.get(cls_name) if not adapter: # FIXME: log to sentry? continue if object_key in updated: # don't add twice the same document in same transaction. The writer will # not delete previous records, ending in duplicate records for same # document. continue if op in ("new", "changed"): with session.begin(nested=True): obj = adapter.retrieve(pk, _session=session, **data) if obj is None: # deleted after task queued, but before task run continue document = service.get_document(obj, adapter) try: writer.add_document(**document) except ValueError: # logger is here to give us more infos in order to catch a weird bug # that happens regularly on CI but is not reliably # reproductible. logger.error("writer.add_document(%r)", document, exc_info=True) raise updated.add(object_key) except BaseException: writer.cancel() raise session.close() writer.commit() try: # async thread: wait for its termination writer.join() except RuntimeError: # happens when actual writer was already available: asyncwriter didn't need # to start a thread pass
#for post in collection.find(): statement = table.select() for post in engine.execute(statement).fetchall(): num += 1 left = total - num if left > 5000 and left % 5000 == 0: print 'Year', year, 'Left', str(left), 'Count', str(num) print 'Consuming time:', str(time() - t) print t = time() #title=post['_id'] title = post['id'] sent = post['sent'] year = post['year'] writer.add_document(title=title, content=sent, year=year) #client.close() engine.close() print 'Done!' print 'Consuming time', str(time() - t2) + 's' print print 'Restarting database...' #os.system('mongod -f /etc/mongodb.conf --shutdown') #os.system('mongod -f /etc/mongodb.conf &') print 'Done!' print print 'Waiting...' #sleep(15)
def newPageCore(self, item, newPageName, useTemplate=False, templateTitle=None, templateBody=None): pagePath = os.path.join(self.notePath, self.itemToPage(item)).replace(os.sep, "/") if not newPageName: if useTemplate: dialog = mikitemplate.PickTemplateDialog(pagePath, self.settings, parent=self) if dialog.exec_(): curTitleIdx = dialog.titleTemplates.currentIndex() curBodyIdx = dialog.bodyTemplates.currentIndex() dtnow = datetime.datetime.now() if curTitleIdx > -1: titleItem = dialog.titleTemplates.model().item(curTitleIdx) titleItemContent = titleItem.data(TTPL_COL_DATA) titleItemType = titleItem.data(TTPL_COL_EXTRA_DATA) titleParameter = dialog.titleTemplateParameter.text() newPageName = mikitemplate.makeTemplateTitle( titleItemType, titleItemContent, dtnow=dtnow, userinput=titleParameter ) if curBodyIdx > -1: bodyItemIdx = dialog.bodyTemplates.rootModelIndex().child(curBodyIdx, 0) bodyFPath = dialog.bodyTemplates.model().filePath(bodyItemIdx) else: bodyFPath = None else: dialog = LineEditDialog(pagePath, self) if dialog.exec_(): newPageName = dialog.editor.text() prevparitem = None if newPageName: if hasattr(item, "text"): pagePath = os.path.join(self.notePath, pagePath + "/").replace(os.sep, "/") if not QtCore.QDir(pagePath).exists(): QtCore.QDir(self.notePath).mkdir(pagePath) if not QtCore.QDir(os.path.dirname(newPageName)).exists(): curdirname = os.path.dirname(newPageName) needed_parents = [] while curdirname != "": needed_parents.append(curdirname) curdirname = os.path.dirname(curdirname) # create the needed hierarchy in reverse order for i, needed_parent in enumerate(needed_parents[::-1]): paritem = self.pageToItem(needed_parent) if paritem is None: if i == 0: self.newPageCore(item, os.path.basename(needed_parent)) else: self.newPageCore(prevparitem, os.path.basename(needed_parent)) QtCore.QDir(pagePath).mkdir(needed_parent) elif not QtCore.QDir(os.path.join(self.notePath, needed_parent).replace(os.sep, "/")).exists(): QtCore.QDir(pagePath).mkdir(needed_parent) if paritem is not None: prevparitem = paritem else: prevparitem = self.pageToItem(needed_parent) fileName = pagePath + newPageName + self.settings.fileExt fh = QtCore.QFile(fileName) fh.open(QtCore.QIODevice.WriteOnly) savestream = QtCore.QTextStream(fh) if useTemplate and bodyFPath is not None: with open(bodyFPath, "r", encoding="utf-8") as templatef: savestream << mikitemplate.makeTemplateBody( os.path.basename(newPageName), dtnow=dtnow, dt_in_body_txt=self.tr("Created {}"), body=templatef.read(), ) else: savestream << mikitemplate.makeDefaultBody(os.path.basename(newPageName), self.tr("Created {}")) fh.close() if prevparitem is not None: QtWidgets.QTreeWidgetItem(prevparitem, [os.path.basename(newPageName)]) else: QtWidgets.QTreeWidgetItem(item, [os.path.basename(newPageName)]) newItem = self.pageToItem(pagePath + newPageName) self.sortItems(0, Qt.AscendingOrder) self.setCurrentItem(newItem) if hasattr(item, "text"): self.expandItem(item) # create attachment folder if not exist attDir = self.itemToAttachmentDir(newItem) if not QtCore.QDir(attDir).exists(): QtCore.QDir().mkpath(attDir) # TODO improvement needed, can be reused somehow with open(fileName, "r") as fileobj: content = fileobj.read() self.ix = open_dir(self.settings.indexdir) # writer = self.ix.writer() writer = AsyncWriter(self.ix) writer.add_document(path=pagePath + newPageName, content=content) writer.commit()
def add_to_fts(cls, content, title=None, id=None, source_hash=None, tags=None): ix = open_dir(LOCAL_FTS_INDEX) writer = AsyncWriter(ix) writer.add_document(content=content, title=title, id=id, source_hash=source_hash, tags=tags) writer.commit()
#Create index and AsyncWriter object index = create_in("tweetindex", my_schema) writer = AsyncWriter(index) if __name__=='__main__': #Load raw data with open("WC2015_headers.csv",'rb') as to_load: data=csv.DictReader(to_load) for row in data: #Extract required information from date to create python datetime object date=row['created_at'][:19]+' '+row['created_at'][-4:] #Clean text and parse into keywords text=row['text'].replace('\\','') keywords=[word for word in word_tokenize(text) if word not in stops] #Check for Retweets rt=False if 'RT ' in text: rt=True #Add completed document to index writer.add_document(id = unicode(row['id']), screen_name = unicode(row['screen_name']), text = unicode(text), contains_retweet=rt, keyword = unicode(" ".join(keywords)), created = datetime.datetime.strptime(date, "%a %b %d %H:%M:%S %Y") ) writer.commit()
def insert(self, link, title, document): writer = AsyncWriter(self.ix) writer.add_document(link=link,title=title, document=document + title) writer.commit()
def createSearchableData(docsDirectory): # definizione dello schema dell'indice schema = Schema(docTitle=STORED, procTitle=KEYWORD(lowercase=True), topics=KEYWORD(stored=True, lowercase=True), categories=KEYWORD(stored=True, lowercase=True), pageUrl=ID(stored=True), procContent=TEXT) cwd = os.getcwd() print(cwd) # creazione della directory indexdir if not os.path.exists("indexdir"): os.mkdir("indexdir") # Creazione indexWriter, per aggiungere i documenti secondo lo schema ix = create_in("indexdir", schema) writer = AsyncWriter(ix) # Lista dei file da indicizzare filepaths = [ os.path.join(docsDirectory, i) for i in os.listdir(docsDirectory) if i.split(".")[-1] == "json" ] num = 1 # per ogni percorso trovato... for path in filepaths: print(f'{num}/{len(filepaths)}') num += 1 fp = open(path, 'r', encoding="utf-8") entry = json.loads(fp.read()) fp.close() docTitle = entry["title"] # Titolo tokenizzato, con attenzione a possibili caratteri unicode da trasformare in caratteri ASCII processedTitle = list(set(processText(docTitle))) #, filterStopwords=True, stemming=True, normalizeAccents=True, minLength=0 pageUrl = entry["url"] # Contenuto in markdown della pagina markdownContent = entry["content"] # regex per trovare le frasi "chiave" nella pagina, ovvero quelle usate come inizio di una sezione nel markdown topicSearch = re.compile(r"\n####.*\n") # preprocessing (filtro stopwords e normalizzazione) delle frasi usate come argomento della pagina topicSet = set() for match in topicSearch.findall(markdownContent): topic = str(match).strip(r'\n').strip('#') topicSet = topicSet.union(set(processText(topic))) #, filterStopwords=True, stemming=True, normalizeAccents=True topics = list(topicSet) # le categorie sono le pagina padre dopo la homepage. Dopo il processing, vengono fatte passare per un set # per eliminare i duplicati. categoeries = list( set(processText(' '.join(str(pageUrl).split(r'/')[3:-2])))) #, filterStopwords=True, stemming=True, normalizeAccents=True # precedentemente: # category = processText(category, filterStopwords=True, normalizeAccents=True) # la sezione contentData è data dal contenuto preprocessato: stemming e normalizzazione procContent = processText(markdownContent) #, filterStopwords=True, stemming=True, normalizeAccents=True # Aggiunta dell'entry all'indice writer.add_document(docTitle=docTitle, procTitle=processedTitle, topics=topics, categories=categoeries, pageUrl=pageUrl, procContent=procContent) writer.commit()