class IndexSchema(SchemaClass): filename = TEXT(stored=True, analyzer=simple_ana) symbol = TEXT(stored=True, analyzer=custom_ana) module = TEXT(stored=True, analyzer=simple_ana) location = STORED() kind = STORED() sort = NUMERIC(sortable=True)
class BookmarkSchema(fields.SchemaClass): contentNGram = TEXT(stored=False, analyzer=_N_GRAM_ANALYZER, phrase=False) contentText = TEXT(stored=False, analyzer=_TEXT_ANALYZER, phrase=True) urlSize = NUMERIC(signed=False, sortable=True, default=999) name = STORED() path = STORED() profile = STORED() url = STORED() icon = STORED()
def __init__(self) -> None: self.schema = Schema( id=NUMERIC(unique=True, stored=True), canonical_name=STORED(), name=STORED(), name_tokenized=TEXT(stored=False, analyzer=WhooshConstants.tokenized_analyzer), name_stemmed=TEXT(stored=False, analyzer=WhooshConstants.stem_analyzer), name_normalized=TEXT(stored=False, analyzer=WhooshConstants.normalized_analyzer, field_boost=100.0))
def populateWhooshNoticias(): schemNoticias = Schema(idNoticia=NUMERIC(stored=True), nombreEquipo=TEXT(stored=True), linkNoticia=TEXT(stored=True), tituloNoticia=TEXT(stored=True), descripcionNoticia=TEXT(stored=True), imagenNoticia=STORED(), tiempoPublicacion=TEXT(stored=True), autor=TEXT(stored=True)) if os.path.exists("IndexNoticias"): shutil.rmtree("IndexNoticias") os.mkdir("IndexNoticias") ixNoticia = create_in("IndexNoticias", schema=schemNoticias) writerNoticia = ixNoticia.writer() listaNoticias = extraerNoticias() n = 1 for noticias in listaNoticias: for a in noticias: for noticia in a: writerNoticia.add_document(idNoticia=n, nombreEquipo=noticia[0], linkNoticia=noticia[1], tituloNoticia=noticia[2], descripcionNoticia=noticia[3], imagenNoticia=noticia[4], tiempoPublicacion=noticia[5], autor=noticia[6]) n += 1 writerNoticia.commit() return n
def __init__(self, search_term: str): self.schema = Schema( educational_requirements=TEXT(), employment_type=ID(), experience_requirements=TEXT(), industry=KEYWORD(), organization=ID(stored=True), title=TEXT(stored=True), url=STORED(), parent_identifier=NUMERIC(stored=True), # Paragraph Data Children type=ID(stored=True), parent=NUMERIC(), paragraph_number=NUMERIC(stored=True), paragraph_heading=TEXT(analyzer=Analyzing.ImprovedTokenizer(), stored=True), paragraph_content=TEXT(analyzer=Analyzing.ImprovedTokenizer(), stored=True)) self.index_path: str = os.path.join(definitions.MAIN_PATH, "Storage", "Indexe", search_term) FileHandler.if_folder_not_existent_create(self.index_path) self.ix: Index = None self.writer: IndexWriter = None
def all_stops(api): """Generate a pickle of all stops.""" log = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) rtdicts = api.routes()[ 'route'] # All active routes on the realtime system. stopset = set() allstops = {} # Whoosh index schema = Schema(sid=TEXT(stored=True), name=TEXT(stored=True), location=STORED()) indexname = "stop_index" if not os.path.exists(indexname): os.mkdir(indexname) ix = index.create_in(indexname, schema) else: ix = index.open_dir(indexname) writer = ix.writer() log.debug("Generating stop database.") # Loop through all the routes to get at stops (API has weird structure) for rtdict in rtdicts: if rtdict['rt'] not in allstops: rtobject = Route.fromapi(api, rtdict) # Add all stops on the route to the set for s in rtobject.inbound_stops + rtobject.outbound_stops: stop = (s.id, s.location, s.name) stopset.add(stop) nchanges = 0 log.debug("Generating search index.") for stop in stopset: nchanges += 1 allstops[stop[0]] = stop # Switch to display groupings allstops = group_stops(allstops) for stop in allstops.values(): writer.update_document(sid=unicode(stop[0]), name=stop[2], location=stop[1]) writer.commit() # And create pickle too log.debug("Pickling db...") export = dict(allstops) with open("paac.stops.pickle", "w") as f: pickle.dump(allstops, f) # And create app db log.debug("Creating app database...") # create_app_db(allstops, already_grouped=True) return nchanges
class TMSchema(SchemaClass): """Fultext index schema for source and context strings.""" source_language = ID(stored=True) target_language = ID(stored=True) source = TEXT(stored=True) target = STORED() origin = ID(stored=True) category = NUMERIC(stored=True)
def build_index(self, path, names=None): print("Building index..") if not os.path.exists(path): os.makedirs(path) schema = Schema(title=TEXT(analyzer=self._analyzer), pid=STORED()) titles = shared.graph.playlist_titles if names is None else names normalized_titles = [normalize_title(title) for title in titles] ix = index.create_in(path, schema) writer = ix.writer() for i in trange(len(normalized_titles)): title = normalized_titles[i] writer.add_document(title=title, pid=i) print("Committing..") writer.commit() print("Done.") self._ix = ix
def create_index(index_dir): schema = Schema(book_abbr=STORED(), book_name=STORED(), book_tree=STORED(), book_kindle=STORED(), short=STORED(), long=STORED(), key_terms=STORED(), key_terms_content=TEXT(stored=True, analyzer=CleanupStandardAnalyzer(analyzer_re, STOP_WORDS) | CharsetFilter(accent_map)), book=ID(stored=True), heading=TEXT(stored=True, analyzer=StemmingAnalyzer(minsize=1, stoplist=None) | CharsetFilter(accent_map)), session=TEXT(stored=True, analyzer=StandardAnalyzer(minsize=1, stoplist=None)), date=DATETIME(stored=True, sortable=True), exact=TEXT(stored=True, analyzer=CleanupStandardAnalyzer(analyzer_re, stoplist=None) | CharsetFilter(accent_map)), stemmed=TEXT(stored=True, analyzer=CleanupStemmingAnalyzer(analyzer_re) | CharsetFilter(accent_map)), common=TEXT(stored=True, analyzer=CleanupStemmingAnalyzer(analyzer_re, stoplist=None) | CharsetFilter(accent_map)), ) ix = index.create_in(index_dir, schema) writer = ix.writer() for book in Books.indexed: with open("books/{}.txt".format(book['abbr']), encoding='utf-8') as f: text = pre_process_book(book, f.read()) text = re.search(book['book_re'], text, flags=re.DOTALL).group(1) d = { 'book_name': book['name'], 'book_abbr': book['abbr'], 'book_tree': book['tree'], 'book_kindle': book['kindle'], 'book': book['abbr'].lower(), } i = 0 heading_tiers = [{'short': '', 'long': ''}] * 3 carry_over_heading = None headings = list(filter(None, book['headings_re'].split(text)[1:])) for (__heading, _content) in zip(headings[::2], headings[1::2]): content = __heading + _content if carry_over_heading: content = carry_over_heading + content carry_over_heading = None heading = clean_heading(__heading) if 'heading_replacements' in book: for (pattern, repl) in book['heading_replacements']: heading = pattern.sub(repl, heading, 1) update_heading_tiers(book, heading_tiers, heading) has_content = re.search(r'[a-z]', _content) if not has_content: carry_over_heading = content continue add_document(writer, d, heading_tiers, content) i += 1 print(i) writer.commit() return ix
#! /usr/bin/env python from whoosh.fields import Schema, ID, KEYWORD, TEXT, STORED import os import config import sys from whoosh import writing, index if len(sys.argv) > 1 and sys.argv[1] == "reindex": writer = index.open_dir(config.INDEX_DIR).writer() writer.commit(mergetype=writing.CLEAR) else: schema = Schema(id=ID(stored=True, unique=True), title=TEXT(stored=True, sortable=True), content=TEXT(stored=True), language=STORED(), tag=KEYWORD(stored=True, commas=True)) if not os.path.exists(config.INDEX_DIR): os.mkdir(config.INDEX_DIR) index.create_in(config.INDEX_DIR, schema) print "Index initialized"
t.boost = 1.0 if positions: t.pos = start_pos + value.find(word) if chars: t.startchar = start_char + value.find(word) t.endchar = t.startchar + len(word) yield t schema = Schema( path=ID(stored=True), family=ID(stored=True), name=ID(stored=True), description=TEXT(stored=True, analyzer=ChineseTokenizer()), keywords=KEYWORD(stored=True, commas=True), created_at=STORED(), updated_at=STORED(), ) class WhooshSearch(object): def __init__(self, app=None): if app is not None: self.init_app(app) def init_app(self, app): app.config.setdefault('WHOOSH_DIR', 'data') self.app = app app.extensions = getattr(app, 'extensions', {}) app.extensions['elasticsearch'] = self
log = logging.getLogger(__name__) # CUSTOM ANALYZER wordsplit + lowercase filter ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() #INDEX SCHEMA DEFINITION SCHEMA = Schema(fileid=ID(unique=True), owner=TEXT(), repository=TEXT(stored=True), path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), extension=TEXT(stored=True)) IDX_NAME = 'HG_INDEX' FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') FRAGMENTER = ContextFragmenter(200) CHGSETS_SCHEMA = Schema( raw_id=ID(unique=True, stored=True), date=NUMERIC(stored=True), last=BOOLEAN(), owner=TEXT(), repository=ID(unique=True, stored=True), author=TEXT(stored=True), message=FieldType(format=Characters(), analyzer=ANALYZER,
from whoosh.fields import Schema, TEXT, KEYWORD, STORED # # Schema used to index the database. The original tuple (minus the zip codes) is stored in the data attribute. # SCHEMA = Schema(name=TEXT(stored=False), zips=KEYWORD(stored=False), data=STORED())
from whoosh.qparser import QueryParser INDEX_PATH = 'index' @dataclass class Verse: reference: str text: str def asdict(self): return {'reference': self.reference, 'text': self.text} _verse_schema = Schema( reference=STORED(), text=TEXT(analyzer=StemmingAnalyzer(), stored=True), ) if index.exists_in(INDEX_PATH): _index = index.open_dir(INDEX_PATH) else: os.mkdir(INDEX_PATH) _index = index.create_in(INDEX_PATH, _verse_schema) def add_verses(verses): writer = _index.writer() for verse in verses: writer.add_document(reference=verse.reference, text=verse.text)
key = f'{eid}:{locale}:tags' for tag in tags['values']: storage.lpush(key, tag) if __name__ == '__main__': print('-' * 30) print('Muzeeglot data ingestion') print('-' * 30) if exists(configuration.INGESTION_LOCK): print('WARN: ingestion lock detected, pass') else: print('INFO: evaluate tags corpus') tags_corpus = get_tags_corpus() print('INFO: create search index') if not exists(configuration.INDEX): makedirs(configuration.INDEX) schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED()) index = create_in(configuration.INDEX, schema) writer = BufferedWriter(index, period=60, limit=200) ingest_languages(writer) ingest_tags(tags_corpus) ingest_entities(tags_corpus, writer) print('INFO: optimize and close index') writer.close() index.optimize() index.close() print('INFO: write ingestion lock') with open(configuration.INGESTION_LOCK, 'w') as stream: stream.write('ingested')
def populateWhooshPeliculas(): schemPeliculas = Schema(idPelicula=NUMERIC(stored=True), titulo=TEXT(stored=True), portada= STORED(), sinopsis= TEXT(stored=True), linkPelicula= TEXT(stored=True), duracion=TEXT(stored=True), actores = KEYWORD(stored=True, commas=True), personal = KEYWORD(stored=True, commas=True), genero = KEYWORD(stored=True, commas=True)) if os.path.exists("Index"): shutil.rmtree("Index") os.mkdir("Index") ix = create_in("Index", schema=schemPeliculas) writer = ix.writer() listaPeliculas = extraer_datos_peliculas() numPeliculas=1 for pelicula in listaPeliculas: writer.update_document(idPelicula =numPeliculas, titulo=pelicula[0], portada=pelicula[1], sinopsis=pelicula[2], linkPelicula=pelicula[3], duracion=pelicula[4], actores = soloNombres(pelicula[5]), personal = soloNombres(pelicula[6]), genero=pelicula[7]) numPeliculas+=1 writer.commit() return numPeliculas-1
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() # FILE INDEX SCHEMA DEFINITION FILE_INDEX_NAME = 'FILE_INDEX' FILE_SCHEMA = Schema( fileid=ID(unique=True), # Path repository=ID(stored=True), repository_id=NUMERIC(unique=True, stored=True), # Numeric id of repo repo_name=TEXT(stored=True), owner=TEXT(), path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), md5=STORED(), extension=ID(stored=True), commit_id=TEXT(stored=True), size=NUMERIC(stored=True), mimetype=TEXT(stored=True), lines=NUMERIC(stored=True), ) # COMMIT INDEX SCHEMA COMMIT_INDEX_NAME = 'COMMIT_INDEX' COMMIT_SCHEMA = Schema( commit_id=ID(unique=True, stored=True), repository=ID(unique=True, stored=True), repository_id=NUMERIC(unique=True, stored=True), commit_idx=NUMERIC(stored=True, sortable=True),
import unicodecsv as csv from whoosh import index, sorting from whoosh.analysis import StandardAnalyzer from whoosh.fields import Schema, STORED, NGRAMWORDS, NUMERIC from whoosh.qparser import MultifieldParser _schema = Schema( ror=STORED(), grid=STORED(), name=NGRAMWORDS(stored=False), aliases=NGRAMWORDS(stored=False), num_students=NUMERIC(int, sortable=True, stored=False), citation_score=NUMERIC(int, sortable=True, stored=False), ) _index_path = 'data/ror-whoosh-index' def _read_ror_csv_rows(): rows = [] with open('data/ror-metrics.csv') as ror_csv: reader = csv.DictReader(ror_csv) for row in reader: row['aliases'] = row['aliases'].split( u'###') if row['aliases'] else [] row['num_students'] = int( row['num_students']) if row['num_students'] else None row['citation_score'] = float( row['citation_score']) if row['citation_score'] else None rows.append(row)
jieba.dt.tmp_dir = os.path.dirname(os.path.abspath(__file__)) # 使用结巴中文分词 analyzer = ChineseAnalyzer() # 创建schema, 在此处有定义的说明可以被用来检索,但STORED不可被检索,只是\ # 表面会出现在搜索的结果中 # stored=True表示可以出现在检索结果中,如果content内容较多\ # 为避免占用空间过大,可以定义为False,然后通过ID定位到数据库\ # 中,通过数据库获取 schema = Schema( title=TEXT(stored=True, analyzer=analyzer, vector=True, phrase=True), path=ID(stored=True), # 可被检索,但content的内容不出现在检索的结果中 content=TEXT(stored=False, analyzer=analyzer), id=STORED() ) # 存储schema信息到'indexdir'目录下 indexdir = 'indexdir/' if not os.path.exists(indexdir): os.mkdir(indexdir) ix = create_in(indexdir, schema) # 安照schema定义信息,增加需要建立索引的文档 # 注意:字符串格式需要为Unicode格式 writer = ix.writer() writer.add_document(title=u'第一篇文档', path=u'www.baidu.com', id=u'1', content=u'这是我们增加的第一篇文脏,又名文档') writer.add_document(title=u'第二篇文档', path=u'www.google.com', id=u'2', content=u'这是我们增加的第二篇文档, very interesting')