class Index: def __init__(self, path='~/Music/iTunes/iTunes Music Library.xml', folder='~/Library/Application Support/Share my tunes'): self.path = os.path.expanduser(path) self.schema = Schema( trackId = ID(stored=True), name=TEXT(stored=True), artist=TEXT(stored=True), album=TEXT(stored=True), genre=KEYWORD(stored=True), location=STORED, trackNumber=STORED, bitRate=ID(stored=True), artwork=KEYWORD(stored=True) ) self.parser = MultifieldParser(["name", "album", "artist"], schema = self.schema) self.folder = "%s/index" % os.path.expanduser(folder) self.empty = not whoosh.index.exists_in(self.folder) self.ix = None def index(self): if self.empty: if not os.path.exists(self.folder): os.makedirs(self.folder) st = FileStorage(self.folder) ix = st.create_index(self.schema) w = ix.writer() w.add_document(name = u"beuha") pipe = file.ID3Filter() #[TODO] using itunes info for artwork? cpt = 0 for track in pipe(ItunesParser(self.path)): if track['album'] != None : album = track['album'].encode('ascii', 'ignore') else: album = "" #print track['artwork'], "[%s]" % album, track['name'].encode('ascii', 'ignore') if cpt % 20 == 0: print "\n%i " %cpt, print '#', #print track['album'], track['name'] w.add_document( trackId = track['trackId'], name=track['name'] ,artist=track['artist'], album=track['album'], genre=track['genre'], location=track['location'], artwork=boolean(track['artwork']), trackNumber=track['trackNumber'], bitRate=track['bitRate'] ) #if cpt % 100 == 1: # w.commit() cpt += 1 print "\n\n%i tracks indexed" % cpt w.commit() ix.optimize() ix.close() else : print "already indexed" def query(self, query): if self.ix == None: self.ix = FileStorage(self.folder).open_index() q = self.parser.parse(query) return self.ix.searcher().search(q, sortedby=("album", "name"), limit=None)
def buscar_animesporsinopsis(request): formulario = BusquedaPorSinopsisForm(request.POST) lista_animes = [] if formulario.is_valid(): ix = FileStorage(dirindex).open_index() query = QueryParser("sinopsis", ix.schema, group=qparser.AndGroup).parse(formulario.cleaned_data['sinopsis']) with ix.searcher() as searcher: results = searcher.search(query) for r in results: anime = [] anime.append(r['titulo']) anime.append(r['imagen']) anime.append(r['rango_web']) anime.append(r['popularidad']) anime.append(r['fecha_inicio']) anime.append(r['fecha_final']) anime.append(r['episodios']) anime.append(r['sinopsis']) anime.append(r['generos']) lista_animes.append(anime) return render(request, 'animesbusquedaporsinopsis.html', {'formulario':formulario, 'animes':lista_animes})
class TranslationMemory(object): def __init__(self): self.index = FileStorage(data_dir('memory')).open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, ) self.searcher = None self.comparer = Comparer() def __del__(self): self.close() def open_searcher(self): if self.searcher is None: self.searcher = self.index.searcher() def doc_count(self): self.open_searcher() return self.searcher.doc_count() def close(self): if self.searcher is not None: self.searcher.close() self.searcher = None @contextlib.contextmanager def writer(self): writer = self.index.writer() try: yield writer finally: writer.commit() def get_language_code(self, code, langmap): language = Language.objects.auto_get_or_create(code) if langmap and language.code in langmap: language = Language.objects.auto_get_or_create( langmap[language.code] ) return language.code def import_tmx(self, fileobj, langmap=None): origin = force_text(os.path.basename(fileobj.name)) storage = tmxfile.parsefile(fileobj) header = next( storage.document.getroot().iterchildren( storage.namespaced("header") ) ) source_language_code = header.get('srclang') source_language = self.get_language_code(source_language_code, langmap) languages = {} with self.writer() as writer: for unit in storage.units: # Parse translations (translate-toolkit does not care about # languages here, it just picks first and second XML elements) translations = {} for node in unit.getlanguageNodes(): lang, text = get_node_data(unit, node) translations[lang] = text if lang not in languages: languages[lang] = self.get_language_code(lang, langmap) try: source = translations.pop(source_language_code) except KeyError: # Skip if source language is not present continue for lang, text in translations.items(): writer.add_document( source_language=source_language, target_language=languages[lang], source=source, target=text, origin=origin, category=CATEGORY_FILE, ) def lookup(self, source_language, target_language, text): langfilter = query.And([ query.Term('source_language', source_language), query.Term('target_language', target_language), ]) self.open_searcher() text_query = self.parser.parse(text) matches = self.searcher.search( text_query, filter=langfilter, limit=20000 ) for match in matches: similarity = self.comparer.similarity(text, match['source']) if similarity < 30: continue yield ( match['source'], match['target'], similarity, match['origin'] ) def delete(self, origin): """Delete entries by origin.""" with self.writer() as writer: return writer.delete_by_term('origin', origin) def empty(self): """Recreates translation memory.""" self.index = setup_index() self.searcher = None def get_origins(self): self.open_searcher() return [ force_text(x) for x in self.searcher.lexicon('origin') ]
parser.add_argument("-add_bm25", action="store_true") args = parser.parse_args() from singletons import PREPROCESS, SEARCHER run_id = args.model if args.model == "clusvm": clusvm = util.load(f"clusvm.pkl") elif "sv" in args.model: svm = util.load(f"{args.svm_file}.pkl") elif args.model == "adarank": alpha = np.load("ada.npy") else: ix = FileStorage("data/msmarcoidx").open_index() if args.model == "bm25": SEARCHER = ix.searcher() qp = QueryParser("body", schema=ix.schema) def predict(inp): qid, query = inp ret = [] if args.model == "okapi" or args.model == "bm25": results = SEARCHER.search(qp.parse(query), limit=args.limit) for rank, hit in enumerate(results): ret.append([qid, hit["docid"], rank + 1, results.score(rank), run_id]) elif args.model == "clusvm": query = [token.text for token in PREPROCESS(query)] queryfeats = get_query_features(query)
def retrieve(inp): qid, pid, relevance = inp ret = [] closestqueries = qs.search(qqp.parse(querytext[qid]), limit=5) closestdocs = s.search(docqp.parse(passagetext[pid]), limit=5) if len(closestqueries) == 0 or len(closestdocs) == 0: return None for q, d in zip(closestqueries, closestdocs): ret.append([q["qid"], d["docid"], relevance]) return ret qrels = [] with open("data/thesis_dataset_graded_relevance.tsv", "rt") as grelfile, docix.searcher( weighting=okapi.weighting) as s, queryix.searcher( weighting=okapi.weighting) as qs, open( "data/graded-qrels.txt", "w") as out: with mp.Pool(processes=24) as pool: with tqdm(total=800) as pbar: found, sofar = 0, 0 for passdoc in pool.imap_unordered( retrieve, csv.reader(grelfile, delimiter="\t")): sofar += 1 pbar.set_description(f"{found} / {sofar}") pbar.update() if passdoc is None: continue found += 1 pbar.set_description(f"{found} / {sofar}") for pd in passdoc: