예제 #1
0
class Index:
	def __init__(self, path='~/Music/iTunes/iTunes Music Library.xml', folder='~/Library/Application Support/Share my tunes'):
		self.path = os.path.expanduser(path)
		self.schema = Schema(
			trackId = ID(stored=True),
			name=TEXT(stored=True),
			artist=TEXT(stored=True),
			album=TEXT(stored=True),
			genre=KEYWORD(stored=True),
			location=STORED,
			trackNumber=STORED,
			bitRate=ID(stored=True),
			artwork=KEYWORD(stored=True)
			)
		self.parser = MultifieldParser(["name", "album", "artist"], schema = self.schema)
		self.folder = "%s/index" % os.path.expanduser(folder)
		self.empty = not whoosh.index.exists_in(self.folder)
		self.ix = None
	def index(self):
		if self.empty:
			if not os.path.exists(self.folder):
				os.makedirs(self.folder)
			st = FileStorage(self.folder)
			ix = st.create_index(self.schema)
			w = ix.writer()
			w.add_document(name = u"beuha")
			pipe = file.ID3Filter()
			#[TODO] using itunes info for artwork?
			cpt = 0
			for track in pipe(ItunesParser(self.path)):
				if track['album'] != None : 
					album = track['album'].encode('ascii', 'ignore')
				else:
					album = ""
				#print track['artwork'], "[%s]" % album, track['name'].encode('ascii', 'ignore')
				if cpt % 20 == 0:
					print "\n%i " %cpt,
				print '#',
				#print track['album'], track['name']
				w.add_document(
					trackId = track['trackId'], name=track['name']
					,artist=track['artist'], album=track['album'],
					genre=track['genre'], location=track['location'],
					artwork=boolean(track['artwork']),
					trackNumber=track['trackNumber'], bitRate=track['bitRate']
				)
				#if cpt % 100 == 1:
				#	w.commit()
				cpt += 1
			print "\n\n%i tracks indexed" % cpt
			w.commit()
			ix.optimize()
			ix.close()
		else :
			print "already indexed"
	def query(self, query):
		if self.ix == None:
			self.ix = FileStorage(self.folder).open_index()
		q = self.parser.parse(query)
		return self.ix.searcher().search(q, sortedby=("album", "name"), limit=None)
예제 #2
0
def buscar_animesporsinopsis(request):
    formulario = BusquedaPorSinopsisForm(request.POST)
    lista_animes = []
    
    if formulario.is_valid():
        ix = FileStorage(dirindex).open_index()
        query = QueryParser("sinopsis", ix.schema, group=qparser.AndGroup).parse(formulario.cleaned_data['sinopsis'])
        with ix.searcher() as searcher:
            results = searcher.search(query)
            for r in results:
                anime = []
                anime.append(r['titulo'])
                anime.append(r['imagen'])
                anime.append(r['rango_web'])
                anime.append(r['popularidad'])
                anime.append(r['fecha_inicio'])
                anime.append(r['fecha_final'])
                anime.append(r['episodios'])
                anime.append(r['sinopsis'])
                anime.append(r['generos'])
                lista_animes.append(anime)
                
    return render(request, 'animesbusquedaporsinopsis.html', {'formulario':formulario, 'animes':lista_animes})
예제 #3
0
class TranslationMemory(object):
    def __init__(self):
        self.index = FileStorage(data_dir('memory')).open_index()
        self.parser = qparser.QueryParser(
            'source',
            schema=self.index.schema,
            group=qparser.OrGroup.factory(0.9),
            termclass=query.FuzzyTerm,
        )
        self.searcher = None
        self.comparer = Comparer()

    def __del__(self):
        self.close()

    def open_searcher(self):
        if self.searcher is None:
            self.searcher = self.index.searcher()

    def doc_count(self):
        self.open_searcher()
        return self.searcher.doc_count()

    def close(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None

    @contextlib.contextmanager
    def writer(self):
        writer = self.index.writer()
        try:
            yield writer
        finally:
            writer.commit()

    def get_language_code(self, code, langmap):
        language = Language.objects.auto_get_or_create(code)
        if langmap and language.code in langmap:
            language = Language.objects.auto_get_or_create(
                langmap[language.code]
            )
        return language.code

    def import_tmx(self, fileobj, langmap=None):
        origin = force_text(os.path.basename(fileobj.name))
        storage = tmxfile.parsefile(fileobj)
        header = next(
            storage.document.getroot().iterchildren(
                storage.namespaced("header")
            )
        )
        source_language_code = header.get('srclang')
        source_language = self.get_language_code(source_language_code, langmap)

        languages = {}
        with self.writer() as writer:
            for unit in storage.units:
                # Parse translations (translate-toolkit does not care about
                # languages here, it just picks first and second XML elements)
                translations = {}
                for node in unit.getlanguageNodes():
                    lang, text = get_node_data(unit, node)
                    translations[lang] = text
                    if lang not in languages:
                        languages[lang] = self.get_language_code(lang, langmap)

                try:
                    source = translations.pop(source_language_code)
                except KeyError:
                    # Skip if source language is not present
                    continue

                for lang, text in translations.items():
                    writer.add_document(
                        source_language=source_language,
                        target_language=languages[lang],
                        source=source,
                        target=text,
                        origin=origin,
                        category=CATEGORY_FILE,
                    )

    def lookup(self, source_language, target_language, text):
        langfilter = query.And([
            query.Term('source_language', source_language),
            query.Term('target_language', target_language),
        ])
        self.open_searcher()
        text_query = self.parser.parse(text)
        matches = self.searcher.search(
            text_query, filter=langfilter, limit=20000
        )

        for match in matches:
            similarity = self.comparer.similarity(text, match['source'])
            if similarity < 30:
                continue
            yield (
                match['source'], match['target'], similarity, match['origin']
            )

    def delete(self, origin):
        """Delete entries by origin."""
        with self.writer() as writer:
            return writer.delete_by_term('origin', origin)

    def empty(self):
        """Recreates translation memory."""
        self.index = setup_index()
        self.searcher = None

    def get_origins(self):
        self.open_searcher()
        return [
            force_text(x) for x in self.searcher.lexicon('origin')
        ]
parser.add_argument("-add_bm25", action="store_true")
args = parser.parse_args()

from singletons import PREPROCESS, SEARCHER

run_id = args.model
if args.model == "clusvm":
    clusvm = util.load(f"clusvm.pkl")
elif "sv" in args.model:
    svm = util.load(f"{args.svm_file}.pkl")
elif args.model == "adarank":
    alpha = np.load("ada.npy")
else:
    ix = FileStorage("data/msmarcoidx").open_index()
    if args.model == "bm25":
        SEARCHER = ix.searcher()
    qp = QueryParser("body", schema=ix.schema)


def predict(inp):
    qid, query = inp
    ret = []

    if args.model == "okapi" or args.model == "bm25":
        results = SEARCHER.search(qp.parse(query), limit=args.limit)
        for rank, hit in enumerate(results):
            ret.append([qid, hit["docid"], rank + 1, results.score(rank), run_id])

    elif args.model == "clusvm":
        query = [token.text for token in PREPROCESS(query)]
        queryfeats = get_query_features(query)
예제 #5
0
def retrieve(inp):
    qid, pid, relevance = inp
    ret = []
    closestqueries = qs.search(qqp.parse(querytext[qid]), limit=5)
    closestdocs = s.search(docqp.parse(passagetext[pid]), limit=5)
    if len(closestqueries) == 0 or len(closestdocs) == 0:
        return None
    for q, d in zip(closestqueries, closestdocs):
        ret.append([q["qid"], d["docid"], relevance])
    return ret


qrels = []
with open("data/thesis_dataset_graded_relevance.tsv",
          "rt") as grelfile, docix.searcher(
              weighting=okapi.weighting) as s, queryix.searcher(
                  weighting=okapi.weighting) as qs, open(
                      "data/graded-qrels.txt", "w") as out:
    with mp.Pool(processes=24) as pool:
        with tqdm(total=800) as pbar:
            found, sofar = 0, 0
            for passdoc in pool.imap_unordered(
                    retrieve, csv.reader(grelfile, delimiter="\t")):
                sofar += 1
                pbar.set_description(f"{found} / {sofar}")
                pbar.update()
                if passdoc is None:
                    continue
                found += 1
                pbar.set_description(f"{found} / {sofar}")
                for pd in passdoc: