def suggestions_and_scores(self, text, weighting=None): """Returns a list of possible alternative spellings of 'text', as ('word', score, weight) triples, where 'word' is the suggested word, 'score' is the score that was assigned to the word using :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`, and 'weight' is the score the word received in the search for the original word's ngrams. You must add words to the dictionary (using add_field, add_words, and/or add_scored_words) before you can use this. This is a lower-level method, in case an expert user needs access to the raw scores, for example to implement a custom suggestion ranking algorithm. Most people will want to call :meth:`~SpellChecker.suggest` instead, which simply returns the top N valued words. :param text: The word to check. :rtype: list """ if weighting is None: weighting = TF_IDF() grams = defaultdict(list) for size in xrange(self.mingram, self.maxgram + 1): key = "gram%s" % size nga = analysis.NgramAnalyzer(size) for t in nga(text): grams[key].append(t.text) queries = [] for size in xrange(self.mingram, min(self.maxgram + 1, len(text))): key = "gram%s" % size gramlist = grams[key] queries.append( query.Term("start%s" % size, gramlist[0], boost=self.booststart)) queries.append( query.Term("end%s" % size, gramlist[-1], boost=self.boostend)) for gram in gramlist: queries.append(query.Term(key, gram)) q = query.Or(queries) ix = self.index() s = ix.searcher(weighting=weighting) try: result = s.search(q) return [(fs["word"], fs["score"], result.score(i)) for i, fs in enumerate(result) if fs["word"] != text] finally: s.close()
def get_searcher(index=INDEX, score_by="BM25F"): """ get_searcher([index=INDEX, score_by="RTE"]) Obtém o buscador para o índice fornecido no parâmetro ``index``. O parâmetro ``score_by`` permite a escolha de uma função de pontuação diferente para o par (query, documento). As funções de pontuação disponíveis são: RTE, TFIDF e BM25F. .. code-block:: python >>> from searcher import get_searcher >>> from index import get_index >>> >>> idx = get_index() >>> searcher = get_searcher(idx, score_by="RTE") >>> :param index: Índice de documentos. :type index: FileIndex :param score_by: Função de pontuação entre a *query* do usuário e um documento recuperado. :type score_by: str :returns: Searcher """ try: from whoosh.scoring import BM25F, TF_IDF except ImportError: print "Ocorreu um erro na importação das funções de pontuação." # Converte para MAIÚSCULO. score_by = score_by.upper() # Escolha da função de pontuação. if score_by == "TF-IDF": score_function = TF_IDF() elif score_by == "BM25F": score_function = BM25F() return index.searcher(weighting=score_function)
def search_index(query, score_func_name, dirname): ix = index.open_dir(dirname, schema=get_schema()) og = OrGroup.factory(0.9) qp = QueryParser("content", schema=get_schema(), group=og) # qp.add_plugin(FuzzyTermPlugin()) # query = ' '.join([(x + '~' if len(x) > 5 else x) for x in query.split(' ')]) q = qp.parse(query) score_func = OkBM25() if score_func_name == 'ok': score_func = OkBM25() elif score_func_name == 'bm25f': score_func = BM25F() elif score_func_name == 'pln': score_func = PLN() elif score_func_name == 'tfidf': score_func = TF_IDF() elif score_func_name == 'freq': score_func = Frequency() searcher = ix.searcher(weighting=score_func) results = searcher.search(q, limit=None) results.fragmenter.surround = 100 return results
def scorer(self, searcher, fieldname, text, qf=1): # BM25 bm25Scorer = BM25F().scorer(searcher, fieldname, text, qf) tfidfScorer = TF_IDF().scorer(searcher, fieldname, text, qf) return self.Scorer(tfidfScorer, bm25Scorer)
else: op_type = qparser.AndGroup dirname = "indexdir" ix = open_dir(dirname) qp = qparser.MultifieldParser( ['content', 'path', 'title', 'head1', 'head2', 'head3', 'head4'], ix.schema, group=op_type) qp.add_plugin(qparser.PlusMinusPlugin) query = qp.parse(search_input) # print(query) if search_type == "BM25": w = BM25F(B=0.75, K1=1.5) elif search_type == "TFIDF": w = TF_IDF() else: w = BM25F( B=0.75, K1=1.5, ) with ix.searcher(weighting=w) as searcher: results = searcher.search(query, terms=True) results.fragmenter = highlight.ContextFragmenter( maxchars=50, surround=50, ) # print(list(searcher.lexicon("content"))) found_doc_num = results.scored_length() run_time = results.runtime # -------------------------------for html use---------------------------------
precision[i] = precision[i-1] precision.append(precision[9]) precision.reverse() for i in range(len(precision)): x[i] += precision[i] sumMeanAveragePrecision += sum(precision) / 11 # print(q, ":", precision, relevant) # print([i/30 for i in x]) return (sumMeanAveragePrecision / 30), [i/30 for i in x] if __name__ == '__main__': ix = index.open_dir("indexdir/index") wBM25 = scoring.BM25F(B=0.75, title_B=1.0, body_B=0.5, K1=1.7) wtf = TF_IDF() m, y = ndcg_evaluation(wBM25) print("NDCG EVALUATION con BM25:", m) NDCG_graphic(y) m, y = ndcg_evaluation(wtf) print("NDCG EVALUATION con TF_IDF:", m) NDCG_graphic(y, 'TF_IDF') print() x = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] m, y = map_evaluation(wBM25) print("MAP EVALUATION con BM25", m) MAP_graphic(x, y) m, y = map_evaluation(wtf) print("MAP EVALUATION con TF_IDF", m) MAP_graphic(x, y, 'TF_IDF')
title=filename[:-4], content=f_string) except: writer.add_document(path=u'None', title=filename[:-4], content=f_string) writer.commit() qp = qparser.MultifieldParser(['content', 'path', 'title'], ix.schema, group=qparser.OrGroup) query = qp.parse("transgenic growth ") # print(query) b = BM25F(B=0.75, K1=1.5) t = TF_IDF() f = Frequency() with ix.searcher(weighting=f) as searcher: results = searcher.search( query, terms=True, ) results.fragmenter = highlight.ContextFragmenter( maxchars=50, surround=90, ) if results: for hit in results: snip = hit.highlights('content') title = hit['title']