Exemplos de TF_IDF em Python, exemplos de whoosh.scoring.TF_IDF em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: spelling.py Projeto: ra2003/whoosh

    def suggestions_and_scores(self, text, weighting=None):
        """Returns a list of possible alternative spellings of 'text', as
        ('word', score, weight) triples, where 'word' is the suggested
        word, 'score' is the score that was assigned to the word using
        :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`,
        and 'weight' is the score the word received in the search for the
        original word's ngrams.
        
        You must add words to the dictionary (using add_field, add_words,
        and/or add_scored_words) before you can use this.
        
        This is a lower-level method, in case an expert user needs access to
        the raw scores, for example to implement a custom suggestion ranking
        algorithm. Most people will want to call :meth:`~SpellChecker.suggest`
        instead, which simply returns the top N valued words.
        
        :param text: The word to check.
        :rtype: list
        """

        if weighting is None:
            weighting = TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(
                query.Term("start%s" % size,
                           gramlist[0],
                           boost=self.booststart))
            queries.append(
                query.Term("end%s" % size, gramlist[-1], boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
        try:
            result = s.search(q)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result) if fs["word"] != text]
        finally:
            s.close()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: searcher.py Projeto: phillipecavalcante/storyline

def get_searcher(index=INDEX, score_by="BM25F"):
    """
    get_searcher([index=INDEX, score_by="RTE"])
    
    Obtém o buscador para o índice fornecido no parâmetro ``index``.
    O parâmetro ``score_by`` permite a escolha de uma função de pontuação diferente
    para o par (query, documento). As funções de pontuação disponíveis são:
    RTE, TFIDF e BM25F.
     
    .. code-block:: python
    
        >>> from searcher import get_searcher
        >>> from index import get_index
        >>>
        >>> idx = get_index()
        >>> searcher = get_searcher(idx, score_by="RTE")
        >>>
        
    :param index: Índice de documentos.
    :type index: FileIndex
    :param score_by: Função de pontuação entre a *query* do usuário e um documento recuperado.
    :type score_by: str
    :returns: Searcher
    """

    try:
        from whoosh.scoring import BM25F, TF_IDF
    except ImportError:
        print "Ocorreu um erro na importação das funções de pontuação."

    # Converte para MAIÚSCULO.
    score_by = score_by.upper()
    # Escolha da função de pontuação.

    if score_by == "TF-IDF":
        score_function = TF_IDF()
    elif score_by == "BM25F":
        score_function = BM25F()

    return index.searcher(weighting=score_function)

Exemplo n.º 3

0

Exibir arquivo

def search_index(query, score_func_name, dirname):
    ix = index.open_dir(dirname, schema=get_schema())
    og = OrGroup.factory(0.9)
    qp = QueryParser("content", schema=get_schema(), group=og)
    # qp.add_plugin(FuzzyTermPlugin())
    # query = ' '.join([(x + '~' if len(x) > 5 else x) for x in query.split(' ')])
    q = qp.parse(query)
    score_func = OkBM25()
    if score_func_name == 'ok':
        score_func = OkBM25()
    elif score_func_name == 'bm25f':
        score_func = BM25F()
    elif score_func_name == 'pln':
        score_func = PLN()
    elif score_func_name == 'tfidf':
        score_func = TF_IDF()
    elif score_func_name == 'freq':
        score_func = Frequency()
    searcher = ix.searcher(weighting=score_func)
    results = searcher.search(q, limit=None)
    results.fragmenter.surround = 100
    return results

Exemplo n.º 4

0

Exibir arquivo

 def scorer(self, searcher, fieldname, text, qf=1):
     # BM25
     bm25Scorer = BM25F().scorer(searcher, fieldname, text, qf)
     tfidfScorer = TF_IDF().scorer(searcher, fieldname, text, qf)
     return self.Scorer(tfidfScorer, bm25Scorer)

Exemplo n.º 5

0

Exibir arquivo

else:
    op_type = qparser.AndGroup

dirname = "indexdir"
ix = open_dir(dirname)
qp = qparser.MultifieldParser(
    ['content', 'path', 'title', 'head1', 'head2', 'head3', 'head4'],
    ix.schema,
    group=op_type)
qp.add_plugin(qparser.PlusMinusPlugin)
query = qp.parse(search_input)
# print(query)
if search_type == "BM25":
    w = BM25F(B=0.75, K1=1.5)
elif search_type == "TFIDF":
    w = TF_IDF()
else:
    w = BM25F(
        B=0.75,
        K1=1.5,
    )
with ix.searcher(weighting=w) as searcher:
    results = searcher.search(query, terms=True)
    results.fragmenter = highlight.ContextFragmenter(
        maxchars=50,
        surround=50,
    )
    # print(list(searcher.lexicon("content")))
    found_doc_num = results.scored_length()
    run_time = results.runtime
    #  -------------------------------for html use---------------------------------

Exemplo n.º 6

0

Exibir arquivo

Arquivo: evaluation.py Projeto: 241253/progetto_gestione_informazione

                precision[i] = precision[i-1]
        precision.append(precision[9])
        precision.reverse()

        for i in range(len(precision)):
            x[i] += precision[i]

        sumMeanAveragePrecision += sum(precision) / 11
        # print(q, ":", precision, relevant)
    # print([i/30 for i in x])
    return (sumMeanAveragePrecision / 30), [i/30 for i in x]

if __name__ == '__main__':
    ix = index.open_dir("indexdir/index")
    wBM25 = scoring.BM25F(B=0.75, title_B=1.0, body_B=0.5, K1=1.7)
    wtf = TF_IDF()

    m, y = ndcg_evaluation(wBM25)
    print("NDCG EVALUATION con BM25:", m)
    NDCG_graphic(y)
    m, y = ndcg_evaluation(wtf)
    print("NDCG EVALUATION con TF_IDF:", m)
    NDCG_graphic(y, 'TF_IDF')
    print()
    x = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    m, y = map_evaluation(wBM25)
    print("MAP EVALUATION con BM25", m)
    MAP_graphic(x, y)
    m, y = map_evaluation(wtf)
    print("MAP EVALUATION con TF_IDF", m)
    MAP_graphic(x, y, 'TF_IDF')

Exemplo n.º 7

0

Exibir arquivo

                            title=filename[:-4],
                            content=f_string)
    except:
        writer.add_document(path=u'None',
                            title=filename[:-4],
                            content=f_string)

writer.commit()
qp = qparser.MultifieldParser(['content', 'path', 'title'],
                              ix.schema,
                              group=qparser.OrGroup)
query = qp.parse("transgenic growth ")
# print(query)

b = BM25F(B=0.75, K1=1.5)
t = TF_IDF()
f = Frequency()
with ix.searcher(weighting=f) as searcher:
    results = searcher.search(
        query,
        terms=True,
    )
    results.fragmenter = highlight.ContextFragmenter(
        maxchars=50,
        surround=90,
    )

    if results:
        for hit in results:
            snip = hit.highlights('content')
            title = hit['title']