示例#1
0
    def suggestions_and_scores(self, text, weighting=None):
        if weighting is None:
            weighting = scoring.TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(
                query.Term("start%s" % size,
                           gramlist[0],
                           boost=self.booststart))
            queries.append(
                query.Term("end%s" % size, gramlist[-1], boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
        try:
            result = s.search(q, limit=None)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result) if fs["word"] != text]
        finally:
            s.close()
示例#2
0
    def suggest(self, text, number=3, usescores=False):
        """Returns a list of suggested alternative spellings of 'text'. You must
        add words to the dictionary (using add_field, add_words, and/or add_scored_words)
        before you can use this.
        
        :param text: The word to check.
        :param number: The maximum number of suggestions to return.
        :param usescores: Use the per-word score to influence the suggestions.
        :rtype: list
        """

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(
                query.Term("start%s" % size,
                           gramlist[0],
                           boost=self.booststart))
            queries.append(
                query.Term("end%s" % size, gramlist[-1], boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()

        s = searching.Searcher(ix)
        try:
            results = s.search(q)

            length = len(results)
            if len(results) > number * 2:
                length = len(results) // 2
            fieldlist = results[:length]

            suggestions = [(fs["word"], fs["score"]) for fs in fieldlist
                           if fs["word"] != text]

            if usescores:

                def keyfn(a):
                    return 0 - (1 / distance(text, a[0])) * a[1]
            else:

                def keyfn(a):
                    return distance(text, a[0])

            suggestions.sort(key=keyfn)
        finally:
            s.close()

        return [word for word, _ in suggestions[:number]]
示例#3
0
文件: spelling.py 项目: ra2003/whoosh
    def suggestions_and_scores(self, text, weighting=None):
        """Returns a list of possible alternative spellings of 'text', as
        ('word', score, weight) triples, where 'word' is the suggested
        word, 'score' is the score that was assigned to the word using
        :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`,
        and 'weight' is the score the word received in the search for the
        original word's ngrams.
        
        You must add words to the dictionary (using add_field, add_words,
        and/or add_scored_words) before you can use this.
        
        This is a lower-level method, in case an expert user needs access to
        the raw scores, for example to implement a custom suggestion ranking
        algorithm. Most people will want to call :meth:`~SpellChecker.suggest`
        instead, which simply returns the top N valued words.
        
        :param text: The word to check.
        :rtype: list
        """

        if weighting is None:
            weighting = TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(
                query.Term("start%s" % size,
                           gramlist[0],
                           boost=self.booststart))
            queries.append(
                query.Term("end%s" % size, gramlist[-1], boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
        try:
            result = s.search(q)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result) if fs["word"] != text]
        finally:
            s.close()
示例#4
0
 def add_scored_words(self, ws):
     writer = self.index().writer()
     for text, score in ws:
         fields = {"word": text, "score": score}
         for size in xrange(self.mingram, self.maxgram + 1):
             nga = analysis.NgramAnalyzer(size)
             gramlist = [t.text for t in nga(text)]
             if len(gramlist) > 0:
                 fields["start%s" % size] = gramlist[0]
                 fields["end%s" % size] = gramlist[-1]
                 fields["gram%s" % size] = " ".join(gramlist)
         writer.add_document(**fields)
     writer.commit()
示例#5
0
    def add_scored_words(self, ws):
        """Adds a list of ("word", score) tuples to the backend dictionary.
        Associating words with a score lets you use the 'usescores' keyword
        argument of the suggest() method to order the suggestions using the
        scores.
        
        :param ws: A sequence of ("word", score) tuples.
        """

        writer = self.index().writer()
        for text, score in ws:
            fields = {"word": text, "score": score}
            for size in xrange(self.mingram, self.maxgram + 1):
                nga = analysis.NgramAnalyzer(size)
                gramlist = [t.text for t in nga(text)]
                if len(gramlist) > 0:
                    fields["start%s" % size] = gramlist[0]
                    fields["end%s" % size] = gramlist[-1]
                    fields["gram%s" % size] = " ".join(gramlist)
            writer.add_document(**fields)
        writer.commit()
示例#6
0
    def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0,
                 queryor=False, phrase=False, sortable=False):
        """
        :param minsize: The minimum length of the N-grams.
        :param maxsize: The maximum length of the N-grams.
        :param stored: Whether to store the value of this field with the
            document. Since this field type generally contains a lot of text,
            you should avoid storing it with the document unless you need to,
            for example to allow fast excerpts in the search results.
        :param queryor: if True, combine the N-grams with an Or query. The
            default is to combine N-grams with an And query.
        :param phrase: store positions on the N-grams to allow exact phrase
            searching. The default is off.
        """

        formatclass = formats.Frequency
        if phrase:
            formatclass = formats.Positions

        self.analyzer = analysis.NgramAnalyzer(minsize, maxsize)
        self.format = formatclass(field_boost=field_boost)
        self.stored = stored
        self.queryor = queryor
        self.set_sortable(sortable)
示例#7
0
                        content=TEXT(stored=False, analyzer=selected_analyzer))

        # Create an empty-Index
        os.mkdir(dir_index_cran +
                 schema_type)  # --> Create folder for Cran Index
        os.mkdir(dir_index_time +
                 schema_type)  # --> Create folder for Time Index

        temp_dir_cran = dir_index_cran + schema_type
        temp_dir_time = dir_index_time + schema_type
        create_in(temp_dir_cran, schema)
        create_in(temp_dir_time, schema)

    elif schema_type == '\\Ngram':

        selected_analyzer = analysis.NgramAnalyzer(3)

        # Create a Schema
        schema = Schema(id=ID(stored=True),
                        title=TEXT(stored=False, analyzer=selected_analyzer),
                        content=TEXT(stored=False, analyzer=selected_analyzer))

        # Create an empty-Index
        os.mkdir(dir_index_cran +
                 schema_type)  # --> Create folder for Cran Index
        os.mkdir(dir_index_time +
                 schema_type)  # --> Create folder for Time Index

        temp_dir_cran = dir_index_cran + schema_type
        temp_dir_time = dir_index_time + schema_type
        create_in(temp_dir_cran, schema)