def suggestions_and_scores(self, text, weighting=None): if weighting is None: weighting = scoring.TF_IDF() grams = defaultdict(list) for size in xrange(self.mingram, self.maxgram + 1): key = "gram%s" % size nga = analysis.NgramAnalyzer(size) for t in nga(text): grams[key].append(t.text) queries = [] for size in xrange(self.mingram, min(self.maxgram + 1, len(text))): key = "gram%s" % size gramlist = grams[key] queries.append( query.Term("start%s" % size, gramlist[0], boost=self.booststart)) queries.append( query.Term("end%s" % size, gramlist[-1], boost=self.boostend)) for gram in gramlist: queries.append(query.Term(key, gram)) q = query.Or(queries) ix = self.index() s = ix.searcher(weighting=weighting) try: result = s.search(q, limit=None) return [(fs["word"], fs["score"], result.score(i)) for i, fs in enumerate(result) if fs["word"] != text] finally: s.close()
def suggest(self, text, number=3, usescores=False): """Returns a list of suggested alternative spellings of 'text'. You must add words to the dictionary (using add_field, add_words, and/or add_scored_words) before you can use this. :param text: The word to check. :param number: The maximum number of suggestions to return. :param usescores: Use the per-word score to influence the suggestions. :rtype: list """ grams = defaultdict(list) for size in xrange(self.mingram, self.maxgram + 1): key = "gram%s" % size nga = analysis.NgramAnalyzer(size) for t in nga(text): grams[key].append(t.text) queries = [] for size in xrange(self.mingram, min(self.maxgram + 1, len(text))): key = "gram%s" % size gramlist = grams[key] queries.append( query.Term("start%s" % size, gramlist[0], boost=self.booststart)) queries.append( query.Term("end%s" % size, gramlist[-1], boost=self.boostend)) for gram in gramlist: queries.append(query.Term(key, gram)) q = query.Or(queries) ix = self.index() s = searching.Searcher(ix) try: results = s.search(q) length = len(results) if len(results) > number * 2: length = len(results) // 2 fieldlist = results[:length] suggestions = [(fs["word"], fs["score"]) for fs in fieldlist if fs["word"] != text] if usescores: def keyfn(a): return 0 - (1 / distance(text, a[0])) * a[1] else: def keyfn(a): return distance(text, a[0]) suggestions.sort(key=keyfn) finally: s.close() return [word for word, _ in suggestions[:number]]
def suggestions_and_scores(self, text, weighting=None): """Returns a list of possible alternative spellings of 'text', as ('word', score, weight) triples, where 'word' is the suggested word, 'score' is the score that was assigned to the word using :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`, and 'weight' is the score the word received in the search for the original word's ngrams. You must add words to the dictionary (using add_field, add_words, and/or add_scored_words) before you can use this. This is a lower-level method, in case an expert user needs access to the raw scores, for example to implement a custom suggestion ranking algorithm. Most people will want to call :meth:`~SpellChecker.suggest` instead, which simply returns the top N valued words. :param text: The word to check. :rtype: list """ if weighting is None: weighting = TF_IDF() grams = defaultdict(list) for size in xrange(self.mingram, self.maxgram + 1): key = "gram%s" % size nga = analysis.NgramAnalyzer(size) for t in nga(text): grams[key].append(t.text) queries = [] for size in xrange(self.mingram, min(self.maxgram + 1, len(text))): key = "gram%s" % size gramlist = grams[key] queries.append( query.Term("start%s" % size, gramlist[0], boost=self.booststart)) queries.append( query.Term("end%s" % size, gramlist[-1], boost=self.boostend)) for gram in gramlist: queries.append(query.Term(key, gram)) q = query.Or(queries) ix = self.index() s = ix.searcher(weighting=weighting) try: result = s.search(q) return [(fs["word"], fs["score"], result.score(i)) for i, fs in enumerate(result) if fs["word"] != text] finally: s.close()
def add_scored_words(self, ws): writer = self.index().writer() for text, score in ws: fields = {"word": text, "score": score} for size in xrange(self.mingram, self.maxgram + 1): nga = analysis.NgramAnalyzer(size) gramlist = [t.text for t in nga(text)] if len(gramlist) > 0: fields["start%s" % size] = gramlist[0] fields["end%s" % size] = gramlist[-1] fields["gram%s" % size] = " ".join(gramlist) writer.add_document(**fields) writer.commit()
def add_scored_words(self, ws): """Adds a list of ("word", score) tuples to the backend dictionary. Associating words with a score lets you use the 'usescores' keyword argument of the suggest() method to order the suggestions using the scores. :param ws: A sequence of ("word", score) tuples. """ writer = self.index().writer() for text, score in ws: fields = {"word": text, "score": score} for size in xrange(self.mingram, self.maxgram + 1): nga = analysis.NgramAnalyzer(size) gramlist = [t.text for t in nga(text)] if len(gramlist) > 0: fields["start%s" % size] = gramlist[0] fields["end%s" % size] = gramlist[-1] fields["gram%s" % size] = " ".join(gramlist) writer.add_document(**fields) writer.commit()
def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, queryor=False, phrase=False, sortable=False): """ :param minsize: The minimum length of the N-grams. :param maxsize: The maximum length of the N-grams. :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param queryor: if True, combine the N-grams with an Or query. The default is to combine N-grams with an And query. :param phrase: store positions on the N-grams to allow exact phrase searching. The default is off. """ formatclass = formats.Frequency if phrase: formatclass = formats.Positions self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) self.format = formatclass(field_boost=field_boost) self.stored = stored self.queryor = queryor self.set_sortable(sortable)
content=TEXT(stored=False, analyzer=selected_analyzer)) # Create an empty-Index os.mkdir(dir_index_cran + schema_type) # --> Create folder for Cran Index os.mkdir(dir_index_time + schema_type) # --> Create folder for Time Index temp_dir_cran = dir_index_cran + schema_type temp_dir_time = dir_index_time + schema_type create_in(temp_dir_cran, schema) create_in(temp_dir_time, schema) elif schema_type == '\\Ngram': selected_analyzer = analysis.NgramAnalyzer(3) # Create a Schema schema = Schema(id=ID(stored=True), title=TEXT(stored=False, analyzer=selected_analyzer), content=TEXT(stored=False, analyzer=selected_analyzer)) # Create an empty-Index os.mkdir(dir_index_cran + schema_type) # --> Create folder for Cran Index os.mkdir(dir_index_time + schema_type) # --> Create folder for Time Index temp_dir_cran = dir_index_cran + schema_type temp_dir_time = dir_index_time + schema_type create_in(temp_dir_cran, schema)