def _words(self, ixreader): text = self.text minsim = self.minsimilarity for term in ixreader.expand_prefix(self.fieldname, text[:self.prefixlength]): if text == term: yield term elif relative(text, term) > minsim: yield term
def lookup(self, input, valid_types=[], exact_only=False): """Attempts to find some sort of object, given a name. Returns a list of named (object, name, language, iso639, iso3166, exact) tuples. `object` is a database object, `name` is the name under which the object was found, `language` and the two isos are the name and country codes of the language in which the name was found, and `exact` is True iff this was an exact match. This function currently ONLY does fuzzy matching if there are no exact matches. Extraneous whitespace is removed with extreme prejudice. Recognizes: - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc. - Foreign names: "Iibui", "Eivui" - Fuzzy names in whatever language: "Evee", "Ibui" - IDs: "133", "192", "250" Also: - Type restrictions. "type:psychic" will only return the type. This is how to make ID lookup useful. Multiple type specs can be entered with commas, as "move,item:1". - Language restrictions. "@fr:charge" will only return Tackle, which is called "Charge" in French. These can be combined with type restrictions, e.g., "@fr,move:charge". `input` Name of the thing to look for. `valid_types` A list of type or language restrictions, e.g., `['pokemon', '@ja']`. If this is provided, only results in one of the given tables will be returned. `exact_only` If True, only exact matches are returned. If set to False (the default), and the provided `name` doesn't match anything exactly, spelling correction will be attempted. """ name = self.normalize_name(input) exact = True # Pop off any type prefix and merge with valid_types name, merged_valid_types, type_term = \ self._apply_valid_types(name, valid_types) # Random lookup if name == 'random': return self.random_lookup(valid_types=merged_valid_types) # Do different things depending what the query looks like # Note: Term objects do an exact match, so we don't have to worry about # a query parser tripping on weird characters in the input try: # Let Python try to convert to a number, so 0xff works name_as_number = int(name, base=0) except ValueError: # Oh well name_as_number = None if '*' in name or '?' in name: exact_only = True query = whoosh.query.Wildcard(u'name', name) elif name_as_number is not None: # Don't spell-check numbers! exact_only = True query = whoosh.query.Term(u'row_id', unicode(name_as_number)) else: # Not an integer query = whoosh.query.Term(u'name', name) if type_term: query = query & type_term ### Actual searching # Limits; result limits are constants, and intermediate results (before # duplicate items are stripped out) are capped at the result limit # times another constant. # Fuzzy are capped at 10, beyond which something is probably very # wrong. Exact matches -- that is, wildcards and ids -- are far less # constrained. if exact_only: max_results = self.MAX_EXACT_RESULTS else: max_results = self.MAX_FUZZY_RESULTS locale = self._get_current_locale() facet = whoosh.sorting.MultiFacet([ LanguageFacet(locale.identifier), table_facet, "name", ]) searcher = self.index.searcher() results = searcher.search( query, limit=int(max_results * self.INTERMEDIATE_FACTOR), sortedby=facet, ) # Look for some fuzzy matches if necessary if not exact_only and not results: exact = False results = [] fuzzy_query_parts = [] fuzzy_weights = {} corrector = searcher.corrector('name') for suggestion in corrector.suggest(name, limit=max_results): fuzzy_query_parts.append(whoosh.query.Term('name', suggestion)) distance = levenshtein.relative(name, suggestion) fuzzy_weights[suggestion] = distance if not fuzzy_query_parts: # Nothing at all; don't try querying return [] fuzzy_query = whoosh.query.Or(fuzzy_query_parts) if type_term: fuzzy_query = fuzzy_query & type_term sorter = LanguageFacet(locale.identifier, extra_weights=fuzzy_weights) results = searcher.search(fuzzy_query, sortedby=sorter) ### Convert results to db objects objects = self._whoosh_records_to_results(results, exact=exact) # Truncate and return return objects[:max_results]
def lookup(self, input, valid_types=[], exact_only=False): """Attempts to find some sort of object, given a name. Returns a list of named (object, name, language, iso639, iso3166, exact) tuples. `object` is a database object, `name` is the name under which the object was found, `language` and the two isos are the name and country codes of the language in which the name was found, and `exact` is True iff this was an exact match. This function currently ONLY does fuzzy matching if there are no exact matches. Extraneous whitespace is removed with extreme prejudice. Recognizes: - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc. - Foreign names: "Iibui", "Eivui" - Fuzzy names in whatever language: "Evee", "Ibui" - IDs: "133", "192", "250" Also: - Type restrictions. "type:psychic" will only return the type. This is how to make ID lookup useful. Multiple type specs can be entered with commas, as "move,item:1". - Language restrictions. "@fr:charge" will only return Tackle, which is called "Charge" in French. These can be combined with type restrictions, e.g., "@fr,move:charge". `input` Name of the thing to look for. `valid_types` A list of type or language restrictions, e.g., `['pokemon', '@ja']`. If this is provided, only results in one of the given tables will be returned. `exact_only` If True, only exact matches are returned. If set to False (the default), and the provided `name` doesn't match anything exactly, spelling correction will be attempted. """ name = self.normalize_name(input) exact = True # Pop off any type prefix and merge with valid_types name, merged_valid_types, type_term = \ self._apply_valid_types(name, valid_types) # Random lookup if name == 'random': return self.random_lookup(valid_types=merged_valid_types) # Do different things depending what the query looks like # Note: Term objects do an exact match, so we don't have to worry about # a query parser tripping on weird characters in the input try: # Let Python try to convert to a number, so 0xff works name_as_number = int(name, base=0) except ValueError: # Oh well name_as_number = None if '*' in name or '?' in name: exact_only = True query = whoosh.query.Wildcard(u'name', name) elif name_as_number is not None: # Don't spell-check numbers! exact_only = True query = whoosh.query.Term(u'row_id', text_type(name_as_number)) else: # Not an integer query = whoosh.query.Term(u'name', name) if type_term: query = query & type_term ### Actual searching # Limits; result limits are constants, and intermediate results (before # duplicate items are stripped out) are capped at the result limit # times another constant. # Fuzzy are capped at 10, beyond which something is probably very # wrong. Exact matches -- that is, wildcards and ids -- are far less # constrained. if exact_only: max_results = self.MAX_EXACT_RESULTS else: max_results = self.MAX_FUZZY_RESULTS locale = self._get_current_locale() facet = whoosh.sorting.MultiFacet([ LanguageFacet(locale.identifier), table_facet, "name", ]) searcher = self.index.searcher() results = searcher.search( query, limit=int(max_results * self.INTERMEDIATE_FACTOR), sortedby=facet, ) # Look for some fuzzy matches if necessary if not exact_only and not results: exact = False results = [] fuzzy_query_parts = [] fuzzy_weights = {} corrector = searcher.corrector('name') for suggestion in corrector.suggest(name, limit=max_results): fuzzy_query_parts.append(whoosh.query.Term('name', suggestion)) distance = levenshtein.relative(name, suggestion) fuzzy_weights[suggestion] = distance if not fuzzy_query_parts: # Nothing at all; don't try querying return [] fuzzy_query = whoosh.query.Or(fuzzy_query_parts) if type_term: fuzzy_query = fuzzy_query & type_term sorter = LanguageFacet( locale.identifier, extra_weights=fuzzy_weights) results = searcher.search(fuzzy_query, sortedby=sorter) ### Convert results to db objects objects = self._whoosh_records_to_results(results, exact=exact) # Truncate and return return objects[:max_results]