Exemplo n.º 1
0
 def _words(self, ixreader):
     text = self.text
     minsim = self.minsimilarity
     for term in ixreader.expand_prefix(self.fieldname, text[:self.prefixlength]):
         if text == term:
             yield term
         elif relative(text, term) > minsim:
             yield term
Exemplo n.º 2
0
 def _words(self, ixreader):
     text = self.text
     minsim = self.minsimilarity
     for term in ixreader.expand_prefix(self.fieldname,
                                        text[:self.prefixlength]):
         if text == term:
             yield term
         elif relative(text, term) > minsim:
             yield term
Exemplo n.º 3
0
    def lookup(self, input, valid_types=[], exact_only=False):
        """Attempts to find some sort of object, given a name.

        Returns a list of named (object, name, language, iso639, iso3166,
        exact) tuples.  `object` is a database object, `name` is the name under
        which the object was found, `language` and the two isos are the name
        and country codes of the language in which the name was found, and
        `exact` is True iff this was an exact match.

        This function currently ONLY does fuzzy matching if there are no exact
        matches.

        Extraneous whitespace is removed with extreme prejudice.

        Recognizes:
        - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
        - Foreign names: "Iibui", "Eivui"
        - Fuzzy names in whatever language: "Evee", "Ibui"
        - IDs: "133", "192", "250"
        Also:
        - Type restrictions.  "type:psychic" will only return the type.  This
          is how to make ID lookup useful.  Multiple type specs can be entered
          with commas, as "move,item:1".
        - Language restrictions.  "@fr:charge" will only return Tackle, which
          is called "Charge" in French.  These can be combined with type
          restrictions, e.g., "@fr,move:charge".

        `input`
            Name of the thing to look for.

        `valid_types`
            A list of type or language restrictions, e.g., `['pokemon',
            '@ja']`.  If this is provided, only results in one of the given
            tables will be returned.

        `exact_only`
            If True, only exact matches are returned.  If set to False (the
            default), and the provided `name` doesn't match anything exactly,
            spelling correction will be attempted.
        """

        name = self.normalize_name(input)
        exact = True

        # Pop off any type prefix and merge with valid_types
        name, merged_valid_types, type_term = \
            self._apply_valid_types(name, valid_types)

        # Random lookup
        if name == 'random':
            return self.random_lookup(valid_types=merged_valid_types)

        # Do different things depending what the query looks like
        # Note: Term objects do an exact match, so we don't have to worry about
        # a query parser tripping on weird characters in the input
        try:
            # Let Python try to convert to a number, so 0xff works
            name_as_number = int(name, base=0)
        except ValueError:
            # Oh well
            name_as_number = None

        if '*' in name or '?' in name:
            exact_only = True
            query = whoosh.query.Wildcard(u'name', name)
        elif name_as_number is not None:
            # Don't spell-check numbers!
            exact_only = True
            query = whoosh.query.Term(u'row_id', unicode(name_as_number))
        else:
            # Not an integer
            query = whoosh.query.Term(u'name', name)

        if type_term:
            query = query & type_term

        ### Actual searching
        # Limits; result limits are constants, and intermediate results (before
        # duplicate items are stripped out) are capped at the result limit
        # times another constant.
        # Fuzzy are capped at 10, beyond which something is probably very
        # wrong.  Exact matches -- that is, wildcards and ids -- are far less
        # constrained.
        if exact_only:
            max_results = self.MAX_EXACT_RESULTS
        else:
            max_results = self.MAX_FUZZY_RESULTS

        locale = self._get_current_locale()
        facet = whoosh.sorting.MultiFacet([
            LanguageFacet(locale.identifier),
            table_facet,
            "name",
        ])
        searcher = self.index.searcher()
        results = searcher.search(
            query,
            limit=int(max_results * self.INTERMEDIATE_FACTOR),
            sortedby=facet,
        )

        # Look for some fuzzy matches if necessary
        if not exact_only and not results:
            exact = False
            results = []

            fuzzy_query_parts = []
            fuzzy_weights = {}
            corrector = searcher.corrector('name')
            for suggestion in corrector.suggest(name, limit=max_results):
                fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
                distance = levenshtein.relative(name, suggestion)
                fuzzy_weights[suggestion] = distance

            if not fuzzy_query_parts:
                # Nothing at all; don't try querying
                return []

            fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
            if type_term:
                fuzzy_query = fuzzy_query & type_term

            sorter = LanguageFacet(locale.identifier,
                                   extra_weights=fuzzy_weights)
            results = searcher.search(fuzzy_query, sortedby=sorter)

        ### Convert results to db objects
        objects = self._whoosh_records_to_results(results, exact=exact)

        # Truncate and return
        return objects[:max_results]
Exemplo n.º 4
0
    def lookup(self, input, valid_types=[], exact_only=False):
        """Attempts to find some sort of object, given a name.

        Returns a list of named (object, name, language, iso639, iso3166,
        exact) tuples.  `object` is a database object, `name` is the name under
        which the object was found, `language` and the two isos are the name
        and country codes of the language in which the name was found, and
        `exact` is True iff this was an exact match.

        This function currently ONLY does fuzzy matching if there are no exact
        matches.

        Extraneous whitespace is removed with extreme prejudice.

        Recognizes:
        - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
        - Foreign names: "Iibui", "Eivui"
        - Fuzzy names in whatever language: "Evee", "Ibui"
        - IDs: "133", "192", "250"
        Also:
        - Type restrictions.  "type:psychic" will only return the type.  This
          is how to make ID lookup useful.  Multiple type specs can be entered
          with commas, as "move,item:1".
        - Language restrictions.  "@fr:charge" will only return Tackle, which
          is called "Charge" in French.  These can be combined with type
          restrictions, e.g., "@fr,move:charge".

        `input`
            Name of the thing to look for.

        `valid_types`
            A list of type or language restrictions, e.g., `['pokemon',
            '@ja']`.  If this is provided, only results in one of the given
            tables will be returned.

        `exact_only`
            If True, only exact matches are returned.  If set to False (the
            default), and the provided `name` doesn't match anything exactly,
            spelling correction will be attempted.
        """

        name = self.normalize_name(input)
        exact = True

        # Pop off any type prefix and merge with valid_types
        name, merged_valid_types, type_term = \
            self._apply_valid_types(name, valid_types)

        # Random lookup
        if name == 'random':
            return self.random_lookup(valid_types=merged_valid_types)

        # Do different things depending what the query looks like
        # Note: Term objects do an exact match, so we don't have to worry about
        # a query parser tripping on weird characters in the input
        try:
            # Let Python try to convert to a number, so 0xff works
            name_as_number = int(name, base=0)
        except ValueError:
            # Oh well
            name_as_number = None

        if '*' in name or '?' in name:
            exact_only = True
            query = whoosh.query.Wildcard(u'name', name)
        elif name_as_number is not None:
            # Don't spell-check numbers!
            exact_only = True
            query = whoosh.query.Term(u'row_id', text_type(name_as_number))
        else:
            # Not an integer
            query = whoosh.query.Term(u'name', name)

        if type_term:
            query = query & type_term


        ### Actual searching
        # Limits; result limits are constants, and intermediate results (before
        # duplicate items are stripped out) are capped at the result limit
        # times another constant.
        # Fuzzy are capped at 10, beyond which something is probably very
        # wrong.  Exact matches -- that is, wildcards and ids -- are far less
        # constrained.
        if exact_only:
            max_results = self.MAX_EXACT_RESULTS
        else:
            max_results = self.MAX_FUZZY_RESULTS

        locale = self._get_current_locale()
        facet = whoosh.sorting.MultiFacet([
            LanguageFacet(locale.identifier),
            table_facet,
            "name",
        ])
        searcher = self.index.searcher()
        results = searcher.search(
            query,
            limit=int(max_results * self.INTERMEDIATE_FACTOR),
            sortedby=facet,
        )

        # Look for some fuzzy matches if necessary
        if not exact_only and not results:
            exact = False
            results = []

            fuzzy_query_parts = []
            fuzzy_weights = {}
            corrector = searcher.corrector('name')
            for suggestion in corrector.suggest(name, limit=max_results):
                fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
                distance = levenshtein.relative(name, suggestion)
                fuzzy_weights[suggestion] = distance

            if not fuzzy_query_parts:
                # Nothing at all; don't try querying
                return []

            fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
            if type_term:
                fuzzy_query = fuzzy_query & type_term

            sorter = LanguageFacet(
                locale.identifier, extra_weights=fuzzy_weights)
            results = searcher.search(fuzzy_query, sortedby=sorter)

        ### Convert results to db objects
        objects = self._whoosh_records_to_results(results, exact=exact)

        # Truncate and return
        return objects[:max_results]