Exemplo n.º 1
0
def Tokenizer(obj, metadata=None, separator=None):
    """Splits a string into tokens ready to be inserted into the search index.

    Args:
        metadata (dict): Optional metadata can be passed to the tokenizer, this
            metadata will be cloned and added as metadata to every token that is
            created from the object to be tokenized.
        separator (callable or compiled regex): This tokenizer will convert its
            parameter to a string by calling `str` and then will split this
            string on characters for which `separator` is True. Lists will have
            their elements converted to strings and wrapped in a lunr `Token`.

    Returns:
        List of Token instances.
    """
    if obj is None:
        return []

    metadata = metadata or {}

    if isinstance(obj, (list, tuple)):
        return [
            Token(as_string(element).lower(), deepcopy(metadata))
            for element in obj
        ]

    if separator is None:
        is_separator = default_separator
    elif callable(separator):
        is_separator = separator
    else:  # must be a regex, remove when dropping support for 2.7
        is_separator = lambda c: separator.match(c)  # noqa

    string = str(obj).lower()
    length = len(string)
    tokens = []
    slice_start = 0
    for slice_end in range(length + 1):
        char = string[slice_end] if slice_end != length else ""
        slice_length = slice_end - slice_start
        if is_separator(char) or slice_end == length:
            if slice_length > 0:
                token_metadata = {}
                token_metadata["position"] = [slice_start, slice_length]
                token_metadata["index"] = len(tokens)
                token_metadata.update(metadata)

                sl = slice(slice_start, slice_end)
                tokens.append(Token(string[sl], token_metadata))

            slice_start = slice_end + 1

    return tokens
Exemplo n.º 2
0
class TestClone:
    def setup_method(self, method):
        self.token = Token("foo", {"bar": True})

    def test_clones_value(self):
        assert str(self.token) == str(self.token.clone())

    def test_clones_metadata(self):
        assert self.token.metadata == self.token.clone().metadata

    def test_clone_and_modify(self):
        clone = self.token.clone(lambda s, m: s.upper())

        assert str(clone) == "FOO"
        self.token.metadata == clone.metadata
Exemplo n.º 3
0
def Tokenizer(obj, metadata=None, separator=SEPARATOR):
    """Splits a string into tokens ready to be inserted into the search index.

    This tokenizer will convert its parameter to a string by calling `str` and
    then will split this string on characters matching `separator`.
    Lists will have their elements converted to strings and wrapped in a lunr
    `Token`.

    Optional metadata can be passed to the tokenizer, this metadata will be
    cloned and added as metadata to every token that is created from the object
    to be tokenized.
    """
    if obj is None:
        return []

    metadata = metadata or {}

    if isinstance(obj, (list, tuple)):
        return [
            Token(as_string(element).lower(), deepcopy(metadata)) for element in obj
        ]

    string = str(obj).lower()
    length = len(string)
    tokens = []
    slice_start = 0
    for slice_end in range(length + 1):
        char = string[slice_end] if slice_end != length else ""
        slice_length = slice_end - slice_start
        if separator.match(char) or slice_end == length:
            if slice_length > 0:
                token_metadata = {}
                token_metadata["position"] = [slice_start, slice_length]
                token_metadata["index"] = len(tokens)
                token_metadata.update(metadata)

                sl = slice(slice_start, slice_end)
                tokens.append(Token(string[sl], token_metadata))

            slice_start = slice_end + 1

    return tokens
Exemplo n.º 4
0
    def test_reduces_words_to_their_stem(self):
        path = os.path.join(os.path.dirname(__file__), "fixtures",
                            "stemming_vocab.json")
        with open(path) as f:
            data = json.loads(f.read())

        for word, expected in data.items():
            token = Token(word)
            result = str(stemmer(token))

            assert result == expected
Exemplo n.º 5
0
    def test_punctuation(self, description, string, expected):
        token = Token(string)
        trimmed = str(trimmer(token))

        assert trimmed == expected
Exemplo n.º 6
0
 def test_latin_characters(self):
     token = Token("hello")
     assert str(trimmer(token)) == str(token)
Exemplo n.º 7
0
 def run_string(self, string, metadata=None):
     """Convenience method for passing a string through a pipeline and
     getting strings out. This method takes care of wrapping the passed
     string in a token and mapping the resulting tokens back to strings."""
     token = Token(string, metadata)
     return [str(tkn) for tkn in self.run([token])]
Exemplo n.º 8
0
def test_str_repr():
    token = Token("foo")
    assert str(token) == "foo"
    assert repr(token) == '<Token "foo">'
Exemplo n.º 9
0
 def setup_method(self, method):
     self.token = Token("foo", {"bar": True})
Exemplo n.º 10
0
    def test_can_update_token_value(self):
        token = Token("foo", {"length": 3})
        token.update(lambda s, m: s.upper())

        assert str(token) == "FOO"
Exemplo n.º 11
0
 def test_can_attach_arbitrary_metadata(self):
     token = Token("foo", {"length": 3})
     assert token.metadata["length"] == 3