Пример #1
0
def test_ngram_tokenization():
    tokenizer = get_tokenizer('ngram', level=3)
    text = '你好啊!吃过了没有?'
    tokens = ['你好啊', '吃过了', '过了没', '了没有']
    assert tokenizer.tokenize(text) == tokens
    assert tokenizer.cut(text) == tokens
    assert tokenizer.lcut(text) == tokens
    assert tokenizer(text) == tokens
Пример #2
0
def test_space_tokenization():
    tokenizer = get_tokenizer('space')
    text = 'hello world'
    tokens = [Token('hello', 0, 5), Token('world', 6, 11)]
    words = ['hello', 'world']
    assert list(tokenizer.tokenize(text)) == tokens
    assert list(tokenizer.cut(text)) == words
    assert tokenizer.lcut(text) == words
    assert list(tokenizer(text)) == tokens
Пример #3
0
def test_ngram_tokenization():
    tokenizer = get_tokenizer('ngram', level=3)
    text = '你好啊!吃过了没有?'
    tokens = [
        Token('你好啊', 0, 3),
        Token('吃过了', 4, 7),
        Token('过了没', 5, 8),
        Token('了没有', 6, 9)
    ]
    words = ['你好啊', '吃过了', '过了没', '了没有']
    assert list(tokenizer.tokenize(text)) == tokens
    assert list(tokenizer.cut(text)) == words
    assert tokenizer.lcut(text) == words
    assert list(tokenizer(text)) == tokens
Пример #4
0
def test_jieba_tokenizer():
    tokenizer = get_tokenizer('jieba', lazy_load=False)
    text = '今天的天气真好'
    tokens = [
        Token('今天', 0, 2),
        Token('的', 2, 3),
        Token('天气', 3, 5),
        Token('真', 5, 6),
        Token('好', 6, 7)
    ]
    words = ['今天', '的', '天气', '真', '好']
    assert list(tokenizer.tokenize(text)) == tokens
    assert list(tokenizer.cut(text)) == words
    assert tokenizer.lcut(text) == words
    assert list(tokenizer(text)) == tokens
Пример #5
0
def test_corenlp_tokenizer():
    tokenizer = get_tokenizer('corenlp', url="http://127.0.0.1:8000")
    text = '今天的天气真好'
    tokens = [
        Token('今天', 0, 2),
        Token('的', 2, 3),
        Token('天气', 3, 5),
        Token('真', 5, 6),
        Token('好', 6, 7)
    ]
    words = ['今天', '的', '天气', '真', '好']
    assert list(tokenizer.tokenize(text)) == tokens
    assert list(tokenizer.cut(text)) == words
    assert tokenizer.lcut(text) == words
    assert list(tokenizer(text)) == tokens
Пример #6
0
def test_pku_tokenizer():
    tokenizer = get_tokenizer('pku')
    text = '今天的天气真好'
    tokens = [
        Token('今天', 0, 2),
        Token('的', 2, 3),
        Token('天气', 3, 5),
        Token('真', 5, 6),
        Token('好', 6, 7)
    ]
    words = ['今天', '的', '天气', '真', '好']
    assert list(tokenizer.tokenize(text)) == tokens
    assert list(tokenizer.cut(text)) == words
    assert tokenizer.lcut(text) == words
    assert list(tokenizer(text)) == tokens
Пример #7
0
def compute_similarity(first,
                       second,
                       method='jaccard',
                       tokenizer=None,
                       partial=False,
                       ngram_range=None,
                       ngram_weights=None):
    assert isinstance(first, str) and isinstance(second, str)
    if not ngram_range:
        ngram_range = [1]
        ngram_weights = [1.0]

    metric_func = {
        'lcs': lcs,
        'jaccard': jaccard,
        'cosine': cosine,
        'dice': dice,
    }.get(method)
    if not metric_func:
        raise ValueError("unsupported method `{}`".format(method))

    tokenizer = tokenizer or get_tokenizer("ngram", level=1)

    first_terms = tokenizer.lcut(first)
    second_terms = tokenizer.lcut(second)

    similarity = 0.0
    ngram_levels = list(range(ngram_range[0], ngram_range[-1] + 1))
    if not ngram_weights:
        ngram_weights = [1 for _ in range(len(ngram_levels))]
    for ngram_level, weight in zip(ngram_levels, ngram_weights):
        first_ngrams = list(windowed(first_terms, ngram_level))
        second_ngrams = list(windowed(second_terms, ngram_level))
        similarity += weight * metric_func(
            first_ngrams, second_ngrams, partial=partial)

    return similarity / sum(ngram_weights)
Пример #8
0
 def __init__(self, schema):
     self.schema = IndexSchema(schema)
     self.fields = sorted(self.schema.index_fields)
     self.term_dict = dict()
     self.index = [defaultdict(set) for _ in self.fields]
     self.tokenizer = get_tokenizer("ngram", level=2)
Пример #9
0
import pytest

from zhtools.tokenize import get_tokenizer
from zhtools.similarity import compute_similarity


@pytest.mark.parametrize(
    'first, second, tokenizer, ngram_range, ngram_weights, sim', [
        ('abcde', 'bcd', None, None, None, 0.6),
        ('abcde', 'bcd', None, [1, 2], None, 0.55),
        ('abcde', 'bcd', None, [1, 2], [0.1, 0.4], 0.52),
        ('abcde', 'bcd', get_tokenizer('ngram', level=2), None, None, 0.5),
    ])
def test_similarity(first, second, tokenizer, ngram_range, ngram_weights, sim):
    # use jaccard default
    similarity = compute_similarity(first,
                                    second,
                                    tokenizer=tokenizer,
                                    ngram_range=ngram_range,
                                    ngram_weights=ngram_weights)
    assert similarity == sim


@pytest.mark.parametrize('first, second, method',
                         [('abcde', 'bcd', 'some_unknown_method')])
def test_similarity_error(first, second, method):
    with pytest.raises(ValueError):
        compute_similarity(first, second, method=method)
Пример #10
0
def test_get_tokenizer_error():
    with pytest.raises(ValueError):
        get_tokenizer('some_unknown_tokenizer')
Пример #11
0
def test_get_tokenizer():
    first = get_tokenizer('ngram', level=3)
    second = get_tokenizer('ngram', level=3)

    assert first is second
Пример #12
0
from contextlib import contextmanager
from os.path import join
import shutil
import tempfile

import pytest

from zhtools.utils.storage import MemoryDocumentStorage
from zhtools.utils.inverted_index import IndexSchema, InvertedIndex, FieldNotExistsError
from zhtools.similarity import compute_similarity
from zhtools.tokenize import get_tokenizer


TOKENIZER = get_tokenizer('ngram', level=2)


@contextmanager
def tempdir():
    path = tempfile.mkdtemp()
    try:
        yield path
    finally:
        try:
            shutil.rmtree(path)
        except IOError:
            pass


class TestIndexSchema():

    def setup(self):