Python find_ngrams示例

编程语言: Python

命名空间/包名称: pyconll.util

方法/功能: find_ngrams

hotexamples.com的示例: 9

Python find_ngrams - 已找到9个示例。这些是从开源项目中提取的最受好评的pyconll.util.find_ngrams现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： test_util.py 项目： bittlingmayer/pyconll

def test_ngram_first_word_match():
    """
    Test that a first word match is not enough to match.
    """
    c = load_from_file(fixture_location('long.conll'))
    it = find_ngrams(c, 'un cabinet'.split())

    with pytest.raises(StopIteration):
        next(it)

示例#2

显示文件

文件： test_util.py 项目： bittlingmayer/pyconll

def test_ngram_standard():
    """
    Test if the find_ngram method works for standard situations.
    """
    c = load_from_file(fixture_location('basic.conll'))

    s, i = next(find_ngrams(c, 'un film sur la'.split()))
    assert s.id == 'fr-ud-dev_00001'
    assert i == 2

示例#3

显示文件

文件： test_util.py 项目： bittlingmayer/pyconll

def test_ngram_none():
    """
    Test that no ngram is identified when no exist
    """
    c = load_from_file(fixture_location('long.conll'))
    it = find_ngrams(c, 'cabinet'.split())

    with pytest.raises(StopIteration):
        next(it)

示例#4

显示文件

文件： test_util.py 项目： bittlingmayer/pyconll

def test_ngram_multiword_split():
    """
    Test that ngram searches still work when they go over a multiword token.
    """
    c = load_from_file(fixture_location('long.conll'))

    it = find_ngrams(c, 'de " décentrement de le Sujet "'.split())
    s, i = next(it)

    assert s.id == 'fr-ud-test_00002'
    assert i == 8

    with pytest.raises(StopIteration):
        next(it)

示例#5

显示文件

文件： test_util.py 项目： bittlingmayer/pyconll

def test_ngram_case_insensitive():
    """
    Test that the case sensitivity function works.
    """
    c = load_from_file(fixture_location('long.conll'))
    results = list(find_ngrams(c, 'Il'.split(), case_sensitive=False))

    actual_ids = list(map(lambda res: res[0].id, results))
    actual_indices = list(map(operator.itemgetter(1), results))

    expected_ids = ['fr-ud-test_00003', 'fr-ud-test_00005', 'fr-ud-test_00008']
    expected_indices = [1, 16, 0]

    assert actual_ids == expected_ids
    assert actual_indices == expected_indices

示例#6

显示文件

文件： test_util.py 项目： bittlingmayer/pyconll

def test_ngram_multiple_per_sentence():
    """
    Test that all ngrams are found when there are multiple in the same sentence.
    """
    c = load_from_file(fixture_location('long.conll'))
    results = list(find_ngrams(c, 'telle ou telle'.split()))

    actual_ids = list(map(lambda res: res[0].id, results))
    actual_indices = list(map(operator.itemgetter(1), results))

    expected_ids = ['fr-ud-test_00008', 'fr-ud-test_00008']
    expected_indices = [21, 26]

    assert actual_ids == expected_ids
    assert actual_indices == expected_indices

示例#7

显示文件

def test_ngram_case_insensitive_n_token():
    """
    Test that the case sensitivity function works, when it is the nth token.
    """
    c = load_from_file(fixture_location('long.conll'))
    s, i, tokens = next(
        find_ngrams(c,
                    'l\' orgaNisaTion pour La sécurité et la'.split(),
                    case_sensitive=False))

    actual_token_ids = list(map(lambda token: token.id, tokens))
    expected_token_ids = ['9', '10', '11', '12', '13', '14', '15']

    assert s.id == 'fr-ud-test_00004'
    assert i == 8
    assert actual_token_ids == expected_token_ids

示例#8

显示文件

def test_ngram_multiword_split():
    """
    Test that ngram searches still work when they go over a multiword token.
    """
    c = load_from_file(fixture_location('long.conll'))

    it = find_ngrams(c, 'de " décentrement de le Sujet "'.split())
    s, i, tokens = next(it)

    actual_token_ids = list(map(lambda token: token.id, tokens))
    expected_token_ids = ['9', '10', '11', '12', '13', '14', '15']

    assert s.id == 'fr-ud-test_00002'
    assert i == 8
    assert actual_token_ids == expected_token_ids

    with pytest.raises(StopIteration):
        next(it)

示例#9

显示文件

def test_ngram_multiple_multiword_splits():
    """
    Test that ngram searches work when they there is more than one multiword token.
    """
    c = load_from_file(fixture_location('long.conll'))

    it = find_ngrams(
        c, 'civile de le territoire non autonome de le Sahara'.split())
    s, i, tokens = next(it)

    actual_token_ids = list(map(lambda token: token.id, tokens))
    expected_token_ids = ['10', '11', '12', '13', '14', '15', '16', '17', '18']

    assert s.id == 'fr-ud-test_00003'
    assert i == 9
    assert actual_token_ids == expected_token_ids

    with pytest.raises(StopIteration):
        next(it)