Пример #1
0
def test_ngrams_order_2():
    lst = "a b c".split()
    actual = list(ngrams(lst, 2))
    expected = ["a", "a b",
                "b", "b c",
                "c"]
    assert_equal(expected, actual)
Пример #2
0
def test_ngrams_order_3():
    tokens = "a b c".split()
    actual = list(ngrams(tokens, 3))
    expected = ["a", "a b", "a b c",
                "b", "b c",
                "c"]
    assert_equal(expected, actual)
Пример #3
0
def test_ngrams_order_none():
    # same result as test_ngrams_order_3...
    tokens = "a b c".split()
    actual = list(ngrams(tokens, None))
    expected = ["a", "a b", "a b c",
                "b", "b c",
                "c"]
    assert_equal(expected, actual)
Пример #4
0
def test_ngrams_order_4():
    # same result as test_ngrams_order_3...
    lst = "a b c".split()
    actual = list(ngrams(lst, 4))
    expected = ["a", "a b", "a b c",
                "b", "b c",
                "c"]
    assert_equal(expected, actual)
Пример #5
0
def test_ngrams():
    text = "Hello , world !".split()
    expected = {"Hello , world", ", world !",
                "Hello ,", ", world", "world !",
                "Hello", ",", "world", "!"}

    ng = Counter(ngrams(text, 3))
    assert_equal(set(ng), expected)
    assert_true(all(freq == 1 for freq in ng.values()))

    with_pos = list(ngrams_with_pos(text, 2))
    assert_in((0, 2, 'Hello ,'), with_pos)
    assert_in((1, 3, ', world'), with_pos)
Пример #6
0
def page_statistics(page, N, sentence_splitter=None, tokenizer=None):
    """Gather statistics from a single WP page.

    The sentence_splitter should be a callable that splits text into sentences.
    It defaults to an unspecified heuristic.

    See ``parse_dump`` for the parameters.

    Returns
    -------
    stats : (dict, dict)
        The first dict maps (target, anchor) pairs to counts.
        The second maps n-grams (up to N) to counts.
    """
    if N is not None and not isinstance(N, int):
        raise TypeError("expected integer or None for N, got %r" % N)

    clean = clean_text(page)
    link_counts = Counter(extract_links(clean))

    if N:
        no_links = remove_links(clean)

        if sentence_splitter is None:
            sentences = re.split(r'(?:\n{2,}|\.\s+)', no_links,
                                 re.MULTILINE | re.UNICODE)
        else:
            sentences = [
                sentence for paragraph in re.split('\n+', no_links)
                for sentence in paragraph
            ]

        if tokenizer is None:
            tokenizer = re.compile(r'\w+', re.UNICODE).findall
        all_ngrams = chain.from_iterable(
            ngrams(tokenizer(sentence), N) for sentence in sentences)
        ngram_counts = Counter(all_ngrams)

    else:
        ngram_counts = None

    return link_counts, ngram_counts
Пример #7
0
def page_statistics(page, N, sentence_splitter=None, tokenizer=None):
    """Gather statistics from a single WP page.

    The sentence_splitter should be a callable that splits text into sentences.
    It defaults to an unspecified heuristic.

    See ``parse_dump`` for the parameters.

    Returns
    -------
    stats : (dict, dict)
        The first dict maps (target, anchor) pairs to counts.
        The second maps n-grams (up to N) to counts.
    """
    if N is not None and not isinstance(N, int):
        raise TypeError("expected integer or None for N, got %r" % N)

    clean = clean_text(page)
    link_counts = Counter(extract_links(clean))

    if N:
        no_links = remove_links(clean)

        if sentence_splitter is None:
            sentences = re.split(r'(?:\n{2,}|\.\s+)', no_links,
                                 re.MULTILINE | re.UNICODE)
        else:
            sentences = [sentence
                         for paragraph in re.split('\n+', no_links)
                         for sentence in paragraph]

        if tokenizer is None:
            tokenizer = re.compile(r'\w+', re.UNICODE).findall
        all_ngrams = chain.from_iterable(ngrams(tokenizer(sentence), N)
                                         for sentence in sentences)
        ngram_counts = Counter(all_ngrams)

    else:
        ngram_counts = None

    return link_counts, ngram_counts
Пример #8
0
def test_ngrams_order_empty_list():
    no_tokens = []
    actual = list(ngrams(no_tokens, None))
    expected = []
    assert_equal(expected, actual)
Пример #9
0
def test_ngrams_order_1():
    tokens = "a b c".split()
    actual = list(ngrams(tokens, 1))
    expected = tokens
    assert_equal(expected, actual)
Пример #10
0
def test_ngrams_order_1():
    lst = "a b c".split()
    actual = list(ngrams(lst, 1))
    expected = lst
    assert_equal(expected, actual)