def test_ngrams_order_2(): lst = "a b c".split() actual = list(ngrams(lst, 2)) expected = ["a", "a b", "b", "b c", "c"] assert_equal(expected, actual)
def test_ngrams_order_3(): tokens = "a b c".split() actual = list(ngrams(tokens, 3)) expected = ["a", "a b", "a b c", "b", "b c", "c"] assert_equal(expected, actual)
def test_ngrams_order_none(): # same result as test_ngrams_order_3... tokens = "a b c".split() actual = list(ngrams(tokens, None)) expected = ["a", "a b", "a b c", "b", "b c", "c"] assert_equal(expected, actual)
def test_ngrams_order_4(): # same result as test_ngrams_order_3... lst = "a b c".split() actual = list(ngrams(lst, 4)) expected = ["a", "a b", "a b c", "b", "b c", "c"] assert_equal(expected, actual)
def test_ngrams(): text = "Hello , world !".split() expected = {"Hello , world", ", world !", "Hello ,", ", world", "world !", "Hello", ",", "world", "!"} ng = Counter(ngrams(text, 3)) assert_equal(set(ng), expected) assert_true(all(freq == 1 for freq in ng.values())) with_pos = list(ngrams_with_pos(text, 2)) assert_in((0, 2, 'Hello ,'), with_pos) assert_in((1, 3, ', world'), with_pos)
def page_statistics(page, N, sentence_splitter=None, tokenizer=None): """Gather statistics from a single WP page. The sentence_splitter should be a callable that splits text into sentences. It defaults to an unspecified heuristic. See ``parse_dump`` for the parameters. Returns ------- stats : (dict, dict) The first dict maps (target, anchor) pairs to counts. The second maps n-grams (up to N) to counts. """ if N is not None and not isinstance(N, int): raise TypeError("expected integer or None for N, got %r" % N) clean = clean_text(page) link_counts = Counter(extract_links(clean)) if N: no_links = remove_links(clean) if sentence_splitter is None: sentences = re.split(r'(?:\n{2,}|\.\s+)', no_links, re.MULTILINE | re.UNICODE) else: sentences = [ sentence for paragraph in re.split('\n+', no_links) for sentence in paragraph ] if tokenizer is None: tokenizer = re.compile(r'\w+', re.UNICODE).findall all_ngrams = chain.from_iterable( ngrams(tokenizer(sentence), N) for sentence in sentences) ngram_counts = Counter(all_ngrams) else: ngram_counts = None return link_counts, ngram_counts
def page_statistics(page, N, sentence_splitter=None, tokenizer=None): """Gather statistics from a single WP page. The sentence_splitter should be a callable that splits text into sentences. It defaults to an unspecified heuristic. See ``parse_dump`` for the parameters. Returns ------- stats : (dict, dict) The first dict maps (target, anchor) pairs to counts. The second maps n-grams (up to N) to counts. """ if N is not None and not isinstance(N, int): raise TypeError("expected integer or None for N, got %r" % N) clean = clean_text(page) link_counts = Counter(extract_links(clean)) if N: no_links = remove_links(clean) if sentence_splitter is None: sentences = re.split(r'(?:\n{2,}|\.\s+)', no_links, re.MULTILINE | re.UNICODE) else: sentences = [sentence for paragraph in re.split('\n+', no_links) for sentence in paragraph] if tokenizer is None: tokenizer = re.compile(r'\w+', re.UNICODE).findall all_ngrams = chain.from_iterable(ngrams(tokenizer(sentence), N) for sentence in sentences) ngram_counts = Counter(all_ngrams) else: ngram_counts = None return link_counts, ngram_counts
def test_ngrams_order_empty_list(): no_tokens = [] actual = list(ngrams(no_tokens, None)) expected = [] assert_equal(expected, actual)
def test_ngrams_order_1(): tokens = "a b c".split() actual = list(ngrams(tokens, 1)) expected = tokens assert_equal(expected, actual)
def test_ngrams_order_1(): lst = "a b c".split() actual = list(ngrams(lst, 1)) expected = lst assert_equal(expected, actual)