def test_add_empty_to_nonempty():
    gold = ['alpha', 'bravo', 'charlie']
    words = WordStore()
    for word in gold:
        words.add(word)

    words += WordStore()

    assert len(list(words.iter_words())) == len(gold)
def test_add():
    words = WordStore()
    words.add('albatross')

    assert len(list(words.iter_words())) == 1
    assert 'albatross' in words.iter_words()

    words.add('ganet')

    assert len(list(words.iter_words())) == 2
    assert 'albatross' in words.iter_words()
    assert 'ganet' in words.iter_words()
def test_add_duplicate():
    gold = ['left', 'center', 'right']
    more_gold = ['surround', 'center']

    words = WordStore()
    for word in gold:
        words.add(word)

    more_words = WordStore()
    for word in more_gold:
        words.add(word)

    words += more_words

    assert len(list(words.iter_words())) == len(frozenset(gold + more_gold))
    for word in words.iter_words():
        assert word in gold or word in more_gold
def test_add_nonempty_to_nonempty():
    gold = ['left', 'center', 'right']
    more_gold = ['surround']

    words = WordStore()
    for word in gold:
        words.add(word)

    more_words = WordStore()
    for word in more_gold:
        words.add(word)

    words += more_words

    assert len(list(words.iter_words())) == len(gold) + len(more_gold)
    for word in words.iter_words():
        assert word in gold or word in more_gold
예제 #5
0
def scrape_html(html: str) -> WordStore:
    """Scrape HTML of its text.

    Args:
        html:

    Returns:
        words in HTML.
    """
    words = WordStore()
    soup = BeautifulSoup(html, 'html.parser')
    try:
        for s in soup.body.strings:
            for word in s.split():
                words.add(word)
    except AttributeError:
        _logger.info('HTML has no body.')

    return words
예제 #6
0
    def build_store(words):
        store = WordStore()
        for word in words:
            store.add(word)

        return store