예제 #1
0
def build(src: str, **_) -> str:
    """Build word store from URL(s).

    Args:
        src: source(s) to scrape. Can be URL, or file containing one URL per
            line.

    Returns:
        Words store as json.
    """
    # Make sure src is iterable.
    src_as_file = os.path.abspath(os.path.expanduser(src))
    sources = (
        (s for s in iter_sources(src))
        if os.path.isfile(src_as_file) else
        (s for s in [src])
    )

    # Scrape source(s).
    words = WordStore()
    for source in sources:
        words += scrape(source)

    # Output results.
    return words.to_json()
예제 #2
0
def test_build_with_file(wordstore):
    rodents = wordstore(['squirrel', 'rabbit', 'capybara'])
    mustelinae = wordstore(['wolverine', 'otter', 'mink'])
    birds = wordstore(['osprey', 'pigeon', 'wren'])
    stores = [rodents, mustelinae, birds]

    gold = rodents + mustelinae + birds

    lines = ['first', 'second', 'third']

    with mock.patch(
            'passphrase.build.scrape',
            side_effect=stores) as mock_scrape, tempfile.NamedTemporaryFile(
                'w') as f:
        # Create sources.
        for line in lines:
            f.write(line)
            f.write('\n')

        f.seek(0)

        json_str = build(f.name)

        words = WordStore.from_json(json_str)

        assert words.store == gold.store
        assert mock_scrape.call_count == len(stores)
예제 #3
0
def test_build_with_url(wordstore):
    gold_wordstore = wordstore(['llama', 'rabbit', 'capybara'])

    with mock.patch('passphrase.build.scrape', return_value=gold_wordstore):
        words_json = build('http://nowhere.in.particular')

    words = WordStore.from_json(words_json)

    assert words.store == gold_wordstore.store
예제 #4
0
def scrape_html(html: str) -> WordStore:
    """Scrape HTML of its text.

    Args:
        html:

    Returns:
        words in HTML.
    """
    words = WordStore()
    soup = BeautifulSoup(html, 'html.parser')
    try:
        for s in soup.body.strings:
            for word in s.split():
                words.add(word)
    except AttributeError:
        _logger.info('HTML has no body.')

    return words
def test_add_duplicate():
    gold = ['left', 'center', 'right']
    more_gold = ['surround', 'center']

    words = WordStore()
    for word in gold:
        words.add(word)

    more_words = WordStore()
    for word in more_gold:
        words.add(word)

    words += more_words

    assert len(list(words.iter_words())) == len(frozenset(gold + more_gold))
    for word in words.iter_words():
        assert word in gold or word in more_gold
def test_add_nonempty_to_nonempty():
    gold = ['left', 'center', 'right']
    more_gold = ['surround']

    words = WordStore()
    for word in gold:
        words.add(word)

    more_words = WordStore()
    for word in more_gold:
        words.add(word)

    words += more_words

    assert len(list(words.iter_words())) == len(gold) + len(more_gold)
    for word in words.iter_words():
        assert word in gold or word in more_gold
def test_add_nonempty_to_empty():
    gold = ['alpha', 'bravo', 'charlie']
    gold_words = WordStore()
    for word in gold:
        gold_words.add(word)

    words = WordStore()
    words += gold_words

    assert len(list(words.iter_words())) == len(gold)
def load_database(src: str) -> WordStore:
    """Load WordStore from file path.

    Args:
        src: file path.

    Returns:
        word store.
    """
    src = os.path.abspath(os.path.expanduser(src))
    with open(src, 'r') as f:
        data = f.read()
        words = WordStore.from_json(data)

    return words
예제 #9
0
def scrape(url: str) -> WordStore:
    """Scrape HTML resource of its text.

    Args:
        url: HTML resource to scrape.

    Returns:
        words in resource.
    """
    r = requests.get(url)
    try:
        r.raise_for_status()
    except requests.exceptions.HTTPError:
        _logger.warning('Could not scrape %s', url)
        words = WordStore()
    else:
        words = scrape_html(r.text)

    return words
def test_empty_plus_empty():
    combined = WordStore() + WordStore()

    assert combined.store == WordStore().store
예제 #11
0
def test_populated(wordstore):
    words = wordstore(['something', 'in', 'the', 'way', 'she', 'moves'])
    json_str = words.to_json()
    reconstructed_words = WordStore.from_json(json_str)

    assert reconstructed_words.store == words.store
def test_add_empty():
    words = WordStore()
    words += WordStore()

    assert len(list(words.iter_words())) == 0
예제 #13
0
def test_add():
    words = WordStore()
    words.add('albatross')

    assert len(list(words.iter_words())) == 1
    assert 'albatross' in words.iter_words()

    words.add('ganet')

    assert len(list(words.iter_words())) == 2
    assert 'albatross' in words.iter_words()
    assert 'ganet' in words.iter_words()
예제 #14
0
    def build_store(words):
        store = WordStore()
        for word in words:
            store.add(word)

        return store
def test_empty_plus_nonempty(wordstore):
    words = wordstore(['a', 'b', 'c'])

    combined = WordStore() + words

    assert combined.store == words.store
def test_nonempty_plus_empty(wordstore):
    words = wordstore(['alpha', 'bravo', 'charlie'])

    combined = words + WordStore()

    assert combined.store == words.store
예제 #17
0
def test_empty():
    words = WordStore()
    json_str = words.to_json()
    reconstructed_words = WordStore.from_json(json_str)

    assert reconstructed_words.store == words.store