예제 #1
0
def test_can_return_link_to_new_article():

    response = '<div id="content"><a href="/wiki/Web_scraping"></a></div>'
    soup = scraper.parse_html(response)

    link = scraper.get_new_article(soup)
    assert link == "https://en.wikipedia.org/wiki/Web_scraping"
예제 #2
0
def run_process(page_number, filename, browser):
    if connect_to_base(browser, page_number):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
    else:
        print('Error connecting to hackernews')
예제 #3
0
def main():
    # file = 'pdfs/01-kevin-rose.pdf'
    df = pd.read_csv('links5.csv', sep='|')
    episodes = list(df.loc[:, 'ep'])
    urls = list(df.loc[:, 'urls'])
    guests = list(df.loc[:, 'guests'])
    for i, url in enumerate(urls):
        print(episodes[i], guests[i].split(','))
        if url[-1] == 'f':
            data = pdf_parser(episodes[i], url, guests[i].split(','))
        elif url[-1] == '/':
            data = parse_html(episodes[i], url)
        else:
            print('ERROR ', episodes[i])
            data = {'file_type': 'audio'}

        with open(f'files/eps/{episodes[i]}.json', 'w') as fp:
            json.dump(data, fp, sort_keys=True)
예제 #4
0
def test_get_new_article_fails_when_no_link():
    response = '<div id="content"></div>'
    soup = scraper.parse_html(response)

    with pytest.raises(OutOfLinksException):
        assert scraper.get_new_article(soup)
예제 #5
0
def test_does_not_follow_hashtag_links():
    response = '<div id="content"><a href="/wiki/Web_scraping#Something"></a></div>'
    soup = scraper.parse_html(response)

    with pytest.raises(OutOfLinksException):
        assert scraper.get_new_article(soup)
예제 #6
0
def test_can_return_title():
    response = scraper.get_page_data(
        "https://en.wikipedia.org/wiki/Web_scraping")
    soup = scraper.parse_html(response.content)

    assert (scraper.get_title(soup) == "Web scraping")