def test_can_return_link_to_new_article(): response = '<div id="content"><a href="/wiki/Web_scraping"></a></div>' soup = scraper.parse_html(response) link = scraper.get_new_article(soup) assert link == "https://en.wikipedia.org/wiki/Web_scraping"
def run_process(page_number, filename, browser): if connect_to_base(browser, page_number): sleep(2) html = browser.page_source output_list = parse_html(html) write_to_file(output_list, filename) else: print('Error connecting to hackernews')
def main(): # file = 'pdfs/01-kevin-rose.pdf' df = pd.read_csv('links5.csv', sep='|') episodes = list(df.loc[:, 'ep']) urls = list(df.loc[:, 'urls']) guests = list(df.loc[:, 'guests']) for i, url in enumerate(urls): print(episodes[i], guests[i].split(',')) if url[-1] == 'f': data = pdf_parser(episodes[i], url, guests[i].split(',')) elif url[-1] == '/': data = parse_html(episodes[i], url) else: print('ERROR ', episodes[i]) data = {'file_type': 'audio'} with open(f'files/eps/{episodes[i]}.json', 'w') as fp: json.dump(data, fp, sort_keys=True)
def test_get_new_article_fails_when_no_link(): response = '<div id="content"></div>' soup = scraper.parse_html(response) with pytest.raises(OutOfLinksException): assert scraper.get_new_article(soup)
def test_does_not_follow_hashtag_links(): response = '<div id="content"><a href="/wiki/Web_scraping#Something"></a></div>' soup = scraper.parse_html(response) with pytest.raises(OutOfLinksException): assert scraper.get_new_article(soup)
def test_can_return_title(): response = scraper.get_page_data( "https://en.wikipedia.org/wiki/Web_scraping") soup = scraper.parse_html(response.content) assert (scraper.get_title(soup) == "Web scraping")