def get_html(d_url, p_url, c_t): global cache_currentTime t = time() if (p_url in cache_dict) and (t - cache_currentTime < c_t): print( f"Returned {p_url} from cache. Cached time:{t-cache_currentTime} " ) return cache_dict[p_url] browser = get_driver(d_url) if connect_to_site(browser, p_url): print(f"Get page {p_url}") html = browser.page_source cache_dict[p_url] = html cache_currentTime = t browser.quit() return html else: print('Error connecting to [p_url]') browser.quit() return None
def run_process(page_number, filename): browser = get_driver() if connect_to_base(browser, page_number): sleep(2) html = browser.page_source output_list = parse_html(html) write_to_file(output_list, filename) browser.quit() else: print('Error connecting to hackernews') browser.quit()
def run_process(page_number, filename, headless): # init browser browser = get_driver(headless) if connect_to_base(browser, page_number): sleep(2) html = browser.page_source output_list = parse_html(html) write_to_file(output_list, filename) # exit browser.quit() else: print("Error connecting to hackernews") browser.quit()
output_list = parse_html(html) ######## # here # ######## write_to_file(output_list, filename) else: print('Error connecting to hackernews') if __name__ == '__main__': # set variables start_time = time() current_page = 1 output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') output_filename = f'output_{output_timestamp}.csv' browser = get_driver() # scrape and crawl while current_page <= 20: print(f'Scraping page #{current_page}...') run_process(current_page, output_filename, browser) ######## # here # ######## current_page = current_page + 1 # exit browser.quit() end_time = time() elapsed_time = end_time - start_time print(f'Elapsed run time: {elapsed_time} seconds')
import sys from time import sleep from scrapers.scraper import get_driver, connect_to_base, parse_html def run_process(rowser): if connect_to_base(browser): print(f'Scraping random Wikipedia page...') sleep(2) html = browser.page_source return parse_html(html) else: print("Error connecting to Wikipedia") return False if __name__ == '__main__': browser = get_driver(sys.argv[1]) data = run_process(browser) print(data) browser.quit() print(f'Finished!')
def test_brower(self): browser=get_driver(self.driver_url) self.assertTrue(browser is not None)
if __name__ == "__main__": # headless mode? headless = False if len(sys.argv) > 1: if sys.argv[1] == "headless": print("Running in headless mode") headless = True # set variables start_time = time() current_page = 1 output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") output_filename = f"output_{output_timestamp}.csv" # init browser browser = get_driver(headless=headless) # scrape and crawl while current_page <= 20: print(f"Scraping page #{current_page}...") run_process(current_page, output_filename, browser) current_page = current_page + 1 # exit browser.quit() end_time = time() elapsed_time = end_time - start_time print(f"Elapsed run time: {elapsed_time} seconds")