def browse_website(url, num_links, webdriver, proxy_queue, browser_params): """ calls get_website before visiting <num_links> present on the page NOTE: top_url will NOT be properly labeled for requests to subpages these will still have the top_url set to the url passed as a parameter to this function. """ # First get the site get_website(url, webdriver, proxy_queue, browser_params) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random()*len(links)-1) print "BROWSE: visiting link to %s" % links[r].get_attribute("href") try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(1) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """ calls get_website before visiting <num_links> present on the page NOTE: top_url will NOT be properly labeled for requests to subpages these will still have the top_url set to the url passed as a parameter to this function. """ # First get the site get_website(url, webdriver, proxy_queue, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random()*len(links)-1) logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(1) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """Calls get_website before visiting <num_links> present on the page. Note: the site_url in the site_visits table for the links visited will be the site_url of the original page and NOT the url of the links visited. """ # First get the site get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random()*len(links)-1) logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(max(1,sleep)) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params): """ calls get_website before visiting <num_links> present on the page NOTE: top_url will NOT be properly labeled for requests to subpages these will still have the top_url set to the url passed as a parameter to this function. """ # First get the site get_website(url, webdriver, proxy_queue, browser_params) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random() * len(links) - 1) print "BROWSE: visiting link to %s" % links[r].get_attribute("href") try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(1) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """ calls get_website before visiting <num_links> present on the page NOTE: top_url will NOT be properly labeled for requests to subpages these will still have the top_url set to the url passed as a parameter to this function. """ # First get the site get_website(url, webdriver, proxy_queue, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random() * len(links) - 1) logger.info( "BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(1) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """Calls get_website before visiting <num_links> present on the page. Note: the site_url in the site_visits table for the links visited will be the site_url of the original page and NOT the url of the links visited. """ # First get the site get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random() * len(links)) logger.info( "BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(max(1, sleep)) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() wait_until_loaded(webdriver, 300) except Exception: pass
def browse_and_dump_source(url, num_links, sleep, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_sockets): """Calls get_website before visiting <num_links> present on the page. Each link visited will do a recursive page source dump. """ # First get the site get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params, extension_sockets) recursive_dump_page_source(visit_id, webdriver, manager_params, suffix='0') # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages already_clicked = set() for i in range(num_links): all_links = get_intra_links(webdriver, url) disp_links = filter(lambda x: is_displayed(x), all_links) links = filter(lambda x: _filter_out_clicks(x, already_clicked), disp_links) if len(links) == 0: break random.shuffle(links) clicked = False for link in links: try: href = link.get_attribute('href') already_clicked.add(href) logger.info("BROWSER %i: Trying to click %s out of " "%i links" % (browser_params['crawl_id'], href, len(links))) link.click() except ElementNotVisibleException: continue except WebDriverException: continue except Exception, e: logger.error("BROWSER %i: Exception trying to visit %s, %s" % (browser_params['crawl_id'], link.get_attribute("href"), str(e))) continue logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], href)) wait_until_loaded(webdriver, 300) time.sleep(max(1, sleep)) recursive_dump_page_source(visit_id, webdriver, manager_params, suffix=str(i + 1)) webdriver.back() time.sleep(max(1, sleep)) wait_until_loaded(webdriver, 300) clicked = True break if not clicked: break