def browse_website(url, num_links, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """ calls get_website before visiting <num_links> present on the page NOTE: top_url will NOT be properly labeled for requests to subpages these will still have the top_url set to the url passed as a parameter to this function. """ # First get the site get_website(url, webdriver, proxy_queue, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random()*len(links)-1) logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(1) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """Calls get_website before visiting <num_links> present on the page. Note: the site_url in the site_visits table for the links visited will be the site_url of the original page and NOT the url of the links visited. """ # First get the site get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random()*len(links)-1) logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(max(1,sleep)) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params): """ calls get_website before visiting <num_links> present on the page NOTE: top_url will NOT be properly labeled for requests to subpages these will still have the top_url set to the url passed as a parameter to this function. """ # First get the site get_website(url, webdriver, proxy_queue, browser_params) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random()*len(links)-1) print "BROWSE: visiting link to %s" % links[r].get_attribute("href") try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(1) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params): """ calls get_website before visiting <num_links> present on the page NOTE: top_url will NOT be properly labeled for requests to subpages these will still have the top_url set to the url passed as a parameter to this function. """ # First get the site get_website(url, webdriver, proxy_queue, browser_params) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random() * len(links) - 1) print "BROWSE: visiting link to %s" % links[r].get_attribute("href") try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(1) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """ calls get_website before visiting <num_links> present on the page NOTE: top_url will NOT be properly labeled for requests to subpages these will still have the top_url set to the url passed as a parameter to this function. """ # First get the site get_website(url, webdriver, proxy_queue, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random() * len(links) - 1) logger.info( "BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(1) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() except Exception, e: pass
def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """Calls get_website before visiting <num_links> present on the page. Note: the site_url in the site_visits table for the links visited will be the site_url of the original page and NOT the url of the links visited. """ # First get the site get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params, extension_socket) # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages for i in range(num_links): links = get_intra_links(webdriver, url) links = filter(lambda x: x.is_displayed() == True, links) if len(links) == 0: break r = int(random.random() * len(links)) logger.info( "BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href"))) try: links[r].click() wait_until_loaded(webdriver, 300) time.sleep(max(1, sleep)) if browser_params['bot_mitigation']: bot_mitigation(webdriver) webdriver.back() wait_until_loaded(webdriver, 300) except Exception: pass
def browse_and_dump_source(url, num_links, sleep, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_sockets): """Calls get_website before visiting <num_links> present on the page. Each link visited will do a recursive page source dump. """ # First get the site get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params, extension_sockets) recursive_dump_page_source(visit_id, webdriver, manager_params, suffix='0') # Connect to logger logger = loggingclient(*manager_params['logger_address']) # Then visit a few subpages already_clicked = set() for i in range(num_links): all_links = get_intra_links(webdriver, url) disp_links = filter(lambda x: is_displayed(x), all_links) links = filter(lambda x: _filter_out_clicks(x, already_clicked), disp_links) if len(links) == 0: break random.shuffle(links) clicked = False for link in links: try: href = link.get_attribute('href') already_clicked.add(href) logger.info("BROWSER %i: Trying to click %s out of " "%i links" % (browser_params['crawl_id'], href, len(links))) link.click() except ElementNotVisibleException: continue except WebDriverException: continue except Exception, e: logger.error("BROWSER %i: Exception trying to visit %s, %s" % (browser_params['crawl_id'], link.get_attribute("href"), str(e))) continue logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], href)) wait_until_loaded(webdriver, 300) time.sleep(max(1, sleep)) recursive_dump_page_source(visit_id, webdriver, manager_params, suffix=str(i + 1)) webdriver.back() time.sleep(max(1, sleep)) wait_until_loaded(webdriver, 300) clicked = True break if not clicked: break
def _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params, logger): """Finds and fills a form, and returns True if accomplished.""" current_url = webdriver.current_url current_site_title = webdriver.title.encode('ascii', 'replace') main_handle = webdriver.current_window_handle in_iframe = False # debug: save before/after screenshots and page source debug_file_prefix = str(visit_id) + '_' debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit' debug_form_post_initial = debug_file_prefix + 'form_initial_result' debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit' debug_form_post_followup = debug_file_prefix + 'form_followup_result' debug_page_source_initial = debug_file_prefix + 'src_initial' debug_page_source_followup = debug_file_prefix + 'src_followup' # try to find newsletter form on landing page newsletter_form = _find_newsletter_form(webdriver) if newsletter_form is None: # search for forms in iframes (if present) iframes = webdriver.find_elements_by_tag_name('iframe') for iframe in iframes: # switch to the iframe webdriver.switch_to_frame(iframe) # is there a form? newsletter_form = _find_newsletter_form(webdriver) if newsletter_form is not None: if debug: dump_page_source(debug_page_source_initial, webdriver, browser_params, manager_params) in_iframe = True break # form found, stay on the iframe # switch back webdriver.switch_to_default_content() # still no form? if newsletter_form is None: return False elif debug: dump_page_source(debug_page_source_initial, webdriver, browser_params, manager_params) email = email_producer(current_url, current_site_title) user_info = _get_user_info(email) _form_fill_and_submit(newsletter_form, user_info, webdriver, False, browser_params, manager_params, debug_form_pre_initial if debug else None) logger.info('submitted form on [%s] with email [%s]', current_url, email) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) if debug: time.sleep(3) save_screenshot(debug_form_post_initial, webdriver, browser_params, manager_params) # fill any follow-up forms... wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # wait if we got redirected follow_up_form = None # first check other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: form_found_in_popup = False for window in windows: if window != main_handle: webdriver.switch_to_window(window) # find newsletter form if follow_up_form is None: follow_up_form = _find_newsletter_form(webdriver) if follow_up_form is not None: if debug: dump_page_source(debug_page_source_followup, webdriver, browser_params, manager_params) _form_fill_and_submit( follow_up_form, user_info, webdriver, True, browser_params, manager_params, debug_form_pre_followup if debug else None) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) if debug: time.sleep(3) save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params) webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) # else check current page if follow_up_form is None: follow_up_form = _find_newsletter_form(webdriver) if follow_up_form is not None: if debug: time.sleep(3) dump_page_source(debug_page_source_followup, webdriver, browser_params, manager_params) _form_fill_and_submit(follow_up_form, user_info, webdriver, True, browser_params, manager_params, debug_form_pre_followup if debug else None) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) if debug: time.sleep(3) save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params) # switch back if in_iframe: webdriver.switch_to_default_content() # close other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: for window in windows: if window != main_handle: webdriver.switch_to_window(window) webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) return True
def fill_forms(url, email_producer, num_links, page_timeout, debug, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """Finds a newsletter form on the page. If not found, visits <num_links> internal links and scans those pages for a form. Submits the form if found. """ # load the site webdriver.set_page_load_timeout(page_timeout) get_website(url, 0, visit_id, webdriver, proxy_queue, browser_params, extension_socket) # connect to the logger logger = loggingclient(*manager_params['logger_address']) # try to find a newsletter form on the landing page if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params, logger): return # otherwise, scan more pages main_handle = webdriver.current_window_handle visited_links = set() for i in xrange(num_links): # get all links on the page links = webdriver.find_elements_by_tag_name('a') random.shuffle(links) current_url = webdriver.current_url current_ps1 = domain_utils.get_ps_plus_1(current_url) # find links to click match_links = [] start_time = timeit.default_timer() for link in links: try: if not link.is_displayed(): continue # check if link is valid and not already visited href = link.get_attribute('href') if href is None or href in visited_links: continue # check if this is an internal link if not _is_internal_link(href, current_url, current_ps1): continue link_text = link.text.lower() # skip links with blacklisted text blacklisted = False for bl_text in _LINK_TEXT_BLACKLIST: if bl_text in link_text: blacklisted = True break if blacklisted: continue # should we click this link? link_rank = 0 for type, s, rank, flags in _LINK_TEXT_RANK: if (type == _TYPE_TEXT and s in link_text) or (type == _TYPE_HREF and s in href): if flags & _FLAG_IN_NEW_URL_ONLY: # don't use this link if the current page URL already matches too if type == _TYPE_HREF and s in current_url: continue # link matches! link_rank = rank match_links.append( (link, rank, link_text, href, flags)) break if link_rank >= _LINK_RANK_SKIP: # good enough, stop looking break except: logger.error("error while looping through links...") # quit if too much time passed (for some reason, this is really slow...) if match_links and timeit.default_timer( ) - start_time > _LINK_MATCH_TIMEOUT: break # find the best link to click if not match_links: break # no more links to click match_links.sort(key=lambda l: l[1]) next_link = match_links[-1] visited_links.add(next_link[3]) # click the link try: # load the page logger.info("clicking on link '%s' - %s" % (next_link[2], next_link[3])) next_link[0].click() wait_until_loaded(webdriver, _PAGE_LOAD_TIME) if browser_params['bot_mitigation']: bot_mitigation(webdriver) # find newsletter form if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params, logger): return # should we stay on this page? if next_link[4] & _FLAG_STAY_ON_PAGE: continue # go back webdriver.back() wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # check other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: form_found_in_popup = False for window in windows: if window != main_handle: webdriver.switch_to_window(window) wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # find newsletter form if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params, logger): form_found_in_popup = True webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) if form_found_in_popup: return except: pass
def _find_and_fill_form(webdriver, user_data, visit_id, debug, browser_params, manager_params, logger): """Finds and fills a form, and returns True if accomplished.""" current_url = webdriver.current_url current_site_title = webdriver.title.encode('ascii', 'replace') main_handle = webdriver.current_window_handle in_iframe = False if debug: logger.debug('The current URL is %s' % current_url) # debug: save before/after screenshots and page source debug_file_prefix = str(visit_id) + '_' debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit' debug_form_post_initial = debug_file_prefix + 'form_initial_result' debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit' debug_form_post_followup = debug_file_prefix + 'form_followup_result' debug_page_source_initial = debug_file_prefix + 'src_initial' debug_page_source_followup = debug_file_prefix + 'src_followup' newsletter_form = None # Search for a modal dialog, and for a form in the modal dialog # Search for no more than two modal dialogs try: search_count = 0 while (search_count < _MAX_POPUP_DISMISS): if debug: logger.debug('Round %d of modal dialog search...' % search_count) dialog_container = _get_dialog_container(webdriver) if dialog_container: if debug: logger.debug( 'Modal dialog found, searching for newsletter form in dialog...' ) newsletter_form = _find_newsletter_form( dialog_container, webdriver, debug, logger) if newsletter_form is None: clicked = _dismiss_dialog(webdriver, dialog_container) if debug: if int(clicked) > 0: if debug: logger.debug( 'No newsletter form in dialog, dismissed it' ) else: if debug: logger.debug( 'Made no clicks to dismiss the dialog') webdriver.find_element_by_tag_name( 'html').send_keys(Keys.ESCAPE) logger.debug( 'Pressed ESC to dismiss the dialog') else: if debug: logger.debug('Found a newsletter form in the dialog') break else: if debug: logger.debug('No dialog on the page') break search_count += 1 except Exception as e: logger.error('Error while examining for modal dialogs: %s' % str(e)) # try to find newsletter forms on landing page after dismissing the dialog if newsletter_form is None: if debug: logger.debug( 'Searching the rest of the page for a newsletter form') newsletter_form = _find_newsletter_form(None, webdriver, debug, logger) # Search for newsletter forms in iframes if newsletter_form is None: if debug: logger.debug( 'No newsletter form found on this page, searching for forms in iframes...' ) # search for forms in iframes (if present) iframes = webdriver.find_elements_by_tag_name( 'iframe') + webdriver.find_elements_by_tag_name('frame') if debug: logger.debug('Searching in %d iframes' % len(iframes)) for iframe in iframes: try: # switch to the iframe webdriver.switch_to_frame(iframe) # is there a form? newsletter_form = _find_newsletter_form( None, webdriver, debug, logger) if newsletter_form is not None: if debug: dump_page_source(debug_page_source_initial, webdriver, browser_params, manager_params) logger.debug( 'Found a newsletter in an iframe on this page') in_iframe = True break # form found, stay on the iframe # switch back webdriver.switch_to_default_content() except Exception as e: if debug: logger.error('Error while analyzing an iframe: %s' % str(e)) webdriver.switch_to_default_content() # still no form? if newsletter_form is None: if debug: logger.debug('None of the iframes have newsletter forms') return False elif debug: dump_page_source(debug_page_source_initial, webdriver, browser_params, manager_params) email = user_data['email'] user_info = user_data _form_fill_and_submit(newsletter_form, user_info, webdriver, True, browser_params, manager_params, debug_form_pre_initial if debug else None) logger.info('Submitted form on [%s] with email [%s] on visit_id [%d]', current_url, email, visit_id) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) if debug: save_screenshot(debug_form_post_initial, webdriver, browser_params, manager_params) logger.debug('The current URL is %s' % webdriver.current_url) logger.debug('Filling any follow-up forms on this page...') # fill any follow-up forms... wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # wait if we got redirected follow_up_form = None # first check other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: if debug: logger.debug('Found %d windows (e.g., popups)' % len(windows)) form_found_in_popup = False for window in windows: if window != main_handle: webdriver.switch_to_window(window) # find newsletter form if follow_up_form is None: follow_up_form = _find_newsletter_form( None, webdriver, debug, logger) if follow_up_form is not None: if debug: dump_page_source(debug_page_source_followup, webdriver, browser_params, manager_params) logger.debug( 'Found a newsletter form in another window') _form_fill_and_submit( follow_up_form, user_info, webdriver, True, browser_params, manager_params, debug_form_pre_followup if debug else None) logger.info( 'Submitted form on [%s] with email [%s] on visit_id [%d]', webdriver.current_url, email, visit_id) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) if debug: save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params) webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) # else check current page if follow_up_form is None: if debug: logger.debug( 'Found no follow-up forms in other windows, checking current page' ) follow_up_form = _find_newsletter_form(None, webdriver, debug, logger) if follow_up_form is not None: if debug: dump_page_source(debug_page_source_followup, webdriver, browser_params, manager_params) logger.debug('Found a follow-up form in this page') _form_fill_and_submit(follow_up_form, user_info, webdriver, True, browser_params, manager_params, debug_form_pre_followup if debug else None) logger.info( 'Submitted form on [%s] with email [%s] on visit_id [%d]', webdriver.current_url, email, visit_id) time.sleep(_FORM_SUBMIT_SLEEP) _dismiss_alert(webdriver) if debug: save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params) else: if debug: logger.debug('No follow-up forms on the current page') # switch back if in_iframe: if debug: logger.debug( 'We were in an iframe, switching back to the main window') webdriver.switch_to_default_content() # close other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: if debug: logger.debug('Closing %d windows (e.g., popups)' % len(windows)) for window in windows: if window != main_handle: webdriver.switch_to_window(window) webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) return True