def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs): """Create a new SelScraper thread Instance. Args: captcha_lock: To sync captcha solving (stdin) proxy: Optional, if set, use the proxy to route all scrapign through it. browser_num: A unique, semantic number for each thread. """ self.search_input = None threading.Thread.__init__(self) SearchEngineScrape.__init__(self, config, *args, **kwargs) self.browser_type = self.config.get('sel_browser', 'chrome').lower() self.browser_num = browser_num self.captcha_lock = captcha_lock self.scrape_method = 'selenium' self.xvfb_display = self.config.get('xvfb_display', None) self.search_param_values = self._get_search_param_values() # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__)
def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs): """Create a new SelScraper thread Instance. Args: captcha_lock: To sync captcha solving (stdin) proxy: Optional, if set, use the proxy to route all scrapign through it. browser_num: A unique, semantic number for each thread. """ self.search_input = None threading.Thread.__init__(self) SearchEngineScrape.__init__(self, *args, **kwargs) self.browser_type = Config['SELENIUM'].get('sel_browser', 'chrome').lower() self.browser_num = browser_num self.captcha_lock = captcha_lock self.scrape_method = 'selenium' self.xvfb_display = Config['SELENIUM'].get('xvfb_display', None) self.search_param_values = self._get_search_param_values() # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__)
def __init__(self, config, *args, time_offset=0.0, **kwargs): """Initialize an HttScrape object to scrape over blocking http. HttpScrape inherits from SearchEngineScrape and from threading.Timer. """ threading.Timer.__init__(self, time_offset, self.search) SearchEngineScrape.__init__(self, config, *args, **kwargs) # Bind the requests module to this instance such that each # instance may have an own proxy self.requests = __import__('requests') # initialize the GET parameters for the search request self.search_params = {} # initialize the HTTP headers of the search request # to some base values that mozilla uses with requests. # the Host and User-Agent field need to be set additionally. self.headers = headers # the mode self.scrape_method = 'http' # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__) if self.search_engine_name == 'blekko': logger.critical('blekko does not support http mode.') self.startable = False
def __init__(self, config, *args, time_offset=0.0, **kwargs): """Initialize an HttScrape object to scrape over blocking http. HttpScrape inherits from SearchEngineScrape and from threading.Timer. """ threading.Timer.__init__(self, time_offset, self.search) SearchEngineScrape.__init__(self, config, *args, **kwargs) # Bind the requests module to this instance such that each # instance may have an own proxy self.requests = __import__('requests') # initialize the GET parameters for the search request self.search_params = {} # initialize the HTTP headers of the search request # to some base values that mozilla uses with requests. # the Host and User-Agent field need to be set additionally. self.headers = headers # the mode self.scrape_method = 'http' # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine( self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__) if self.search_engine_name == 'blekko': logger.critical('blekko does not support http mode.') self.startable = False
def __init__(self, config, query='', page_number=1, search_engine='google', scrape_method='http-async'): """ """ self.config = config self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = scrape_method self.requested_at = None self.requested_by = 'localhost' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine( self.config, self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine( self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers self.status = 'successful' self.num_results_per_page = int(config['num_results_per_page']) self.startRecord = self.num_results_per_page * (self.page_number - 1) + 1 self.stringStartRecord = "&first=" + str(self.startRecord)
def check_detection(config, search_engine_name): """ Checks whether the search engine specified by search_engine_name blocked us. """ status = '' chromedriver = config.get('chromedriver_path', '/usr/bin/chromedriver') options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument('window-size=1200x600') browser = webdriver.Chrome(chrome_options=options, executable_path=chromedriver) if search_engine_name == 'google': url = get_base_search_url_by_search_engine(config, 'google', 'selenium') browser.get(url) def check(browser, status): needles = SearchEngineScrape.malicious_request_needles['google'] if needles['inurl'] in browser.current_url and needles[ 'inhtml'] in browser.page_source: status += 'Google is asking for a captcha! ' code = 'DETECTED' else: status += 'No captcha prompt detected. ' code = 'UNDETECTED' return (code, status) search_input = None try: search_input = WebDriverWait(browser, 5).until( EC.visibility_of_element_located((By.NAME, 'q'))) status += 'Got a search input field. ' except TimeoutException: status += 'No search input field located after 5 seconds. ' return check(browser, status) try: # random query search_input.send_keys('President of Finland' + Keys.ENTER) status += 'Google Search successful! ' except WebDriverException: status += 'Cannot make a google search! ' return check(browser, status) return check(browser, status) else: raise NotImplementedError( 'Detection check only implemented for Google Right now.') browser.quit() return status
def check_detection(config, search_engine_name): """ Checks whether the search engine specified by search_engine_name blocked us. """ status = '' chromedriver = config.get('chromedriver_path', '/usr/bin/chromedriver') options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument('window-size=1200x600') browser = webdriver.Chrome(chrome_options=options, executable_path=chromedriver) if search_engine_name == 'google': url = get_base_search_url_by_search_engine(config, 'google', 'selenium') browser.get(url) def check(browser, status): needles = SearchEngineScrape.malicious_request_needles['google'] if needles['inurl'] in browser.current_url and needles['inhtml'] in browser.page_source: status += 'Google is asking for a captcha! ' code = 'DETECTED' else: status += 'No captcha prompt detected. ' code = 'UNDETECTED' return (code, status) search_input = None try: search_input = WebDriverWait(browser, 5).until( EC.visibility_of_element_located((By.NAME, 'q'))) status += 'Got a search input field. ' except TimeoutException: status += 'No search input field located after 5 seconds. ' return check(browser, status) try: # random query search_input.send_keys('President of Finland'+ Keys.ENTER) status += 'Google Search successful! ' except WebDriverException: status += 'Cannot make a google search! ' return check(browser, status) return check(browser, status) else: raise NotImplementedError('Detection check only implemented for Google Right now.') browser.quit() return status
def __init__(self, query='', page_number=1, search_engine='google', **kwargs): self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = 'http-async' self.requested_at = None self.requested_by = '' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers
def __init__(self, config, query='', page_number=1, search_engine='google', scrape_method='http-async'): """ """ self.config = config self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = scrape_method self.requested_at = None self.requested_by = 'localhost' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers self.status = 'successful'
def __init__(self, query='', page_number=1, search_engine='google', **kwargs): """ @todo: **kwargs doesn't seem to be used, check if any call to init passes additional keyword args and remove it """ self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = 'http-async' self.requested_at = None self.requested_by = 'localhost' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers self.status = 'successful'
def __init__(self, config, query='', page_number=1, search_engine='google', scrape_method='http-async'): """ """ self.config = config self.query = query self.page_number = page_number self.search_engine_name = search_engine self.search_type = 'normal' self.scrape_method = scrape_method self.requested_at = None self.requested_by = 'localhost' self.parser = get_parser_by_search_engine(self.search_engine_name) self.base_search_url = get_base_search_url_by_search_engine( self.config, self.search_engine_name, 'http') self.params = get_GET_params_for_search_engine( self.query, self.search_engine_name, search_type=self.search_type) self.headers = headers self.status = 'successful'