def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs):
        """Create a new SelScraper thread Instance.

        Args:
            captcha_lock: To sync captcha solving (stdin)
            proxy: Optional, if set, use the proxy to route all scrapign through it.
            browser_num: A unique, semantic number for each thread.
        """
        self.search_input = None

        threading.Thread.__init__(self)
        SearchEngineScrape.__init__(self, *args, **kwargs)

        self.browser_type = Config['SELENIUM'].get('sel_browser', 'chrome').lower()
        self.browser_num = browser_num
        self.captcha_lock = captcha_lock
        self.scrape_method = 'selenium'

        self.xvfb_display = Config['SELENIUM'].get('xvfb_display', None)

        self.search_param_values = self._get_search_param_values()

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, self.scrape_method)
        super().instance_creation_info(self.__class__.__name__)
示例#2
0
    def __init__(self, config, *args, time_offset=0.0, **kwargs):
        """Initialize an HttScrape object to scrape over blocking http.

        HttpScrape inherits from SearchEngineScrape
        and from threading.Timer.
        """
        threading.Timer.__init__(self, time_offset, self.search)
        SearchEngineScrape.__init__(self, config, *args, **kwargs)

        # Bind the requests module to this instance such that each
        # instance may have an own proxy
        self.requests = __import__('requests')

        # initialize the GET parameters for the search request
        self.search_params = {}

        # initialize the HTTP headers of the search request
        # to some base values that mozilla uses with requests.
        # the Host and User-Agent field need to be set additionally.
        self.headers = headers

        # the mode
        self.scrape_method = 'http'

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method)

        super().instance_creation_info(self.__class__.__name__)

        if self.search_engine_name == 'blekko':
            logger.critical('blekko does not support http mode.')
            self.startable = False
示例#3
0
    def __init__(self, config, *args, time_offset=0.0, **kwargs):
        """Initialize an HttScrape object to scrape over blocking http.

        HttpScrape inherits from SearchEngineScrape
        and from threading.Timer.
        """
        threading.Timer.__init__(self, time_offset, self.search)
        SearchEngineScrape.__init__(self, config, *args, **kwargs)

        # Bind the requests module to this instance such that each
        # instance may have an own proxy
        self.requests = __import__('requests')

        # initialize the GET parameters for the search request
        self.search_params = {}

        # initialize the HTTP headers of the search request
        # to some base values that mozilla uses with requests.
        # the Host and User-Agent field need to be set additionally.
        self.headers = headers

        # the mode
        self.scrape_method = 'http'

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(
            self.config, self.search_engine_name, self.scrape_method)

        super().instance_creation_info(self.__class__.__name__)

        if self.search_engine_name == 'blekko':
            logger.critical('blekko doesnt support http mode.')
            self.startable = False
    def run(self):
        super().before_search()

        if self.startable:
            args = []
            kwargs = {}
            kwargs['rand'] = False
            SearchEngineScrape.blocking_search(self, self.search, *args,
                                               **kwargs)