Пример #1
0
        def request():
            url = self.base_search_url + urlencode(self.params)

            response = yield from aiohttp.request('GET', url, params=self.params, headers=self.headers)

            if response.status != 200:
                self.status = 'not successful: ' + response.status

            self.requested_at = datetime.datetime.utcnow()

            out('[+] {} requested keyword \'{}\' on {}. Response status: {}'.format(
                self.requested_by,
                self.query,
                self.search_engine_name,
                response.status
                ), lvl=2)

            out('[i] URL: {} HEADERS: {}'.format(
                url,
                self.headers
                ), lvl=3)

            if response.status == 200:
                body = yield from response.read_and_close(decode=False)
                self.parser = self.parser(body)
                return self

            return None
Пример #2
0
    def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs):
        """Create a new SelScraper Thread.

        Args:
            captcha_lock: To sync captcha solving (stdin)
            proxy: Optional, if set, use the proxy to route all scrapign through it.
            browser_num: A unique, semantic number for each thread.
        """
        self.search_input = None

        threading.Thread.__init__(self)
        SearchEngineScrape.__init__(self, *args, **kwargs)

        self.browser_type = Config['SELENIUM'].get('sel_browser', 'chrome').lower()
        self.browser_num = browser_num
        self.captcha_lock = captcha_lock
        self.ip = '127.0.0.1'
        self.search_number = 0
        self.scrapemethod = 'sel'

        # How long to sleep (ins seconds) after every n-th request
        self.sleeping_ranges = dict()
        for line in Config['SELENIUM'].get('sleeping_ranges').split('\n'):
            assert line.count(';') == 1
            key, value = line.split(';')
            self.sleeping_ranges[int(key)] = tuple([int(offset.strip()) for offset in value.split(',')])

        out('[+] SelScraper[{}] created using the search engine {}. Number of keywords to scrape={}, using proxy={}, number of pages={}, browser_num={}'.format(self.search_engine, self.browser_type, len(self.keywords), self.proxy, self.num_pages_per_keyword, self.name), lvl=2)
Пример #3
0
def fix_broken_cache_names(url, search_engine, scrapemode, page_number):
    """Fix broken cache names.

    Args:
        url: A list of strings to add to each cached_file_name() call.

    @todo: `url` is not used here -> check if scrape_method is passed to this function and remove it
    """
    files = _get_all_cache_files()
    logger.debug("{} cache files found in {}".format(len(files), Config["GLOBAL"].get("cachedir", ".scrapecache")))
    r = re.compile(r"<title>(?P<kw>.*?) - Google Search</title>")

    i = 0
    for path in files:
        fname = os.path.split(path)[1].strip()
        data = read_cached_file(path)
        infilekws = r.search(data).group("kw")
        realname = cached_file_name(infilekws, search_engine, scrapemode, page_number)
        if fname != realname:
            out(
                "The search query in the title element in file {} differ from that hash of its name. Fixing...".format(
                    path
                ),
                lvl=3,
            )
            src = os.path.abspath(path)
            dst = os.path.abspath(os.path.join(os.path.split(path)[0], realname))
            logger.debug("Renamed from {} => {}".format(src, dst))
            os.rename(src, dst)
        i += 1

    logger.debug("Renamed {} files.".format(i))
Пример #4
0
    def _find_next_page_element(self):
        """Finds the element that locates the next page for any search engine.

        Returns:
            The element that needs to be clicked to get to the next page.
        """
        if self.search_type == 'normal':
            selector = self.next_page_selectors[self.search_engine_name]
            try:
                # wait until the next page link emerges
                WebDriverWait(self.webdriver, 5).until(
                    EC.visibility_of_element_located(
                        (By.CSS_SELECTOR, selector)))
                return self.webdriver.find_element_by_css_selector(selector)
            except TimeoutException as te:
                out('{}: Cannot locate next page element: {}'.format(
                    self.name, te),
                    lvl=4)
                return False
            except WebDriverException as e:
                out('{} Cannot locate next page element: {}'.format(
                    self.name, e),
                    lvl=4)
                return False

        elif self.search_type == 'image':
            self.page_down()
            return True
Пример #5
0
 def instance_creation_info(self, scraper_name):
     """Debug message whenever a scraping worker is created"""
     out('[+] {}[{}][search-type:{}][{}] using search engine "{}". Num keywords={}, num pages for keyword={}'
         .format(scraper_name, self.requested_by, self.search_type,
                 self.base_search_url, self.search_engine_name,
                 len(self.jobs), self.pages_per_keyword),
         lvl=1)
Пример #6
0
def fix_broken_cache_names(url, search_engine, scrapemode):
    """Fix broken cache names.

    Args:
        url: A list of strings to add to each cached_file_name() call.
    """
    files = _get_all_cache_files()
    logger.debug('{} cache files found in {}'.format(
        len(files), Config['GLOBAL'].get('cachedir', '.scrapecache')))
    r = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>')

    for i, path in enumerate(files):
        fname = os.path.split(path)[1].strip()
        data = read_cached_file(path)
        infilekws = r.search(data).group('kw')
        realname = cached_file_name(infilekws, search_engine, scrapemode)
        if fname != realname:
            out('The search query in the title element in file {} differ from that hash of its name. Fixing...'
                .format(path),
                lvl=3)
            src = os.path.abspath(path)
            dst = os.path.abspath(
                os.path.join(os.path.split(path)[0], realname))
            logger.debug('Renamed from {} => {}'.format(src, dst))
            os.rename(src, dst)
    logger.debug('Renamed {} files.'.format(i))
Пример #7
0
 def instance_creation_info(self, scraper_name):
     """Debug message whenever a scraping worker is created"""
     out('[+] {}[{}][search-type:{}] created using the search engine {}. Number of keywords to scrape={}, using proxy={}, number of pages per keyword={}'
         .format(scraper_name, self.ip, self.search_type,
                 self.search_engine, len(self.keywords), self.proxy,
                 self.num_pages_per_keyword),
         lvl=2)
Пример #8
0
    def run(self):
        """Run the SelScraper."""

        display = Display(visible=0, size=(800, 600))
        display.start()
        #self._set_xvfb_display()

        if not self._get_webdriver():
            raise_or_log('{}: Aborting due to no available selenium webdriver.'.format(self.name),
                         exception_obj=SeleniumMisconfigurationError)

        try:
            self.webdriver.set_window_size(400, 400)
            self.webdriver.set_window_position(400 * (self.browser_num % 4), 400 * (math.floor(self.browser_num // 4)))
        except WebDriverException as e:
            out('Cannot set window size: {}'.format(e), lvl=4)

        super().before_search()

        if self.startable:
            self.build_search()
            self.search()

        if self.webdriver:
            self.webdriver.close()
Пример #9
0
    def search(self):
        """Search with webdriver.

        Fills out the search form of the search engine for each keyword.
        Clicks the next link while num_pages_per_keyword is not reached.
        """
        n = 0

        for self.current_keyword in self.keywords:

            super().next_keyword_info(n)

            self.search_input = self._wait_until_search_input_field_appears()

            if self.search_input is False:
                self.search_input = self.handle_request_denied()

            if self.search_input:
                self.search_input.clear()
                time.sleep(.25)
                self.search_input.send_keys(self.current_keyword + Keys.ENTER)
            else:
                raise GoogleSearchError('Cannot get handle to the input form!')

            for self.current_page in range(1, self.num_pages_per_keyword + 1):
                # Waiting until the keyword appears in the title may
                # not be enough. The content may still be from the old page.
                try:
                    WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword))
                except TimeoutException as e:
                    logger.error(SeleniumSearchError('Keyword "{}" not found in title: {}'.format(self.current_keyword, self.webdriver.title)))
                    break

                # match the largest sleep range
                sleep_time = random.randrange(*self._largest_sleep_range(self.search_number))
                time.sleep(sleep_time)

                html = self.webdriver.page_source

                self.parser.parse(html)
                self.store()
                out(str(self.parser), lvl=2)

                # Lock for the sake that two threads write to same file (not probable)
                with self.cache_lock:
                    cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod)

                self.search_number += 1

                # Click the next page link not when leaving the loop
                if self.current_page < self.num_pages_per_keyword + 1:
                    self.next_url = self._goto_next_page()

                    if not self.next_url:
                        break

            n += 1
    def search(self):
        """Search with webdriver.

        Fills out the search form of the search engine for each keyword.
        Clicks the next link while pages_per_keyword is not reached.
        """
        for self.query, self.pages_per_keyword in self.jobs.items():

            self.search_input = self._wait_until_search_input_field_appears()

            if self.search_input is False and Config['PROXY_POLICY'].getboolean('stop_on_detection'):
                self.status = 'Malicious request detected'
                super().after_search()
                return

            if self.search_input is False:
                # @todo: pass status_code
                self.search_input = self.handle_request_denied()

            if self.search_input:
                try:
                    self.fill_search_input()
                except (StaleElementReferenceException, InvalidElementStateException, ElementNotVisibleException) as e1:
                    while True:
                        try:
                            self.build_search()
                            self.search_input = self._wait_until_search_input_field_appears()
                            self.fill_search_input()
                            break
                        except (StaleElementReferenceException, InvalidElementStateException, ElementNotVisibleException) as e2:
                            pass
            else:
                out('{}: Cannot get handle to the input form for keyword {}.'.format(self.name, self.query), lvl=4)
                continue

            super().detection_prevention_sleep()
            super().keyword_info()

            for self.page_number in self.pages_per_keyword:

                self.wait_until_serp_loaded()

                try:
                    self.html = self.webdriver.execute_script('return document.body.innerHTML;')
                except WebDriverException as e:
                    self.html = self.webdriver.page_source

                super().after_search()

                # Click the next page link not when leaving the loop
                # in the next iteration.
                if self.page_number in self.pages_per_keyword:
                    next_url = self._goto_next_page()
                    self.requested_at = datetime.datetime.utcnow()

                    if not next_url:
                        break
Пример #11
0
    def handle_request_denied(self):
        """Checks whether Google detected a potentially harmful request.

        Whenever such potential abuse is detected, Google shows an captcha.
        This method just blocks as long as someone entered the captcha in the browser window.
        When the window is not visible (For example when using PhantomJS), this method
        makes a png from the html code and shows it to the user, which should enter it in a command
        line.

        Returns:
            The search input field.

        Raises:
            MaliciousRequestDetected when there was not way to stp Google From denying our requests.
        """
        malicious_request_needles = {
            'google': {
                'inurl': '/sorry/',
                'inhtml': 'detected unusual traffic'
            }
        }

        needles = malicious_request_needles[self.search_engine]

        if needles['inurl'] in self.webdriver.current_url and needles[
                'inhtml'] in self.webdriver.page_source:

            if Config['SELENIUM'].getboolean('manual_captcha_solving', False):
                with self.captcha_lock:
                    import tempfile
                    tf = tempfile.NamedTemporaryFile('wb')
                    tf.write(self.webdriver.get_screenshot_as_png())
                    import webbrowser
                    webbrowser.open('file://{}'.format(tf.name))
                    solution = input('enter the captcha please...')
                    self.webdriver.find_element_by_name('submit').send_keys(
                        solution + Keys.ENTER)
                    try:
                        self.search_input = WebDriverWait(
                            self.webdriver,
                            5).until(self._get_search_input_field())
                    except TimeoutException as e:
                        raise MaliciousRequestDetected(
                            'Requesting with this ip is not possible at the moment.'
                        )
                    tf.close()

            else:
                # Just wait until the user solves the captcha in the browser window
                # 10 hours if needed :D
                out('Waiting for user to solve captcha', lvl=1)
                return self._wait_until_search_input_field_appears(10 * 60 *
                                                                   60)

        elif 'is not an HTTP Proxy' in self.webdriver.page_source:
            raise GoogleSearchError(
                'Inavlid TOR usage. Specify the proxy protocol as socks5')
Пример #12
0
 def wait_until_title_contains_keyword(self):
     try:
         WebDriverWait(self.webdriver,
                       5).until(EC.title_contains(self.query))
     except TimeoutException as e:
         out(SeleniumSearchError(
             '{}: Keyword "{}" not found in title: {}'.format(
                 self.name, self.query, self.webdriver.title)),
             lvl=4)
Пример #13
0
 def wait_until_title_contains_keyword(self):
     try:
         WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.query))
     except TimeoutException:
         out(
             SeleniumSearchError(
                 '{}: Keyword "{}" not found in title: {}'.format(self.name, self.query, self.webdriver.title)
             ),
             lvl=4,
         )
Пример #14
0
def parse_serp(html=None, search_engine=None,
                    scrapemethod=None, current_page=None, requested_at=None,
                    requested_by='127.0.0.1', current_keyword=None, parser=None, serp=None):
        """Store the parsed data in the sqlalchemy session.

        Args:
            TODO: A whole lot

        Returns:
            The parsed SERP object.
        """

        if not parser:
            parser = get_parser_by_search_engine(search_engine)
            parser = parser()
            parser.parse(html)

        out(parser, lvl=2)
        num_results = 0

        if not serp:
            serp = SearchEngineResultsPage(
                search_engine_name=search_engine,
                scrapemethod=scrapemethod,
                page_number=current_page,
                requested_at=requested_at,
                requested_by=requested_by,
                query=current_keyword,
                num_results_for_keyword=parser.search_results['num_results'],
            )

        for key, value in parser.search_results.items():
            if isinstance(value, list):
                rank = 1
                for link in value:
                    parsed = urlparse(link['link'])

                    # fill with nones to prevent key errors
                    [link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link]

                    l = Link(
                        link=link['link'],
                        snippet=link['snippet'],
                        title=link['title'],
                        visible_link=link['visible_link'],
                        domain=parsed.netloc,
                        rank=rank,
                        serp=serp
                    )
                    num_results += 1
                    rank += 1

        serp.num_results = num_results

        return serp
Пример #15
0
def parse_serp(html=None, search_engine=None,
                    scrapemethod=None, current_page=None, requested_at=None,
                    requested_by='127.0.0.1', current_keyword=None, parser=None, serp=None):
        """Store the parsed data in the sqlalchemy session.

        Args:
            TODO: A whole lot

        Returns:
            The parsed SERP object.
        """

        if not parser:
            parser = get_parser_by_search_engine(search_engine)
            parser = parser()
            parser.parse(html)

        out(parser, lvl=2)
        num_results = 0

        if not serp:
            serp = SearchEngineResultsPage(
                search_engine_name=search_engine,
                scrapemethod=scrapemethod,
                page_number=current_page,
                requested_at=requested_at,
                requested_by=requested_by,
                query=current_keyword,
                num_results_for_keyword=parser.search_results['num_results'],
            )

        for key, value in parser.search_results.items():
            if isinstance(value, list):
                rank = 1
                for link in value:
                    parsed = urlparse(link['link'])

                    # fill with nones to prevent key errors
                    [link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link]

                    l = Link(
                        link=link['link'],
                        snippet=link['snippet'],
                        title=link['title'],
                        visible_link=link['visible_link'],
                        domain=parsed.netloc,
                        rank=rank,
                        serp=serp
                    )
                    num_results += 1
                    rank += 1

        serp.num_results = num_results

        return (serp, parser)
Пример #16
0
 def next_keyword_info(self, pos_current_keyword):
     """Print a short summary where we are in the scrape and what's the next keyword."""
     out('[{thread_name}][{search_engine} with ip {ip} next keyword: "{keyword}" with {num_pages} pages. {done}/{all} already scraped.'.format(
         thread_name=self.name,
         search_engine=self.search_engine,
         ip=self.ip,
         keyword=self.current_keyword,
         num_pages=self.num_pages_per_keyword,
         done=pos_current_keyword,
         all=self.num_keywords
     ), lvl=1)
Пример #17
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

        output_format = Config['GLOBAL'].get('output_format', 'stdout')
        output_file = Config['GLOBAL'].get('output_filename', 'google_scraper')

        if output_format == 'stdout':
            out(self.parser, lvl=2)
        elif output_format == 'json':
            if not hasattr(self, 'json_outfile'):
                self.json_outfile = open(output_file + '.json', 'a')

            obj = self._get_serp_obj()
            obj['requested_at'] = obj['requested_at'].isoformat()
            json.dump(obj, self.json_outfile, indent=2, sort_keys=True)

        elif output_format == 'csv':
            if not hasattr(self, 'csv_outfile'):
                self.csv_outfile = csv.DictWriter(open(output_file + '.csv', 'a'),
                        fieldnames=('link', 'title', 'snippet', 'visible_link', 'num_results',
                                    'query', 'search_engine_name', 'requested_by',
                                    'scrapemethod', 'page_number', 'requested_at'))
                self.csv_outfile.writeheader()

            rows = []
            for result_type, value in self.parser.search_results.items():
                if isinstance(value, list):
                    for link in value:
                        rows.append(link)

            obj = self._get_serp_obj()
            obj['num_results'] = self.parser.search_results['num_results']
            for row in rows:
                row.update(obj)
                self.csv_outfile.writerow(row)
Пример #18
0
 def keyword_info(self):
     """Print a short summary where we are in the scrape and what's the next keyword."""
     out('[{thread_name}][{ip}][{search_engine}]Keyword: "{keyword}" with {num_pages} pages, slept {delay} seconds before scraping. {done}/{all} already scraped.'.format(
         thread_name=self.name,
         search_engine=self.search_engine,
         ip=self.ip,
         keyword=self.current_keyword,
         num_pages=self.num_pages_per_keyword,
         delay=self.current_delay,
         done=self.search_number,
         all=self.num_keywords
     ), lvl=2)
Пример #19
0
    def handle_request_denied(self):
        """Checks whether Google detected a potentially harmful request.

        Whenever such potential abuse is detected, Google shows an captcha.
        This method just blocks as long as someone entered the captcha in the browser window.
        When the window is not visible (For example when using PhantomJS), this method
        makes a png from the html code and shows it to the user, which should enter it in a command
        line.

        Returns:
            The search input field.

        Raises:
            MaliciousRequestDetected when there was not way to stp Google From denying our requests.
        """
        malicious_request_needles = {
            'google': {
                'inurl': '/sorry/',
                'inhtml': 'detected unusual traffic'
            },
            'bing': {},
            'yahoo': {},
            'baidu': {},
            'yandex': {},
        }

        needles = malicious_request_needles[self.search_engine]

        if needles and needles['inurl'] in self.webdriver.current_url and needles['inhtml'] in self.webdriver.page_source:

            if Config['SELENIUM'].getboolean('manual_captcha_solving', False):
                with self.captcha_lock:
                    import tempfile
                    tf = tempfile.NamedTemporaryFile('wb')
                    tf.write(self.webdriver.get_screenshot_as_png())
                    import webbrowser
                    webbrowser.open('file://{}'.format(tf.name))
                    solution = input('enter the captcha please...')
                    self.webdriver.find_element_by_name('submit').send_keys(solution + Keys.ENTER)
                    try:
                        self.search_input = WebDriverWait(self.webdriver, 5).until(self._get_search_input_field())
                    except TimeoutException as e:
                        raise MaliciousRequestDetected('Requesting with this ip is not possible at the moment.')
                    tf.close()

            else:
                # Just wait until the user solves the captcha in the browser window
                # 10 hours if needed :D
                out('Waiting for user to solve captcha', lvl=1)
                return self._wait_until_search_input_field_appears(10*60*60)

        elif 'is not an HTTP Proxy' in self.webdriver.page_source:
            raise GoogleSearchError('Inavlid TOR usage. Specify the proxy protocol as socks5')
Пример #20
0
    def search(self):
        """Search with webdriver.

        Fills out the search form of the search engine for each keyword.
        Clicks the next link while pages_per_keyword is not reached.
        """
        for self.query, self.pages_per_keyword in self.jobs.items():

            self.search_input = self._wait_until_search_input_field_appears()

            if self.search_input is False:
                self.search_input = self.handle_request_denied()

            if self.search_input:
                self.search_input.clear()
                time.sleep(.25)

                try:
                    self.search_input.send_keys(self.query + Keys.ENTER)
                except ElementNotVisibleException as e:
                    time.sleep(2)
                    self.search_input.send_keys(self.query + Keys.ENTER)

                self.requested_at = datetime.datetime.utcnow()
            else:
                out('{}: Cannot get handle to the input form for keyword {}.'.
                    format(self.name, self.query),
                    lvl=4)
                continue

            super().detection_prevention_sleep()
            super().keyword_info()

            for self.page_number in self.pages_per_keyword:

                self.wait_until_serp_loaded()

                try:
                    self.html = self.webdriver.execute_script(
                        'return document.body.innerHTML;')
                except WebDriverException as e:
                    self.html = self.webdriver.page_source

                super().after_search()

                # Click the next page link not when leaving the loop
                # in the next iteration.
                if self.page_number in self.pages_per_keyword:
                    self.next_url = self._goto_next_page()
                    self.requested_at = datetime.datetime.utcnow()

                    if not self.next_url:
                        break
Пример #21
0
 def keyword_info(self):
     """Print a short summary where we are in the scrape and what's the next keyword."""
     out('[{thread_name}][{ip}][{search_engine}]Keyword: "{keyword}" with {num_pages} pages, slept {delay} seconds before scraping. {done}/{all} already scraped.'
         .format(thread_name=self.name,
                 search_engine=self.search_engine,
                 ip=self.ip,
                 keyword=self.current_keyword,
                 num_pages=self.num_pages_per_keyword,
                 delay=self.current_delay,
                 done=self.search_number,
                 all=self.num_keywords),
         lvl=2)
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine.

        When raising StopScrapingException, the scraper will stop.

        When return False, the scraper tries to continue with next keyword.
        """

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(user_agents)

        try:
            super().detection_prevention_sleep()
            super().keyword_info()

            request = self.requests.get(self.base_search_url +
                                        urlencode(self.search_params),
                                        headers=self.headers,
                                        timeout=5)

            self.requested_at = datetime.datetime.utcnow()
            self.html = request.text

            out('[HTTP - {url}, headers={headers}, params={params}'.format(
                url=request.url,
                headers=self.headers,
                params=self.search_params),
                lvl=3)

        except self.requests.ConnectionError as ce:
            reason = 'Network problem occurred {}'.format(ce)
            raise StopScrapingException(
                'Stopping scraping because {}'.format(reason))
        except self.requests.Timeout as te:
            reason = 'Connection timeout {}'.format(te)
            raise StopScrapingException(
                'Stopping scraping because {}'.format(reason))
        except self.requests.exceptions.RequestException as e:
            # In case of any http networking exception that wasn't caught
            # in the actual request, just end the worker.
            raise StopScrapingException(
                'Stopping scraping because {}'.format(e))

        if not request.ok:
            self.handle_request_denied(request.status_code)
            return False

        super().after_search()

        return True
Пример #23
0
    def search(self):
        """Search with webdriver.

        Called within the blocking_search search loop.

        """
        for self.current_keyword in self.keywords:

            for self.current_page in range(1, self.num_pages_per_keyword + 1):
                # match the largest sleep range
                sleep_time = random.randrange(*self._largest_sleep_range(self.search_number))

                time.sleep(sleep_time)

                # Waiting until the keyword appears in the title may
                # not be enough. The content may still be from the old page.
                try:
                    WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword))
                except TimeoutException as e:
                    logger.error(SeleniumSearchError('Keyword "{}" not found in title: {}'.format(self.current_keyword, self.webdriver.title)))


                html = self.webdriver.page_source

                self.parser.parse(html)
                self.store()
                out(str(self.parser), lvl=2)

                # Lock for the sake that two threads write to same file (not probable)
                with self.cache_lock:
                    cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod)

                self.search_number += 1

                if self.current_page > 1:
                    self.next_url = self._goto_next_page()

            try:
                self.search_input = WebDriverWait(self.webdriver, 5).until(
                    EC.presence_of_element_located(self._get_search_input_field()))
            except TimeoutException as e:
                logger.error(e)
                if not self.handle_request_denied():
                    open('/tmp/out.png', 'wb').write(self.webdriver.get_screenshot_as_png())
                    raise GoogleSearchError('search input field cannot be found.')

            if self.search_input:
                self.search_input.clear()
                time.sleep(.25)
                self.search_input.send_keys(self.current_keyword + Keys.ENTER)
Пример #24
0
    def handle_request_denied(self, status_code):
        """Checks whether Google detected a potentially harmful request.

        Whenever such potential abuse is detected, Google shows an captcha.
        This method just blocks as long as someone entered the captcha in the browser window.
        When the window is not visible (For example when using PhantomJS), this method
        makes a png from the html code and shows it to the user, which should enter it in a command
        line.

        Returns:
            The search input field.

        Raises:
            MaliciousRequestDetected when there was not way to stp Google From denying our requests.
        """
        # selenium webdriver objects have no status code :/
        super().handle_request_denied("400")

        needles = self.malicious_request_needles[self.search_engine_name]

        if (
            needles
            and needles["inurl"] in self.webdriver.current_url
            and needles["inhtml"] in self.webdriver.page_source
        ):

            if Config["SELENIUM"].getboolean("manual_captcha_solving", False):
                with self.captcha_lock:
                    import tempfile

                    tf = tempfile.NamedTemporaryFile("wb")
                    tf.write(self.webdriver.get_screenshot_as_png())
                    import webbrowser

                    webbrowser.open("file://{}".format(tf.name))
                    solution = input("enter the captcha please...")
                    self.webdriver.find_element_by_name("submit").send_keys(solution + Keys.ENTER)
                    try:
                        self.search_input = WebDriverWait(self.webdriver, 5).until(
                            EC.visibility_of_element_located(self._get_search_input_field())
                        )
                    except TimeoutException:
                        raise MaliciousRequestDetected("Requesting with this ip is not possible at the moment.")
                    tf.close()

            else:
                # Just wait until the user solves the captcha in the browser window
                # 10 hours if needed :D
                out("Waiting for user to solve captcha", lvl=1)
                return self._wait_until_search_input_field_appears(10 * 60 * 60)
Пример #25
0
    def search(self, rand=False, timeout=15):
        """The actual search for the search engine.

        When raising StopScrapingException, the scraper will stop.

        When return False, the scraper tries to continue with next keyword.
        """

        success = True

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(user_agents)

        try:
            super().detection_prevention_sleep()
            super().keyword_info()

            request = self.requests.get(self.base_search_url + urlencode(self.search_params),
                                        headers=self.headers, timeout=timeout)

            self.requested_at = datetime.datetime.utcnow()
            self.html = request.text

            out('[HTTP - {url}, headers={headers}, params={params}'.format(
                url=request.url,
                headers=self.headers,
                params=self.search_params),
                lvl=3)

        except self.requests.ConnectionError as ce:
            self.status = 'Network problem occurred {}'.format(ce)
            success = False
        except self.requests.Timeout as te:
            self.status = 'Connection timeout {}'.format(te)
            success = False
        except self.requests.exceptions.RequestException as e:
            # In case of any http networking exception that wasn't caught
            # in the actual request, just end the worker.
            self.status = 'Stopping scraping because {}'.format(e)
        else:
            if not request.ok:
                self.handle_request_denied(request.status_code)
                success = False

        super().after_search()

        return success
Пример #26
0
    def after_search(self):
        """Store the results and parse em.

        Notify the progress queue if necessary.
        """
        self.search_number += 1

        if not self.store():
            out(
                'No results to store for keyword: "{}" in search engine: {}'.format(self.query,
                                                                                    self.search_engine_name), lvl=4)

        if self.progress_queue:
            self.progress_queue.put(1)
        self.cache_results()
Пример #27
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.
                search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

        output_format = Config['GLOBAL'].get('output_format', 'stdout')
        output_file = Config['GLOBAL'].get('output_filename', 'google_scraper')

        def results():
            rows = []
            for result_type, value in self.parser.search_results.items():
                if isinstance(value, list):
                    for link in value:
                        rows.append(link)
            return rows

        if output_format == 'stdout':
            out(self.parser, lvl=2)
        elif output_format == 'json':
            obj = self._get_serp_obj()
            obj['results'] = results()
            json.dump(obj, self.json_outfile, indent=2, sort_keys=True)
            self.json_outfile.write(',')

        elif output_format == 'csv':
            obj = self._get_serp_obj()
            for row in results():
                row.update(obj)
                self.csv_outfile.writerow(row)
Пример #28
0
def get_serp_from_database(session, query, search_engine, scrapemethod):
    try:
        serp = session.query(SearchEngineResultsPage).filter(
                SearchEngineResultsPage.query == query,
                SearchEngineResultsPage.search_engine_name == search_engine,
                SearchEngineResultsPage.scrapemethod == scrapemethod).first()
        out(serp.links, lvl=2)
        return serp
    except NoResultFound as e:
        # that shouldn't happen
        # we have a cache file that matches the above identifying information
        # but it was never stored to the database.
        return False
    except MultipleResultsFound as e:
        raise e

    return False
Пример #29
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

        output_format = Config['GLOBAL'].get('output_format', 'stdout')
        output_file = Config['GLOBAL'].get('output_filename', 'google_scraper')

        def results():
            rows = []
            for result_type, value in self.parser.search_results.items():
                if isinstance(value, list):
                    for link in value:
                        rows.append(link)
            return rows

        if output_format == 'stdout':
            out(self.parser, lvl=2)
        elif output_format == 'json':
            obj = self._get_serp_obj()
            obj['results'] = results()
            json.dump(obj, self.json_outfile, indent=2, sort_keys=True)
            self.json_outfile.write(',')

        elif output_format == 'csv':
            obj = self._get_serp_obj()
            for row in results():
                row.update(obj)
                self.csv_outfile.writerow(row)
Пример #30
0
def get_serp_from_database(session, query, search_engine, scrapemethod):
    try:
        serp = session.query(SearchEngineResultsPage).filter(
            SearchEngineResultsPage.query == query,
            SearchEngineResultsPage.search_engine_name == search_engine,
            SearchEngineResultsPage.scrapemethod == scrapemethod).first()
        out(serp.links, lvl=2)
        return serp
    except NoResultFound as e:
        # that shouldn't happen
        # we have a cache file that matches the above identifying information
        # but it was never stored to the database.
        return False
    except MultipleResultsFound as e:
        raise e

    return False
Пример #31
0
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine."""

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)

        try:
            out('[HTTP - {proxy}] Base_url: {base_url}, headers={headers}, params={params}'.format(
                proxy=self.proxy,
                base_url=self.base_search_url,
                headers=self.headers,
                params=self.search_params),
            lvl=3)

            super().next_keyword_info(self.n)

            request = self.requests.get(self.base_search_url, headers=self.headers,
                             params=self.search_params, timeout=3.0)

        except self.requests.ConnectionError as ce:
            logger.error('Network problem occurred {}'.format(ce))
            raise ce
        except self.requests.Timeout as te:
            logger.error('Connection timeout {}'.format(te))
            raise te

        if not request.ok:
            logger.error('HTTP Error: {}'.format(request.status_code))
            self.handle_request_denied(request.status_code)
            return False

        html = request.text

        # cache fresh results
        with self.cache_lock:
            cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod)

        self.parser.parse(html)
        self.store()
        out(str(self.parser), lvl=2)

        self.n += 1
Пример #32
0
    def blocking_search(self, callback, *args, nextkw=None, **kwargs):
        """Similar transports have the same search loop layout.

        The SelScrape and HttpScrape classes have the same search loops. Just
        the transport mechanism is quite different (In HttpScrape class we replace
        the browsers functionality with our own for example).

        Args:
            callback: A callable with the search functionality.
        """
        for self.current_keyword in self.keywords:

            out('Next Keyword="{kw}" requested by {scraper} and ip {ip}'.format(kw=self.current_keyword, scraper=self.__class__.__name__, ip=self.ip), lvl=2)

            self.current_page = self.start_page_pos

            for self.current_page in range(1, self.num_pages_per_keyword + 1):

                # set the actual search code in the derived class
                callback(*args, **kwargs)
Пример #33
0
def fix_broken_cache_names(url, search_engine, scrapemode):
    """Fix broken cache names.

    Args:
        url: A list of strings to add to each cached_file_name() call.
    """
    files = _get_all_cache_files()
    logger.debug('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir', '.scrapecache')))
    r = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>')

    for i, path in enumerate(files):
        fname = os.path.split(path)[1].strip()
        data = read_cached_file(path)
        infilekws = r.search(data).group('kw')
        realname = cached_file_name(infilekws, search_engine, scrapemode)
        if fname != realname:
            out('The search query in the title element in file {} differ from that hash of its name. Fixing...'.format(path), lvl=3)
            src = os.path.abspath(path)
            dst = os.path.abspath(os.path.join(os.path.split(path)[0], realname))
            logger.debug('Renamed from {} => {}'.format(src, dst))
            os.rename(src, dst)
    logger.debug('Renamed {} files.'.format(i))
Пример #34
0
    def _find_next_page_element(self):
        """Finds the element that locates the next page for any search engine.

        Returns:
            The element that needs to be clicked to get to the next page.
        """
        if self.search_type == 'normal':
            selector = self.next_page_selectors[self.search_engine_name]
            try:
                # wait until the next page link emerges
                WebDriverWait(self.webdriver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, selector)))
                return self.webdriver.find_element_by_css_selector(selector)
            except TimeoutException as te:
                out('{}: Cannot locate next page element: {}'.format(self.name, te), lvl=4)
                return False
            except WebDriverException as e:
                out('{} Cannot locate next page element: {}'.format(self.name, e), lvl=4)
                return False

        elif self.search_type == 'image':
            self.page_down()
            return True
Пример #35
0
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine."""

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)

        try:
            out('[HTTP - {proxy}] Base_url: {base_url}, headers={headers}, params={params}'
                .format(proxy=self.proxy,
                        base_url=self.base_search_url,
                        headers=self.headers,
                        params=self.search_params),
                lvl=3)

            super().detection_prevention_sleep()
            super().keyword_info()

            request = self.requests.get(self.base_search_url,
                                        headers=self.headers,
                                        params=self.search_params,
                                        timeout=5)

            self.current_request_time = datetime.datetime.utcnow()

        except self.requests.ConnectionError as ce:
            logger.error('Network problem occurred {}'.format(ce))
            raise ce
        except self.requests.Timeout as te:
            logger.error('Connection timeout {}'.format(te))
            raise te

        if not request.ok:
            logger.error('HTTP Error: {}'.format(request.status_code))
            self.handle_request_denied(request.status_code)
            return False

        super().after_search(request.text)
Пример #36
0
    def __init__(self, *args, time_offset=0.0, **kwargs):
        """Initialize an HttScrape object to scrape over blocking http.

        HttpScrape inherits from SearchEngineScrape
        and from threading.Timer.
        """
        threading.Timer.__init__(self, time_offset, self.search)
        SearchEngineScrape.__init__(self, *args, **kwargs)
        
        # Bind the requests module to this instance such that each 
        # instance may have an own proxy
        self.requests = __import__('requests')
        
        # initialize the GET parameters for the search request
        self.search_params = {}

        # initialize the HTTP headers of the search request
        # to some base values that mozilla uses with requests.
        # the Host and User-Agent field need to be set additionally.
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }

        # the mode
        self.scrapemethod = 'http'

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(self.search_engine, self.scrapemethod)

        # check proxies first before anything
        if Config['SCRAPING'].getboolean('check_proxies'):
            self.proxy_check()

        out('[+] HttpScrape[{}] created using the search engine {}. Number of keywords to scrape={}, using proxy={}, number of pages={}'.format(
            self.ip, self.search_engine, len(self.keywords), self.proxy, self.num_pages_per_keyword), lvl=1)
Пример #37
0
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine."""

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)

        try:
            out('[HTTP - {proxy}] Base_url: {base_url}, headers={headers}, params={params}'.format(
                proxy=self.proxy,
                base_url=self.base_search_url,
                headers=self.headers,
                params=self.search_params),
            lvl=3)

            super().detection_prevention_sleep()
            super().keyword_info()

            request = self.requests.get(self.base_search_url, headers=self.headers,
                             params=self.search_params, timeout=5)

            self.current_request_time = datetime.datetime.utcnow()

        except self.requests.ConnectionError as ce:
            logger.error('Network problem occurred {}'.format(ce))
            raise ce
        except self.requests.Timeout as te:
            logger.error('Connection timeout {}'.format(te))
            raise te

        if not request.ok:
            logger.error('HTTP Error: {}'.format(request.status_code))
            self.handle_request_denied(request.status_code)
            return False

        super().after_search(request.text)
Пример #38
0
def parse_all_cached_files(keywords,
                           session,
                           scraper_search,
                           try_harder=False):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        identifying_list: A list of list with elements that identify the result. The consist of the keyword, search_engine, scrapemode.
        session: An sql alchemy session to add the entities
        try_harder: If there is a cache file that cannot be mapped to a keyword, read it and try it again and try to
                    extract the search query from the html.

    Returns:
        A list of keywords that couldn't be parsed and which need to be scraped anew.
    """
    google_query_needle = re.compile(
        r'<title>(?P<kw>.*?) - Google Search</title>')
    files = _get_all_cache_files()
    mapping = {}
    search_engine = Config['SCRAPING'].get('search_engine')
    scrapemethod = Config['SCRAPING'].get('scrapemethod')
    for kw in keywords:
        key = cached_file_name(kw, search_engine, scrapemethod)

        out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'
            .format(kw=kw, se=search_engine, sm=scrapemethod, hash=key),
            lvl=5)

        mapping[key] = kw

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        query = mapping.get(clean_filename, None)

        if query:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            try:
                serp = session.query(SearchEngineResultsPage).filter(
                    SearchEngineResultsPage.query == query,
                    SearchEngineResultsPage.search_engine_name ==
                    search_engine, SearchEngineResultsPage.scrapemethod ==
                    scrapemethod).one()
            except NoResultFound as e:
                # that shouldn't happen
                # we have a cache file that matches the above identifying information
                # but it was never stored to the database.
                logger.error(
                    'No entry for file {} found in database. Will parse again.'
                    .format(clean_filename))
                serp = parse_serp(html=read_cached_file(get_path(fname)),
                                  search_engine=search_engine,
                                  scrapemethod=scrapemethod,
                                  current_page=0,
                                  current_keyword=query)
            except MultipleResultsFound as e:
                raise e
            finally:
                scraper_search.serps.append(serp)

            mapping.pop(clean_filename)

        # TODO: support query detection for all supported search engines
        # by parsing the keyword, search engine from the raw html

    out('{} cache files found in {}'.format(len(files),
                                            Config['GLOBAL'].get('cachedir')),
        lvl=1)
    out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'
        .format(len(keywords) - len(mapping), len(keywords), len(mapping)),
        lvl=1)

    session.add(scraper_search)
    session.commit()
    # return the remaining keywords to scrape
    return mapping.values()
Пример #39
0
 def instance_creation_info(self, scraper_name):
     """Debug message whenever a scraping worker is created"""
     out('[+] {}[{}][search-type:{}][{}] using search engine "{}". Num keywords={}, num pages for keyword={}'.format(
         scraper_name, self.requested_by, self.search_type, self.base_search_url, self.search_engine_name,
         len(self.jobs),
         self.pages_per_keyword), lvl=1)
Пример #40
0
    def search(self, rand=False, timeout=15):
        """The actual search for the search engine.

        When raising StopScrapingException, the scraper will stop.

        When return False, the scraper tries to continue with next keyword.
        """

        success = True

        self.build_search()

        if Config['SCRAPING'].get('user_agents'):
            user_agents_actual = Config['SCRAPING'].get('user_agents')
        else:
            user_agents_actual = user_agents

        if rand:
            self.headers['User-Agent'] = random.choice(user_agents_actual)

        try:
            super().detection_prevention_sleep()
            super().keyword_info()

            proxies = {}
            if self.proxy:
                if self.proxy.username:
                    proxy_base_url = "{username}:{password}@{host}:{port}".format(
                        username=self.proxy.username,
                        password=self.proxy.password,
                        host=self.proxy.host,
                        port=self.proxy.port)
                else:
                    proxy_base_url = "{host}:{port}".format(
                        host=self.proxy.host, port=self.proxy.port)

                proxies = {
                    "http":
                    "{proto}://{base_url}".format(proto='http',
                                                  base_url=proxy_base_url),
                    "https":
                    "{proto}://{base_url}".format(proto='https',
                                                  base_url=proxy_base_url)
                }

            request = self.requests.get(self.base_search_url +
                                        urlencode(self.search_params),
                                        headers=self.headers,
                                        timeout=timeout,
                                        proxies=proxies)

            self.requested_at = datetime.datetime.utcnow()
            self.html = request.text

            self.serp_log.add(self.query, self.html)

            out('[HTTP - {url}, headers={headers}, params={params}'.format(
                url=request.url,
                headers=self.headers,
                params=self.search_params),
                lvl=3)

        except self.requests.ConnectionError as ce:
            self.status = 'Network problem occurred {}'.format(ce)
            success = False
        except self.requests.Timeout as te:
            self.status = 'Connection timeout {}'.format(te)
            success = False
        except self.requests.exceptions.RequestException as e:
            # In case of any http networking exception that wasn't caught
            # in the actual request, just end the worker.
            self.status = 'Stopping scraping because {}'.format(e)
        else:
            if not request.ok:
                self.handle_request_denied(request.status_code)
                success = False

        super().after_search()

        return success
Пример #41
0
def parse_all_cached_files(scrape_jobs, session, scraper_search):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        session: An sql alchemy session to add the entities
        scraper_search: Abstract object representing the current search.

    Returns:
        The scrape jobs that couldn't be parsed from the cache directory.
    """
    files = _get_all_cache_files()
    num_cached = num_total = 0
    mapping = {}
    for job in scrape_jobs:
        cache_name = cached_file_name(job['query'], job['search_engine'],
                                      job['scrape_method'], job['page_number'])
        mapping[cache_name] = job
        num_total += 1

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        job = mapping.get(clean_filename, None)

        if job:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            serp = get_serp_from_database(session, job['query'],
                                          job['search_engine'],
                                          job['scrape_method'],
                                          job['page_number'])

            if not serp:
                serp = parse_again(fname, job['search_engine'],
                                   job['scrape_method'], job['query'])

            serp.scraper_searches.append(scraper_search)
            session.add(serp)

            if num_cached % 200 == 0:
                session.commit()

            store_serp_result(serp)
            num_cached += 1
            scrape_jobs.remove(job)

    out('{} cache files found in {}'.format(len(files),
                                            Config['GLOBAL'].get('cachedir')),
        lvl=2)
    out('{}/{} objects have been read from the cache. {} remain to get scraped.'
        .format(num_cached, num_total, num_total - num_cached),
        lvl=2)

    session.add(scraper_search)
    session.commit()

    return scrape_jobs
Пример #42
0
def parse_all_cached_files(keywords, search_engines, session, scraper_search):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        session: An sql alchemy session to add the entities

    Returns:
        A list of keywords that couldn't be parsed and which need to be scraped anew.
    """
    google_query_needle = re.compile(
        r'<title>(?P<kw>.*?) - Google Search</title>')

    files = _get_all_cache_files()
    mapping = {}
    scrapemethod = Config['SCRAPING'].get('scrapemethod')
    num_cached = 0
    # a keyword is requested once for each search engine
    num_total_keywords = len(keywords) * len(search_engines)

    for kw in keywords:
        for search_engine in search_engines:
            key = cached_file_name(kw, search_engine, scrapemethod)

            out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'
                .format(kw=kw, se=search_engine, sm=scrapemethod, hash=key),
                lvl=5)

            mapping[key] = (kw, search_engine)

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        query = search_engine = None
        val = mapping.get(clean_filename, None)
        if val:
            query, search_engine = val

        if query and search_engine:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            serp = None  #get_serp_from_database(session, query, search_engine, scrapemethod)

            if not serp:
                serp = parse_again(fname, search_engine, scrapemethod, query)

            serp.scraper_searches.append(scraper_search)
            session.add(serp)
            session.commit()

            mapping.pop(clean_filename)
            num_cached += 1

    out('{} cache files found in {}'.format(len(files),
                                            Config['GLOBAL'].get('cachedir')),
        lvl=1)
    out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'
        .format(num_cached, num_total_keywords,
                num_total_keywords - num_cached),
        lvl=1)

    session.add(scraper_search)
    session.commit()
    # return the remaining keywords to scrape
    return [e[0] for e in mapping.values()]
Пример #43
0
def main(return_results=False, parse_cmd_line=True):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
    Returns:
        A database session to the results when return_results is True
    """
    if parse_cmd_line:
        parse_cmd_args()

    # If the configuration file to use is explicitly specified, update the current configuration
    # with it.
    if Config['GLOBAL'].get('config_file', None):
        update_config_with_file(Config['GLOBAL'].get('config_file', None))

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE

        print(open(CONFIG_FILE).read())
        return

    if Config['GLOBAL'].getboolean('version'):
        from GoogleScraper.version import __version__

        print(__version__)
        return

    if Config['GLOBAL'].getboolean('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(Config['GLOBAL'].get('cachedir')))
        except:
            pass
        return

    init_outfile(force_reload=True)

    kwfile = Config['SCRAPING'].get('keyword_file', '')
    if kwfile:
        kwfile = os.path.abspath(kwfile)

    keyword = Config['SCRAPING'].get('keyword')
    keywords = {keyword for keyword in set(Config['SCRAPING'].get('keywords', []).split('\n')) if keyword}
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    se = Config['SCRAPING'].get('search_engines', 'google')
    if se.strip() == '*':
        se = Config['SCRAPING'].get('supported_search_engines', 'google')

    search_engines = list({search_engine.strip() for search_engine in se.split(',') if search_engine.strip()})
    assert search_engines, 'No search engine specified'
    num_search_engines = len(search_engines)
    num_workers = Config['SCRAPING'].getint('num_workers')
    scrape_method = Config['SCRAPING'].get('scrape_method')
    pages = Config['SCRAPING'].getint('num_pages_for_keyword', 1)
    method = Config['SCRAPING'].get('scrape_method', 'http')

    if Config['GLOBAL'].getboolean('shell', False):
        namespace = {}
        session_cls = get_session(scoped=False)
        namespace['session'] = session_cls()
        namespace['ScraperSearch'] = ScraperSearch
        namespace['SERP'] = SERP
        namespace['Link'] = Link
        namespace['Proxy'] = GoogleScraper.database.Proxy
        print('Available objects:')
        print('session - A sqlalchemy session of the results database')
        print('ScraperSearch - Search/Scrape job instances')
        print('SERP - A search engine results page')
        print('Link - A single link belonging to a SERP')
        print('Proxy - Proxies stored for scraping projects.')
        start_python_console(namespace)
        return

    if not (keyword or keywords) and not kwfile:
        raise_or_log(
            'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
            'keyword with --keyword.')
        # Just print the help.
        get_command_line(True)
        return

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    keywords = [keyword, ] if keyword else keywords
    scrape_jobs = {}
    if kwfile:
        if not os.path.exists(kwfile):
            raise_or_log('The keyword file {} does not exist.'.format(kwfile),
                         exception_obj=InvalidConfigurationException)
        else:
            if kwfile.endswith('.py'):
                # we need to import the variable "scrape_jobs" from the module.
                sys.path.append(os.path.dirname(kwfile))
                try:
                    modname = os.path.split(kwfile)[-1].rstrip('.py')
                    scrape_jobs = getattr(__import__(modname, fromlist=['scrape_jobs']), 'scrape_jobs')
                except ImportError as e:
                    logger.warning(e)
            else:
                # Clean the keywords of duplicates right in the beginning
                keywords = set([line.strip() for line in open(kwfile, 'r').read().split('\n') if line.strip()])

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords, search_engines, scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if Config['GLOBAL'].getboolean('clean_cache_files', False):
        clean_cachefiles()
        return

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise_or_log('Not more that 100 results per page available for searches.',
                     exception_obj=InvalidConfigurationException)

    proxies = []

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if Config['SCRAPING'].getboolean('use_own_ip'):
        proxies.append(None)

    if not proxies:
        raise InvalidConfigurationException(
            'No proxies available and using own IP is prohibited by configuration. Turning down.')

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        raise_or_log('Invalid search type! Select one of {}'.format(repr(valid_search_types)),
                     exception_obj=InvalidConfigurationException)

    if Config['GLOBAL'].getboolean('simulate', False):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info('If GoogleScraper would have been run without the --simulate flag, it would have:')
        logger.info('Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'.format(
            len(keywords), Config['SCRAPING'].getint('num_results_per_page', 0),
            Config['SCRAPING'].getint('num_pages_for_keyword')))
        if None in proxies:
            logger.info('Also using own ip address to scrape.')
        else:
            logger.info('Not scraping with own ip address.')
        logger.info('Used {} unique ip addresses in total'.format(len(proxies)))
        if proxies:
            logger.info('The following proxies are used: \n\t\t{}'.format(
                '\n\t\t'.join([proxy.host + ':' + proxy.port for proxy in proxies if proxy])))

        logger.info('By using {} mode with {} worker instances'.format(Config['SCRAPING'].get('scrape_method'),
                                                                       Config['SCRAPING'].getint('num_workers')))
        return

    # get a scoped sqlalchemy session
    session_cls = get_session(scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None
    if kwfile and Config['GLOBAL'].getboolean('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=os.path.abspath(kwfile),
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines)
        )

    # First of all, lets see how many requests remain to issue after searching the cache.
    if Config['GLOBAL'].getboolean('do_caching'):
        scrape_jobs = parse_all_cached_files(scrape_jobs, session, scraper_search)

    if scrape_jobs:

        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        out('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'.format(
            num_keywords=len(list(scrape_jobs)),
            num_proxies=len(proxies),
            num_threads=num_search_engines
            ), lvl=1)

        progress_thread = None

        # Let the games begin
        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(
                                mode=method,
                                proxy=proxy,
                                search_engine=search_engine,
                                session=session,
                                db_lock=db_lock,
                                cache_lock=cache_lock,
                                scraper_search=scraper_search,
                                captcha_lock=captcha_lock,
                                progress_queue=q,
                                browser_num=num_worker
                            )
                        )

            for job in scrape_jobs:

                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(scrape_jobs, session=session, scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise InvalidConfigurationException(
                'No such scrape_method {}'.format(Config['SCRAPING'].get('scrape_method')))

        if method in ('selenium', 'http'):
            # progress_thread can be None
            try:
                progress_thread.join()
            except AttributeError:
                pass

    # in the end, close the json file.
    from GoogleScraper.output_converter import outfile, output_format

    if output_format == 'json':
        outfile.end()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    if return_results:
        return scraper_search
Пример #44
0
def parse_all_cached_files(keywords, search_engines, session, scraper_search, try_harder=False):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        identifying_list: A list of list with elements that identify the result. The consist of the keyword, search_engine, scrapemode.
        session: An sql alchemy session to add the entities
        try_harder: If there is a cache file that cannot be mapped to a keyword, read it and try it again and try to
                    extract the search query from the html.

    Returns:
        A list of keywords that couldn't be parsed and which need to be scraped anew.
    """
    google_query_needle = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>')

    files = _get_all_cache_files()
    mapping = {}
    scrapemethod = Config['SCRAPING'].get('scrapemethod')
    num_cached = 0

    for kw in keywords:
        for search_engine in search_engines:
            key = cached_file_name(kw, search_engine, scrapemethod)

            out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'.format(
                    kw=kw,
                    se=search_engine,
                    sm=scrapemethod,
                    hash=key
                ), lvl=5)

            mapping[key] = (kw, search_engine)

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        query = search_engine = None
        val = mapping.get(clean_filename, None)
        if val:
            query, search_engine = val

        if query and search_engine:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            try:
                serp = session.query(SearchEngineResultsPage).filter(
                        SearchEngineResultsPage.query == query,
                        SearchEngineResultsPage.search_engine_name == search_engine,
                        SearchEngineResultsPage.scrapemethod == scrapemethod).one()
            except NoResultFound as e:
                # that shouldn't happen
                # we have a cache file that matches the above identifying information
                # but it was never stored to the database.
                logger.error('No entry for file {} found in database. Will parse again.'.format(clean_filename))
                html = read_cached_file(get_path(fname))
                serp = parse_serp(
                    html=html,
                    search_engine=search_engine,
                    scrapemethod=scrapemethod,
                    current_page=0,
                    current_keyword=query
                )
            except MultipleResultsFound as e:
                raise e

            if serp:
                scraper_search.serps.append(serp)

            mapping.pop(clean_filename)
            num_cached += 1

    out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=1)
    out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'.format(
        num_cached, len(keywords), len(keywords) - num_cached), lvl=1)

    session.add(scraper_search)
    session.commit()
    # return the remaining keywords to scrape
    return [e[0] for e in mapping.values()]
Пример #45
0
    def _parse(self, cleaner=None):
        """Internal parse the dom according to the provided css selectors.
        
        Raises: InvalidSearchTypeException if no css selectors for the searchtype could be found.
        """
        self._parse_lxml(cleaner)

        # try to parse the number of results.
        attr_name = self.searchtype + '_search_selectors'
        selector_dict = getattr(self, attr_name, None)

        # get the appropriate css selectors for the num_results for the keyword
        num_results_selector = getattr(self, 'num_results_search_selectors',
                                       None)

        self.num_results_for_query = self.first_match(num_results_selector,
                                                      self.dom)
        if not self.num_results_for_query:
            out('{}: Cannot parse num_results from serp page with selectors {}'
                .format(self.__class__.__name__, num_results_selector),
                lvl=4)

        # get the current page we are at. Sometimes we search engines don't show this.
        try:
            self.page_number = int(
                self.first_match(self.page_number_selectors, self.dom))
        except ValueError:
            self.page_number = -1

        # let's see if the search query was shitty (no results for that query)
        self.effective_query = self.first_match(self.effective_query_selector,
                                                self.dom)
        if self.effective_query:
            out('{}: There was no search hit for the search query. Search engine used {} instead.'
                .format(self.__class__.__name__, self.effective_query),
                lvl=4)

        # the element that notifies the user about no results.
        self.no_results_text = self.first_match(self.no_results_selector,
                                                self.dom)

        # get the stuff that is of interest in SERP pages.
        if not selector_dict and not isinstance(selector_dict, dict):
            raise InvalidSearchTypeException(
                'There is no such attribute: {}. No selectors found'.format(
                    attr_name))

        for result_type, selector_class in selector_dict.items():

            self.search_results[result_type] = []

            for selector_specific, selectors in selector_class.items():

                if 'result_container' in selectors and selectors[
                        'result_container']:
                    css = '{container} {result_container}'.format(**selectors)
                else:
                    css = selectors['container']

                results = self.dom.xpath(self.css_to_xpath(css))

                to_extract = set(
                    selectors.keys()) - {'container', 'result_container'}
                selectors_to_use = {
                    key: selectors[key]
                    for key in to_extract if key in selectors.keys()
                }

                for index, result in enumerate(results):
                    # Let's add primitive support for CSS3 pseudo selectors
                    # We just need two of them
                    # ::text
                    # ::attr(attribute)

                    # You say we should use xpath expressions instead?
                    # Maybe you're right, but they are complicated when it comes to classes,
                    # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html
                    serp_result = {}
                    # key are for example 'link', 'snippet', 'visible-url', ...
                    # selector is the selector to grab these items
                    for key, selector in selectors_to_use.items():
                        serp_result[key] = self.advanced_css(selector, result)

                    serp_result['rank'] = index + 1

                    # only add items that have not None links.
                    # Avoid duplicates. Detect them by the link.
                    # If statement below: Lazy evaluation. The more probable case first.
                    if 'link' in serp_result and serp_result['link'] and \
                            not [e for e in self.search_results[result_type] if e['link'] == serp_result['link']]:
                        self.search_results[result_type].append(serp_result)
                        self.num_results += 1
Пример #46
0
 def instance_creation_info(self, scraper_name):
     """Debug message whenever a scraping worker is created"""
     out('[+] {}[{}][search-type:{}] created using the search engine {}. Number of keywords to scrape={}, using proxy={}, number of pages per keyword={}'.format(
         scraper_name, self.ip, self.search_type, self.search_engine, len(self.keywords), self.proxy, self.num_pages_per_keyword), lvl=2)
Пример #47
0
def main(return_results=False, parse_cmd_line=True):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
    Returns:
        A database session to the results when return_results is True
    """
    if parse_cmd_line:
        parse_cmd_args()

    # If the configuration file to use is explicitly specified, update the current configuration
    # with it.
    if Config['GLOBAL'].get('config_file', None):
        update_config_with_file(Config['GLOBAL'].get('config_file', None))

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE
        print(open(CONFIG_FILE).read())
        return

    if Config['GLOBAL'].getboolean('version'):
        from GoogleScraper.version import __version__
        print(__version__)
        return

    kwfile = Config['SCRAPING'].get('keyword_file', '')
    keyword = Config['SCRAPING'].get('keyword')
    keywords = {
        keyword
        for keyword in set(Config['SCRAPING'].get('keywords', []).split('\n'))
        if keyword
    }
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    if Config['GLOBAL'].getboolean('shell', False):
        namespace = {}
        Session = get_session(scoped=False, create=False)
        namespace['session'] = Session()
        namespace['ScraperSearch'] = ScraperSearch
        namespace['SERP'] = SERP
        namespace['Link'] = Link
        print('Available objects:')
        print('session - A sqlalchemy session of the results database')
        print('ScraperSearch - Search/Scrape job instances')
        print('SERP - A search engine results page')
        print('Link - A single link belonging to a SERP')
        start_python_console(namespace)
        return

    if not (keyword or keywords) and not kwfile:
        logger.error(
            'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and keyword with --keyword.'
        )
        get_command_line(False, True)
        return

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    keywords = [
        keyword,
    ] if keyword else keywords
    if kwfile:
        if not os.path.exists(kwfile):
            raise InvalidConfigurationException(
                'The keyword file {} does not exist.'.format(kwfile))
        else:
            # Clean the keywords of duplicates right in the beginning
            keywords = set([
                line.strip() for line in open(kwfile, 'r').read().split('\n')
            ])

    search_engines = list({
        search_engine
        for search_engine in Config['SCRAPING'].get(
            'search_engines', 'google').split(',') if search_engine
    })
    assert search_engines, 'No search engine specified'

    if Config['GLOBAL'].getboolean('clean_cache_files', False):
        clean_cachefiles()
        return

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise InvalidConfigurationException(
            'Not more that 100 results per page available for searches.')

    proxies = []

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if Config['SCRAPING'].getboolean('use_own_ip'):
        proxies.append(None)

    if not proxies:
        raise InvalidConfigurationException(
            "No proxies available and using own IP is prohibited by configuration. Turning down."
        )

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        InvalidConfigurationException(
            'Invalid search type! Select one of {}'.format(
                repr(valid_search_types)))

    if Config['GLOBAL'].getboolean('simulate', False):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info(
            'If GoogleScraper would have been run without the --simulate flag, it would have:'
        )
        logger.info(
            'Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'
            .format(len(keywords),
                    Config['SCRAPING'].getint('num_results_per_page', 0),
                    Config['SCRAPING'].getint('num_pages_for_keyword')))
        if None in proxies:
            logger.info('Also using own ip address to scrape.')
        else:
            logger.info('Not scraping with own ip address.')
        logger.info('Used {} unique ip addresses in total'.format(
            len(proxies)))
        if proxies:
            logger.info('The following proxies are used: \n\t\t{}'.format(
                '\n\t\t'.join([
                    proxy.host + ':' + proxy.port for proxy in proxies if proxy
                ])))

        logger.info('By using {} mode with {} worker instances'.format(
            Config['SCRAPING'].get('scrapemethod'),
            Config['SCRAPING'].getint('num_workers')))
        return

    # get a scoped sqlalchemy session
    Session = get_session(scoped=False, create=True)
    session = Session()

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.

    scraper_search = None
    if kwfile:
        searches = session.query(ScraperSearch).\
            filter(ScraperSearch.keyword_file == kwfile).\
            order_by(ScraperSearch.started_searching).\
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.fromtimestamp(
                os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=kwfile,
            number_search_engines_used=1,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines))

    # First of all, lets see how many keywords remain to scrape after parsing the cache
    if Config['GLOBAL'].getboolean('do_caching'):
        remaining = parse_all_cached_files(keywords, search_engines, session,
                                           scraper_search)
    else:
        remaining = keywords

    # remove duplicates and empty keywords
    remaining = [keyword for keyword in set(remaining) if keyword]

    if remaining:

        kwgroups = assign_keywords_to_scrapers(remaining)

        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # final check before going into the loop
        num_workers_to_allocate = len(kwgroups) * len(
            search_engines) > Config['SCRAPING'].getint('maximum_workers')
        if (len(kwgroups) * len(search_engines)
            ) > Config['SCRAPING'].getint('maximum_workers'):
            logger.error('Too many workers: {} , might crash the app'.format(
                num_workers_to_allocate))

        out('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'
            .format(num_keywords=len(remaining),
                    num_proxies=len(proxies),
                    num_threads=Config['SCRAPING'].getint('num_workers', 1)),
            lvl=1)

        # Show the progress of the scraping
        q = queue.Queue()
        progress_thread = ShowProgressQueue(q, len(remaining))
        progress_thread.start()

        # Let the games begin
        if Config['SCRAPING'].get('scrapemethod') in ('selenium', 'http'):
            # A lock to prevent multiple threads from solving captcha.
            captcha_lock = threading.Lock()

            # Distribute the proxies evenly on the keywords to search for
            scrapejobs = []

            for k, search_engine in enumerate(search_engines):
                for i, keyword_group in enumerate(kwgroups):

                    proxy_to_use = proxies[i % len(proxies)]

                    if Config['SCRAPING'].get('scrapemethod',
                                              'http') == 'selenium':
                        scrapejobs.append(
                            SelScrape(
                                search_engine=search_engine,
                                session=session,
                                keywords=keyword_group,
                                db_lock=db_lock,
                                cache_lock=cache_lock,
                                scraper_search=scraper_search,
                                captcha_lock=captcha_lock,
                                browser_num=i,
                                proxy=proxy_to_use,
                                progress_queue=q,
                            ))
                    elif Config['SCRAPING'].get('scrapemethod') == 'http':
                        scrapejobs.append(
                            HttpScrape(
                                search_engine=search_engine,
                                keywords=keyword_group,
                                session=session,
                                scraper_search=scraper_search,
                                cache_lock=cache_lock,
                                db_lock=db_lock,
                                proxy=proxy_to_use,
                                progress_queue=q,
                            ))

            for t in scrapejobs:
                t.start()

            for t in scrapejobs:
                t.join()

        elif Config['SCRAPING'].get('scrapemethod') == 'http-async':
            raise NotImplemented('soon my dear friends :)')

        else:
            raise InvalidConfigurationException(
                'No such scrapemethod. Use "http" or "sel"')

        scraper_search.stopped_searching = datetime.datetime.utcnow()
        session.add(scraper_search)
        session.commit()

        progress_thread.join()

        if return_results:
            return session
Пример #48
0
    def search(self):
        """Search with webdriver.

        Fills out the search form of the search engine for each keyword.
        Clicks the next link while pages_per_keyword is not reached.
        """
        for self.query, self.pages_per_keyword in self.jobs.items():

            self.search_input = self._wait_until_search_input_field_appears()

            if self.search_input is False and Config['PROXY_POLICY'].getboolean('stop_on_detection'):
                self.status = 'Malicious request detected'
                super().after_search()
                return

            if self.search_input is False:
                # @todo: pass status_code
                self.search_input = self.handle_request_denied()

            if self.search_input:
                self.search_input.clear()
                time.sleep(.25)

                self.search_param_fields = self._get_search_param_fields()

                if self.search_param_fields:
                    wait_res = self._wait_until_search_param_fields_appears()
                    if wait_res is False:
                        raise Exception('Waiting search param input fields time exceeds')
                    for param, field in self.search_param_fields.items():
                        if field[0] == By.ID:
                            js_tpl = '''
                            var field = document.getElementById("%s");
                            field.setAttribute("value", "%s");
                            '''
                        elif field[0] == By.NAME:
                            js_tpl = '''
                            var fields = document.getElementsByName("%s");
                            for (var f in fields) {
                                f.setAttribute("value", "%s");
                            }
                            '''
                        js_str = js_tpl % (field[1], self.search_param_values[param])
                        self.webdriver.execute_script(js_str)

                try:
                    self.search_input.send_keys(self.query + Keys.ENTER)
                except ElementNotVisibleException:
                    time.sleep(2)
                    self.search_input.send_keys(self.query + Keys.ENTER)

                self.requested_at = datetime.datetime.utcnow()
            else:
                out('{}: Cannot get handle to the input form for keyword {}.'.format(self.name, self.query), lvl=4)
                continue

            super().detection_prevention_sleep()
            super().keyword_info()

            for self.page_number in self.pages_per_keyword:

                self.wait_until_serp_loaded()

                try:
                    self.html = self.webdriver.execute_script('return document.body.innerHTML;')
                except WebDriverException as e:
                    self.html = self.webdriver.page_source

                super().after_search()

                # Click the next page link not when leaving the loop
                # in the next iteration.
                if self.page_number in self.pages_per_keyword:
                    next_url = self._goto_next_page()
                    self.requested_at = datetime.datetime.utcnow()

                    if not next_url:
                        break
Пример #49
0
    def _parse(self, cleaner=None):
        """Internal parse the dom according to the provided css selectors.
        
        Raises: InvalidSearchTypeException if no css selectors for the searchtype could be found.
        """
        self._parse_lxml(cleaner)

        # try to parse the number of results.
        attr_name = self.searchtype + '_search_selectors'
        selector_dict = getattr(self, attr_name, None)

        # get the appropriate css selectors for the num_results for the keyword
        num_results_selector = getattr(self, 'num_results_search_selectors', None)

        self.num_results_for_query = self.first_match(num_results_selector, self.dom)
        if not self.num_results_for_query:
            out('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__,
                                                                                       num_results_selector), lvl=4)

        # get the current page we are at. Sometimes we search engines don't show this.
        try:
            self.page_number = int(self.first_match(self.page_number_selectors, self.dom))
        except ValueError:
            self.page_number = -1

        # let's see if the search query was shitty (no results for that query)
        self.effective_query = self.first_match(self.effective_query_selector, self.dom)
        if self.effective_query:
            out('{}: There was no search hit for the search query. Search engine used {} instead.'.format(
                self.__class__.__name__, self.effective_query), lvl=4)

        # the element that notifies the user about no results.
        self.no_results_text = self.first_match(self.no_results_selector, self.dom)

        # get the stuff that is of interest in SERP pages.
        if not selector_dict and not isinstance(selector_dict, dict):
            raise InvalidSearchTypeException('There is no such attribute: {}. No selectors found'.format(attr_name))

        for result_type, selector_class in selector_dict.items():

            self.search_results[result_type] = []

            for selector_specific, selectors in selector_class.items():

                if 'result_container' in selectors and selectors['result_container']:
                    css = '{container} {result_container}'.format(**selectors)
                else:
                    css = selectors['container']

                results = self.dom.xpath(
                    self.css_to_xpath(css)
                )

                to_extract = set(selectors.keys()) - {'container', 'result_container'}
                selectors_to_use = {key: selectors[key] for key in to_extract if key in selectors.keys()}

                for index, result in enumerate(results):
                    # Let's add primitive support for CSS3 pseudo selectors
                    # We just need two of them
                    # ::text
                    # ::attr(attribute)

                    # You say we should use xpath expressions instead?
                    # Maybe you're right, but they are complicated when it comes to classes,
                    # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html
                    serp_result = {}
                    # key are for example 'link', 'snippet', 'visible-url', ...
                    # selector is the selector to grab these items
                    for key, selector in selectors_to_use.items():
                        serp_result[key] = self.advanced_css(selector, result)

                    serp_result['rank'] = index + 1

                    # only add items that have not None links.
                    # Avoid duplicates. Detect them by the link.
                    # If statement below: Lazy evaluation. The more probable case first.
                    if 'link' in serp_result and serp_result['link'] and \
                            not [e for e in self.search_results[result_type] if e['link'] == serp_result['link']]:
                        self.search_results[result_type].append(serp_result)
                        self.num_results += 1
Пример #50
0
def main(return_results=False, parse_cmd_line=True):
    """Runs the GoogleScraper application as determined by the various configuration points.

    The main() function encompasses the core functionality of GoogleScraper. But it
    shouldn't be the main() functions job to check the validity of the provided
    configuration.

    Args:
        return_results: When GoogleScrape is used from within another program, don't print results to stdout,
                        store them in a database instead.
        parse_cmd_line: Whether to get options from the command line or not.
    Returns:
        A database session to the results when return_results is True
    """
    if parse_cmd_line:
        parse_cmd_args()

    # If the configuration file to use is explicitly specified, update the current configuration
    # with it.
    if Config['GLOBAL'].get('config_file', None):
        update_config_with_file(Config['GLOBAL'].get('config_file', None))

    if Config['GLOBAL'].getboolean('view_config'):
        from GoogleScraper.config import CONFIG_FILE

        print(open(CONFIG_FILE).read())
        return

    if Config['GLOBAL'].getboolean('version'):
        from GoogleScraper.version import __version__

        print(__version__)
        return

    if Config['GLOBAL'].getboolean('clean', False):
        try:
            os.remove('google_scraper.db')
            if sys.platform == 'linux':
                os.system('rm {}/*'.format(Config['GLOBAL'].get('cachedir')))
        except:
            pass
        return

    init_outfile(force_reload=True)

    kwfile = Config['SCRAPING'].get('keyword_file', '')
    if kwfile:
        kwfile = os.path.abspath(kwfile)

    keyword = Config['SCRAPING'].get('keyword')
    keywords = {
        keyword
        for keyword in set(Config['SCRAPING'].get('keywords', []).split('\n'))
        if keyword
    }
    proxy_file = Config['GLOBAL'].get('proxy_file', '')
    proxy_db = Config['GLOBAL'].get('mysql_proxy_db', '')

    se = Config['SCRAPING'].get('search_engines', 'google')
    if se.strip() == '*':
        se = Config['SCRAPING'].get('supported_search_engines', 'google')

    search_engines = list({
        search_engine.strip()
        for search_engine in se.split(',') if search_engine.strip()
    })
    assert search_engines, 'No search engine specified'
    num_search_engines = len(search_engines)
    num_workers = Config['SCRAPING'].getint('num_workers')
    scrape_method = Config['SCRAPING'].get('scrape_method')
    pages = Config['SCRAPING'].getint('num_pages_for_keyword', 1)
    method = Config['SCRAPING'].get('scrape_method', 'http')

    if Config['GLOBAL'].getboolean('shell', False):
        namespace = {}
        session_cls = get_session(scoped=False)
        namespace['session'] = session_cls()
        namespace['ScraperSearch'] = ScraperSearch
        namespace['SERP'] = SERP
        namespace['Link'] = Link
        namespace['Proxy'] = GoogleScraper.database.Proxy
        print('Available objects:')
        print('session - A sqlalchemy session of the results database')
        print('ScraperSearch - Search/Scrape job instances')
        print('SERP - A search engine results page')
        print('Link - A single link belonging to a SERP')
        print('Proxy - Proxies stored for scraping projects.')
        start_python_console(namespace)
        return

    if not (keyword or keywords) and not kwfile:
        raise_or_log(
            'No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
            'keyword with --keyword.')
        # Just print the help.
        get_command_line(True)
        return

    if Config['GLOBAL'].getboolean('fix_cache_names'):
        fix_broken_cache_names()
        logger.info('renaming done. restart for normal use.')
        return

    keywords = [
        keyword,
    ] if keyword else keywords
    scrape_jobs = {}
    if kwfile:
        if not os.path.exists(kwfile):
            raise_or_log('The keyword file {} does not exist.'.format(kwfile),
                         exception_obj=InvalidConfigurationException)
        else:
            if kwfile.endswith('.py'):
                # we need to import the variable "scrape_jobs" from the module.
                sys.path.append(os.path.dirname(kwfile))
                try:
                    modname = os.path.split(kwfile)[-1].rstrip('.py')
                    scrape_jobs = getattr(
                        __import__(modname, fromlist=['scrape_jobs']),
                        'scrape_jobs')
                except ImportError as e:
                    logger.warning(e)
            else:
                # Clean the keywords of duplicates right in the beginning
                keywords = set([
                    line.strip()
                    for line in open(kwfile, 'r').read().split('\n')
                    if line.strip()
                ])

    if not scrape_jobs:
        scrape_jobs = default_scrape_jobs_for_keywords(keywords,
                                                       search_engines,
                                                       scrape_method, pages)

    scrape_jobs = list(scrape_jobs)

    if Config['GLOBAL'].getboolean('clean_cache_files', False):
        clean_cachefiles()
        return

    if Config['GLOBAL'].getboolean('check_oto', False):
        _caching_is_one_to_one(keyword)

    if Config['SCRAPING'].getint('num_results_per_page') > 100:
        raise_or_log(
            'Not more that 100 results per page available for searches.',
            exception_obj=InvalidConfigurationException)

    proxies = []

    if proxy_db:
        proxies = get_proxies_from_mysql_db(proxy_db)
    elif proxy_file:
        proxies = parse_proxy_file(proxy_file)

    if Config['SCRAPING'].getboolean('use_own_ip'):
        proxies.append(None)

    if not proxies:
        raise InvalidConfigurationException(
            'No proxies available and using own IP is prohibited by configuration. Turning down.'
        )

    valid_search_types = ('normal', 'video', 'news', 'image')
    if Config['SCRAPING'].get('search_type') not in valid_search_types:
        raise_or_log('Invalid search type! Select one of {}'.format(
            repr(valid_search_types)),
                     exception_obj=InvalidConfigurationException)

    if Config['GLOBAL'].getboolean('simulate', False):
        print('*' * 60 + 'SIMULATION' + '*' * 60)
        logger.info(
            'If GoogleScraper would have been run without the --simulate flag, it would have:'
        )
        logger.info(
            'Scraped for {} keywords, with {} results a page, in total {} pages for each keyword'
            .format(len(keywords),
                    Config['SCRAPING'].getint('num_results_per_page', 0),
                    Config['SCRAPING'].getint('num_pages_for_keyword')))
        if None in proxies:
            logger.info('Also using own ip address to scrape.')
        else:
            logger.info('Not scraping with own ip address.')
        logger.info('Used {} unique ip addresses in total'.format(
            len(proxies)))
        if proxies:
            logger.info('The following proxies are used: \n\t\t{}'.format(
                '\n\t\t'.join([
                    proxy.host + ':' + proxy.port for proxy in proxies if proxy
                ])))

        logger.info('By using {} mode with {} worker instances'.format(
            Config['SCRAPING'].get('scrape_method'),
            Config['SCRAPING'].getint('num_workers')))
        return

    # get a scoped sqlalchemy session
    session_cls = get_session(scoped=False)
    session = session_cls()

    # add fixtures
    fixtures(session)

    # add proxies to the database
    add_proxies_to_db(proxies, session)

    # ask the user to continue the last scrape. We detect a continuation of a
    # previously established scrape, if the keyword-file is the same and unmodified since
    # the beginning of the last scrape.
    scraper_search = None
    if kwfile and Config['GLOBAL'].getboolean('continue_last_scrape', False):
        searches = session.query(ScraperSearch). \
            filter(ScraperSearch.keyword_file == kwfile). \
            order_by(ScraperSearch.started_searching). \
            all()

        if searches:
            last_search = searches[-1]
            last_modified = datetime.datetime.utcfromtimestamp(
                os.path.getmtime(last_search.keyword_file))

            # if the last modification is older then the starting of the search
            if last_modified < last_search.started_searching:
                scraper_search = last_search
                logger.info('Continuing last scrape.')

    if not scraper_search:
        scraper_search = ScraperSearch(
            keyword_file=os.path.abspath(kwfile),
            number_search_engines_used=num_search_engines,
            number_proxies_used=len(proxies),
            number_search_queries=len(keywords),
            started_searching=datetime.datetime.utcnow(),
            used_search_engines=','.join(search_engines))

    # First of all, lets see how many requests remain to issue after searching the cache.
    if Config['GLOBAL'].getboolean('do_caching'):
        scrape_jobs = parse_all_cached_files(scrape_jobs, session,
                                             scraper_search)

    if scrape_jobs:

        # Create a lock to synchronize database access in the sqlalchemy session
        db_lock = threading.Lock()

        # create a lock to cache results
        cache_lock = threading.Lock()

        # A lock to prevent multiple threads from solving captcha, used in selenium instances.
        captcha_lock = threading.Lock()

        out('Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'
            .format(num_keywords=len(list(scrape_jobs)),
                    num_proxies=len(proxies),
                    num_threads=num_search_engines),
            lvl=1)

        progress_thread = None

        # Let the games begin
        if method in ('selenium', 'http'):

            # Show the progress of the scraping
            q = queue.Queue()
            progress_thread = ShowProgressQueue(q, len(scrape_jobs))
            progress_thread.start()

            workers = queue.Queue()
            num_worker = 0
            for search_engine in search_engines:

                for proxy in proxies:

                    for worker in range(num_workers):
                        num_worker += 1
                        workers.put(
                            ScrapeWorkerFactory(mode=method,
                                                proxy=proxy,
                                                search_engine=search_engine,
                                                session=session,
                                                db_lock=db_lock,
                                                cache_lock=cache_lock,
                                                scraper_search=scraper_search,
                                                captcha_lock=captcha_lock,
                                                progress_queue=q,
                                                browser_num=num_worker))

            for job in scrape_jobs:

                while True:
                    worker = workers.get()
                    workers.put(worker)
                    if worker.is_suitabe(job):
                        worker.add_job(job)
                        break

            threads = []

            while not workers.empty():
                worker = workers.get()
                thread = worker.get_worker()
                if thread:
                    threads.append(thread)

            for t in threads:
                t.start()

            for t in threads:
                t.join()

            # after threads are done, stop the progress queue.
            q.put('done')

        elif method == 'http-async':
            scheduler = AsyncScrapeScheduler(scrape_jobs,
                                             session=session,
                                             scraper_search=scraper_search,
                                             db_lock=db_lock)
            scheduler.run()

        else:
            raise InvalidConfigurationException(
                'No such scrape_method {}'.format(
                    Config['SCRAPING'].get('scrape_method')))

        if method in ('selenium', 'http'):
            # progress_thread can be None
            try:
                progress_thread.join()
            except AttributeError:
                pass

    # in the end, close the json file.
    from GoogleScraper.output_converter import outfile, output_format

    if output_format == 'json':
        outfile.end()

    scraper_search.stopped_searching = datetime.datetime.utcnow()
    session.add(scraper_search)
    session.commit()

    if return_results:
        return scraper_search
Пример #51
0
def parse_all_cached_files(keywords, search_engines, session, scraper_search):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        session: An sql alchemy session to add the entities

    Returns:
        A list of keywords that couldn't be parsed and which need to be scraped anew.
    """
    google_query_needle = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>')

    files = _get_all_cache_files()
    mapping = {}
    scrapemethod = Config['SCRAPING'].get('scrapemethod')
    num_cached = 0
    # a keyword is requested once for each search engine
    num_total_keywords = len(keywords) * len(search_engines)

    for kw in keywords:
        for search_engine in search_engines:
            key = cached_file_name(kw, search_engine, scrapemethod)

            out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'.format(
                    kw=kw,
                    se=search_engine,
                    sm=scrapemethod,
                    hash=key
                ), lvl=5)

            mapping[key] = (kw, search_engine)

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        query = search_engine = None
        val = mapping.get(clean_filename, None)
        if val:
            query, search_engine = val

        if query and search_engine:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            serp = None #get_serp_from_database(session, query, search_engine, scrapemethod)

            if not serp:
                serp = parse_again(fname, search_engine, scrapemethod, query)

            serp.scraper_searches.append(scraper_search)
            session.add(serp)
            session.commit()

            mapping.pop(clean_filename)
            num_cached += 1

    out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=1)
    out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'.format(
        num_cached, num_total_keywords, num_total_keywords - num_cached), lvl=1)

    session.add(scraper_search)
    session.commit()
    # return the remaining keywords to scrape
    return [e[0] for e in mapping.values()]