def _search(self, searchtype='normal'): """The actual search and parsing of the results. Private, internal method. Parsing is done with lxml and cssselect. The html structure of the Google Search results may change over time. Effective: February 2014 There are several parts of a SERP results page the average user is most likely interested: (Probably in this order) - Non-advertisement links, as well as their little snippet and title - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)" - Advertisement search results (links, titles, snippets like above) Problem: This data comes in a wide range of different formats, depending on the parameters set in the search. Investigations over the different formats are done in the directory tests/serp_formats. """ self._build_query(searchtype) # After building the query, all parameters are set, so we know what we're requesting. logger.debug("Created new GoogleScrape object with searchparams={}".format(pprint.pformat(self.search_params))) html = get_cached(self.search_query, Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join(Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) if not html: try: r = self.requests.get(Config['GLOBAL'].get('base_search_url'), headers=self._HEADERS, params=self.search_params, timeout=3.0) logger.debug("Scraped with url: {} and User-Agent: {}".format(r.url, self._HEADERS['User-Agent'])) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) if str(r.status_code)[0] == '5': print('Maybe google recognizes you as sneaky spammer after' ' you requested their services too inexhaustibly :D') return False html = r.text if Config['HTTP'].getboolean('view', False): self.browserview(html) # cache fresh results cache_results(html, self.search_query, url=Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join(Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) self.parser = GoogleParser(html, searchtype=self.searchtype) self.search_results.update(self.parser.all_results)
def search(self, *args, rand=False, **kwargs): """The actual search for the search engine.""" self.build_search() if rand: self.headers['User-Agent'] = random.choice(self.USER_AGENTS) html = get_cached(self.current_keyword, self.base_search_url, params=self.search_params) if not html: try: if Config['GLOBAL'].getint('verbosity', 0) > 1: logger.info( '[HTTP] Base_url: {base_url}, headers={headers}, params={params}' .format(base_url=self.base_search_url, headers=self.headers, params=self.search_params)) r = self.requests.get(self.base_search_url, headers=self.headers, params=self.search_params, timeout=3.0) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) self.handle_request_denied(r.status_code) return False html = r.text # cache fresh results cache_results(html, self.current_keyword, url=self.base_search_url, params=self.search_params) self.parser.parse(html) self.store() print(self.parser)
def search(self, *args, rand=False, **kwargs): """The actual search for the search engine.""" self.build_search() if rand: self.headers['User-Agent'] = random.choice(self.USER_AGENTS) html = get_cached(self.current_keyword, self.search_engine, 'http') if not html: try: if Config['GLOBAL'].getint('verbosity', 0) > 1: logger.info('[HTTP] Base_url: {base_url}, headers={headers}, params={params}'.format( base_url=self.base_search_url, headers=self.headers, params=self.search_params) ) request = self.requests.get(self.base_search_url, headers=self.headers, params=self.search_params, timeout=3.0) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not request.ok: logger.error('HTTP Error: {}'.format(request.status_code)) self.handle_request_denied(request.status_code) return False html = request.text # cache fresh results with self.cache_lock: cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod) self.parser.parse(html) self.store() out(str(self.parser), lvl=2)
def _search(self, searchtype='normal'): """The actual search and parsing of the results. Private, internal method. Parsing is done with lxml and cssselect. The html structure of the Google Search results may change over time. Effective: February 2014 There are several parts of a SERP results page the average user is most likely interested: (Probably in this order) - Non-advertisement links, as well as their little snippet and title - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)" - Advertisement search results (links, titles, snippets like above) Problem: This data comes in a wide range of different formats, depending on the parameters set in the search. Investigations over the different formats are done in the directory tests/serp_formats. """ self._build_query(searchtype) # After building the query, all parameters are set, so we know what we're requesting. logger.debug( "Created new GoogleScrape object with searchparams={}".format( pprint.pformat(self.search_params))) html = get_cached(self.search_query, Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join( Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) if not html: try: r = self.requests.get(Config['GLOBAL'].get('base_search_url'), headers=self._HEADERS, params=self.search_params, timeout=3.0) logger.debug("Scraped with url: {} and User-Agent: {}".format( r.url, self._HEADERS['User-Agent'])) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) if str(r.status_code)[0] == '5': print('Maybe google recognizes you as sneaky spammer after' ' you requested their services too inexhaustibly :D') return False html = r.text if Config['HTTP'].getboolean('view', False): self.browserview(html) # cache fresh results cache_results(html, self.search_query, url=Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join( Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) self.parser = GoogleParser(html, searchtype=self.searchtype) self.search_results.update(self.parser.all_results)