def run(self): config = { 'SCRAPING': { 'use_own_ip': 'True', 'keywords': '\n'.join(self.searches.values()), 'num_of_pages': "%s" % self.num_pages, 'scrapemethod': self.scrapemethod }, 'SELENIUM': { 'sel_browser': 'chrome', 'manual_captcha_solving': 'True', # 'sleeping_ranges': '5; 1, 2', # more agressive than defaults }, 'GLOBAL': { 'do_caching': 'True', #'do_caching': 'False', #'cachedir': 'dc 'db': "results_{asctime}.db", # 'debug': 'WARNING', 'debug': 'ERROR', }, 'GOOGLE_SEARCH_PARAMS': { 'start': "0", 'num': "30", } } if self.proxyfile: print("Using proxies from %s" % self.proxyfile) config['GLOBAL']['proxy_file'] = self.proxyfile # GoogleScraper.config.update_config(config) # hack, GoogleScraper config 'db' path is broken when 2nd time db = GoogleScraper.scrape_with_config(config, usrcb_result=self.cb_results) urls = db.execute('SELECT * FROM link').fetchall() db.close() self.urls.extend(urls) return urls
def scrape(keywords, search_engine : str, num_pages: int) -> object: scrape_results = [] titles = [] links = [] snippets = [] config = { 'use_own_ip': True, 'keywords': keywords, 'search_engines': [search_engine], 'num_pages_for_keyword': num_pages, 'scrape_method': 'http', 'sel_browser': 'chrome', 'do_caching': False, 'num_workers': 1 } search = GoogleScraper.scrape_with_config(config) for serp in search.serps: print(serp) for link in serp.links: titles.append(link.title) links.append(link.link) snippets.append(str(link.snippet)) """ #print(link.title + '\n') #print(link.link + '\n') #print(link.snippet + '\n') """ scrape_results.append(titles) scrape_results.append(links) scrape_results.append(snippets) return scrape_results
def search_via_googler(self, query_paraments): """ args['config']['last_update'] Applications: tbm=app Blogs: tbm=blg Books: tbm=bks Discussions: tbm=dsc Images: tbm=isch News: tbm=nws Patents: tbm=pts Places: tbm=plcs Recipes: tbm=rcp Shopping: tbm=shop Video: tbm=vid """ def fix_urls(url): url = url.replace('/amp/', '') if '/amp/' in url else url url = url.replace('/amp.html', '') if '/amp.html' in url else url url = urllib.parse.urljoin('http://', url) if 'http://' not in url else url return url google_search_url = 'https://www.google.com/search?tbs=qdr:%s&' dateRestrict = query_paraments.get('dateRestrict', 'd') config = { 'use_own_ip': 'True', 'keywords': [query_paraments['q']], 'google_search_url': google_search_url % dateRestrict, 'num_results_per_page': query_paraments.get('results_per_page', 25), 'num_pages_for_keyword': query_paraments.get('num_pages', 4), 'num_workers': 2, 'search_engines': [ 'google', ], 'search_type': 'normal', 'scrape_method': 'http', 'do_caching': False, 'print_results': None, } logger.debug('Making search with Googler lib with configuration') try: google_search = GoogleScraper.scrape_with_config(config) urls_without_fix = [] urls = [] for serp in google_search.serps: urls_without_fix = [r.link for r in serp.links] urls = [fix_urls(r.link) for r in serp.links] logger.debug( ('Google Search fixed links successfully extracted with ' 'query "{}": {:d} links extracted').format( query_paraments['q'], len(urls))) logger.debug( ('Google Search links without fix successfully extracted ' 'with query "{}":\n{}').format(query_paraments['q'], urls_without_fix)) logger.debug(('List of link extracted from Google Search with the ' 'query "{}":\n{}').format(query_paraments['q'], urls)) return urls except GoogleScraper.GoogleSearchError as e: logger.error(str(e))