def run(self): print("----------------------------start def run(self):-----------------------------") while True: self.get_requests() if not self.requests: break self.results = self.loop.run_until_complete(asyncio.wait([r() for r in self.requests])) for task in self.results[0]: scrape = task.result() if scrape: if self.cache_manager: self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method, scrape.page_number) if scrape.parser: serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query) if self.scraper_search: self.scraper_search.serps.append(serp) if self.session: self.session.add(serp) self.session.commit() store_serp_result(serp, self.config) print("----------------------------end def run(self):-----------------------------")
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session.' if self.html: self.parser.parse(self.html) else: self.parser = None with self.db_lock: serp = parse_serp(self.config, parser=self.parser, scraper=self, query=self.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp, self.config) if serp.num_results: return True else: return False
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session.' if self.html: self.parser.parse(self.html) else: print("Nothing to parse for {keyword}! (page len = {pagelen})". format(keyword=self.query, pagelen=len(self.html))) self.parser = None with self.db_lock: serp = parse_serp(parser=self.parser, scraper=self, query=self.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp) if serp.num_results: return True else: return False
def run(self): while True: self.get_requests() if not self.requests: break self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests])) for task in self.results[0]: scrape = task.result() if scrape: if self.cache_manager: self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method, scrape.page_number) if scrape.parser: serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query) if self.scraper_search: self.scraper_search.serps.append(serp) if self.session: self.session.add(serp) self.session.commit() store_serp_result(serp, self.config)
def parse_all_cached_files(self, scrape_jobs, session, scraper_search): """Walk recursively through the cachedir (as given by the Config) and parse all cached files. Args: session: An sql alchemy session to add the entities scraper_search: Abstract object representing the current search. Returns: The scrape jobs that couldn't be parsed from the cache directory. """ files = self._get_all_cache_files() num_cached = num_total = 0 mapping = {} for job in scrape_jobs: cache_name = self.cached_file_name( job['query'], job['search_engine'], job['scrape_method'], job['page_number'] ) mapping[cache_name] = job num_total += 1 for path in files: # strip of the extension of the path if it has eny fname = os.path.split(path)[1] clean_filename = fname for ext in ALLOWED_COMPRESSION_ALGORITHMS: if fname.endswith(ext): clean_filename = fname.rstrip('.' + ext) job = mapping.get(clean_filename, None) if job: # We found a file that contains the keyword, search engine name and # search mode that fits our description. Let's see if there is already # an record in the database and link it to our new ScraperSearch object. serp = self.get_serp_from_database(session, job['query'], job['search_engine'], job['scrape_method'], job['page_number']) if not serp: serp = self.parse_again(fname, job['search_engine'], job['scrape_method'], job['query']) serp.scraper_searches.append(scraper_search) session.add(serp) if num_cached % 200 == 0: session.commit() store_serp_result(serp, self.config) num_cached += 1 scrape_jobs.remove(job) logger.info('{} cache files found in {}'.format(len(files), self.config.get('cachedir'))) logger.info('{}/{} objects have been read from the cache. {} remain to get scraped.'.format( num_cached, num_total, num_total - num_cached)) session.add(scraper_search) session.commit() return scrape_jobs
def run(self): while True: self.get_requests() if not self.requests: break self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests])) for task in self.results[0]: scrape = task.result() if scrape: cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method, scrape.page_number) if scrape.parser: serp = parse_serp(parser=scrape.parser, scraper=scrape, query=scrape.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp)
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=self.current_request_time, requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser. search_results['num_results'], ) self.scraper_search.serps.append(serp) serp, parser = parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit() store_serp_result(dict_from_scraping_object(self), self.parser)
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session.' if self.html: self.parser.parse(self.html) else: self.parser = None with self.db_lock: serp = parse_serp(parser=self.parser, scraper=self, query=self.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp) if serp.num_results: return True else: return False
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: num_results = 0 serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=self.current_request_time, requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser.search_results['num_results'], ) self.scraper_search.serps.append(serp) serp, parser = parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit() store_serp_result(dict_from_scraping_object(self), self.parser)
def parse_all_cached_files(self, scrape_jobs, session, scraper_search): """Walk recursively through the cachedir (as given by the Config) and parse all cached files. Args: session: An sql alchemy session to add the entities scraper_search: Abstract object representing the current search. Returns: The scrape jobs that couldn't be parsed from the cache directory. """ files = self._get_all_cache_files() num_cached = num_total = 0 mapping = {} for job in scrape_jobs: cache_name = self.cached_file_name(job['query'], job['search_engine'], job['scrape_method'], job['page_number']) mapping[cache_name] = job num_total += 1 for path in files: # strip of the extension of the path if it has eny fname = os.path.split(path)[1] clean_filename = fname for ext in ALLOWED_COMPRESSION_ALGORITHMS: if fname.endswith(ext): clean_filename = fname.rstrip('.' + ext) job = mapping.get(clean_filename, None) if job: # We found a file that contains the keyword, search engine name and # search mode that fits our description. Let's see if there is already # an record in the database and link it to our new ScraperSearch object. serp = self.get_serp_from_database(session, job['query'], job['search_engine'], job['scrape_method'], job['page_number']) if not serp: serp = self.parse_again(fname, job['search_engine'], job['scrape_method'], job['query']) serp.scraper_searches.append(scraper_search) session.add(serp) if num_cached % 200 == 0: session.commit() store_serp_result(serp, self.config) num_cached += 1 scrape_jobs.remove(job) logger.info('{} cache files found in {}'.format( len(files), self.config.get('cachedir'))) logger.info( '{}/{} objects have been read from the cache. {} remain to get scraped.' .format(num_cached, num_total, num_total - num_cached)) session.add(scraper_search) session.commit() return scrape_jobs
def parse_all_cached_files(keywords, search_engines, session, scraper_search): """Walk recursively through the cachedir (as given by the Config) and parse all cached files. Args: session: An sql alchemy session to add the entities Returns: A list of keywords that couldn't be parsed and which need to be scraped anew. """ google_query_needle = re.compile( r'<title>(?P<kw>.*?) - Google Search</title>') files = _get_all_cache_files() mapping = {} scrapemethod = Config['SCRAPING'].get('scrapemethod') num_cached = 0 # a keyword is requested once for each search engine num_total_keywords = len(keywords) * len(search_engines) for kw in keywords: for search_engine in search_engines: key = cached_file_name(kw, search_engine, scrapemethod) out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}' .format(kw=kw, se=search_engine, sm=scrapemethod, hash=key), lvl=5) mapping[key] = (kw, search_engine) for path in files: # strip of the extension of the path if it has eny fname = os.path.split(path)[1] clean_filename = fname for ext in ALLOWED_COMPRESSION_ALGORITHMS: if fname.endswith(ext): clean_filename = fname.rstrip('.' + ext) query = search_engine = None val = mapping.get(clean_filename, None) if val: query, search_engine = val if query and search_engine: # We found a file that contains the keyword, search engine name and # searchmode that fits our description. Let's see if there is already # an record in the database and link it to our new ScraperSearch object. serp = None #get_serp_from_database(session, query, search_engine, scrapemethod) if not serp: serp, parser = parse_again(fname, search_engine, scrapemethod, query) serp.scraper_searches.append(scraper_search) session.add(serp) session.commit() store_serp_result(dict_from_serp_object(serp), parser) mapping.pop(clean_filename) num_cached += 1 out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=1) out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.' .format(num_cached, num_total_keywords, num_total_keywords - num_cached), lvl=1) session.add(scraper_search) session.commit() # return the remaining keywords to scrape return [e[0] for e in mapping.values()]
def parse_all_cached_files(keywords, search_engines, session, scraper_search): """Walk recursively through the cachedir (as given by the Config) and parse all cached files. Args: session: An sql alchemy session to add the entities Returns: A list of keywords that couldn't be parsed and which need to be scraped anew. """ google_query_needle = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>') files = _get_all_cache_files() mapping = {} scrapemethod = Config['SCRAPING'].get('scrapemethod') num_cached = 0 # a keyword is requested once for each search engine num_total_keywords = len(keywords) * len(search_engines) for kw in keywords: for search_engine in search_engines: key = cached_file_name(kw, search_engine, scrapemethod) out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'.format( kw=kw, se=search_engine, sm=scrapemethod, hash=key ), lvl=5) mapping[key] = (kw, search_engine) for path in files: # strip of the extension of the path if it has eny fname = os.path.split(path)[1] clean_filename = fname for ext in ALLOWED_COMPRESSION_ALGORITHMS: if fname.endswith(ext): clean_filename = fname.rstrip('.' + ext) query = search_engine = None val = mapping.get(clean_filename, None) if val: query, search_engine = val if query and search_engine: # We found a file that contains the keyword, search engine name and # searchmode that fits our description. Let's see if there is already # an record in the database and link it to our new ScraperSearch object. serp = None #get_serp_from_database(session, query, search_engine, scrapemethod) if not serp: serp, parser = parse_again(fname, search_engine, scrapemethod, query) serp.scraper_searches.append(scraper_search) session.add(serp) session.commit() store_serp_result(dict_from_serp_object(serp), parser) mapping.pop(clean_filename) num_cached += 1 out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=1) out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'.format( num_cached, num_total_keywords, num_total_keywords - num_cached), lvl=1) session.add(scraper_search) session.commit() # return the remaining keywords to scrape return [e[0] for e in mapping.values()]