def handle_response(domain, response): app_log_process('handle response %s' % domain) error = u'' body = u'' effective_url = response.effective_url if response.body: body = to_unicode(response.body) if response.error: error = to_unicode(str(response.error)) app_log_process('handle response result %s %s' % (response.code, error)) return error, effective_url, body
def run(self, domain_id): app_log_process('start parsing %s domain_id' % domain_id) crawling_result = None try: crawling_result = self.storage.get_crawling_result(domain_id) app_log_process('found response %s %s' % (crawling_result[0], crawling_result[1])) except IOError: app_log_process('skip by not found sources') return_by_raise() domain_name, error, effective_url, source = crawling_result parsing_result = self.parse_result(domain_name, error, effective_url, source) app_log_process('parsed result %s' % parsing_result[0]) if parsing_result[0] == RESULT_ERROR: yield self.storage.update_by_parser(domain_id, False) yield self.storage.clear_relations_from(domain_id) elif parsing_result[0] == RESULT_FULL_REDIRECT: new_domain_id = yield self.storage.add_domain_custom( parsing_result[1]) yield self.storage.update_by_parser(domain_id, True) yield self.storage.clear_relations_from(domain_id) yield self.storage.add_relations_from([(domain_id, new_domain_id)]) elif parsing_result[0] == RESULT_LINKS: yield self.storage.update_by_parser(domain_id, True) relations = [] for link in parsing_result[1]: new_domain_id = yield self.storage.add_domain_custom(link) relations.append((domain_id, new_domain_id)) yield self.storage.clear_relations_from(domain_id) yield self.storage.add_relations_from(relations) else: raise RuntimeError('Unknown parsing result type %s' % parsing_result[0]) self.storage.clear_crawling_result(domain_id) app_log_process('end parsing process')
def run(self): if pycurl.version.find('c-ares') < 0: app_log_process('c-ares not installed (%s)' % pycurl.version, logging.ERROR) num_conn = min(len(self.domains), options.crawler_curl_conn) app_log_process('start crawling process %d domains, %d conn, %d timeout' % (len(self.domains), num_conn, options.crawler_curl_timeout)) tornado.httpclient.AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=num_conn) http_client = tornado.httpclient.AsyncHTTPClient() for i, domain in enumerate(self.domains): id, url = domain request = tornado.httpclient.HTTPRequest(url, connect_timeout=options.crawler_curl_timeout, request_timeout=options.crawler_curl_timeout, follow_redirects=True, max_redirects=options.crawler_curl_max_redirects) http_client.fetch(request, callback=(yield tornado.gen.Callback(i))) keys = set(range(len(self.domains))) while keys: key, response = yield yieldpoints.WaitAny(keys) domain = self.domains[key] error, effective_url, body = self.handle_response(self.domains[key], response) keys.remove(key) self.storage.save_crawling_result(str(domain[0]), domain[1], error, effective_url, body) self.q.add_parser_task(domain[0]) http_client.close() app_log_process('end crawling process')
def parser_process(): app_log_process('start parser process') log_fds('start') log_mem('start') q = Q() s = Storage() parser = Parser(s) while True: log_fds('start loop') log_mem('start loop') task = q.get_parser_task() if task: yield parser.run(task[2]) q.complete_task(task[0]) else: app_log_process("not found task") time.sleep(options.parser_sleep_period_sec) app_log_process('end parser process')
def crawler_process(): app_log_process('start crawler process') log_fds('start') log_mem('start') q = Q() s = Storage() while True: log_fds('start loop') log_mem('start loop') task = q.get_crawler_task() if task: crawler = Crawler(task[2], q, s) yield crawler.run() q.complete_task(task[0]) del crawler else: app_log_process("not found task") time.sleep(options.crawler_sleep_period_sec) app_log_process('end crawler process') log_fds('end')
def run(self): if pycurl.version.find('c-ares') < 0: app_log_process('c-ares not installed (%s)' % pycurl.version, logging.ERROR) num_conn = min(len(self.domains), options.crawler_curl_conn) app_log_process( 'start crawling process %d domains, %d conn, %d timeout' % (len(self.domains), num_conn, options.crawler_curl_timeout)) tornado.httpclient.AsyncHTTPClient.configure( "tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=num_conn) http_client = tornado.httpclient.AsyncHTTPClient() for i, domain in enumerate(self.domains): id, url = domain request = tornado.httpclient.HTTPRequest( url, connect_timeout=options.crawler_curl_timeout, request_timeout=options.crawler_curl_timeout, follow_redirects=True, max_redirects=options.crawler_curl_max_redirects) http_client.fetch(request, callback=(yield tornado.gen.Callback(i))) keys = set(range(len(self.domains))) while keys: key, response = yield yieldpoints.WaitAny(keys) domain = self.domains[key] error, effective_url, body = self.handle_response( self.domains[key], response) keys.remove(key) self.storage.save_crawling_result(str(domain[0]), domain[1], error, effective_url, body) self.q.add_parser_task(domain[0]) http_client.close() app_log_process('end crawling process')
def parse_result(self, domain_name, error, effective_url, source): if error: app_log_process('parse unknown error %s' % error, logging.DEBUG) return RESULT_ERROR, error final_domain = self.e.extract(effective_url) if not final_domain: app_log_process('parse redirect error %s' % effective_url, logging.DEBUG) return RESULT_ERROR, 'invalid redirect to %s' % effective_url if final_domain != domain_name: app_log_process( 'parse full redirect %s -> %s' % (domain_name, final_domain), logging.DEBUG) return RESULT_FULL_REDIRECT, final_domain s = source.lower().strip().encode('utf-8') if not s: app_log_process('parse empty source error %d' % len(s), logging.DEBUG) return RESULT_ERROR, 'empty source %s' % len(s) if len(s) > options.parser_max_source_size_mb * 1024 * 1024: app_log_process('parse large source error %d' % len(s), logging.DEBUG) return RESULT_ERROR, 'large source %s' % len(s) try: document = self.create_html_doc(s) except lxml.etree.ParserError as e: app_log_process('parser error %s' % e, logging.WARNING) return RESULT_ERROR, 'error parser %s' % e # a href href_links_source = document.xpath('//a/@href') app_log_process( 'found a@href links %d (%s)' % (len(href_links_source), ','.join(href_links_source[:10])), logging.DEBUG) href_links = self._links_domain_filter(href_links_source, domain_name) app_log_process( 'filtered a@href links %d (%s)' % (len(href_links), ','.join(href_links[:10])), logging.DEBUG) # # script src js # script_links = self._links_domain_filter(document.xpath('//script/@src')) # app_log_process('found script@src links %d (%s)' % (len(script_links), ','.join(script_links)), logging.DEBUG) # # # link href css # link_links = self._links_domain_filter(document.xpath('//link[contains(@rel, "stylesheet")]/@href')) # app_log_process('found link@href links %d (%s)' % (len(link_links), ','.join(link_links)), logging.DEBUG) # # # img src # img_links = self._links_domain_filter(document.xpath('//img/@src')) # app_log_process('found img@src links %d (%s)' % (len(img_links), ','.join(img_links)), logging.DEBUG) if len(href_links) > options.parser_max_link_count: app_log_process('parse too many links error %d' % len(href_links), logging.DEBUG) return RESULT_ERROR, 'too many links %s' % len(href_links) return RESULT_LINKS, href_links