def parse(self, response): #print(response.headers) #print(response.status) if response.status == 504: # no response #print('504 detected') pass # LUA ERROR # # TODO: print/display errors elif 'error' in response.data: if (response.data['error'] == 'network99'): ## splash restart ## error_retry = request.meta.get('error_retry', 0) if error_retry < 3: error_retry += 1 url = request.meta['current_url'] father = request.meta['father'] self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='execute', cache_args=['lua_source'], meta={ 'father': father, 'current_url': url, 'error_retry': error_retry }, args=self.build_request_arg( response.cookiejar)) else: print('Connection to proxy refused') else: print(response.data['error']) elif response.status != 200: print('other response: {}'.format(response.status)) # detect connection to proxy refused error_log = (json.loads(response.body.decode())) print(error_log) #elif crawlers.is_redirection(self.domains[0], response.data['last_url']): # pass # ignore response else: item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) self.save_crawled_item(item_id, response.data['html']) crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) if self.root_key is None: self.root_key = item_id crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) if 'cookies' in response.data: all_cookies = response.data['cookies'] else: all_cookies = [] # SCREENSHOT if 'png' in response.data and self.png: sha256_string = Screenshot.save_crawled_screeshot( response.data['png'], 5000000, f_save=self.requested_mode) if sha256_string: Screenshot.save_item_relationship( sha256_string, item_id) Screenshot.save_domain_relationship( sha256_string, self.domains[0]) # HAR if 'har' in response.data and self.har: crawlers.save_har(self.har_dir, item_id, response.data['har']) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): l_cookies = self.build_request_arg(all_cookies) yield SplashRequest(link.url, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': item_id, 'current_url': link.url }, args=l_cookies)
def parse(self, response): #print(response.headers) #print(response.status) #print(response.meta) #print(response.data) # # TODO: handle lua script error #{'type': 'ScriptError', 'info': {'error': "'}' expected (to close '{' at line 47) near 'error_retry'", #'message': '[string "..."]:53: \'}\' expected (to close \'{\' at line 47) near \'error_retry\'', #'type': 'LUA_INIT_ERROR', 'source': '[string "..."]', 'line_number': 53}, #'error': 400, 'description': 'Error happened while executing Lua script'} if response.status == 504: # no response #print('504 detected') pass # LUA ERROR # # TODO: logs errors elif 'error' in response.data: if (response.data['error'] == 'network99'): ## splash restart ## error_retry = response.meta.get('error_retry', 0) if error_retry < 3: error_retry += 1 url = response.data['last_url'] father = response.meta['father'] self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) if 'cookies' in response.data: all_cookies = response.data[ 'cookies'] # # TODO: use initial cookie ????? else: all_cookies = [] l_cookies = self.build_request_arg(all_cookies) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='execute', dont_filter=True, meta={ 'father': father, 'current_url': url, 'error_retry': error_retry }, args=l_cookies) else: if self.requested_mode == 'test': crawlers.save_test_ail_crawlers_result( False, 'Connection to proxy refused') print('Connection to proxy refused') elif response.data['error'] == 'network3': if self.requested_mode == 'test': crawlers.save_test_ail_crawlers_result( False, 'HostNotFoundError: the remote host name was not found (invalid hostname)' ) print( 'HostNotFoundError: the remote host name was not found (invalid hostname)' ) else: if self.requested_mode == 'test': crawlers.save_test_ail_crawlers_result( False, response.data['error']) print(response.data['error']) elif response.status != 200: print('other response: {}'.format(response.status)) # detect connection to proxy refused error_log = (json.loads(response.body.decode())) print(error_log) #elif crawlers.is_redirection(self.domains[0], response.data['last_url']): # pass # ignore response else: ## TEST MODE ## if self.requested_mode == 'test': if 'It works!' in response.data['html']: crawlers.save_test_ail_crawlers_result( True, 'It works!') else: print('TEST ERROR') crawlers.save_test_ail_crawlers_result( False, 'TEST ERROR') return ## -- ## item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) self.save_crawled_item(item_id, response.data['html']) crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) if self.root_key is None: self.root_key = item_id crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) if 'cookies' in response.data: all_cookies = response.data['cookies'] else: all_cookies = [] # SCREENSHOT if 'png' in response.data and self.png: sha256_string = Screenshot.save_crawled_screeshot( response.data['png'], 5000000, f_save=self.requested_mode) if sha256_string: Screenshot.save_item_relationship( sha256_string, item_id) Screenshot.save_domain_relationship( sha256_string, self.domains[0]) # HAR if 'har' in response.data and self.har: crawlers.save_har(self.har_dir, item_id, response.data['har']) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): l_cookies = self.build_request_arg(all_cookies) yield SplashRequest(link.url, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': item_id, 'current_url': link.url }, args=l_cookies)