def process_get_response(self, response): logger.debug(u"Called {} for {} ".format('process_get_response', self.encoded_url)) if response.error: logger.debug(u"Error processing get request: {} with error : {} ( {} )" % (self.encoded_url, response.error, response.reason)) self.failure_message = response.reason else: html_source = response.body html_source = decode_to_unicode(html_source) if self.is_page_internal(): dom = html.fromstring(html_source) # logger.debug("obtained dom object for {}".format(encoded_url)) link_count = 0 for href_value in dom.xpath('//a/@href'): href_value = decode_to_unicode(href_value) logger.debug(u"Entering for loop for for {} with href {}".format(self.encoded_url, href_value)) self._process_hardcoded_url(href_value) link = self._format_link(href_value) logger.debug(u"obtained link object{} for {}".format(link, self.encoded_url)) if link: parsed_link = obtain_domain_with_subdomain_for_page(link) if parsed_link not in self.domains_to_skip: link_page = TornadoClientPage(link, self, self.base_site, self.base_domain, self.domains_to_skip) self.links.add(link_page) link_page.parent = self link_count += 1 self.finalize_process(self.spider)
def skip_page(self): parsed_link = obtain_domain_with_subdomain_for_page(self.url) for skipped_domain in self.domains_to_skip: if parsed_link == skipped_domain: return True for segment_to_skip in URL_SEGMENTS_TO_SKIP: if segment_to_skip in self.url: return True return False