def parse_item(self, response): meta = response.request.meta source = meta['domain'] url = response.request.url if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if not self._process_response(response, source, LinkType.LEAF): service.report_status([LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)]) career.remove_item(url, source) return if not self.itemloader_class: return try: selector = HtmlXPathSelector(response) loader = self.itemloader_class(selector) loader.add_value('source', source) loader.add_value('source_link', url) except Exception, e: service.report_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) print e, url log_error(url)
def _process_response(self, response, source, type): ''' Returns True if response can be further processed otherwise False. ''' url = response.request.url if self.sourcefilterprocessor_class: processor = self.sourcefilterprocessor_class() url = processor.process(self, url) if url is None: service.report_status([LinkStatus(url, source, Status.FAIL, type)]) return False if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if url is None: service.report_status([LinkStatus(url, source, Status.FAIL, type)]) return False if response.status == 200: service.report_status([LinkStatus(url, source, Status.SUCCEED, type)]) return True else: service.report_status([LinkStatus(url, source, Status.FAIL, type)]) return False
def parse(self, response): meta = response.request.meta source = meta['domain'] all_link = [] url = response.request.url if not self._process_response(response, source, LinkType.CATELOG): return rule_dicts = meta['rules'] rules = self._get_rules(source, rule_dicts) for rule in rules: links = [l for l in rule.link_extractor.extract_links(response)] if links and rule.process_links: links = rule.process_links(links) for link in links: if link not in all_link: all_link.append(link) if all_link: service.report_status([LinkStatus(link.url, source, Status.FOUND, rule.link_type) for link in all_link]) service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))])