def _should_resolve(self, lru, spider): c1 = has_prefix(lru, spider.discover_prefixes) c2 = has_prefix(lru, spider.follow_prefixes) c3 = any( (match in lru for match in ["url", "link", "redir", "target", "orig", "goto"])) return c1 or (c2 and c3)
def process_item(self, item, spider): lrulinks = [] for url, lru in item["lrulinks"]: if self._should_resolve(lru, spider): if url in spider.resolved_links: lru = spider.resolved_links[url] else: try: agent = ResolverAgent(proxy=self.proxy) rurl = yield agent.resolve(url) if rurl == url and has_prefix(lru, spider.discover_prefixes): rurl = yield agent.resolve(url) lru = url_to_lru_clean(rurl) spider.resolved_links[url] = lru except Exception, e: spider.log("Error resolving redirects from URL %s: %s %s" % (url, type(e), e), log.INFO) lrulinks.append(lru)
def process_item(self, item, spider): lrulinks = [] for url, lru in item["lrulinks"]: if self._should_resolve(lru, spider): if url in spider.resolved_links: lru = spider.resolved_links[url] else: try: agent = ResolverAgent(proxy=self.proxy) rurl = yield agent.resolve(url) if rurl == url and has_prefix(lru, spider.discover_prefixes): rurl = yield agent.resolve(url) lru = url_to_lru_clean(rurl, TLDS_TREE) spider.resolved_links[url] = lru except Exception, e: spider.log("Error resolving redirects from URL %s: %s %s" % (url, type(e), e), log.INFO) lrulinks.append(lru)
def _should_follow(self, depth, fromlru, tolru): c1 = depth < self.maxdepth c2 = has_prefix(tolru, self.follow_prefixes) c3 = not(has_prefix(tolru, self.nofollow_prefixes)) return c1 and c2 and c3
def _should_follow(self, depth, fromlru, tolru): c1 = depth < self.maxdepth c2 = has_prefix(tolru, self.follow_prefixes) c3 = not (has_prefix(tolru, self.nofollow_prefixes)) return c1 and c2 and c3
def _should_resolve(self, lru, spider): c1 = has_prefix(lru, spider.discover_prefixes) c2 = has_prefix(lru, spider.follow_prefixes) c3 = any((match in lru for match in ["url", "link", "redir", "target", "orig", "goto"])) return c1 or (c2 and c3)