Exemplo n.º 1
0
 def _should_resolve(self, lru, spider):
     c1 = has_prefix(lru, spider.discover_prefixes)
     c2 = has_prefix(lru, spider.follow_prefixes)
     c3 = any(
         (match in lru
          for match in ["url", "link", "redir", "target", "orig", "goto"]))
     return c1 or (c2 and c3)
Exemplo n.º 2
0
 def process_item(self, item, spider):
     lrulinks = []
     for url, lru in item["lrulinks"]:
         if self._should_resolve(lru, spider):
             if url in spider.resolved_links:
                 lru = spider.resolved_links[url]
             else:
                 try:
                     agent = ResolverAgent(proxy=self.proxy)
                     rurl = yield agent.resolve(url)
                     if rurl == url and has_prefix(lru, spider.discover_prefixes):
                         rurl = yield agent.resolve(url)
                     lru = url_to_lru_clean(rurl)
                     spider.resolved_links[url] = lru
                 except Exception, e:
                     spider.log("Error resolving redirects from URL %s: %s %s" % (url, type(e), e), log.INFO)
         lrulinks.append(lru)
Exemplo n.º 3
0
 def process_item(self, item, spider):
     lrulinks = []
     for url, lru in item["lrulinks"]:
         if self._should_resolve(lru, spider):
             if url in spider.resolved_links:
                 lru = spider.resolved_links[url]
             else:
                 try:
                     agent = ResolverAgent(proxy=self.proxy)
                     rurl = yield agent.resolve(url)
                     if rurl == url and has_prefix(lru, spider.discover_prefixes):
                         rurl = yield agent.resolve(url)
                     lru = url_to_lru_clean(rurl, TLDS_TREE)
                     spider.resolved_links[url] = lru
                 except Exception, e:
                     spider.log("Error resolving redirects from URL %s: %s %s" % (url, type(e), e), log.INFO)
         lrulinks.append(lru)
Exemplo n.º 4
0
 def _should_follow(self, depth, fromlru, tolru):
     c1 = depth < self.maxdepth
     c2 = has_prefix(tolru, self.follow_prefixes)
     c3 = not(has_prefix(tolru, self.nofollow_prefixes))
     return c1 and c2 and c3
Exemplo n.º 5
0
 def _should_follow(self, depth, fromlru, tolru):
     c1 = depth < self.maxdepth
     c2 = has_prefix(tolru, self.follow_prefixes)
     c3 = not (has_prefix(tolru, self.nofollow_prefixes))
     return c1 and c2 and c3
Exemplo n.º 6
0
 def _should_resolve(self, lru, spider):
     c1 = has_prefix(lru, spider.discover_prefixes)
     c2 = has_prefix(lru, spider.follow_prefixes)
     c3 = any((match in lru for match in ["url", "link", "redir", "target", "orig", "goto"]))
     return c1 or (c2 and c3)