Exemplo n.º 1
0
 def normalize_link(link):
     if urlparse.urlsplit(link).scheme in ('http', 'https', ''):
         if '#' in link:
             link = link[:link.index('#')]
         if url:
             link = urlparse.urljoin(url, link)
             if not local and common.same_domain(url, link):
                 # local links not included
                 link = None
             if not external and not common.same_domain(url, link):
                 # external links not included
                 link = None
     else:
         link = None  # ignore mailto, etc
     return link
Exemplo n.º 2
0
 def normalize_link(link):
     if urlsplit(link).scheme in ('http', 'https', ''):
         if '#' in link:
             link = link[:link.index('#')]
         if url:
             link = urljoin(url, link)
             if not local and common.same_domain(url, link):
                 # local links not included
                 link = None
             if not external and not common.same_domain(url, link):
                 # external links not included
                 link = None
     else:
         link = None # ignore mailto, etc
     return link
Exemplo n.º 3
0
 def valid(link):
     """Check if should crawl this link
     """
     # check if a media file
     if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
         # check if a proper HTTP link
         if link.lower().startswith('http'):
             # only crawl within website
             if common.same_domain(domain, link):
                 # passes regex
                 if self.allowed_urls.match(link) and not self.banned_urls.match(link):
                     # not blocked by robots.txt
                     if not self.robots or self.robots.can_fetch(settings.user_agent, link):
                         # allowed to recrawl
                         if self.crawl_existing or (D.cache and link not in D.cache):
                             return True
     return False
Exemplo n.º 4
0
 def valid(link):
     """Check if should crawl this link
     """
     # check if a media file
     if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
         # check if a proper HTTP link
         if link.lower().startswith('http'):
             # only crawl within website
             if common.same_domain(domain, link):
                 # passes regex
                 if self.allowed_urls.match(link) and not self.banned_urls.match(link):
                     # not blocked by robots.txt
                     if not self.robots or self.robots.can_fetch(settings.user_agent, link):
                         # allowed to recrawl
                         if self.crawl_existing or (D.cache and link not in D.cache):
                             return True
     return False