Exemplos de same_domain em Python, exemplos de common.same_domain em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: xpath.py Projeto: akhdir/price-extraction

 def normalize_link(link):
     if urlparse.urlsplit(link).scheme in ('http', 'https', ''):
         if '#' in link:
             link = link[:link.index('#')]
         if url:
             link = urlparse.urljoin(url, link)
             if not local and common.same_domain(url, link):
                 # local links not included
                 link = None
             if not external and not common.same_domain(url, link):
                 # external links not included
                 link = None
     else:
         link = None  # ignore mailto, etc
     return link

Exemplo n.º 2

0

Exibir arquivo

Arquivo: xpath.py Projeto: huligong1234/python-study

 def normalize_link(link):
     if urlsplit(link).scheme in ('http', 'https', ''):
         if '#' in link:
             link = link[:link.index('#')]
         if url:
             link = urljoin(url, link)
             if not local and common.same_domain(url, link):
                 # local links not included
                 link = None
             if not external and not common.same_domain(url, link):
                 # external links not included
                 link = None
     else:
         link = None # ignore mailto, etc
     return link

Exemplo n.º 3

0

Exibir arquivo

Arquivo: download.py Projeto: amumu/webscraping

 def valid(link):
     """Check if should crawl this link
     """
     # check if a media file
     if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
         # check if a proper HTTP link
         if link.lower().startswith('http'):
             # only crawl within website
             if common.same_domain(domain, link):
                 # passes regex
                 if self.allowed_urls.match(link) and not self.banned_urls.match(link):
                     # not blocked by robots.txt
                     if not self.robots or self.robots.can_fetch(settings.user_agent, link):
                         # allowed to recrawl
                         if self.crawl_existing or (D.cache and link not in D.cache):
                             return True
     return False

Exemplo n.º 4

0

Exibir arquivo

Arquivo: download.py Projeto: richardpenman/webscraping

 def valid(link):
     """Check if should crawl this link
     """
     # check if a media file
     if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
         # check if a proper HTTP link
         if link.lower().startswith('http'):
             # only crawl within website
             if common.same_domain(domain, link):
                 # passes regex
                 if self.allowed_urls.match(link) and not self.banned_urls.match(link):
                     # not blocked by robots.txt
                     if not self.robots or self.robots.can_fetch(settings.user_agent, link):
                         # allowed to recrawl
                         if self.crawl_existing or (D.cache and link not in D.cache):
                             return True
     return False