Exemplo n.º 1
0
 def get_url_type(self, url):
     subdomain, _, _ = tldextract.extract(url)
     if self.is_domain_url(url):
         if len(subdomain) == 0 or subdomain == "www":
             return "domain"
         else:
             return "subdomain"
     else:
         return "others"
Exemplo n.º 2
0
    def normalize_url(self, url, base_url = None):
        if url is None or len(url) == 0:
            return None

        original_url = url
        #Note: here asume all non-unicode urls are encoded by utf-8
        if isinstance(url, str):
            url = url.decode("utf-8")

        if not isinstance(url, unicode):
            logging.error("invalid normalized url, url is not unicode", url = original_url, base_url = base_url)
            return None

        url = url.replace('%20', ' ').strip()

        #fix http scheme:
        url = self._fix_http_scheme(url)

        #handle relative url
        if base_url is not None:
            url = urlparse.urljoin(base_url, url)

        #common normlization
        try:
            url = urlnorm.norm(url)
        except Exception as e:
            logging.warn("invalid normalized url, urlnorm raised exception", url = original_url, base_url = base_url, exception = e)
            return None

        try:
            parse_result = urlparse.urlparse(url)
        except Exception as e:
            logging.warn("invalid normalized url, when parsing url", url = original_url, base_url = base_url)
            return None

        if not parse_result.scheme.lower() in self._settings["general_crawl_policies"]["supported_schemes"]:
            logging.warn("invalid normalized url, not supported schemes", url = original_url, base_url = base_url)
            return None


        netloc = parse_result.netloc
        host = parse_result.netloc.split(':')[0]
        if ip_regex.match(host) is None: #if it's an ip host

            #check if domain and tld exists
            subdomain, domain, tld = tldextract.extract(host)
            if len(domain) == 0 or len(tld) == 0:
                logging.warn("invalid normalized url, no domain or tld", url = original_url, base_url = base_url)
                return None

            #fix chinese punctuation
            for i in range(len(chinese_punctuation_map[0])):
                src = chinese_punctuation_map[0][i]
                dst = chinese_punctuation_map[1][i]
                netloc = netloc.replace(src, dst)

            #add www if not exists
            if len(subdomain) == 0:
                netloc = "www." + netloc

        fragment = parse_result.fragment
        if not fragment.startswith("!"): #Google's recommendation for ajax request
            fragment = ""
        if len(parse_result.scheme) == 0 or len(netloc) == 0:
            logging.warn("invalid normalized url, scheme or netloc is none", url = original_url, base_url = base_url)
            return None

        url = urlparse.urlunparse((parse_result.scheme, netloc, parse_result.path, parse_result.params, parse_result.query, fragment))

        #canonicalize url
        #Note: it's too strong, and sometimes change the url semantics.
        #url = ccrawler.utils.url.canonicalize_url(url)

        url = url.strip()
        if len(url) > self._settings["general_crawl_policies"]["max_url_length"]:
            logging.warn("invalid normalized url, length exceeded", url = original_url, base_url = base_url)
            return None
        elif len(url) == 0:
            logging.warn("invalid normalized url, length too short", url = original_url, base_url = base_url)
            return None
        else:
            return url
Exemplo n.º 3
0
def get_url_domain_info(url):
    subdomain, domain, tld = tldextract.extract(url)

    full_domain = section_join(domain, tld)
    host = section_join(subdomain, full_domain)
    return domain, full_domain, host