def test_tld_extract(): assert tld_extract("sub.test.com") == ("sub", "test", "com") assert tld_extract(".test.com") == ("", "test", "com") assert tld_extract(".test.com.") == ("", "test", "com") assert tld_extract(".www.test.com.") == ("www", "test", "com") assert tld_extract(u".www.test.com.") == ("www", "test", "com") assert [type(x) for x in tld_extract(u".www.test.com.")] == [str, str, str]
def _fast_make_domain_id(domain): """ Experimental fast version bypassing cosrlib.URL """ if domain not in URL_DOMAIN_IDS_CACHE: subdomain, domain, suffix = tld_extract(domain) if subdomain == "www" or not subdomain: URL_DOMAIN_IDS_CACHE[domain] = mmh3.hash("%s.%s" % (domain, suffix)) else: while subdomain.startswith("www."): subdomain = subdomain[4:] URL_DOMAIN_IDS_CACHE[domain] = ((mmh3.hash(subdomain) << 32) + mmh3.hash("%s.%s" % (domain, suffix))) return URL_DOMAIN_IDS_CACHE[domain]
def _fast_make_domain_id(domain): """ Experimental fast version bypassing cosrlib.URL """ if domain not in URL_DOMAIN_IDS_CACHE: subdomain, domain, suffix = tld_extract(domain) if subdomain == "www" or not subdomain: URL_DOMAIN_IDS_CACHE[domain] = mmh3.hash("%s.%s" % (domain, suffix)) else: while subdomain.startswith("www."): subdomain = subdomain[4:] URL_DOMAIN_IDS_CACHE[domain] = ( (mmh3.hash(subdomain) << 32) + mmh3.hash("%s.%s" % (domain, suffix)) ) return URL_DOMAIN_IDS_CACHE[domain]