Пример #1
0
def extract_twold(url):
    etld_obj = etld.etld()
    registered = ''
    suffix = ''
    registered, suffix = etld_obj.parse(url)
    twold = '.'.join([registered.split('.')[-1], suffix])
    #print "twold: ", twold
    return twold
Пример #2
0
    def __split_url(self, url):
        """Split a url in several pieces, returning a tuple with each of that pieces.
        It will also remove the user (http://user:[email protected]) and the port (http://domain.com:8080) 

        Example: With the url "http://www.google.com/test/extra/index.html", the function will return this pieces:

        protocol: The protocol used by the url (in the example, "http").
        domain: The domain of the url (in the example, "google.com").
        subdomain: The subdomain of the url (in the example, "www").
        firstlevel: The first level of the path (in the example, "test").
        extra: The part of the URL not contained in the previous pieces (in the example, "extra/index.html").   
        """

        url = url.lower()

        splitted_url = url.split("://")
        if len(splitted_url) > 1:
            protocol = splitted_url[0]
            url = splitted_url[1]
        else:
            protocol = "http"

        if protocol != "http" and protocol != "https":
            return (None, None, None, None, None)

        parsed_url = urlparse("%s://%s" % (protocol, url))

        domain_string = parsed_url.netloc
        path_string = parsed_url.path

        if not domain_string:
            return (None, None, None, None, None)
        else:
            if domain_string.find("@") > -1:
                domain_string = domain_string.split("@")[1]
            if domain_string.find(":") > -1:
                domain_string = domain_string.split(":")[0]

        etld_object = etld.etld()
        try:
            subdomain, domain = etld_object.parse("%s://%s" % (protocol, domain_string))
        except:
            return (None, None, None, None, None)

        if subdomain == "":
            subdomain = None

        if path_string:
            path_pieces = path_string.split("/")

            firstlevel = path_pieces[1] if len(path_pieces) > 1 and path_pieces[1] else None
            extra = "/".join(path_pieces[2:]) if len(path_pieces) > 2 and path_pieces[2] else None
        else:
            firstlevel = None
            extra = None

        return (protocol, domain, subdomain, firstlevel, extra)
Пример #3
0
def extract_twold(hostname):
    if hostname is None:
        return None

    hostname = hostname.strip()
    if len(hostname) == 0:
        return None
    if isIP4Address(hostname):
        return None

    try:
        etld_obj = etld.etld()
        registered = ''
        suffix = ''
        registered, suffix = etld_obj.parse(hostname)
        twold = '.'.join([registered.split('.')[-1], suffix])
        print "hostname: %s -- twold: %s" % (hostname, twold)
        return twold
    except:
        print "Unable to compute twold: hostname: %s" % (hostname, )

    return None
Пример #4
0
    def __split_url(self, url):
        """Split a url in several pieces, returning a tuple with each of that pieces.
        It will also remove the user (http://user:[email protected]) and the port (http://domain.com:8080) 

        Example: With the url "http://www.google.com/test/extra/index.html", the function will return this pieces:

        protocol: The protocol used by the url (in the example, "http").
        domain: The domain of the url (in the example, "google.com").
        subdomain: The subdomain of the url (in the example, "www").
        firstlevel: The first level of the path (in the example, "test").
        extra: The part of the URL not contained in the previous pieces (in the example, "extra/index.html").   
        """

        url = url.lower()

        splitted_url = url.split("://")
        if len(splitted_url) > 1:
            protocol = splitted_url[0]
            url = splitted_url[1]
        else:
            protocol = 'http'

        if protocol != "http" and protocol != "https":
            return (None, None, None, None, None)

        parsed_url = urlparse("%s://%s" % (protocol, url))

        domain_string = parsed_url.netloc
        path_string = parsed_url.path

        if not domain_string:
            return (None, None, None, None, None)
        else:
            if domain_string.find("@") > -1:
                domain_string = domain_string.split("@")[1]
            if domain_string.find(":") > -1:
                domain_string = domain_string.split(":")[0]

        etld_object = etld.etld()
        try:
            subdomain, domain = etld_object.parse("%s://%s" %
                                                  (protocol, domain_string))
        except:
            return (None, None, None, None, None)

        if subdomain == "":
            subdomain = None

        if path_string:
            path_pieces = path_string.split("/")

            firstlevel = path_pieces[1] if len(
                path_pieces) > 1 and path_pieces[1] else None
            extra = "/".join(
                path_pieces[2:]
            ) if len(path_pieces) > 2 and path_pieces[2] else None
        else:
            firstlevel = None
            extra = None

        return (protocol, domain, subdomain, firstlevel, extra)
    return -(a[1] - b[1])


def calculateDiffs(prevData, data):
    """For each domain in data, calculate the delta between it and the
     domain in prevData."""
    retval = []
    for domain, count in data.iteritems():
        prevValue = prevData.get(domain, 0)
        count -= prevValue
        retval.append((domain, count))
    retval.sort(rank_by_count)
    return retval


etldService = etld.etld()


def getSLD(domain):
    """Get the "second level domain", e.g. "mozilla.org" or "bbc.co.uk" """
    try:
        sp = etldService.parse(domain)  # returns ("5.4.bbc", "co.uk")
        sld = sp[0].rsplit(".", 1)[-1]
        tld = sp[1]
        return sld + "." + tld
    except etld.EtldException:
        return domain


mx_queries = 0
mx_cache_hit = 0
Пример #6
0
def rank_by_count(a,b):
  return -(a[1] - b[1])

def calculateDiffs(prevData, data):
  """For each domain in data, calculate the delta between it and the
     domain in prevData."""
  retval = []
  for domain,count in data.iteritems():
    prevValue = prevData.get(domain, 0)
    count -= prevValue
    retval.append((domain,count))
  retval.sort(rank_by_count)
  return retval

etldService = etld.etld()

def getSLD(domain):
  """Get the "second level domain", e.g. "mozilla.org" or "bbc.co.uk" """
  try:
    sp = etldService.parse(domain) # returns ("5.4.bbc", "co.uk")
    sld = sp[0].rsplit(".", 1)[-1]
    tld = sp[1]
    return sld + "." + tld
  except etld.EtldException:
    return domain


mx_queries = 0
mx_cache_hit = 0