def extract_domain(url):
    url = urllib.unquote(urllib.unquote(urllib.unquote(url)))
    if "http" not in url:
        print >> sys.stderr, "Error 4, http is not in url, invalid url: %s" % url
        return "-1"
    url = url.split("http")[-1]
    if url.startswith("://"):
        url = url.strip("://")
    elif url.startswith("s://"):
        url = url.strip("s://")
    else:
        print >> sys.stderr, "Error 2, error http head, invalid url : %s" % url
        return "-1"
    url = "http://" + url
    if url_util.is_valid_url(url):
        url = url_util.regularize_url(url)
        domain = Filter.domain_url(url)
        if domain == "NULL":
            print >> sys.stderr, "Error 3, domain is null, invalid url : %s" % url
            return "-1"
        return domain
    else:
        print >> sys.stderr, "Error 1, invalid url: %s" % url
        return "-1"
def extract_domain(url):
    url = urllib.unquote(urllib.unquote(urllib.unquote(url)))
    if "http" not in url:
        print >> sys.stderr, "Error 4, http is not in url, invalid url: %s" % url
        return "-1"
    url = url.split("http")[-1]
    if url.startswith("://"):
        url = url.strip("://")
    elif url.startswith("s://"):
        url = url.strip("s://")
    else:
        print >> sys.stderr, "Error 2, error http head, invalid url : %s" % url
        return "-1"
    url = "http://" + url
    if url_util.is_valid_url(url):
        url = url_util.regularize_url(url)
        domain = Filter.domain_url(url)
        if domain == "NULL":
            print >> sys.stderr, "Error 3, domain is null, invalid url : %s" % url
            return "-1"
        return domain
    else:
        print >> sys.stderr, "Error 1, invalid url: %s" % url
        return "-1"
Пример #3
0
import sys
import url_util
import urllib

for line in sys.stdin:
	vals = line.strip().split("\t")
	if len(vals) <= 11:
		tag = "2"
		index = vals[3]
		url = vals[1]
		query = vals[2]
		ori_val = "\1".join(vals)
		print "%s\t%s\t%s\t%s\t%s" % (index, tag, query, url, ori_val)
	elif len(vals) > 140:
		tag = "1"
		index = vals[0]
		query = url_util.regularize_str(vals[4])
		url = urllib.unquote(vals[93])
		if not url_util.is_valid_url(url):
			continue
		url = url_util.regularize_url(url)
		rig_q = vals[140].split("#")[0]
		click_q = vals[150].split("%")[3]
		try:
			rig_q_val = float(rig_q)
			click_q_val = float(click_q)
		except:
			print >> sys.stderr, "Error! rig q: %s or click q: %s is invalid." % (rig_q, click_q)
			continue
		print "\t".join([index, tag, query, url, rig_q, click_q])