def extract_domain(url):
    url = urllib.unquote(urllib.unquote(urllib.unquote(url)))
    if "http" not in url:
        print >> sys.stderr, "Error 4, http is not in url, invalid url: %s" % url
        return "-1"
    url = url.split("http")[-1]
    if url.startswith("://"):
        url = url.strip("://")
    elif url.startswith("s://"):
        url = url.strip("s://")
    else:
        print >> sys.stderr, "Error 2, error http head, invalid url : %s" % url
        return "-1"
    url = "http://" + url
    if url_util.is_valid_url(url):
        url = url_util.regularize_url(url)
        domain = Filter.domain_url(url)
        if domain == "NULL":
            print >> sys.stderr, "Error 3, domain is null, invalid url : %s" % url
            return "-1"
        return domain
    else:
        print >> sys.stderr, "Error 1, invalid url: %s" % url
        return "-1"
def extract_domain(url):
    url = urllib.unquote(urllib.unquote(urllib.unquote(url)))
    if "http" not in url:
        print >> sys.stderr, "Error 4, http is not in url, invalid url: %s" % url
        return "-1"
    url = url.split("http")[-1]
    if url.startswith("://"):
        url = url.strip("://")
    elif url.startswith("s://"):
        url = url.strip("s://")
    else:
        print >> sys.stderr, "Error 2, error http head, invalid url : %s" % url
        return "-1"
    url = "http://" + url
    if url_util.is_valid_url(url):
        url = url_util.regularize_url(url)
        domain = Filter.domain_url(url)
        if domain == "NULL":
            print >> sys.stderr, "Error 3, domain is null, invalid url : %s" % url
            return "-1"
        return domain
    else:
        print >> sys.stderr, "Error 1, invalid url: %s" % url
        return "-1"
示例#3
0
import sys
import url_util

SOURCE = "word_url_winfoid"

for line in sys.stdin:
    val = line.strip().split("\t")
	level    = val[0]
	
	if level == "4":
		winfoid  = val[1]
		bidword  = url_util.regularize_str(val[6])
		pc_url   = val[13]
		wise_url = val[15]
		if pc_url != "DFT":
			pc_url   = url_util.regularize_url(pc_url)
			print "%s\t%s\t%s\t%s" % (bidword, pc_url, SOURCE, winfoid)
		if wise_url != "DFT":
			wise_url = url_util.regularize_url(wise_url)
			print "%s\t%s\t%s\t%s" % (bidword, wise_url, SOURCE, winfoid)
import sys
import url_util

for line in sys.stdin:
    vals = line.strip().split("\t")
    if len(vals) >= 4:
        tag = "2"
        query = url_util.regularize_str(vals[0])
        url = url_util.regularize_url(vals[11])
        ori_val = "\1".join(vals)
        print "%s\t%s\t%s\t%s" % (query, url, tag, ori_val)
    elif len(vals) == 3:
        tag = "1"
        string = vals[0]
        url = vals[1]
        rel_q = vals[2]
        try:
            q_val = float(rel_q)
        except e:
            print >> sys.stderr, "Error! relevance q: %s is invalid." % rel_q
            continue
        print "%s\t%s\t%s\t%s" % (string, url, tag, rel_q)
示例#5
0
import sys
import url_util
import urllib

for line in sys.stdin:
	vals = line.strip().split("\t")
	if len(vals) <= 11:
		tag = "2"
		index = vals[3]
		url = vals[1]
		query = vals[2]
		ori_val = "\1".join(vals)
		print "%s\t%s\t%s\t%s\t%s" % (index, tag, query, url, ori_val)
	elif len(vals) > 140:
		tag = "1"
		index = vals[0]
		query = url_util.regularize_str(vals[4])
		url = urllib.unquote(vals[93])
		if not url_util.is_valid_url(url):
			continue
		url = url_util.regularize_url(url)
		rig_q = vals[140].split("#")[0]
		click_q = vals[150].split("%")[3]
		try:
			rig_q_val = float(rig_q)
			click_q_val = float(click_q)
		except:
			print >> sys.stderr, "Error! rig q: %s or click q: %s is invalid." % (rig_q, click_q)
			continue
		print "\t".join([index, tag, query, url, rig_q, click_q])