def extract_domain(url): url = urllib.unquote(urllib.unquote(urllib.unquote(url))) if "http" not in url: print >> sys.stderr, "Error 4, http is not in url, invalid url: %s" % url return "-1" url = url.split("http")[-1] if url.startswith("://"): url = url.strip("://") elif url.startswith("s://"): url = url.strip("s://") else: print >> sys.stderr, "Error 2, error http head, invalid url : %s" % url return "-1" url = "http://" + url if url_util.is_valid_url(url): url = url_util.regularize_url(url) domain = Filter.domain_url(url) if domain == "NULL": print >> sys.stderr, "Error 3, domain is null, invalid url : %s" % url return "-1" return domain else: print >> sys.stderr, "Error 1, invalid url: %s" % url return "-1"
import sys import url_util SOURCE = "word_url_winfoid" for line in sys.stdin: val = line.strip().split("\t") level = val[0] if level == "4": winfoid = val[1] bidword = url_util.regularize_str(val[6]) pc_url = val[13] wise_url = val[15] if pc_url != "DFT": pc_url = url_util.regularize_url(pc_url) print "%s\t%s\t%s\t%s" % (bidword, pc_url, SOURCE, winfoid) if wise_url != "DFT": wise_url = url_util.regularize_url(wise_url) print "%s\t%s\t%s\t%s" % (bidword, wise_url, SOURCE, winfoid)
import sys import url_util for line in sys.stdin: vals = line.strip().split("\t") if len(vals) >= 4: tag = "2" query = url_util.regularize_str(vals[0]) url = url_util.regularize_url(vals[11]) ori_val = "\1".join(vals) print "%s\t%s\t%s\t%s" % (query, url, tag, ori_val) elif len(vals) == 3: tag = "1" string = vals[0] url = vals[1] rel_q = vals[2] try: q_val = float(rel_q) except e: print >> sys.stderr, "Error! relevance q: %s is invalid." % rel_q continue print "%s\t%s\t%s\t%s" % (string, url, tag, rel_q)
import sys import url_util import urllib for line in sys.stdin: vals = line.strip().split("\t") if len(vals) <= 11: tag = "2" index = vals[3] url = vals[1] query = vals[2] ori_val = "\1".join(vals) print "%s\t%s\t%s\t%s\t%s" % (index, tag, query, url, ori_val) elif len(vals) > 140: tag = "1" index = vals[0] query = url_util.regularize_str(vals[4]) url = urllib.unquote(vals[93]) if not url_util.is_valid_url(url): continue url = url_util.regularize_url(url) rig_q = vals[140].split("#")[0] click_q = vals[150].split("%")[3] try: rig_q_val = float(rig_q) click_q_val = float(click_q) except: print >> sys.stderr, "Error! rig q: %s or click q: %s is invalid." % (rig_q, click_q) continue print "\t".join([index, tag, query, url, rig_q, click_q])