def load_blacklist(f): bl = set() with open(f) as lines: for line in lines: line = line.strip().lower() host = URLUtility.get_host(line) print host bl.add(host) return bl
def count(infile): sites = set([]) with open(infile) as lines: for line in lines: obj = json.loads(line) url = obj['url'] site = URLUtility.get_host(url) sites.add(site) for site in sites: print site print len(sites)
def __init__(self, url): #url = url_normalize.url_normalize(url) self.host = URLUtility.get_host(url) self.pages = [] self.word_set = set() self.vsm = None self.jaccard = None # Don't use this if seeds are updated self.cosine = None # Don't use this if seeds are updated self.clf_vsm = None self.bstf_vsm = None self.bsbin_vsm = None self.cs_vsm = None
def export_host(indir, outfile): urls = ExportURL.load_urls(indir) uniq_hosts = set([]) out = open(outfile, "w") for url in urls: try: host = URLUtility.get_host(url) if host not in uniq_hosts: uniq_hosts.add(host) out.write(host.encode('utf-8') + "\n") except: traceback.print_exc() out.close()
def run_filter(infile, outfile): blacklists = load_blacklist("blacklist.txt") out = open(outfile, "w") counter = {} with open(infile) as lines: for line in lines: url = line.strip() if is_filter(url, blacklists, counter): continue else: host = URLUtility.get_host(url) out.write(line) out.close()
def _read_ac_result_file(result_file, max_pages): """ Load all sites from the result file of ACHE """ count = 0 sites = set() with open(result_file) as lines: for line in lines: count += 1 url = line.split()[0] url = URLUtility.normalize(url) site = URLUtility.get_host(url) sites.add(site) if count == max_pages: break return sites
def _count_relev(url2label, site2label, urls): relev_pages = [] relev_sites = [] p = 0 s = 0 counted_sites = set() for url in urls: site = URLUtility.get_host(url) if url in url2label: if url2label[url]: p += 1 relev_pages.append(p) if site2label[site] and site not in counted_sites: s += 1 counted_sites.add(site) relev_sites.append(s) return relev_pages, relev_sites
def _read_sf_result_file(result_file, max_pages): """ Load all sites from the result file of SEEDFINDER """ sites = set() count = 0 with open(result_file) as lines: for line in lines: count += 1 values = line.strip().split(', ') url = values[-1] url = URLUtility.normalize(url) site = URLUtility.get_host(url) sites.add(site) if count == max_pages: break return sites
def main(argv): if len(argv) == 0: print "Args: [All HTML Directory] [Candidate Directory] [Output Directory]" print "[Topic]: topic from schema.org" print "[All HTML Directory]: Directory that contains collected pages in JSON format" print "[Candidate Directory]: Directory that contains candidate pages" print "[Output Directory]: Empty directory - if not existed, it will be created automatically" sys.exit(1) topic = argv[0] indir = argv[1] #Directory that contains all collected pages posdir = argv[2] #Directory that contains candidate pages outdir = argv[3] #Output if not os.path.exists(outdir): os.makedirs(outdir) pos_urls = ExportURL.load_urls(posdir) pos_sites = set([]) for url in pos_urls: site = URLUtility.get_host(url) pos_sites.add(site) print "Number of candidate sites: " + str(len(pos_sites)) pattern = generate_pattern(topic) PROCESS_NUMBER = cpu_count() - 2 if len(argv) == 5: PROCESS_NUMBER = int(argv[4]) jobs = [] queues = [] files = os.listdir(indir) for i in range(PROCESS_NUMBER): q = [] for j in range(i, len(files), PROCESS_NUMBER): q.append(files[j]) queues.append(q) for q in queues: p = Process(target=select_negative, args=(q, indir, outdir, pattern, pos_sites)) jobs.append(p) p.start() for p in jobs: p.join()
def search_site(url_file, out_file, keyword): """ Write results as json line objects into out_file Format of each json object: list<str>: list of urls. First url is the main site """ urls = URLUtility.load_urls(url_file) site2urls = read_json(out_file) k = 10 out = open(out_file, "a+") for i, url in enumerate(urls): site = URLUtility.get_host(url) if site not in site2urls: results = bing_search.search_site(keyword, url, 10) results = [site, url] + results json.dump(results, out) out.write("\n") out.close()
def select_negative(filenames, indir, outdir, pattern, pos_sites): for f in filenames: infile = indir + "/" + f outfile = outdir + "/" + f out = open(outfile, "w") with open(infile) as lines: for line in lines: try: data = json.loads(line) url = data['url'] html = data['text'] site = URLUtility.get_host(url) if site in pos_sites: match = pattern.search(html) if match == None: out.write(line) except: traceback.print_exc() print "Error processing " + url out.close()
def _read_relev_file(clf_file): """ Load all sites from the classification file Note that all classification files from different discovery tools have the same format """ sites = set() with open(clf_file) as lines: for line in lines: try: values = line.strip().split(",") url = ''.join(values[:-1]) label = int(values[-1]) url = URLUtility.normalize(url) site = URLUtility.get_host(url) if label != -1 and label != 1: print "Parsed label is incorrect" if label == 1: sites.add(site) except: traceback.print_exc() return sites
def select_subset(self, urls): """ Each page might contain thousand of external urls which pollute the results, so we only keep a fixed number of links from each page How this works: - Pick one url in each site - If not yet reaching max, select random urls Returns: - list of urls """ if len(urls) <= self.max_urls: return urls results = [] """ cur = urls while len(results)<self.max_urls: sites = set() next = [] for url in cur: site = URLUtility.get_host(url) if site not in sites: sites.add(site) results.append(url) else: next.append(url) if len(results) == self.max_urls: break cur = next """ sites = set() for url in urls: site = URLUtility.get_host(url) if site not in sites: sites.add(site) results.append(url) if len(results) == self.max_urls: break return results
def _read_clf_file(clf_file): url2label = {} site2label = {} with open(clf_file) as lines: for line in lines: try: values = line.strip().split(",") url = ''.join(values[:-1]) label = int(values[-1]) url = URLUtility.normalize(url) site = URLUtility.get_host(url) if label > 0: url2label[url] = True site2label[site] = True else: url2label[url] = False if site not in site2label: site2label[site] = False except: print line return url2label, site2label
def is_filter(url, blacklists, counter): #blacklists contains list of hosts host = URLUtility.get_host(url) ext = host.strip("/").split(".")[-1] if ext in ext_blacklists: return True if "?" in host: return True if "=" in host: return True max = 1000 if host in counter: counter[host] += 1 if counter[host] >= max: blacklists.add(host) else: counter[host] = 1 if host in blacklists: return True else: return False
def get_host(self): return URLUtility.get_host(self.url)
def evaluate_ranking(seed_file, candidate_file, negative_file, data_dir, rankings, max_cand, representation, test_ratio, online, selection=None, max_pages=1, prf=False, seednumbs=None): """ test_ratio: percentage of test urls splitted from seed urls """ t = time.time() seed_urls = URLUtility.load_urls(seed_file) cand_urls = URLUtility.load_urls(candidate_file) neg_urls = URLUtility.load_urls(negative_file) # Split train and test urls split = int((1 - test_ratio) * len(seed_urls)) test_urls = seed_urls[split:] train_urls = seed_urls[:split] # Fetch the train, test and candidate sites print "Loading the cache" fetcher = Fetcher(data_dir) if selection == "mix": # This is to prove the yet ineffectiveness of multipages representation train_selection = test_selection = "search" cand_selection = "random" else: train_selection = test_selection = cand_selection = selection print "\nFetching train sites" train_sites = fetcher.fetch_sites(train_urls, max_pages, train_selection, online) print "Time to fetch train sites: ", time.time() - t t = time.time() if seednumbs: seednumbs = get_seednumbs(seednumbs[0], len(train_sites), seednumbs[1]) else: seednumbs = [len(train_sites)] print "seednumbs", seednumbs for seednumb in seednumbs: train_sites = train_sites[:seednumb + 1] #for s in train_sites: # for p in s: # print p.get_url() print "\nFetching cand sites" cand_sites = fetcher.fetch_sites(cand_urls, max_pages, cand_selection, online) print "\nFetching test sites" test_sites = fetcher.fetch_sites(test_urls, max_pages, test_selection, online) print "\nFetching negative sites" neg_sites = fetcher.fetch_sites(neg_urls, 1, None, online) print "Time to fetch cand, test, neg sites: ", time.time() - t cand_sites = cand_sites[:max_cand] max_cand -= len(test_sites) cand_sites.extend(test_sites) print "Number of seed sites: ", len(train_sites) print "Number of test sites: ", len(test_sites) print "Number of candidate sites: ", len(cand_sites) print "Ranking methods: ", rankings if online: print "Running online mode" else: print "Running offline mode" # Initialize the ranking models for ranking in rankings: # Train print "Ranking..." t = time.time() ranker = Ranker( copy.deepcopy(train_sites), representation, ranking, neg_sites ) # train_sites might be changed in the object initialization print "Time to initialize ranker: ", time.time() - t t = time.time() top_sites = ranker.rank(cand_sites, prf) print "Time to rank: ", time.time() - t # Evaluate print "Evaluating ranking results" site2rank = {} site2website = {} for i, site_score in enumerate(top_sites): site = site_score[0].get_host() if site not in site2rank: site2rank[site] = i site2website[site] = site_score[0] test_scores = [] #test_count = 0 for url in test_urls: site = URLUtility.get_host(url) if site in site2rank: #test_count += 1 print site, site2rank[site] print[p.get_url() for p in site2website[site]] test_scores.append(site2rank[site]) test_scores = sorted(test_scores) mean = sum(test_scores) / float(len(test_scores)) mean = round(mean, 2) median = test_scores[(len(test_scores) - 1) / 2] #prec_at_k = round(len([s for s in test_scores if s<=len(test_urls)])/float(test_count), 4)*100 prec_at_k = round( len([s for s in test_scores if s < len(test_scores)]) / float(len(test_scores)), 4) * 100 precs = compute_prec(test_scores) print "RESULTS_SEEDNUMB", len(train_sites) print "RESULTS_RAW," + ranking + ',' + ','.join( [str(s) for s in test_scores]) print "RESULTS_AGGREGATION," + ranking + ',' + str( mean) + ',' + str(median) + ',' + str(prec_at_k) print "RESULTS_PRECS", ranking + ',' + ','.join( [str(p) for p in precs]) # Debug: print top 10 urls print "Top 10 urls: " for item in top_sites[:20]: print item[0].get_host(), item[1] print[p.get_url() for p in item[0]] # Clear the pre-computed vectorization from previous runs clear(train_sites) clear(cand_sites) clear(test_sites) clear(neg_sites)