def load_blacklist(f):
    bl = set()
    with open(f) as lines:
        for line in lines:
            line = line.strip().lower()
            host = URLUtility.get_host(line)
            print host
            bl.add(host)
    return bl
def count(infile):
    sites = set([])
    with open(infile) as lines:
        for line in lines:
            obj = json.loads(line)
            url = obj['url']
            site = URLUtility.get_host(url)
            sites.add(site)
    for site in sites:
        print site
    print len(sites)
Пример #3
0
 def __init__(self, url):
     #url = url_normalize.url_normalize(url)
     self.host = URLUtility.get_host(url)
     self.pages = []
     self.word_set = set()
     self.vsm = None
     self.jaccard = None  # Don't use this if seeds are updated
     self.cosine = None  # Don't use this if seeds are updated
     self.clf_vsm = None
     self.bstf_vsm = None
     self.bsbin_vsm = None
     self.cs_vsm = None
 def export_host(indir, outfile):
     urls = ExportURL.load_urls(indir)
     uniq_hosts = set([])
     out = open(outfile, "w")
     for url in urls:
         try:
             host = URLUtility.get_host(url)
             if host not in uniq_hosts:
                 uniq_hosts.add(host)
                 out.write(host.encode('utf-8') + "\n")
         except:
             traceback.print_exc()
     out.close()
def run_filter(infile, outfile):
    blacklists = load_blacklist("blacklist.txt")
    out = open(outfile, "w")
    counter = {}
    with open(infile) as lines:
        for line in lines:
            url = line.strip()
            if is_filter(url, blacklists, counter):
                continue
            else:
                host = URLUtility.get_host(url)

                out.write(line)
    out.close()
Пример #6
0
def _read_ac_result_file(result_file, max_pages):
    """
    Load all sites from the result file of ACHE 
    """
    count = 0
    sites = set()
    with open(result_file) as lines:
        for line in lines:
            count += 1
            url = line.split()[0]
            url = URLUtility.normalize(url)
            site = URLUtility.get_host(url)
            sites.add(site)
            if count == max_pages:
                break
    return sites
Пример #7
0
def _count_relev(url2label, site2label, urls):
    relev_pages = []
    relev_sites = []
    p = 0
    s = 0
    counted_sites = set()
    for url in urls:
        site = URLUtility.get_host(url)
        if url in url2label:
            if url2label[url]:
                p += 1
            relev_pages.append(p)
            if site2label[site] and site not in counted_sites:
                s += 1
                counted_sites.add(site)
            relev_sites.append(s)
    return relev_pages, relev_sites
Пример #8
0
def _read_sf_result_file(result_file, max_pages):
    """
    Load all sites from the result file of SEEDFINDER 
    """
    sites = set()
    count = 0
    with open(result_file) as lines:
        for line in lines:
            count += 1
            values = line.strip().split(', ')
            url = values[-1]
            url = URLUtility.normalize(url)
            site = URLUtility.get_host(url)
            sites.add(site)
            if count == max_pages:
                break
    return sites
def main(argv):
    if len(argv) == 0:
        print "Args: [All HTML Directory] [Candidate Directory] [Output Directory]"
        print "[Topic]: topic from schema.org"
        print "[All HTML Directory]: Directory that contains collected pages in JSON format"
        print "[Candidate Directory]: Directory that contains candidate pages"
        print "[Output Directory]: Empty directory - if not existed, it will be created automatically"
        sys.exit(1)

    topic = argv[0]
    indir = argv[1]  #Directory that contains all collected pages
    posdir = argv[2]  #Directory that contains candidate pages
    outdir = argv[3]  #Output
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    pos_urls = ExportURL.load_urls(posdir)
    pos_sites = set([])
    for url in pos_urls:
        site = URLUtility.get_host(url)
        pos_sites.add(site)

    print "Number of candidate sites: " + str(len(pos_sites))

    pattern = generate_pattern(topic)

    PROCESS_NUMBER = cpu_count() - 2
    if len(argv) == 5:
        PROCESS_NUMBER = int(argv[4])
    jobs = []
    queues = []

    files = os.listdir(indir)
    for i in range(PROCESS_NUMBER):
        q = []
        for j in range(i, len(files), PROCESS_NUMBER):
            q.append(files[j])
        queues.append(q)
    for q in queues:
        p = Process(target=select_negative,
                    args=(q, indir, outdir, pattern, pos_sites))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()
Пример #10
0
def search_site(url_file, out_file, keyword):
    """
    Write results as json line objects into out_file
    Format of each json object:
        list<str>: list of urls. First url is the main site
    """
    urls = URLUtility.load_urls(url_file)
    site2urls = read_json(out_file)
    k = 10

    out = open(out_file, "a+")
    for i, url in enumerate(urls):
        site = URLUtility.get_host(url)
        if site not in site2urls:
            results = bing_search.search_site(keyword, url, 10)
            results = [site, url] + results
            json.dump(results, out)
            out.write("\n")
    out.close()
def select_negative(filenames, indir, outdir, pattern, pos_sites):
    for f in filenames:
        infile = indir + "/" + f
        outfile = outdir + "/" + f
        out = open(outfile, "w")
        with open(infile) as lines:
            for line in lines:
                try:
                    data = json.loads(line)
                    url = data['url']
                    html = data['text']
                    site = URLUtility.get_host(url)
                    if site in pos_sites:
                        match = pattern.search(html)
                        if match == None:
                            out.write(line)
                except:
                    traceback.print_exc()
                    print "Error processing " + url
        out.close()
Пример #12
0
def _read_relev_file(clf_file):
    """
    Load all sites from the classification file
    Note that all classification files from different discovery tools have the same format
    """
    sites = set()
    with open(clf_file) as lines:
        for line in lines:
            try:
                values = line.strip().split(",")
                url = ''.join(values[:-1])
                label = int(values[-1])
                url = URLUtility.normalize(url)
                site = URLUtility.get_host(url)
                if label != -1 and label != 1:
                    print "Parsed label is incorrect"
                if label == 1:
                    sites.add(site)
            except:
                traceback.print_exc()
    return sites
Пример #13
0
    def select_subset(self, urls):
        """
        Each page might contain thousand of external urls which pollute the results, so we only keep a fixed number of links from each page
        How this works:
            - Pick one url in each site  
            - If not yet reaching max, select random urls
        Returns:
            - list of urls
        """
        if len(urls) <= self.max_urls:
            return urls

        results = []
        """
        cur = urls
        while len(results)<self.max_urls:
            sites = set()
            next = []
            for url in cur: 
                site = URLUtility.get_host(url)
                if site not in sites:
                    sites.add(site)
                    results.append(url)
                else:
                    next.append(url)
                if len(results) == self.max_urls:
                    break
            cur = next
        """
        sites = set()
        for url in urls:
            site = URLUtility.get_host(url)
            if site not in sites:
                sites.add(site)
                results.append(url)
            if len(results) == self.max_urls:
                break

        return results
Пример #14
0
def _read_clf_file(clf_file):
    url2label = {}
    site2label = {}

    with open(clf_file) as lines:
        for line in lines:
            try:
                values = line.strip().split(",")
                url = ''.join(values[:-1])
                label = int(values[-1])
                url = URLUtility.normalize(url)
                site = URLUtility.get_host(url)

                if label > 0:
                    url2label[url] = True
                    site2label[site] = True
                else:
                    url2label[url] = False
                    if site not in site2label:
                        site2label[site] = False
            except:
                print line
    return url2label, site2label
def is_filter(url, blacklists, counter):
    #blacklists contains list of hosts
    host = URLUtility.get_host(url) 
    ext = host.strip("/").split(".")[-1] 
    if ext in ext_blacklists:
        return True

    if "?" in host:
        return True
    if "=" in host:
        return True

    max = 1000
    if host in counter:
        counter[host] += 1
        if counter[host] >= max:
            blacklists.add(host)
    else:
        counter[host] = 1

    if host in blacklists:
        return True
    else:
        return False
Пример #16
0
 def get_host(self):
     return URLUtility.get_host(self.url)
Пример #17
0
def evaluate_ranking(seed_file,
                     candidate_file,
                     negative_file,
                     data_dir,
                     rankings,
                     max_cand,
                     representation,
                     test_ratio,
                     online,
                     selection=None,
                     max_pages=1,
                     prf=False,
                     seednumbs=None):
    """
    test_ratio: percentage of test urls splitted from seed urls
    """
    t = time.time()
    seed_urls = URLUtility.load_urls(seed_file)
    cand_urls = URLUtility.load_urls(candidate_file)
    neg_urls = URLUtility.load_urls(negative_file)

    # Split train and test urls
    split = int((1 - test_ratio) * len(seed_urls))
    test_urls = seed_urls[split:]
    train_urls = seed_urls[:split]

    # Fetch the train, test and candidate sites
    print "Loading the cache"
    fetcher = Fetcher(data_dir)
    if selection == "mix":
        # This is to prove the yet ineffectiveness of multipages representation
        train_selection = test_selection = "search"
        cand_selection = "random"
    else:
        train_selection = test_selection = cand_selection = selection

    print "\nFetching train sites"
    train_sites = fetcher.fetch_sites(train_urls, max_pages, train_selection,
                                      online)

    print "Time to fetch train sites: ", time.time() - t
    t = time.time()

    if seednumbs:
        seednumbs = get_seednumbs(seednumbs[0], len(train_sites), seednumbs[1])
    else:
        seednumbs = [len(train_sites)]
    print "seednumbs", seednumbs
    for seednumb in seednumbs:
        train_sites = train_sites[:seednumb + 1]
        #for s in train_sites:
        #    for p in s:
        #        print p.get_url()
        print "\nFetching cand sites"
        cand_sites = fetcher.fetch_sites(cand_urls, max_pages, cand_selection,
                                         online)
        print "\nFetching test sites"
        test_sites = fetcher.fetch_sites(test_urls, max_pages, test_selection,
                                         online)
        print "\nFetching negative sites"
        neg_sites = fetcher.fetch_sites(neg_urls, 1, None, online)
        print "Time to fetch cand, test, neg sites: ", time.time() - t

        cand_sites = cand_sites[:max_cand]
        max_cand -= len(test_sites)
        cand_sites.extend(test_sites)
        print "Number of seed sites: ", len(train_sites)
        print "Number of test sites: ", len(test_sites)
        print "Number of candidate sites: ", len(cand_sites)
        print "Ranking methods: ", rankings
        if online:
            print "Running online mode"
        else:
            print "Running offline mode"

        # Initialize the ranking models
        for ranking in rankings:
            # Train
            print "Ranking..."
            t = time.time()
            ranker = Ranker(
                copy.deepcopy(train_sites), representation, ranking, neg_sites
            )  # train_sites might be changed in the object initialization
            print "Time to initialize ranker: ", time.time() - t
            t = time.time()
            top_sites = ranker.rank(cand_sites, prf)
            print "Time to rank: ", time.time() - t

            # Evaluate
            print "Evaluating ranking results"
            site2rank = {}
            site2website = {}
            for i, site_score in enumerate(top_sites):
                site = site_score[0].get_host()
                if site not in site2rank:
                    site2rank[site] = i
                    site2website[site] = site_score[0]
            test_scores = []
            #test_count = 0
            for url in test_urls:
                site = URLUtility.get_host(url)
                if site in site2rank:
                    #test_count += 1
                    print site, site2rank[site]
                    print[p.get_url() for p in site2website[site]]
                    test_scores.append(site2rank[site])
            test_scores = sorted(test_scores)
            mean = sum(test_scores) / float(len(test_scores))
            mean = round(mean, 2)
            median = test_scores[(len(test_scores) - 1) / 2]
            #prec_at_k = round(len([s for s in test_scores if s<=len(test_urls)])/float(test_count), 4)*100
            prec_at_k = round(
                len([s for s in test_scores if s < len(test_scores)]) /
                float(len(test_scores)), 4) * 100
            precs = compute_prec(test_scores)
            print "RESULTS_SEEDNUMB", len(train_sites)
            print "RESULTS_RAW," + ranking + ',' + ','.join(
                [str(s) for s in test_scores])
            print "RESULTS_AGGREGATION," + ranking + ',' + str(
                mean) + ',' + str(median) + ',' + str(prec_at_k)
            print "RESULTS_PRECS", ranking + ',' + ','.join(
                [str(p) for p in precs])

            # Debug: print top 10 urls
            print "Top 10 urls: "
            for item in top_sites[:20]:
                print item[0].get_host(), item[1]
                print[p.get_url() for p in item[0]]

            # Clear the pre-computed vectorization from previous runs
            clear(train_sites)
            clear(cand_sites)
            clear(test_sites)
            clear(neg_sites)