Exemplo n.º 1
0
def test():
    fetcher = Fetcher("test/fetcher_test_data")
    urls = URLUtility.load_urls("test/data/urls.txt")
    sites = fetcher.fetch(urls)
    for site in sites:
        for page in site:
            print page.get_text('body')[:100].replace("\n", "")
 def __init__(self, seed_file, result_file, data_dir):
     """
     Args:
         seed_file: contains list of seed urls
         data_dir: stores crawled data
         result_file: stores urls and their scores
     """
     self.train_urls = URLUtility.load_urls(seed_file)
     self.fetcher = Fetcher(
         data_dir, None, False
     )  # Note: Fetcher contains Bing Search but does not use it (just for website ranking evaluation)
     self.result_file = result_file
     self.ranked_result_file = result_file + ".rank"
     self.searcher = Search_APIs(data_dir, self.fetcher)
Exemplo n.º 3
0
def search_site(url_file, out_file, keyword):
    """
    Write results as json line objects into out_file
    Format of each json object:
        list<str>: list of urls. First url is the main site
    """
    urls = URLUtility.load_urls(url_file)
    site2urls = read_json(out_file)
    k = 10

    out = open(out_file, "a+")
    for i, url in enumerate(urls):
        site = URLUtility.get_host(url)
        if site not in site2urls:
            results = bing_search.search_site(keyword, url, 10)
            results = [site, url] + results
            json.dump(results, out)
            out.write("\n")
    out.close()
Exemplo n.º 4
0
                    page = Page(url)
                    if len(res.text) < self.max_html_size:
                        page.add_html(res.text)
                        if extraction:
                            jspage = page.get_json_obj()
                        else:
                            jspage = {'url': url, 'html': res.text}
                        out.write(json.dumps(jspage) + '\n')
                else:
                    print res.status_code, url
            except:
                print "Failed to fetch ", url
                traceback.print_exc()

        out.close()


def test():
    fetcher = Fetcher()
    urls = ['http://nyu.edu', 'http://mit.edu']
    out_file = 'test_fetcher.json'
    fetcher.fetch(urls, out_file)


if __name__ == "__main__":
    url_file = sys.argv[1]
    out_file = sys.argv[2]
    urls = URLUtility.load_urls(url_file)
    fetcher = Fetcher()
    fetcher.fetch(urls, out_file, True)
Exemplo n.º 5
0
def evaluate_ranking(seed_file,
                     candidate_file,
                     negative_file,
                     data_dir,
                     rankings,
                     max_cand,
                     representation,
                     test_ratio,
                     online,
                     selection=None,
                     max_pages=1,
                     prf=False,
                     seednumbs=None):
    """
    test_ratio: percentage of test urls splitted from seed urls
    """
    t = time.time()
    seed_urls = URLUtility.load_urls(seed_file)
    cand_urls = URLUtility.load_urls(candidate_file)
    neg_urls = URLUtility.load_urls(negative_file)

    # Split train and test urls
    split = int((1 - test_ratio) * len(seed_urls))
    test_urls = seed_urls[split:]
    train_urls = seed_urls[:split]

    # Fetch the train, test and candidate sites
    print "Loading the cache"
    fetcher = Fetcher(data_dir)
    if selection == "mix":
        # This is to prove the yet ineffectiveness of multipages representation
        train_selection = test_selection = "search"
        cand_selection = "random"
    else:
        train_selection = test_selection = cand_selection = selection

    print "\nFetching train sites"
    train_sites = fetcher.fetch_sites(train_urls, max_pages, train_selection,
                                      online)

    print "Time to fetch train sites: ", time.time() - t
    t = time.time()

    if seednumbs:
        seednumbs = get_seednumbs(seednumbs[0], len(train_sites), seednumbs[1])
    else:
        seednumbs = [len(train_sites)]
    print "seednumbs", seednumbs
    for seednumb in seednumbs:
        train_sites = train_sites[:seednumb + 1]
        #for s in train_sites:
        #    for p in s:
        #        print p.get_url()
        print "\nFetching cand sites"
        cand_sites = fetcher.fetch_sites(cand_urls, max_pages, cand_selection,
                                         online)
        print "\nFetching test sites"
        test_sites = fetcher.fetch_sites(test_urls, max_pages, test_selection,
                                         online)
        print "\nFetching negative sites"
        neg_sites = fetcher.fetch_sites(neg_urls, 1, None, online)
        print "Time to fetch cand, test, neg sites: ", time.time() - t

        cand_sites = cand_sites[:max_cand]
        max_cand -= len(test_sites)
        cand_sites.extend(test_sites)
        print "Number of seed sites: ", len(train_sites)
        print "Number of test sites: ", len(test_sites)
        print "Number of candidate sites: ", len(cand_sites)
        print "Ranking methods: ", rankings
        if online:
            print "Running online mode"
        else:
            print "Running offline mode"

        # Initialize the ranking models
        for ranking in rankings:
            # Train
            print "Ranking..."
            t = time.time()
            ranker = Ranker(
                copy.deepcopy(train_sites), representation, ranking, neg_sites
            )  # train_sites might be changed in the object initialization
            print "Time to initialize ranker: ", time.time() - t
            t = time.time()
            top_sites = ranker.rank(cand_sites, prf)
            print "Time to rank: ", time.time() - t

            # Evaluate
            print "Evaluating ranking results"
            site2rank = {}
            site2website = {}
            for i, site_score in enumerate(top_sites):
                site = site_score[0].get_host()
                if site not in site2rank:
                    site2rank[site] = i
                    site2website[site] = site_score[0]
            test_scores = []
            #test_count = 0
            for url in test_urls:
                site = URLUtility.get_host(url)
                if site in site2rank:
                    #test_count += 1
                    print site, site2rank[site]
                    print[p.get_url() for p in site2website[site]]
                    test_scores.append(site2rank[site])
            test_scores = sorted(test_scores)
            mean = sum(test_scores) / float(len(test_scores))
            mean = round(mean, 2)
            median = test_scores[(len(test_scores) - 1) / 2]
            #prec_at_k = round(len([s for s in test_scores if s<=len(test_urls)])/float(test_count), 4)*100
            prec_at_k = round(
                len([s for s in test_scores if s < len(test_scores)]) /
                float(len(test_scores)), 4) * 100
            precs = compute_prec(test_scores)
            print "RESULTS_SEEDNUMB", len(train_sites)
            print "RESULTS_RAW," + ranking + ',' + ','.join(
                [str(s) for s in test_scores])
            print "RESULTS_AGGREGATION," + ranking + ',' + str(
                mean) + ',' + str(median) + ',' + str(prec_at_k)
            print "RESULTS_PRECS", ranking + ',' + ','.join(
                [str(p) for p in precs])

            # Debug: print top 10 urls
            print "Top 10 urls: "
            for item in top_sites[:20]:
                print item[0].get_host(), item[1]
                print[p.get_url() for p in item[0]]

            # Clear the pre-computed vectorization from previous runs
            clear(train_sites)
            clear(cand_sites)
            clear(test_sites)
            clear(neg_sites)
    counter = Counter()
    for site in sites:
        for p in site:
            text = p.get_text('meta')
            text = URLUtility.clean_text(text)
            words = word_tokenize(text)
            words = [
                word for word in words if word not in stop and len(word) > 2
            ]
            counter += Counter(words)

    # Get the topk words
    counter = [(counter[w], w) for w in counter
               if counter[w] > 1]  # convert to array
    heapq.heapify(counter)
    topk = heapq.nlargest(k, counter)
    print "Top extracted keywords: ", topk
    return [w[1] for w in topk]


def make_output_filename(data_dir, seed_file):
    seed_filename = seed_file.split("/")[-1].split(".")[0]
    return data_dir + "/" + seed_filename + "_candidates.txt"


if __name__ == "__main__":
    seed_file = sys.argv[1]
    data_dir = sys.argv[2]
    seed_urls = URLUtility.load_urls(seed_file)
    collect_candidates(seed_urls, data_dir)
    def run_mix_search(self,
                       ranking,
                       selection=None,
                       online=True,
                       max_results=50,
                       seed_keyword="gun",
                       search="kw",
                       iters=5,
                       representation='body',
                       negative_file=None):
        """
        seed_sites: urls that are used for search
        selected_urls: urls that were used for search

        Only top-ranked urls will become seed urls

        Important Args:
            ranking: a ranking method
            max_results: Maximum number of results to return in related and keyword search
        """
        max_pages = 1  # Always use single page to represent a website
        train_sites = self.fetcher.fetch_sites(self.train_urls, max_pages,
                                               selection, online)

        if negative_file:  # (random) reliably negative examples
            neg_urls = URLUtility.load_urls(negative_file)
            neg_urls = neg_urls[:200]
        else:
            neg_urls = []
        print "neg_urls: ", len(neg_urls)
        neg_sites = self.fetcher.fetch_sites(neg_urls, 1, None, online)
        ranker = Ranker(train_sites, representation, ranking, neg_sites)

        # Data
        scores = []  # Avoid exception when iters=0
        #seed_sites = self.train_urls # topk urls from each search batch
        seed_sites = train_sites  # topk urls from each search batch
        selected_urls = {}  # avoid searching with these urls again
        selected_urls['kw'] = set()
        selected_urls['bl'] = set()
        selected_urls['rl'] = set()
        selected_urls['fw'] = set()
        results = []  # Search results for ranking
        urls = set()  # Avoid fetch and rank these urls again
        sites = set()  # used to compute reward

        # Hyperparameters
        #max_numb_pages = 12000 # stop condition
        max_numb_pages = 51000  # stop condition
        #iters = 500
        iters = 2000
        k = 20  #  number of pages from the newly discovered pages to be added to the seed list
        max_kw = 20  # maximum number of keywords to select from the seed pages
        self.searcher.set_max_keywords(max_kw)

        # Initialize Search Operator Selection Strategy
        count = {}  # Count number of results yeilded by each search operator
        count['bl'] = count['kw'] = count['rl'] = count['fw'] = 0
        count['bl'] = 20000  # never choose this
        #ucb = UCB1(['rl', 'bl', 'kw'])
        ucb = UCB1(['rl', 'bl', 'kw', 'fw'])

        site_mode = False  # used in get_top_ranked_urls function

        for i in xrange(iters):
            t = time.time()

            print "Searching... ", len(seed_sites), "  seed urls"
            searchop = self.select_searchop(count, search, ucb)

            if searchop == 'rl' or searchop == 'bl':
                site_mode = True
            else:
                site_mode = False

            print "\n Iteration ", i, searchop
            new_urls = self.searcher.search(seed_sites, \
                                            searchop, seed_keyword=seed_keyword, \
                                            max_results=max_results)
            new_urls = [url for url in new_urls if url not in urls]

            if len(new_urls) == 0:
                print "Searcher found 0 url"
                seed_sites = self.get_top_ranked_urls(
                    scores, k, selected_urls[searchop], site_mode
                )  # Backlink search and related search only use host name to form the query. searchop!='kw' <-> searchop=='bl' or searchop=='rl'
                if len(seed_sites) == 0:
                    print "Stop. Running out of seeds"
                    break
                else:
                    continue

            urls.update(new_urls)

            print "Time to search ", i, ": ", time.time() - t
            t = time.time()

            new_sites = self.fetcher.fetch_sites(new_urls, max_pages,
                                                 selection, online)

            print "Time to fetch ", i, ": ", time.time() - t
            t = time.time()

            temp = len(results)
            results.extend(new_sites)
            print "Size of candidates (after): ", len(results)
            print "Number of new candidates (after): ", len(results) - temp
            scores = ranker.rank(results)
            if len(scores) >= max_numb_pages:
                print "Stop. Retrieved ", max_numb_pages, " pages"
                break
            #seed_sites = self.get_top_ranked_urls(scores, k, selected_urls[searchop])
            seed_sites = self.get_top_ranked_urls(
                scores, k, selected_urls[searchop], site_mode
            )  # Backlink search and related search only use host name to form the query. searchop!='kw' <-> searchop=='bl' or searchop=='rl'
            if len(seed_sites) == 0:
                print "Stop. Running out of seeds"
                break
            self.save_urls(new_sites, i)

            # Update information from the search results to the operation selector
            count[searchop] += len(new_urls)
            if (search == 'bandit') and new_sites:
                reward = self.get_reward(scores, new_sites, sites)
                print "UCB Rewards", searchop, reward
                ucb.update(searchop, reward, len(new_sites))
                sites.update([s.get_host() for s in new_sites])
            print "Time to rank ", i, ": ", time.time() - t

        self.save_scores(scores)
    def run(self,
            ranking,
            selection=None,
            online=True,
            max_results=50,
            seed_keyword="gun",
            searchop="kw",
            iters=5,
            representation='body',
            negative_file=None):
        """
        seed_sites: urls that are used for search
        selected_urls: urls that were used for search

        Only top-ranked urls will become seed urls

        Important Args:
            ranking: a ranking method
            max_results: Maximum number of results to return in related and keyword search
        """
        max_pages = 1  # Always use single page to represent a website
        train_sites = self.fetcher.fetch_sites(self.train_urls, max_pages,
                                               selection, online)

        if negative_file:  # (random) reliably negative examples
            neg_urls = URLUtility.load_urls(negative_file)
            neg_urls = neg_urls[:200]
        else:
            neg_urls = []
        print "neg_urls: ", len(neg_urls)
        neg_sites = self.fetcher.fetch_sites(neg_urls, 1, None, online)
        ranker = Ranker(train_sites, representation, ranking, neg_sites)

        # Data
        scores = []  # Avoid exception when iters=0
        #seed_sites = self.train_urls # topk urls from each search batch
        seed_sites = train_sites  # topk urls from each search batch
        selected_urls = set()  # avoid searching with these urls again
        results = []  # Search results for ranking
        urls = set()  # Avoid fetch and rank these urls again

        # Hyperparameters
        #max_numb_pages = 12000 # stop condition
        max_numb_pages = 51000  # stop condition
        #iters = 500
        iters = 2000

        k = 20  #  number of pages from the newly discovered pages to be added to the seed list
        max_kw = 20  # maximum number of keywords to select from the seed pages
        self.searcher.set_max_keywords(max_kw)
        """
        # Search Strategy
        blsearch = kwsearch = rlsearch = fwsearch = False
        if search == 'bl':
            blsearch = True
            print "Backlink search enable"
        elif search == 'rl':
            rlsearch = True
            print "Related search enable"
        elif search == 'kw':
            kwsearch =  True
            print "Keyword search enable"
        """
        site_mode = False  # used in get_top_ranked_urls function
        if searchop == 'rl' or searchop == 'bl':
            site_mode = True

        for i in xrange(iters):
            t = time.time()

            print "Searching... ", len(seed_sites), "  seed urls"
            print "\n Iteration ", i, searchop
            new_urls = self.searcher.search(seed_sites, searchop, \
                                            seed_keyword=seed_keyword, \
                                            max_results=max_results)
            new_urls = [url for url in new_urls if url not in urls]
            if len(new_urls) == 0:
                print "Searcher found 0 url"
                seed_sites = self.get_top_ranked_urls(scores, k, selected_urls,
                                                      site_mode)
                if len(seed_sites) == 0:
                    print "Stop. Running out of seeds"
                    break
                else:
                    continue

            urls.update(new_urls)

            print "Time to search ", i, ": ", time.time() - t
            t = time.time()

            new_sites = self.fetcher.fetch_sites(new_urls, max_pages,
                                                 selection, online)

            print "Time to fetch ", i, ": ", time.time() - t
            t = time.time()

            print "Size of candidates (before): ", len(results)
            results.extend(new_sites)
            print "Size of candidates (after): ", len(results)
            scores = ranker.rank(results)
            if len(scores) >= max_numb_pages:
                print "Stop. Retrieved ", max_numb_pages, " pages"
                break
            seed_sites = self.get_top_ranked_urls(scores, k, selected_urls,
                                                  site_mode)
            if len(seed_sites) == 0:
                print "Stop. Running out of seeds"
                break
            self.save_urls(new_sites, i)

            print "Time to rank ", i, ": ", time.time() - t

        self.save_scores(scores)