예제 #1
0
    def get_ground_truth(self,dataset):
        print "our dataset is {0}".format(dataset)
        data = dataset.replace("new_","")
        if os._exists("./crawling/{0}/site.gold/{1}/{1}.gold".format(self.date,data)):
            print "./crawling/{0}/site.gold/{1}/{1}.gold".format(self.date,data)
            gold_file = open("./crawling/{0}/site.gold/{1}/{1}.gold".format(self.date,data)).readlines()
        elif os._exists("./{0}/site.gold/{1}/{1}.gold".format(self.date,data)):
            gold_file = open("./{0}/site.gold/{1}/{1}.gold".format(self.date,data)).readlines()
            print "./{0}/site.gold/{1}/{1}.gold".format(self.date,data)
        else:
            print "annotation starts"
            a = annotator(dataset)
            self.ground_truth = a.get_ground_truth(self.path_list)
            return None

        gold_dict = self.build_gold(gold_file)
        #print self.folder_path
        print gold_dict.keys()
        print "length is ", len(gold_dict.keys())
        for i in range(len(self.pages)):
            # here {}/sample instead of {}_samples
            #path = self.pages[i].path.replace("../Crawler/{0}/samples/{1}/".format(self.date,data),"")
            path = self.pages[i].path.replace("../../Crawler/{0}/samples/{1}/".format(self.date,data),"")
            #print path.strip()
            id = int(gold_dict[path.strip().replace(" ","")])
            self.ground_truth.append(id)
        '''
예제 #2
0
    def get_ground_truth(self, dataset):
        print "our dataset is {0}".format(dataset)
        data = dataset.replace("new_", "")
        if os._exists("./crawling/{0}/site.gold/{1}/{1}.gold".format(
                self.date, data)):
            print "./crawling/{0}/site.gold/{1}/{1}.gold".format(
                self.date, data)
            gold_file = open("./crawling/{0}/site.gold/{1}/{1}.gold".format(
                self.date, data)).readlines()
        elif os._exists("./{0}/site.gold/{1}/{1}.gold".format(self.date,
                                                              data)):
            gold_file = open("./{0}/site.gold/{1}/{1}.gold".format(
                self.date, data)).readlines()
            print "./{0}/site.gold/{1}/{1}.gold".format(self.date, data)
        else:
            print "annotation starts"
            a = annotator(dataset)
            self.ground_truth = a.get_ground_truth(self.path_list)
            return None

        gold_dict = self.build_gold(gold_file)
        #print self.folder_path
        print gold_dict.keys()
        print "length is ", len(gold_dict.keys())
        for i in range(len(self.pages)):
            # here {}/sample instead of {}_samples
            #path = self.pages[i].path.replace("../Crawler/{0}/samples/{1}/".format(self.date,data),"")
            path = self.pages[i].path.replace(
                "../../Crawler/{0}/samples/{1}/".format(self.date, data), "")
            #print path.strip()
            id = int(gold_dict[path.strip().replace(" ", "")])
            self.ground_truth.append(id)
        '''
예제 #3
0
    def crawl(self):
        self.get_pattern(self.dataset, self.cluster_rank)
        self.a = annotator(self.dataset)

        write_file = open(
            "./results/vidal_{0}_{1}_{2}_size{3}.txt".format(
                self.dataset, self.date, self.cluster_rank, self.crawl_size),
            "w")
        num_web_crawl = 0
        entry, prefix = self.entry, self.prefix
        self.url_stack = [(entry, "", 0)]
        self.final_list = []
        size, num = self.crawl_size, 0  # the number of crawling
        s = sampler(self.dataset, self.entry, self.prefix, 0)
        while (num < size and len(self.url_stack) > 0):
            first_url = self.url_stack[0][0]
            parent_url = self.url_stack[0][1]
            rule_id = self.url_stack[0][2]
            try:
                print "first url is ", first_url
            except:
                traceback.print_exc()

            if first_url not in self.history_set:
                num += 1
                try:
                    url_list, new_rule_id = self.crawl_link(
                        first_url, rule_id, self.history_set, s)
                    self.final_list.append((first_url, parent_url, rule_id))
                except:
                    print "might miss somthing here"
                    traceback.print_exc()
                    flag = s.crawlUrl(first_url, self.dataset, self.url_stack,
                                      self.history_set)
                    if flag == 1:
                        url_list, new_rule_id = self.crawl_link(
                            first_url, rule_id, self.history_set, s)
                        self.final_list.append(
                            (first_url, parent_url, rule_id))
                        random_time_s = random.randint(5, 10)
                        time.sleep(random_time_s)
                        num_web_crawl += 1
                        if num_web_crawl % 10 == 9:
                            random_time_s = random.randint(60, 90)
                            time.sleep(random_time_s)
                    else:
                        num -= 1
                        print "crawl failure"
            if self.url_stack[0][0] == first_url:
                self.url_stack.pop(0)
            print " num is {}".format(num)
            sys.stdout.flush()
            self.history_set.add(first_url)

        print len(self.final_list), "length of final list"

        for pair in self.final_list:
            url, parent_url, cluster_id = pair[0], pair[1], pair[2]
            write_file.write(url + "\t" + str(parent_url) + "\t" +
                             str(cluster_id) + '\n')
예제 #4
0
    def crawl(self):
        self.get_pattern(self.dataset, self.cluster_rank)
        self.a = annotator(self.dataset)

        write_file = open(
            "./results/vidal_{0}_{1}_{2}_size{3}.txt".format(
                self.dataset, self.date, self.cluster_rank, self.crawl_size
            ),
            "w",
        )
        num_web_crawl = 0
        entry, prefix = self.entry, self.prefix
        self.url_stack = [(entry, "", 0)]
        self.final_list = []
        size, num = self.crawl_size, 0  # the number of crawling
        s = sampler(self.dataset, self.entry, self.prefix, 0)
        while num < size and len(self.url_stack) > 0:
            first_url = self.url_stack[0][0]
            parent_url = self.url_stack[0][1]
            rule_id = self.url_stack[0][2]
            try:
                print "first url is ", first_url
            except:
                traceback.print_exc()

            if first_url not in self.history_set:
                num += 1
                try:
                    url_list, new_rule_id = self.crawl_link(first_url, rule_id, self.history_set, s)
                    self.final_list.append((first_url, parent_url, rule_id))
                except:
                    print "might miss somthing here"
                    traceback.print_exc()
                    flag = s.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set)
                    if flag == 1:
                        url_list, new_rule_id = self.crawl_link(first_url, rule_id, self.history_set, s)
                        self.final_list.append((first_url, parent_url, rule_id))
                        random_time_s = random.randint(5, 10)
                        time.sleep(random_time_s)
                        num_web_crawl += 1
                        if num_web_crawl % 10 == 9:
                            random_time_s = random.randint(60, 90)
                            time.sleep(random_time_s)
                    else:
                        num -= 1
                        print "crawl failure"
            if self.url_stack[0][0] == first_url:
                self.url_stack.pop(0)
            print " num is {}".format(num)
            sys.stdout.flush()
            self.history_set.add(first_url)

        print len(self.final_list), "length of final list"

        for pair in self.final_list:
            url, parent_url, cluster_id = pair[0], pair[1], pair[2]
            write_file.write(url + "\t" + str(parent_url) + "\t" + str(cluster_id) + "\n")
예제 #5
0
    valid_ratio = 1 - float(num)/float(total)
    #print valid_ratio, " valid ratio"

    return valid_ratio


def compare_methods(counter_a, counter_b):
    print counter_a.most_common()
    print counter_b.most_common()

if __name__ == '__main__':
    site = sys.argv[1]
    folder_path = "../../Crawler/July30_samples/{}/".format(site)
    date = "July30"
    #sitemap = pageCluster(site,date,folder_path,0)
    a = annotator(site)

    #c = crawler(site,date,None,None,eps=None,cluster_rank=0,crawl_size=None,rank_algo=None)


    path = "./results/bfs/{}_July30_0_bfs_size10001.txt".format(site)
    #bfs = a.annotate_file(path)
    bfs = get_annotation_cluster(path,a,num=2001)

    # random walk
    rw_path = "./results/evaluate/sampling/sampling_uniform_{0}_{1}_size3001.txt".format(site,date)
    rw_list = get_annotation_cluster(rw_path,a,num=2001)

    # general_crawl
    g_path = "./results/evaluate/general/{}_July30_1_general_size2001.txt".format(site)
    #g_path = "./results/evaluate/general/{}_July30_1_general_size2001.txt".format(site)
예제 #6
0
        "asp", "youtube", "hupu", "douban", "rottentomatoes", "stackexchange"
    ]
    #sites = ["douban"]
    date = "July30"

    no_sim_entropy, no_sim_ratio = [], []
    no_balance_entropy, no_balance_ratio, no_info_entropy = [], [], []
    general_entropy, general_ratio = [], []

    valid_ratio_list = [[], [], [], []]
    entropy_list = [[], [], [], []]

    for site in sites:
        folder_path = "../../Crawler/July30_samples/{}/".format(site)
        #sitemap = pageCluster(site,date,folder_path,0)
        a = annotator(site)

        #c = crawler(site,date,None,None,eps=None,cluster_rank=0,crawl_size=None,rank_algo=None)

        #data_folder = "../../Crawler/full_data/{}".format(site)
        #result_file = "./results/bfs/{}_May1_0_bfs_size5000.txt".format(site)
        #write_path = "./{}_bfs_classify.txt".format(site)
        #classify_results_file(c,result_file,data_folder,write_path)

        #prefix = "http://android.{0}.com".format(site)
        #prefix = "http://forums.asp.net"
        #prefix = "https://www.youtube.com"

        path = "./results/bfs/{}_July30_0_bfs_size10001.txt".format(site)
        bfs = get_cluster_results(path, 5000)
        #bfs = a.annotate_file(path)