def get_ground_truth(self,dataset): print "our dataset is {0}".format(dataset) data = dataset.replace("new_","") if os._exists("./crawling/{0}/site.gold/{1}/{1}.gold".format(self.date,data)): print "./crawling/{0}/site.gold/{1}/{1}.gold".format(self.date,data) gold_file = open("./crawling/{0}/site.gold/{1}/{1}.gold".format(self.date,data)).readlines() elif os._exists("./{0}/site.gold/{1}/{1}.gold".format(self.date,data)): gold_file = open("./{0}/site.gold/{1}/{1}.gold".format(self.date,data)).readlines() print "./{0}/site.gold/{1}/{1}.gold".format(self.date,data) else: print "annotation starts" a = annotator(dataset) self.ground_truth = a.get_ground_truth(self.path_list) return None gold_dict = self.build_gold(gold_file) #print self.folder_path print gold_dict.keys() print "length is ", len(gold_dict.keys()) for i in range(len(self.pages)): # here {}/sample instead of {}_samples #path = self.pages[i].path.replace("../Crawler/{0}/samples/{1}/".format(self.date,data),"") path = self.pages[i].path.replace("../../Crawler/{0}/samples/{1}/".format(self.date,data),"") #print path.strip() id = int(gold_dict[path.strip().replace(" ","")]) self.ground_truth.append(id) '''
def get_ground_truth(self, dataset): print "our dataset is {0}".format(dataset) data = dataset.replace("new_", "") if os._exists("./crawling/{0}/site.gold/{1}/{1}.gold".format( self.date, data)): print "./crawling/{0}/site.gold/{1}/{1}.gold".format( self.date, data) gold_file = open("./crawling/{0}/site.gold/{1}/{1}.gold".format( self.date, data)).readlines() elif os._exists("./{0}/site.gold/{1}/{1}.gold".format(self.date, data)): gold_file = open("./{0}/site.gold/{1}/{1}.gold".format( self.date, data)).readlines() print "./{0}/site.gold/{1}/{1}.gold".format(self.date, data) else: print "annotation starts" a = annotator(dataset) self.ground_truth = a.get_ground_truth(self.path_list) return None gold_dict = self.build_gold(gold_file) #print self.folder_path print gold_dict.keys() print "length is ", len(gold_dict.keys()) for i in range(len(self.pages)): # here {}/sample instead of {}_samples #path = self.pages[i].path.replace("../Crawler/{0}/samples/{1}/".format(self.date,data),"") path = self.pages[i].path.replace( "../../Crawler/{0}/samples/{1}/".format(self.date, data), "") #print path.strip() id = int(gold_dict[path.strip().replace(" ", "")]) self.ground_truth.append(id) '''
def crawl(self): self.get_pattern(self.dataset, self.cluster_rank) self.a = annotator(self.dataset) write_file = open( "./results/vidal_{0}_{1}_{2}_size{3}.txt".format( self.dataset, self.date, self.cluster_rank, self.crawl_size), "w") num_web_crawl = 0 entry, prefix = self.entry, self.prefix self.url_stack = [(entry, "", 0)] self.final_list = [] size, num = self.crawl_size, 0 # the number of crawling s = sampler(self.dataset, self.entry, self.prefix, 0) while (num < size and len(self.url_stack) > 0): first_url = self.url_stack[0][0] parent_url = self.url_stack[0][1] rule_id = self.url_stack[0][2] try: print "first url is ", first_url except: traceback.print_exc() if first_url not in self.history_set: num += 1 try: url_list, new_rule_id = self.crawl_link( first_url, rule_id, self.history_set, s) self.final_list.append((first_url, parent_url, rule_id)) except: print "might miss somthing here" traceback.print_exc() flag = s.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set) if flag == 1: url_list, new_rule_id = self.crawl_link( first_url, rule_id, self.history_set, s) self.final_list.append( (first_url, parent_url, rule_id)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) num_web_crawl += 1 if num_web_crawl % 10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: num -= 1 print "crawl failure" if self.url_stack[0][0] == first_url: self.url_stack.pop(0) print " num is {}".format(num) sys.stdout.flush() self.history_set.add(first_url) print len(self.final_list), "length of final list" for pair in self.final_list: url, parent_url, cluster_id = pair[0], pair[1], pair[2] write_file.write(url + "\t" + str(parent_url) + "\t" + str(cluster_id) + '\n')
def crawl(self): self.get_pattern(self.dataset, self.cluster_rank) self.a = annotator(self.dataset) write_file = open( "./results/vidal_{0}_{1}_{2}_size{3}.txt".format( self.dataset, self.date, self.cluster_rank, self.crawl_size ), "w", ) num_web_crawl = 0 entry, prefix = self.entry, self.prefix self.url_stack = [(entry, "", 0)] self.final_list = [] size, num = self.crawl_size, 0 # the number of crawling s = sampler(self.dataset, self.entry, self.prefix, 0) while num < size and len(self.url_stack) > 0: first_url = self.url_stack[0][0] parent_url = self.url_stack[0][1] rule_id = self.url_stack[0][2] try: print "first url is ", first_url except: traceback.print_exc() if first_url not in self.history_set: num += 1 try: url_list, new_rule_id = self.crawl_link(first_url, rule_id, self.history_set, s) self.final_list.append((first_url, parent_url, rule_id)) except: print "might miss somthing here" traceback.print_exc() flag = s.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set) if flag == 1: url_list, new_rule_id = self.crawl_link(first_url, rule_id, self.history_set, s) self.final_list.append((first_url, parent_url, rule_id)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) num_web_crawl += 1 if num_web_crawl % 10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: num -= 1 print "crawl failure" if self.url_stack[0][0] == first_url: self.url_stack.pop(0) print " num is {}".format(num) sys.stdout.flush() self.history_set.add(first_url) print len(self.final_list), "length of final list" for pair in self.final_list: url, parent_url, cluster_id = pair[0], pair[1], pair[2] write_file.write(url + "\t" + str(parent_url) + "\t" + str(cluster_id) + "\n")
valid_ratio = 1 - float(num)/float(total) #print valid_ratio, " valid ratio" return valid_ratio def compare_methods(counter_a, counter_b): print counter_a.most_common() print counter_b.most_common() if __name__ == '__main__': site = sys.argv[1] folder_path = "../../Crawler/July30_samples/{}/".format(site) date = "July30" #sitemap = pageCluster(site,date,folder_path,0) a = annotator(site) #c = crawler(site,date,None,None,eps=None,cluster_rank=0,crawl_size=None,rank_algo=None) path = "./results/bfs/{}_July30_0_bfs_size10001.txt".format(site) #bfs = a.annotate_file(path) bfs = get_annotation_cluster(path,a,num=2001) # random walk rw_path = "./results/evaluate/sampling/sampling_uniform_{0}_{1}_size3001.txt".format(site,date) rw_list = get_annotation_cluster(rw_path,a,num=2001) # general_crawl g_path = "./results/evaluate/general/{}_July30_1_general_size2001.txt".format(site) #g_path = "./results/evaluate/general/{}_July30_1_general_size2001.txt".format(site)
"asp", "youtube", "hupu", "douban", "rottentomatoes", "stackexchange" ] #sites = ["douban"] date = "July30" no_sim_entropy, no_sim_ratio = [], [] no_balance_entropy, no_balance_ratio, no_info_entropy = [], [], [] general_entropy, general_ratio = [], [] valid_ratio_list = [[], [], [], []] entropy_list = [[], [], [], []] for site in sites: folder_path = "../../Crawler/July30_samples/{}/".format(site) #sitemap = pageCluster(site,date,folder_path,0) a = annotator(site) #c = crawler(site,date,None,None,eps=None,cluster_rank=0,crawl_size=None,rank_algo=None) #data_folder = "../../Crawler/full_data/{}".format(site) #result_file = "./results/bfs/{}_May1_0_bfs_size5000.txt".format(site) #write_path = "./{}_bfs_classify.txt".format(site) #classify_results_file(c,result_file,data_folder,write_path) #prefix = "http://android.{0}.com".format(site) #prefix = "http://forums.asp.net" #prefix = "https://www.youtube.com" path = "./results/bfs/{}_July30_0_bfs_size10001.txt".format(site) bfs = get_cluster_results(path, 5000) #bfs = a.annotate_file(path)