def __init__(self, dataset, date, entry, prefix, cluster_rank, crawl_size): self.dataset = dataset self.date = date self.cluster_rank = cluster_rank self.crawl_size = crawl_size self.cluster_rank = cluster_rank self.entry, self.prefix = entry, prefix self.history_set = set() self.group_list = [] self.group_dict = {} if self.date == "May1": self.path_prefix = "../../Crawler/{}_samples/{}/".format(date, dataset.replace("new_", "")) else: self.path_prefix = "../Crawler/{}_samples/{}/".format(date, dataset) self.folder_path = ["../../Crawler/{}_samples/{}/".format(date, dataset.replace("new_", ""))] self.sitemap = pageCluster(dataset, date, self.folder_path, 0) self.cluster_num = int(self.sitemap.DBSCAN()) self.full_folder = "../../Crawler/full_data/" + dataset c = crawler( self.dataset, self.date, None, None, eps=None, cluster_rank=self.cluster_rank, crawl_size=None, rank_algo=None, ) self.target_cluster = c.target_cluster self.crawler = c
def __init__(self, dataset, date, entry, prefix, cluster_rank, crawl_size): self.dataset = dataset self.date = date self.cluster_rank = cluster_rank self.crawl_size = crawl_size self.cluster_rank = cluster_rank self.entry, self.prefix = entry, prefix self.history_set = set() self.group_list = [] self.group_dict = {} if self.date == "May1": self.path_prefix = "../../Crawler/{}_samples/{}/".format( date, dataset.replace("new_", "")) else: self.path_prefix = "../Crawler/{}_samples/{}/".format( date, dataset) self.folder_path = [ "../../Crawler/{}_samples/{}/".format(date, dataset.replace("new_", "")) ] self.sitemap = pageCluster(dataset, date, self.folder_path, 0) self.cluster_num = int(self.sitemap.DBSCAN()) self.full_folder = "../../Crawler/full_data/" + dataset c = crawler(self.dataset, self.date, None, None, eps=None, cluster_rank=self.cluster_rank, crawl_size=None, rank_algo=None) self.target_cluster = c.target_cluster self.crawler = c
def __init__(self, dataset, date, entry,prefix, eps, cluster_rank,crawl_size, rank_algo="bfs"): self.dataset = dataset self.date = date self.eps = eps self.cluster_rank = cluster_rank self.rank_algo = rank_algo self.crawl_size = crawl_size self.rules = self.get_rules() self.entry, self.prefix = entry,prefix self.history_set = set() self.path_prefix = "../../Crawler/{}_samples/{}/".format(date,dataset.replace("new_","")) self.folder_path = ["../../Crawler/{}_samples/{}/".format(date,dataset.replace("new_",""))] self.sitemap = pageCluster(dataset,date,self.folder_path,0) self.full_folder = "../../Crawler/full_data/" + dataset self.trans = {} self.queue = {} self.crawled_cluster_count = defaultdict(int) self.trans_dict = read_trans_dict(dataset,date) #self.cluster_dict = get_cluster_dict(dataset,date) #self.cluster_num = int(self.sitemap.DBSCAN(eps_val=self.eps)) self.cluster_num = int(self.sitemap.DBSCAN()) self.build_gold_cluster_dict() self.cluster_xpath_trans = self.get_xpath_transition() self.trans_prob_mat = self.calculate_trans_prob_mat() self.max_score = 500 self.target_cluster = self.get_sample_cluster()
def cv(self): labels_true = np.array(self.pages.ground_truth) skf = StratifiedKFold(labels_true, n_folds=4) results = [] count = 0 p = pageCluster(self.dataset, self.date) for train, test in skf: #print train, test count += 1 print "this is the {} times for CV".format(count) train_gold, test_gold = labels_true[train], labels_true[test] self.run(train) self.MDL() path_list = self.pages.path_list self.classify(test) self.clustering() print train, "train index list", type(train), len(train) train_y = np.array(self.pages.category)[train] test_y = np.array(self.pages.category)[test] results.append( p.Evaluation_CV(test_gold, test_y, train_gold, train_y, path_list=path_list)) ''' t = KMeans() train_y, final_centroids, final_ite, final_dist = t.k_means(km_train_x, num_clusters, replicates=20) test_y = t.k_means_classify(test_x) path_list = [self.UP_pages.path_list[idx] for idx in test] results.append(self.Evaluation_CV(test_gold,test_y,km_train_gold,train_y, path_list=path_list)) ''' result = np.mean(results, axis=0) cv_batch_file = open("./results/c_cv_baseline.results", "a") algo = "dbscan" dataset = self.dataset prefix = str(dataset) + " classifying \t" metrics = [ 'cv_micro_precision', 'cv_macro_precision', "non outlier ratio" ] for index, metric in enumerate(metrics): line = prefix + "\t" + metric + "\t" + str(result[index]) print line cv_batch_file.write(line + "\n")
def cv(self): labels_true = np.array(self.pages.ground_truth) skf = StratifiedKFold(labels_true, n_folds=4) results = [] count = 0 p = pageCluster(self.dataset,self.date) for train, test in skf: #print train, test count += 1 print "this is the {} times for CV".format(count) train_gold, test_gold = labels_true[train], labels_true[test] self.run(train) self.MDL() path_list = self.pages.path_list self.classify(test) self.clustering() print train, "train index list", type(train), len(train) train_y = np.array(self.pages.category)[train] test_y = np.array(self.pages.category)[test] results.append(p.Evaluation_CV(test_gold,test_y, train_gold, train_y, path_list=path_list)) ''' t = KMeans() train_y, final_centroids, final_ite, final_dist = t.k_means(km_train_x, num_clusters, replicates=20) test_y = t.k_means_classify(test_x) path_list = [self.UP_pages.path_list[idx] for idx in test] results.append(self.Evaluation_CV(test_gold,test_y,km_train_gold,train_y, path_list=path_list)) ''' result = np.mean(results,axis=0) cv_batch_file = open("./results/c_cv_baseline.results","a") algo = "dbscan" dataset = self.dataset prefix = str(dataset) + " classifying \t" metrics = ['cv_micro_precision','cv_macro_precision',"non outlier ratio"] for index,metric in enumerate(metrics): line = prefix + "\t" + metric + "\t" + str(result[index]) print line cv_batch_file.write(line + "\n" )
def __init__(self, dataset, date, entry, prefix, eps, cluster_rank, crawl_size, rank_algo="bfs"): self.dataset = dataset self.date = date self.eps = eps self.cluster_rank = cluster_rank self.rank_algo = rank_algo self.crawl_size = crawl_size self.rules = self.get_rules() self.entry, self.prefix = entry, prefix self.history_set = set() self.path_prefix = "../../Crawler/{}_samples/{}/".format( date, dataset.replace("new_", "")) self.folder_path = [ "../../Crawler/{}_samples/{}/".format(date, dataset.replace("new_", "")) ] self.sitemap = pageCluster(dataset, date, self.folder_path, 0) self.full_folder = "../../Crawler/full_data/" + dataset self.trans = {} self.queue = {} self.crawled_cluster_count = defaultdict(int) self.trans_dict = read_trans_dict(dataset, date) #self.cluster_dict = get_cluster_dict(dataset,date) #self.cluster_num = int(self.sitemap.DBSCAN(eps_val=self.eps)) self.cluster_num = int(self.sitemap.DBSCAN()) self.build_gold_cluster_dict() self.cluster_xpath_trans = self.get_xpath_transition() self.trans_prob_mat = self.calculate_trans_prob_mat() self.max_score = 500 self.target_cluster = self.get_sample_cluster()