def score(self, learn_test, detect_test, y_test, test_params = None): if not test_params: test_params = dict(sef.best_params) learn_test = list_to_sites(learn_test, self.simhash_config) detect_test = list_to_sites(detect_test, self.simhash_config) cluster_learner = ClusterLearning() cluster_config = CD.ClusterConfig() cluster_config.algorithm.name = CD.Algorithm.HIERARCHICAL_CLUSTERING cluster_config.algorithm.inconsistent_coefficient = test_params["train_inconsistent"] logger = logging.getLogger("global") logger.info("validating now, best parameters are:") logger.info(test_params) logger.info("learning with coefficient: {0}".format(test_params["train_inconsistent"])) learned_test_sites = cluster_learner.learn(learn_test, cluster_config) logger.info("learning complete!") # write to file for plotting return self.score_learned(learned_test_sites, detect_test, y_test, test_params)
def fit(self, learn_train, detect_train, y_train): learn_train = list_to_sites(learn_train, self.simhash_config) detect_train = list_to_sites(detect_train, self.simhash_config) min_params = dict() test_params = dict() # T_train for t1 in np.arange(self.coefficient[0], self.coefficient[1], self.coefficient_step): test_params['train_inconsistent'] = t1 cluster_learner = ClusterLearning() cluster_config = CD.ClusterConfig() cluster_config.algorithm.name = CD.Algorithm.HIERARCHICAL_CLUSTERING cluster_config.algorithm.inconsistent_coefficient = t1 logger = logging.getLogger("global") logger.info("learning with coefficient: {0}".format(t1)) learned_sites = cluster_learner.learn(learn_train, cluster_config) logger.info("learning complete!") # T_test for t2 in np.arange(self.coefficient[2], self.coefficient[3], self.coefficient_step): test_params['test_inconsistent'] = t2 # R_test for r in np.arange(self.radius[0], self.radius[1], self.radius_step): test_params['test_diameter'] = r logger.info("testing with coefficient: {0}, min_radius: {1}".format(t2, r)) current_score = self.score_learned(learned_sites, detect_train, y_train, test_params) logger.info("testing complete, f1 score is: {0}".format(current_score)) """ the less the error, the less the distance between train and test threshold, ie. uncertain area, the better. """ if (('score' not in min_params) or (current_score < min_params['score']) or (current_score == min_params['score'] and min_params['distance'] > t2 - t1)): min_params['score'] = current_score min_params['distance'] = t2 - t1 min_params['train_inconsistent'] = t1 min_params['test_inconsistent'] = t2 min_params['test_diameter'] = r if (('score' not in self.best_params) or (min_params['score'] < self.best_params['score']) or (current_score == min_params['score'] and min_params['distance'] > t2 - t1)): self.best_params = dict(min_params) logger.info(self.best_params) return min_params
def test_learner(): in_filenames = [ "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_0.text" ] out_filename = in_filenames[0] + ".learned" cluster_learner = ClusterLearning() cluster_config = CD.ClusterConfig() cluster_config.algorithm.name = CD.Algorithm.HAMMING_THRESHOLD cluster_config.algorithm.thres = 5 cluster_config.algorithm.left_out_ratio = 5 # left out ratio is 5% res = cluster_learner.learn(in_filenames, cluster_config) # write_proto_to_file(res, out_filename) print "result for hamming threhold clustering" print res cluster_config.algorithm.name = CD.Algorithm.HIERARCHICAL_CLUSTERING cluster_config.algorithm.left_out_ratio = 5 # left out ratio is 5% res = cluster_learner.learn(in_filenames, cluster_config) # write_proto_to_file(res, out_filename) print "result for hierarchical clustering" print res
def test_computer(): site_list_filenames = [ "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_2" ] out_filename = ( "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_2.text" ) cluster_learner = ClusterLearning() simhash_config = CD.SimhashConfig() simhash_config.simhash_type = CD.TEXT simhash_config.discard_failure = False simhash_config.usage.tri_gram = True res = cluster_learner.compute_simhash(site_list_filenames, simhash_config) # write_proto_to_file(res, out_filename) print res out_filename = ( "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_2.dom" ) cluster_learner = ClusterLearning() simhash_config = CD.SimhashConfig() simhash_config.simhash_type = CD.DOM simhash_config.usage.tri_gram = False res = cluster_learner.compute_simhash(site_list_filenames, simhash_config) # write_proto_to_file(res, out_filename) print res # cluster_config = CD.ClusterConfig() out_filename = ( "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_2.text_dom" ) cluster_learner = ClusterLearning() simhash_config = CD.SimhashConfig() simhash_config.simhash_type = CD.TEXT_DOM simhash_config.usage.tri_gram = False res = cluster_learner.compute_simhash(site_list_filenames, simhash_config) # write_proto_to_file(res, out_filename) print res