def score(self, learn_test, detect_test, y_test, test_params = None):
		if not test_params:
			test_params = dict(sef.best_params)
		learn_test = list_to_sites(learn_test, self.simhash_config)
		detect_test = list_to_sites(detect_test, self.simhash_config)
		cluster_learner = ClusterLearning()
		cluster_config = CD.ClusterConfig()
		cluster_config.algorithm.name = CD.Algorithm.HIERARCHICAL_CLUSTERING
		cluster_config.algorithm.inconsistent_coefficient = test_params["train_inconsistent"]
		logger = logging.getLogger("global")
		logger.info("validating now, best parameters are:")
		logger.info(test_params)
		logger.info("learning with coefficient: {0}".format(test_params["train_inconsistent"]))
		learned_test_sites = cluster_learner.learn(learn_test, cluster_config)
		logger.info("learning complete!")
		# write to file for plotting
		return self.score_learned(learned_test_sites, detect_test, y_test,
				test_params)
	def fit(self, learn_train, detect_train, y_train):
		learn_train = list_to_sites(learn_train, self.simhash_config)
		detect_train = list_to_sites(detect_train, self.simhash_config)
		min_params = dict()
		test_params = dict()
		# T_train
		for t1 in np.arange(self.coefficient[0], self.coefficient[1], self.coefficient_step):
			test_params['train_inconsistent'] = t1
			cluster_learner = ClusterLearning()
			cluster_config = CD.ClusterConfig()
			cluster_config.algorithm.name = CD.Algorithm.HIERARCHICAL_CLUSTERING
			cluster_config.algorithm.inconsistent_coefficient = t1
			logger = logging.getLogger("global")
			logger.info("learning with coefficient: {0}".format(t1))
			learned_sites = cluster_learner.learn(learn_train, cluster_config)
			logger.info("learning complete!")
			# T_test
			for t2 in np.arange(self.coefficient[2], self.coefficient[3], self.coefficient_step):
				test_params['test_inconsistent'] = t2
				# R_test
				for r in np.arange(self.radius[0], self.radius[1], self.radius_step):
					test_params['test_diameter'] = r
					logger.info("testing with coefficient: {0}, min_radius: {1}".format(t2, r))
					current_score = self.score_learned(learned_sites, detect_train, y_train, test_params) 
					logger.info("testing complete, f1 score is: {0}".format(current_score))
					"""
					the less the error, 
					the less the distance between train and test threshold, ie. uncertain area, the better.
					"""
					if (('score' not in min_params) or (current_score < min_params['score']) or
							(current_score == min_params['score'] and min_params['distance'] > t2 - t1)):
						min_params['score'] = current_score
						min_params['distance'] = t2 - t1
						min_params['train_inconsistent'] = t1
						min_params['test_inconsistent'] = t2
						min_params['test_diameter'] = r
		if (('score' not in self.best_params) or (min_params['score'] < self.best_params['score']) or 
				(current_score == min_params['score'] and min_params['distance'] > t2 - t1)):
			self.best_params = dict(min_params)
			logger.info(self.best_params)
		return min_params
def test_learner():
    in_filenames = [
        "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_0.text"
    ]
    out_filename = in_filenames[0] + ".learned"
    cluster_learner = ClusterLearning()
    cluster_config = CD.ClusterConfig()
    cluster_config.algorithm.name = CD.Algorithm.HAMMING_THRESHOLD
    cluster_config.algorithm.thres = 5
    cluster_config.algorithm.left_out_ratio = 5  # left out ratio is 5%
    res = cluster_learner.learn(in_filenames, cluster_config)
    # write_proto_to_file(res, out_filename)
    print "result for hamming threhold clustering"
    print res

    cluster_config.algorithm.name = CD.Algorithm.HIERARCHICAL_CLUSTERING
    cluster_config.algorithm.left_out_ratio = 5  # left out ratio is 5%
    res = cluster_learner.learn(in_filenames, cluster_config)
    # write_proto_to_file(res, out_filename)
    print "result for hierarchical clustering"
    print res
def test_computer():
    site_list_filenames = [
        "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_2"
    ]
    out_filename = (
        "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_2.text"
    )
    cluster_learner = ClusterLearning()
    simhash_config = CD.SimhashConfig()
    simhash_config.simhash_type = CD.TEXT
    simhash_config.discard_failure = False
    simhash_config.usage.tri_gram = True
    res = cluster_learner.compute_simhash(site_list_filenames, simhash_config)
    # write_proto_to_file(res, out_filename)
    print res

    out_filename = (
        "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_2.dom"
    )
    cluster_learner = ClusterLearning()
    simhash_config = CD.SimhashConfig()
    simhash_config.simhash_type = CD.DOM
    simhash_config.usage.tri_gram = False
    res = cluster_learner.compute_simhash(site_list_filenames, simhash_config)
    # write_proto_to_file(res, out_filename)
    print res

    # cluster_config = CD.ClusterConfig()
    out_filename = (
        "../data/abusive_words.20150115-154913.selenium.crawl/91532f0a84878d909e2deed33e9932cf/ad_crawl_log_2.text_dom"
    )
    cluster_learner = ClusterLearning()
    simhash_config = CD.SimhashConfig()
    simhash_config.simhash_type = CD.TEXT_DOM
    simhash_config.usage.tri_gram = False
    res = cluster_learner.compute_simhash(site_list_filenames, simhash_config)
    # write_proto_to_file(res, out_filename)
    print res