def learn(observed_sites_filenames, outfile = None, inconsistent_coefficient = 1):
	"""
	After mannual testing, left out ratio may be a good candidate.
	"""
	out_filename = outfile + ".learned" if outfile else observed_sites_filenames[0] + ".learned"
	cluster_learner = ClusterLearning()
	cluster_config = CD.ClusterConfig()
	cluster_config.algorithm.name = CD.Algorithm.HIERARCHICAL_CLUSTERING
	cluster_config.algorithm.inconsistent_coefficient = inconsistent_coefficient
	res = cluster_learner.learn(observed_sites_filenames, cluster_config)
	write_proto_to_file(res, out_filename)
def compute(site_list_filenames, outfile = None, simhash_type = None, is_google = False):
	# this branch simhash_type == None, TEXT, TEXT_DOM
	if not simhash_type == "DOM":
		text_out_filename = outfile + ".text" if outfile else site_list_filenames[0] + ".text"
		cluster_learner = ClusterLearning()
		simhash_config = CD.SimhashConfig()
		simhash_config.simhash_type = CD.TEXT
		simhash_config.discard_failure = not is_google
		simhash_config.crawl_log_landing_url_as_observation_landing_url = not is_google
		simhash_config.usage.tri_gram = True
		res = cluster_learner.compute_simhash(site_list_filenames, simhash_config)
		write_proto_to_file(res, text_out_filename)
	# this branch simhash_type == None, DOM, TEXT_DOM
	if not simhash_type == "TEXT":
		dom_out_filename = outfile + ".dom" if outfile else site_list_filenames[0] + ".dom"
		cluster_learner = ClusterLearning()
		simhash_config = CD.SimhashConfig()
		simhash_config.simhash_type = CD.DOM
		simhash_config.discard_failure = not is_google
		simhash_config.crawl_log_landing_url_as_observation_landing_url = not is_google
		simhash_config.usage.tri_gram = False
		res = cluster_learner.compute_simhash(site_list_filenames, simhash_config)
		write_proto_to_file(res, dom_out_filename)