def score_learned(self, learned_sites, detect_test, y_test, test_params):
		valid_instance(learned_sites, CD.LearnedSites)
		valid_instance(detect_test, CD.ObservedSites)
		total_size = len(y_test)
		# if X_test is ObservedSites, list_to_sites does nothing or
		# selection (y_test not None)
		X_test = list_to_sites(detect_test, self.simhash_config)
		X_expected = list_to_sites(detect_test, self.simhash_config, y_test)
		detection_config = CD.DetectionConfig()
		detection_config.algorithm = CD.DetectionConfig.INCONSISTENT_COEFFICIENT
		detection_config.min_radius = test_params['test_diameter']
		detection_config.inconsistent_coefficient = test_params['test_inconsistent']
		detection_config.simhash_type = self.simhash_config.simhash_type
		detector = CloakingDetection(detection_config, learned_sites)
		cloaking_sites = detector.detect(X_test)

		if self.cloaking_sites:
			cloaking_sites = intersect_observed_sites_util(self.cloaking_sites, cloaking_sites)
		else:
			"""
			This is only useful for integrated testing, text detection set cloaking_sites,
			dom detection read this and intersect its cloaking_sites with the one 
			from text and evaluate it.
			"""
			self.cloaking_sites = CD.ObservedSites()
			self.cloaking_sites.CopyFrom(cloaking_sites)

		rate, pr, errors = compute_metrics(cloaking_sites, X_expected, total_size)
		logger = logging.getLogger("global")
		logger.warning(test_params)
		logger.warning(rate)
		logger.warning(pr)
		logger.warning(errors)

		"""
		Record the latest test scores for further analysis
		"""
		self.rate = rate
		self.pr = pr
		self.errors = errors
		res_str = ",".join([ str(test_params["train_inconsistent"]),
				str(test_params["test_inconsistent"]),
				str(test_params["test_diameter"]) ]) + "\n"
		res_str += ",".join([ str(rate[0]), str(rate[1]) ]) + "\n"
		if self.outf:
			self.outf.write(res_str)
		return self.objective(errors)
	def learn(self, observed_sites_filenames, cluster_config=None):
		"""
		Learn clusters from observed_sites_filenames using cluster_config

		@parameter
		observed_sites_filenames: list of observed_sites to be learned
		cluster_config: configuration for clustering
		@return
		learned_sites: the learned clusters
		"""
		if (not cluster_config) and (not self.cluster_config):
			raise Exception("Cluster config missing")
		elif cluster_config and valid_instance(cluster_config, CD.ClusterConfig):
			self.cluster_config = CD.ClusterConfig()
			self.cluster_config.CopyFrom(cluster_config)
		# learn the clusters
		if isinstance(observed_sites_filenames, list):
			observed_sites = merge_observed_sites(observed_sites_filenames)
		elif valid_instance(observed_sites_filenames, CD.ObservedSites):
			observed_sites = observed_sites_filenames
		else:
			raise Exception("Wrong argument for learn!")
		
		print "in learning phase"
		print "before de noise {0}".format(len(observed_sites.site))
		de_noise_config = CD.DeNoiseConfig()
		observed_sites = de_noise(observed_sites, de_noise_config)
		print "after de noise {0}".format(len(observed_sites.site))
		learned_sites = CD.LearnedSites()
		cluster_config.simhash_type = observed_sites.config.simhash_type
		for observed_site in observed_sites.site:
			# either TEXT or DOM can be handled now. TEXT_DOM is not supported.
			if cluster_config.algorithm.name == CD.Algorithm.HAMMING_THRESHOLD:
				result = HammingTreshold(cluster_config, observed_site)
			if cluster_config.algorithm.name == CD.Algorithm.K_MEANS:
				result = KMeans(cluster_config, observed_site)
			if cluster_config.algorithm.name == CD.Algorithm.SPECTRAL_CLUSTERING:
				result = SpectralClustering(cluster_config, observed_site)
			if cluster_config.algorithm.name == CD.Algorithm.HIERARCHICAL_CLUSTERING:
				# result = HierarchicalClustering(cluster_config, observed_site)
				result = ScipyHierarchicalClustering(cluster_config,
						observed_site)
			# If no pattern can be extracted, return None
			if result:
				learned_site = learned_sites.site.add()
				learned_site.CopyFrom(result)
		return learned_sites
	def compute_simhash(self, site_list_filenames, simhash_config):
		"""
		Compute simhash for site_list_filenames using configuration simhash_config

		@parameter
		site_list_filenames: a list of site_list_filename (CrawlLog)
		simhash_config: the configuration for simhash computing
		@return
		observed_sites: site observations aggregated by site
		"""

		# Input is a list of site_list_filename
		valid_instance(simhash_config, CD.SimhashConfig)
		if simhash_config.crawl_log_landing_url_as_observation_landing_url:
			url_field = "landing_url"
		else:
			url_field = "url"
		observed_sites, path_list = load_observed_sites(site_list_filenames,
				url_field)  
		simhash_computer = HtmlSimhashComputer(simhash_config)
		thread_computer = ThreadComputer(simhash_computer, 'compute_simhash', path_list)
		path_simhash_dict = dict()
		for p, s in thread_computer.result:
			path_simhash_dict[p] = s
		observed_sites.config.CopyFrom(simhash_config)
		for site in observed_sites.site:
			count = 0
			for observation in site.observation:
				if not observation.file_path in path_simhash_dict:
					# If simhash computation of this observation failed,
					# just ignore this one. Because we don't have sample
					# and it is not marked as failure.
					del site.observation[count]
					continue
				result = path_simhash_dict[observation.file_path]
				if simhash_config.simhash_type in [CD.TEXT, CD.TEXT_DOM]:
					observation.text_simhash = result[0][0].value
					observation.text_feature_count = result[0][1]
				if simhash_config.simhash_type in [CD.DOM, CD.TEXT_DOM]:
					observation.dom_simhash = result[-1][0].value
					observation.dom_feature_count = result[-1][1]
				count += 1
		if not simhash_config.discard_failure:
			observed_sites = add_failure(observed_sites, site_list_filenames)
		return observed_sites
	def build_by_dom(self, html_dom):
		if valid_instance(html_dom, CD.HtmlDom):
			features = list()
			if self.simhash_config.usage.gram:
				for feature in html_dom.node:
					features.append(feature.name)
			if self.simhash_config.usage.bi_gram:
				for feature in html_dom.bi_node:
					features.append(feature.name)
			if self.simhash_config.usage.tri_gram:
				for feature in html_dom.tri_node:
					features.append(feature.name)
			return [self.build_by_features(features), len(features)]
	def build_by_text(self, html_text):
		if valid_instance(html_text, CD.HtmlText):
			# weighted features are not supported by now
			features = list()
			if self.simhash_config.usage.gram:
				for feature in html_text.word:
					features.append(feature.name)
			if self.simhash_config.usage.bi_gram:
				for feature in html_text.bi_gram:
					features.append(feature.name)
			if self.simhash_config.usage.tri_gram:
				for feature in html_text.tri_gram:
					features.append(feature.name)
			return [self.build_by_features(features), len(features)]
	def build_by_features(self, features):
		# print len(features)
		if valid_instance(features, collections.Iterable):
			return Simhash(features)
	def __init__(self, simhash_config):
		if valid_instance(simhash_config, CD.SimhashConfig):
			self.simhash_config = simhash_config
	def __init__(self, cluster_config=None):
		if not cluster_config:
			self.cluster_config = None
		elif valid_instance(cluster_config, CD.ClusterConfig):
			self.cluster_config = CD.ClusterConfig()
			self.cluster_config.CopyFrom(cluster_config)