def learn(observed_sites_filenames, outfile = None, inconsistent_coefficient = 1): """ After mannual testing, left out ratio may be a good candidate. """ out_filename = outfile + ".learned" if outfile else observed_sites_filenames[0] + ".learned" cluster_learner = ClusterLearning() cluster_config = CD.ClusterConfig() cluster_config.algorithm.name = CD.Algorithm.HIERARCHICAL_CLUSTERING cluster_config.algorithm.inconsistent_coefficient = inconsistent_coefficient res = cluster_learner.learn(observed_sites_filenames, cluster_config) write_proto_to_file(res, out_filename)
def compute(site_list_filenames, outfile = None, simhash_type = None, is_google = False): # this branch simhash_type == None, TEXT, TEXT_DOM if not simhash_type == "DOM": text_out_filename = outfile + ".text" if outfile else site_list_filenames[0] + ".text" cluster_learner = ClusterLearning() simhash_config = CD.SimhashConfig() simhash_config.simhash_type = CD.TEXT simhash_config.discard_failure = not is_google simhash_config.crawl_log_landing_url_as_observation_landing_url = not is_google simhash_config.usage.tri_gram = True res = cluster_learner.compute_simhash(site_list_filenames, simhash_config) write_proto_to_file(res, text_out_filename) # this branch simhash_type == None, DOM, TEXT_DOM if not simhash_type == "TEXT": dom_out_filename = outfile + ".dom" if outfile else site_list_filenames[0] + ".dom" cluster_learner = ClusterLearning() simhash_config = CD.SimhashConfig() simhash_config.simhash_type = CD.DOM simhash_config.discard_failure = not is_google simhash_config.crawl_log_landing_url_as_observation_landing_url = not is_google simhash_config.usage.tri_gram = False res = cluster_learner.compute_simhash(site_list_filenames, simhash_config) write_proto_to_file(res, dom_out_filename)