def plot_sim_distance(inputfile, outfile, simhash_type, proto_type, avg_dist=True): simhash_type = get_simhash_type(simhash_type, True) sites = getattr(CD, proto_type)() read_proto_from_file(sites, inputfile) out_f = open(outfile, "w") if proto_type == "LearnedSites": for learned_site in sites.site: out_f.write(learned_site.name + "," + str(len(learned_site.pattern)) + "\n") for pattern in learned_site.pattern: dist_list = simhash_vector_distance(pattern.item, avg_dist) out_f.write("pattern\n" + "\n".join([str(d) for d in dist_list]) + "\n") out_f.close() elif proto_type == "ObservedSites": for observed_site in sites.site: out_f.write(observed_site.name + "," + str(len(observed_site.observation)) + "\n") simhash_item_vector = aggregate_simhash(observed_site, simhash_type) dist_list = simhash_vector_distance(simhash_item_vector, avg_dist) out_f.write("\n".join([str(d) for d in dist_list]) + "\n") out_f.close() else: raise Exception("Wrong proto! Only LearnedSites and ObservedSites can be used!")
def plot_simhash(inputfile, outfile, simhash_type, proto_type): simhash_type = get_simhash_type(simhash_type) sites = getattr(CD, proto_type)() read_proto_from_file(sites, inputfile) out_f = open(outfile, "w") if proto_type == "LearnedSites": for site in sites.site: observation_size = 0 for pattern in site.pattern: for item in pattern.item: observation_size += item.count out_f.write(site.name + "," + str(observation_size) + "\n") for pattern in site.pattern: for item in pattern.item: item_str = "%0.16x" % item.simhash item_str_array = [item_str for i in range(item.count)] out_f.write("\n".join(item_str_array) + "\n") out_f.close() elif proto_type == "ObservedSites": for site in sites.site: out_f.write(site.name + "," + str(len(site.observation)) + "\n") for observation in site.observation: simhash_str = "%0.16x" % getattr(observation, simhash_type) out_f.write(simhash_str + "\n") out_f.close() else: raise Exception("Wrong proto! Only LearnedSites and ObservedSites can be used!")
def build_site_simhash_dict(observed_sites): """ Return two dict, one maps site name to all the simhashs, the other maps site name to observed site """ valid_instance(observed_sites, CD.ObservedSites) site_simhash_dict = dict() observed_sites_dict = dict() attr_name = get_simhash_type(observed_sites.config.simhash_type) for observed_site in observed_sites.site: if not observed_site.name in site_simhash_dict: site_simhash_dict[observed_site.name] = set() observed_sites_dict[observed_site.name] = observed_site for observation in observed_site.observation: site_simhash_dict[observed_site.name].add(getattr(observation, attr_name)) return site_simhash_dict, observed_sites_dict