def get_bad(bar_points, filename, outfilename): observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) domain_set = set(); for site in observed_sites.site: for observation in site.observation: url_domain = top_domain(observation.landing_url) domain_set.add(url_domain) domain_list = list(domain_set) bad_domains = get_domain_reputation(domain_list, bar_points) bad_observed_sites = CD.ObservedSites() bad_observed_sites.config.CopyFrom(observed_sites.config) for site in observed_sites.site: observation_list = list() for observation in site.observation: if top_domain(observation.landing_url) in bad_domains: observation_list.append(observation) if len(observation_list) == 0: continue bad_site = bad_observed_sites.site.add() bad_site.name = site.name for observation in observation_list: to_add = bad_site.observation.add() to_add.CopyFrom(observation) write_proto_to_file(bad_observed_sites, outfilename)
def sample(text_filenames, outfile, sample_size): dom_filenames = _replace_list_by(text_filenames, 'text', 'dom') google_text_filenames = _replace_list_by(text_filenames, 'user', 'google') google_dom_filenames = _replace_list_by(dom_filenames, 'user', 'google') text_observed_sites = merge_observed_sites(text_filenames) observed_site_list = list() url_set = set() for observed_site in text_observed_sites.site: observed_site_list.append(observed_site) for observation in observed_site.observation: url_set.add(observation.landing_url) logger = logging.getLogger("global") logger.info("there are {0} urls".format(len(url_set))) logger.info("there are {0} observed sites".format(len(observed_site_list))) random.shuffle(observed_site_list) # test_size is number of sites, actual observation should be more than this. sample_sites = CD.ObservedSites() sample_sites.config.CopyFrom(text_observed_sites.config) sample_list = observed_site_list[0:sample_size] original_label_list = [observed_site.name for observed_site in sample_list] for observed_site in sample_list: sample_site = sample_sites.site.add() sample_site.CopyFrom(observed_site) sample_filename = outfile + ".user.sample.text" write_proto_to_file(sample_sites, sample_filename) _output_sample_sites(original_label_list, dom_filenames, outfile + ".user.sample.dom") _output_sample_sites(original_label_list, google_text_filenames, outfile + '.google.sample.text') _output_sample_sites(original_label_list, google_dom_filenames, outfile + '.google.sample.dom')
def update_groundtruth(original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) # add google is list add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) #add_u_t = CD.ObservedSites() #read_proto_from_file(add_u_t, add_u_text) #add_u_d = CD.ObservedSites() #read_proto_from_file(add_u_d, add_u_dom) #add_g_t = merge_observed_sites(add_g_text) #add_g_d = merge_observed_sites(add_g_dom) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) _output_sample_sites(diff_e_set, [add_expected], add_expected + ".temp") _output_sample_sites(diff_e_set, [add_u_text], add_u_text + ".temp") _output_sample_sites(diff_e_set, [add_u_dom], add_u_dom + ".temp") add_g_text_fs = filter(bool, open(add_g_text, 'r').read().split('\n')) add_g_dom_fs = filter(bool, open(add_g_dom, 'r').read().split('\n')) _output_sample_sites(diff_e_set, add_g_text_fs, add_g_text + ".temp") _output_sample_sites(diff_e_set, add_g_dom_fs, add_g_dom + ".temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_u_text + ".temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_u_dom + ".temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_g_text + ".temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_g_dom + ".temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)
def write_crawl_log(self, counter_suffix=True): crawl_log_dir = self.crawl_config.crawl_log_dir if (not crawl_log_dir) or crawl_log_dir == "": crawl_log_dir = self.crawl_config.user_agent_md5_dir current_log_filename = crawl_log_dir + self.crawl_config.log_filename if counter_suffix: current_log_filename += "_" + str(self.counter) # Write global crawl_log write_proto_to_file(self.current_log, current_log_filename) # After write, reset variables self.current_log = CD.CrawlLog() return current_log_filename
def crawl(self): has_written = False for user_agent in self.user_agents: user_agent_md5 = hex_md5(user_agent) self.crawl_config.user_agent = user_agent self.crawl_config.user_agent_md5_dir = self.base_dir + user_agent_md5 + '/' # specify which type of browser to use set_browser_type(self.crawl_config) mkdir_if_not_exist(self.crawl_config.user_agent_md5_dir) # md5 - user agent mapping logs md5_UA_f = open(self.md5_UA_filename, 'a') # user agent md5_UA_f.write(user_agent_md5 + ":" + user_agent + "\n") md5_UA_f.close() # crawl web pages url_fetcher = UrlFetcher(self.crawl_config) thread_computer = ThreadComputer(url_fetcher, 'fetch_url', self.urls) url_fetcher.quit() # Write log for current user agent current_log = CD.CrawlLog() current_log_filename = self.crawl_config.user_agent_md5_dir + 'crawl_log' current_search = CD.CrawlSearchTerm() for p, s in thread_computer.result: result = current_search.result.add() result.CopyFrom(s) result_search = current_log.result_search.add() result_search.CopyFrom(current_search) write_proto_to_file(current_log, current_log_filename) # Write global crawl_log crawl_log = CD.CrawlLog() if has_written: read_proto_from_file(crawl_log, self.crawl_log_filename) else: has_written = True for r_s in current_log.result_search: result_search = crawl_log.result_search.add() result_search.CopyFrom(r_s) """ for s in current_log.result: result = crawl_log.result.add() result.CopyFrom(s) """ write_proto_to_file(crawl_log, self.crawl_log_filename)
def _output_sample_sites(original_label_list, filenames, outfile): """ Output the sample sites, either google or user @parameter oringinal_label_list: the selected websites filenames: observed sites filenames outfile: output filename @return If observed_sites from filenames doesn't contain all urls from original label list, use the return value new_label_list to get the updated label list. """ observed_sites = merge_observed_sites(filenames) observed_sites_map = dict() for observed_site in observed_sites.site: observed_sites_map[observed_site.name] = observed_site sample_sites = CD.ObservedSites() if observed_sites.HasField("config"): sample_sites.config.CopyFrom(observed_sites.config) else: print "There is no config in the observed_sites, please double check why" print "This can only happen to expected sites" print filenames sample_list = list() new_label_list = list() for label in original_label_list: if label in observed_sites_map: sample_list.append(observed_sites_map[label]) new_label_list.append(label) for observed_site in sample_list: sample_site = sample_sites.site.add() sample_site.CopyFrom(observed_site) write_proto_to_file(sample_sites, outfile) o_size = len(original_label_list) n_size = len(new_label_list) if not o_size == n_size: print "size of the original label list is: {0}".format(o_size) print "size of the new label list is: {0}".format(n_size) return new_label_list
def generate_test(observed_sites_filename, test_size=5000, positive_size=1000): text_observed_sites_filename = observed_sites_filename + ".text" dom_observed_sites_filename = observed_sites_filename + ".dom" if not (os.path.exists(dom_observed_sites_filename) and os.path.exists(text_observed_sites_filename)): raise Exception("Computed observed sites file doesn't exist!") # select for text simhash first computed_observed_sites_filename = text_observed_sites_filename observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, computed_observed_sites_filename) observed_site_list = list() for observed_site in observed_sites.site: observed_site_list.append(observed_site) random.shuffle(observed_site_list) # test_size is number of sites, actual observation should be more than this. test_sites = CD.ObservedSites() mismatch_sites = CD.ObservedSites() test_sites.config.CopyFrom(observed_sites.config) mismatch_sites.config.CopyFrom(observed_sites.config) test_list = observed_site_list[0:test_size] mismatch_list = test_list[0:positive_size] # original_label_list and mismatch_label_mapping is used in dom select. original_label_list = [observed_site.name for observed_site in test_list] mismatch_label_mapping = dict() for observed_site in mismatch_list: # observed_site in test_list are also changed. current_label = observed_site.name mismatch_label = random.sample(observed_site_list, 1)[0].name while (top_domain(current_label) == top_domain(mismatch_label)): mismatch_label = random.sample(observed_site_list, 1)[0].name observed_site.name = mismatch_label mismatch_site = mismatch_sites.site.add() mismatch_site.CopyFrom(observed_site) mismatch_label_mapping[current_label] = mismatch_label for observed_site in test_list: test_site = test_sites.site.add() test_site.CopyFrom(observed_site) mismatch_sites_filename = computed_observed_sites_filename + ".mismatch" test_sites_filename = computed_observed_sites_filename + ".test" write_proto_to_file(mismatch_sites, mismatch_sites_filename) write_proto_to_file(test_sites, test_sites_filename) # select for dom simhash now computed_observed_sites_filename = dom_observed_sites_filename observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, computed_observed_sites_filename) observed_sites_map = dict() for observed_site in observed_sites.site: observed_sites_map[observed_site.name] = observed_site test_sites = CD.ObservedSites() mismatch_sites = CD.ObservedSites() test_sites.config.CopyFrom(observed_sites.config) mismatch_sites.config.CopyFrom(observed_sites.config) test_list = list() for label in original_label_list: test_list.append(observed_sites_map[label]) for label in mismatch_label_mapping: observed_sites_map[label].name = mismatch_label_mapping[label] mismatch_site = mismatch_sites.site.add() mismatch_site.CopyFrom(observed_sites_map[label]) for observed_site in test_list: test_site = test_sites.site.add() test_site.CopyFrom(observed_site) mismatch_sites_filename = computed_observed_sites_filename + ".mismatch" test_sites_filename = computed_observed_sites_filename + ".test" write_proto_to_file(mismatch_sites, mismatch_sites_filename) write_proto_to_file(test_sites, test_sites_filename)
def update_groundtruth_redundant(count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): valid_instance(count, int) in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) diff_e_list = list(diff_e_set) logger.info(len(diff_e_list)) random.shuffle(diff_e_list) diff_e_sample = diff_e_list[:count] """ get the sites that are in "de-deduplicated" examples and add them this is necessary, because there are sites, that are cloaking, but remove in de-dup phase. Doesn't know why. """ add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n')) diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all + ".u.text.temp")) # use the updated diff expected set, to generate the new data _output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp") add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom') _output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp") add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp") add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_all + ".g.text.temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_all + ".g.dom.temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)
def main(argv): has_function = False help_msg = """data_util.py -f <function> [-p <prefix>][-p <prefix> -o <outfile>][-i <inputfile> -t <proto_type>][-o <outfile>][-i <site_list> -l <server_link> -o <outdir> -m <mode>][-i <inputfile>-o <outfile> -s <simhash_type> -t <proto_type>][-i <inputfile> -o <outfile> -s <simhash_type> -t <proto_type> -a] [-o <outfile>] [-i <inputfile> -o <outfile>] [-i <inputfile>] [-i <text_filt>] [-i <inputfile> -c <count> -o <outfile>] [-o <outfile>] [-i <inputfile> -l <leanredfile> -o <outfile>], valid functions are append_prefix, compute_list, show_proto, intersect_sites, collect_observations, plot_simhash, plot_sim_distance, get_domains, get_domain_scores, domain_filter, dedup, sample, merge_sites, get_learned_eval, [-i <table_name> -o <outfie>] export_db [-i <inputfile> -o <outfile>] de_noise [-i <inputfile> -c <count>] update_groundtruth [-i <user observation list, suffix removed>] merge_user_sites""" try: opts, args = getopt.getopt(argv, "hf:p:o:t:i:m:l:s:ac:", ["function=", "prefix=", "outfile=", "proto_type=", "ifile=", "mode=", "link=", "simhash_type=", "avg_dist", "count"]) except getopt.GetoptError: print help_msg sys.exit(2) hasinputfile = False outfile = None avg_dist = False for opt, arg in opts: if opt == "-h": print help_msg sys.exit() elif opt in ("-f", "--function"): function = arg has_function = True elif opt in ("-p", "--prefix"): prefix = arg elif opt in ("-o", "--outfile"): outfile = arg elif opt in ("-i", "--ifile"): inputfile = arg hasinputfile = True elif opt in ("-t", "--proto_type"): proto_type = arg elif opt in ("-m", "--mode"): mode = arg elif opt in ("-l", "--link"): link = arg elif opt in ("-s", "--simhash_type"): simhash_type = arg elif opt in ("-a", "--avg_dist"): avg_dist = True elif opt in ("-c", "--count"): count = arg else: print help_msg sys.exit(2) if hasinputfile: logging.basicConfig(filename= inputfile + "_running_log_" + function, level=logging.DEBUG) logging.getLogger("global") if not has_function: print help_msg sys.exit() if function == "append_prefix": inputfile_list = [line[:-1] for line in sys.stdin] append_prefix(inputfile_list, prefix) elif function == "compute_list": crawl_log_list = [line[:-1] for line in sys.stdin] compute_list(crawl_log_list, outfile, prefix) elif function == "show_proto": show_proto(inputfile, proto_type) elif function == "intersect_sites": observed_sites_list = [line[:-1] for line in sys.stdin] result_sites = intersect_observed_sites(*observed_sites_list) write_proto_to_file(result_sites, outfile) evaluation_form(outfile, outfile + ".eval", "ObservedSites") elif function == "collect_observations": if link: util.REMOTE_DRIVER = link site_list = filter(bool, open(inputfile, 'r').read().split('\n')) site_set = set(site_list) outdir = outfile collect_site_for_plot(site_set, outdir, mode) elif function == "plot_simhash": if not outfile: outfile = inputfile + ".plot_cluster" plot_simhash(inputfile, outfile, simhash_type, proto_type) elif function == "plot_sim_distance": if not outfile: outfile = inputfile + ".plot_sim_distance" plot_sim_distance(inputfile, outfile, simhash_type, proto_type, avg_dist) elif function == "get_domains": observed_sites_list = [line[:-1] for line in sys.stdin] get_domains(observed_sites_list, outfile) elif function == "get_domain_scores": domains = filter(bool, open(inputfile, 'r').read().split('\n')) result = domain_scores(domains, outfile) elif function == "domain_filter": """ Three steps for computed sites. 1. filter known benign 2. de-duplicate 3. sample $count number of sites """ bar_points = 60 observed_sites_list = filter(bool, open(inputfile, 'r').read().split('\n')) for filename in observed_sites_list: get_bad(bar_points, filename, filename + ".filt") elif function == "dedup": text_filenames = filter(bool, open(inputfile, 'r').read().split('\n')) count = 0 for filename in text_filenames: if ((not 'text' in filename) or ('google' in filename) or ('dom' in filename)): response = interact_query("The input file doesn't seem to \ be valid! Press [Yes/No] to continue or exit!") if not response: sys.exit(0) count += dedup(filename) logger = logging.getLogger("global") logger.info("total sites after dedup: {0}".format(count)) elif function == "sample": text_filenames = filter(bool, open(inputfile, 'r').read().split('\n')) sample(text_filenames, outfile, int(count)) evaluation_form(outfile + '.user.sample.text', outfile + ".user.sample.text.eval", "ObservedSites") evaluation_form(outfile + '.google.sample.text', outfile + ".google.sample.text.eval", "ObservedSites") elif function == "merge_sites": observed_sites_names = [line[:-1] for line in sys.stdin] observed_sites = merge_observed_sites(observed_sites_names) logger = logging.getLogger("global") logger.info("total sites after merge: {0}".format(len(observed_sites.site))) write_proto_to_file(observed_sites, outfile) elif function == "merge_user_sites": """ -i input_file """ filenames = filter(bool, open(inputfile, 'r').read().split('\n')) text_filenames = [filename + '.text' for filename in filenames] dom_filenames = [filename + '.dom' for filename in filenames] text_observed_sites = merge_observed_sites(text_filenames) logger = logging.getLogger("global") logger.info("total sites after merge: {0}".format(len(text_observed_sites.site))) write_proto_to_file(text_observed_sites, inputfile + '.text') dom_observed_sites = merge_observed_sites(dom_filenames) logger.info("total sites after merge: {0}".format(len(dom_observed_sites.site))) write_proto_to_file(dom_observed_sites, inputfile + '.dom') elif function == "get_learned_eval": """ -l learned_file -i detected_file """ learned_file = link observed_file = inputfile result_sites = get_learned_eval(learned_file, observed_file) write_proto_to_file(result_sites, outfile) evaluation_form(outfile, outfile + ".eval", "LearnedSites") elif function == "export_db": """ -i table_name -o outfile """ export_db_to_file(inputfile, outfile) export_db_to_file(inputfile, outfile + ".noise", ["PageBroken"]) elif function == "de_noise": """ remove noise: index.html not found, feature count = 0 """ if "learn" in inputfile: response = interact_query("The input file seems to \ be learned sites, we only support observed \ sites! Press [Yes/No] to continue or exit!") if not response: sys.exit(0) logger = logging.getLogger("global") logger.info("processing {0}".format(inputfile)) de_noise_config = CD.DeNoiseConfig() de_noise_config.zero_feature = True original = CD.ObservedSites() read_proto_from_file(original, inputfile) observed_sites = de_noise(original, de_noise_config) logger.info("before de-noise {0}".format(len(original.site))) logger.info("after de-noise: {0}".format(len(observed_sites.site))) outfile = outfile if outfile else inputfile write_proto_to_file(observed_sites, outfile) elif function == "update_groundtruth": """ This function is too specific. It is to add more malicious examples to the collected groundtruth. """ filenames = filter(bool, open(inputfile, 'r').read().split('\n')) if len(filenames) == 15: original_expected = filenames[0] original_u_text = filenames[1] original_u_dom = filenames[2] original_g_text = filenames[3] original_g_dom = filenames[4] # observed site may have same URL. add_count = count add_expected = filenames[5] add_u_text = filenames[6] add_u_dom = filenames[7] add_g_text = filenames[8] add_g_dom = filenames[9] # outfile out_expected = filenames[10] out_u_text = filenames[11] out_u_dom = filenames[12] out_g_text = filenames[13] out_g_dom = filenames[14] # in this case we will add all update_groundtruth(original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom) elif len(filenames) == 12: original_expected = filenames[0] original_u_text = filenames[1] original_u_dom = filenames[2] original_g_text = filenames[3] original_g_dom = filenames[4] # observed site may have same URL. add_count = int(count) add_expected = filenames[5] add_all = filenames[6] ''' add_u_text = filenames[6] add_u_dom = filenames[7] add_g_text = filenames[8] add_g_dom = filenames[9] # outfile out_expected = filenames[10] out_u_text = filenames[11] out_u_dom = filenames[12] out_g_text = filenames[13] out_g_dom = filenames[14] ''' out_expected = filenames[7] out_u_text = filenames[8] out_u_dom = filenames[9] out_g_text = filenames[10] out_g_dom = filenames[11] update_groundtruth_redundant(add_count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom) else: raise Exception("Cannot handle now!") else: print help_msg sys.exit(2)
def dedup(text_file): """ 1. dom_file, google_text_file, google_dom_file are deducted from text_file 2. google files can be split. we first check whether unsplit exisits, if not we merge all the split ones. 3. The observed sites are output to correponding filename + '.dedup' @parameter text_file: text observed sites file @return number of websites after deduplicate """ dom_file = text_file.replace('text', 'dom') user_text_observed_sites = CD.ObservedSites() read_proto_from_file(user_text_observed_sites, text_file) logger = logging.getLogger("global") logger.info("processing {0}".format(text_file)) logger.info("before dedup: {0}".format(len(user_text_observed_sites.site))) user_dom_observed_sites = CD.ObservedSites() read_proto_from_file(user_dom_observed_sites, dom_file) google_text_file = text_file.replace('user', 'google') google_text_observed_sites = load_split_observed_sites(google_text_file) google_dom_file = dom_file.replace('user', 'google') google_dom_observed_sites = load_split_observed_sites(google_dom_file) user_text_dict, user_text_sites_dict = build_site_simhash_dict(user_text_observed_sites) user_dom_dict, user_dom_sites_dict = build_site_simhash_dict(user_dom_observed_sites) google_text_dict, google_text_sites_dict = build_site_simhash_dict(google_text_observed_sites) google_dom_dict, google_dom_sites_dict = build_site_simhash_dict(google_dom_observed_sites) # how to define exact match user_text_remained = CD.ObservedSites() user_dom_remained = CD.ObservedSites() google_text_remained = CD.ObservedSites() google_dom_remained = CD.ObservedSites() text_failure = set([0]) failure_count = 0 # if the feature set is empty, then this is the hash value. text_zero = set([18446744073709551615]) zero_count = 0 google_failure_count = 0 google_zero_count = 0 for site_name in user_text_dict: if ((not site_name in google_text_dict) or (not site_name in google_dom_dict)): continue if (user_text_dict[site_name] == text_failure): failure_count += 1 continue elif (user_text_dict[site_name] == text_zero): zero_count += 1 continue elif (google_text_dict[site_name] == text_failure): google_failure_count += 1 continue elif (google_text_dict[site_name] == text_zero): google_zero_count += 1 continue text_common = user_text_dict[site_name] & google_text_dict[site_name] dom_common = user_dom_dict[site_name] & google_dom_dict[site_name] if (text_common == user_text_dict[site_name] and dom_common == user_dom_dict[site_name]): continue else: _add_observed_site(user_text_remained, user_text_sites_dict, site_name) _add_observed_site(user_dom_remained, user_dom_sites_dict, site_name) _add_observed_site(google_text_remained, google_text_sites_dict, site_name) _add_observed_site(google_dom_remained, google_dom_sites_dict, site_name) user_text_remained.config.CopyFrom(user_text_observed_sites.config) user_dom_remained.config.CopyFrom(user_dom_observed_sites.config) google_text_remained.config.CopyFrom(google_text_observed_sites.config) google_dom_remained.config.CopyFrom(google_dom_observed_sites.config) write_proto_to_file(user_text_remained, text_file + ".dedup") write_proto_to_file(user_dom_remained, dom_file + ".dedup") write_proto_to_file(google_text_remained, google_text_file + ".dedup") write_proto_to_file(google_dom_remained, google_dom_file + ".dedup") logger.info("after dedup: {0}".format(len(user_text_remained.site))) logger.info("failure count: {0}, zero feature count: {1}".format(failure_count, zero_count)) logger.info("google failure count: {0}, google zero feature count: {1}".format(google_failure_count, google_zero_count)) return len(user_text_remained.site)