def update_groundtruth(original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) # add google is list add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) #add_u_t = CD.ObservedSites() #read_proto_from_file(add_u_t, add_u_text) #add_u_d = CD.ObservedSites() #read_proto_from_file(add_u_d, add_u_dom) #add_g_t = merge_observed_sites(add_g_text) #add_g_d = merge_observed_sites(add_g_dom) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) _output_sample_sites(diff_e_set, [add_expected], add_expected + ".temp") _output_sample_sites(diff_e_set, [add_u_text], add_u_text + ".temp") _output_sample_sites(diff_e_set, [add_u_dom], add_u_dom + ".temp") add_g_text_fs = filter(bool, open(add_g_text, 'r').read().split('\n')) add_g_dom_fs = filter(bool, open(add_g_dom, 'r').read().split('\n')) _output_sample_sites(diff_e_set, add_g_text_fs, add_g_text + ".temp") _output_sample_sites(diff_e_set, add_g_dom_fs, add_g_dom + ".temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_u_text + ".temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_u_dom + ".temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_g_text + ".temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_g_dom + ".temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)
def update_groundtruth_redundant(count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): valid_instance(count, int) in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) diff_e_list = list(diff_e_set) logger.info(len(diff_e_list)) random.shuffle(diff_e_list) diff_e_sample = diff_e_list[:count] """ get the sites that are in "de-deduplicated" examples and add them this is necessary, because there are sites, that are cloaking, but remove in de-dup phase. Doesn't know why. """ add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n')) diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all + ".u.text.temp")) # use the updated diff expected set, to generate the new data _output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp") add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom') _output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp") add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp") add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_all + ".g.text.temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_all + ".g.dom.temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)