def test_intersect_observed_sites(): observed_sites_list = ["../../data/abusive_words_9_category.computed/test.user.dom.cloaking", "../../data/abusive_words_9_category.computed/test.user.text.cloaking"] result = None for filename in observed_sites_list: observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) files = sites_file_path_set(observed_sites) result = result & files if result else files result_sites = intersect_observed_sites(*observed_sites_list) new_set = sites_file_path_set(result_sites) assert_equal(result, new_set)
def main(argv): has_function = False help_msg = """data_util.py -f <function> [-p <prefix>][-p <prefix> -o <outfile>][-i <inputfile> -t <proto_type>][-o <outfile>][-i <site_list> -l <server_link> -o <outdir> -m <mode>][-i <inputfile>-o <outfile> -s <simhash_type> -t <proto_type>][-i <inputfile> -o <outfile> -s <simhash_type> -t <proto_type> -a] [-o <outfile>] [-i <inputfile> -o <outfile>] [-i <inputfile>] [-i <text_filt>] [-i <inputfile> -c <count> -o <outfile>] [-o <outfile>] [-i <inputfile> -l <leanredfile> -o <outfile>], valid functions are append_prefix, compute_list, show_proto, intersect_sites, collect_observations, plot_simhash, plot_sim_distance, get_domains, get_domain_scores, domain_filter, dedup, sample, merge_sites, get_learned_eval, [-i <table_name> -o <outfie>] export_db [-i <inputfile> -o <outfile>] de_noise [-i <inputfile> -c <count>] update_groundtruth [-i <user observation list, suffix removed>] merge_user_sites""" try: opts, args = getopt.getopt(argv, "hf:p:o:t:i:m:l:s:ac:", ["function=", "prefix=", "outfile=", "proto_type=", "ifile=", "mode=", "link=", "simhash_type=", "avg_dist", "count"]) except getopt.GetoptError: print help_msg sys.exit(2) hasinputfile = False outfile = None avg_dist = False for opt, arg in opts: if opt == "-h": print help_msg sys.exit() elif opt in ("-f", "--function"): function = arg has_function = True elif opt in ("-p", "--prefix"): prefix = arg elif opt in ("-o", "--outfile"): outfile = arg elif opt in ("-i", "--ifile"): inputfile = arg hasinputfile = True elif opt in ("-t", "--proto_type"): proto_type = arg elif opt in ("-m", "--mode"): mode = arg elif opt in ("-l", "--link"): link = arg elif opt in ("-s", "--simhash_type"): simhash_type = arg elif opt in ("-a", "--avg_dist"): avg_dist = True elif opt in ("-c", "--count"): count = arg else: print help_msg sys.exit(2) if hasinputfile: logging.basicConfig(filename= inputfile + "_running_log_" + function, level=logging.DEBUG) logging.getLogger("global") if not has_function: print help_msg sys.exit() if function == "append_prefix": inputfile_list = [line[:-1] for line in sys.stdin] append_prefix(inputfile_list, prefix) elif function == "compute_list": crawl_log_list = [line[:-1] for line in sys.stdin] compute_list(crawl_log_list, outfile, prefix) elif function == "show_proto": show_proto(inputfile, proto_type) elif function == "intersect_sites": observed_sites_list = [line[:-1] for line in sys.stdin] result_sites = intersect_observed_sites(*observed_sites_list) write_proto_to_file(result_sites, outfile) evaluation_form(outfile, outfile + ".eval", "ObservedSites") elif function == "collect_observations": if link: util.REMOTE_DRIVER = link site_list = filter(bool, open(inputfile, 'r').read().split('\n')) site_set = set(site_list) outdir = outfile collect_site_for_plot(site_set, outdir, mode) elif function == "plot_simhash": if not outfile: outfile = inputfile + ".plot_cluster" plot_simhash(inputfile, outfile, simhash_type, proto_type) elif function == "plot_sim_distance": if not outfile: outfile = inputfile + ".plot_sim_distance" plot_sim_distance(inputfile, outfile, simhash_type, proto_type, avg_dist) elif function == "get_domains": observed_sites_list = [line[:-1] for line in sys.stdin] get_domains(observed_sites_list, outfile) elif function == "get_domain_scores": domains = filter(bool, open(inputfile, 'r').read().split('\n')) result = domain_scores(domains, outfile) elif function == "domain_filter": """ Three steps for computed sites. 1. filter known benign 2. de-duplicate 3. sample $count number of sites """ bar_points = 60 observed_sites_list = filter(bool, open(inputfile, 'r').read().split('\n')) for filename in observed_sites_list: get_bad(bar_points, filename, filename + ".filt") elif function == "dedup": text_filenames = filter(bool, open(inputfile, 'r').read().split('\n')) count = 0 for filename in text_filenames: if ((not 'text' in filename) or ('google' in filename) or ('dom' in filename)): response = interact_query("The input file doesn't seem to \ be valid! Press [Yes/No] to continue or exit!") if not response: sys.exit(0) count += dedup(filename) logger = logging.getLogger("global") logger.info("total sites after dedup: {0}".format(count)) elif function == "sample": text_filenames = filter(bool, open(inputfile, 'r').read().split('\n')) sample(text_filenames, outfile, int(count)) evaluation_form(outfile + '.user.sample.text', outfile + ".user.sample.text.eval", "ObservedSites") evaluation_form(outfile + '.google.sample.text', outfile + ".google.sample.text.eval", "ObservedSites") elif function == "merge_sites": observed_sites_names = [line[:-1] for line in sys.stdin] observed_sites = merge_observed_sites(observed_sites_names) logger = logging.getLogger("global") logger.info("total sites after merge: {0}".format(len(observed_sites.site))) write_proto_to_file(observed_sites, outfile) elif function == "merge_user_sites": """ -i input_file """ filenames = filter(bool, open(inputfile, 'r').read().split('\n')) text_filenames = [filename + '.text' for filename in filenames] dom_filenames = [filename + '.dom' for filename in filenames] text_observed_sites = merge_observed_sites(text_filenames) logger = logging.getLogger("global") logger.info("total sites after merge: {0}".format(len(text_observed_sites.site))) write_proto_to_file(text_observed_sites, inputfile + '.text') dom_observed_sites = merge_observed_sites(dom_filenames) logger.info("total sites after merge: {0}".format(len(dom_observed_sites.site))) write_proto_to_file(dom_observed_sites, inputfile + '.dom') elif function == "get_learned_eval": """ -l learned_file -i detected_file """ learned_file = link observed_file = inputfile result_sites = get_learned_eval(learned_file, observed_file) write_proto_to_file(result_sites, outfile) evaluation_form(outfile, outfile + ".eval", "LearnedSites") elif function == "export_db": """ -i table_name -o outfile """ export_db_to_file(inputfile, outfile) export_db_to_file(inputfile, outfile + ".noise", ["PageBroken"]) elif function == "de_noise": """ remove noise: index.html not found, feature count = 0 """ if "learn" in inputfile: response = interact_query("The input file seems to \ be learned sites, we only support observed \ sites! Press [Yes/No] to continue or exit!") if not response: sys.exit(0) logger = logging.getLogger("global") logger.info("processing {0}".format(inputfile)) de_noise_config = CD.DeNoiseConfig() de_noise_config.zero_feature = True original = CD.ObservedSites() read_proto_from_file(original, inputfile) observed_sites = de_noise(original, de_noise_config) logger.info("before de-noise {0}".format(len(original.site))) logger.info("after de-noise: {0}".format(len(observed_sites.site))) outfile = outfile if outfile else inputfile write_proto_to_file(observed_sites, outfile) elif function == "update_groundtruth": """ This function is too specific. It is to add more malicious examples to the collected groundtruth. """ filenames = filter(bool, open(inputfile, 'r').read().split('\n')) if len(filenames) == 15: original_expected = filenames[0] original_u_text = filenames[1] original_u_dom = filenames[2] original_g_text = filenames[3] original_g_dom = filenames[4] # observed site may have same URL. add_count = count add_expected = filenames[5] add_u_text = filenames[6] add_u_dom = filenames[7] add_g_text = filenames[8] add_g_dom = filenames[9] # outfile out_expected = filenames[10] out_u_text = filenames[11] out_u_dom = filenames[12] out_g_text = filenames[13] out_g_dom = filenames[14] # in this case we will add all update_groundtruth(original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom) elif len(filenames) == 12: original_expected = filenames[0] original_u_text = filenames[1] original_u_dom = filenames[2] original_g_text = filenames[3] original_g_dom = filenames[4] # observed site may have same URL. add_count = int(count) add_expected = filenames[5] add_all = filenames[6] ''' add_u_text = filenames[6] add_u_dom = filenames[7] add_g_text = filenames[8] add_g_dom = filenames[9] # outfile out_expected = filenames[10] out_u_text = filenames[11] out_u_dom = filenames[12] out_g_text = filenames[13] out_g_dom = filenames[14] ''' out_expected = filenames[7] out_u_text = filenames[8] out_u_dom = filenames[9] out_g_text = filenames[10] out_g_dom = filenames[11] update_groundtruth_redundant(add_count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom) else: raise Exception("Cannot handle now!") else: print help_msg sys.exit(2)