def plot_sim_distance(inputfile, outfile, simhash_type, proto_type, avg_dist=True): simhash_type = get_simhash_type(simhash_type, True) sites = getattr(CD, proto_type)() read_proto_from_file(sites, inputfile) out_f = open(outfile, "w") if proto_type == "LearnedSites": for learned_site in sites.site: out_f.write(learned_site.name + "," + str(len(learned_site.pattern)) + "\n") for pattern in learned_site.pattern: dist_list = simhash_vector_distance(pattern.item, avg_dist) out_f.write("pattern\n" + "\n".join([str(d) for d in dist_list]) + "\n") out_f.close() elif proto_type == "ObservedSites": for observed_site in sites.site: out_f.write(observed_site.name + "," + str(len(observed_site.observation)) + "\n") simhash_item_vector = aggregate_simhash(observed_site, simhash_type) dist_list = simhash_vector_distance(simhash_item_vector, avg_dist) out_f.write("\n".join([str(d) for d in dist_list]) + "\n") out_f.close() else: raise Exception("Wrong proto! Only LearnedSites and ObservedSites can be used!")
def get_bad(bar_points, filename, outfilename): observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) domain_set = set(); for site in observed_sites.site: for observation in site.observation: url_domain = top_domain(observation.landing_url) domain_set.add(url_domain) domain_list = list(domain_set) bad_domains = get_domain_reputation(domain_list, bar_points) bad_observed_sites = CD.ObservedSites() bad_observed_sites.config.CopyFrom(observed_sites.config) for site in observed_sites.site: observation_list = list() for observation in site.observation: if top_domain(observation.landing_url) in bad_domains: observation_list.append(observation) if len(observation_list) == 0: continue bad_site = bad_observed_sites.site.add() bad_site.name = site.name for observation in observation_list: to_add = bad_site.observation.add() to_add.CopyFrom(observation) write_proto_to_file(bad_observed_sites, outfilename)
def plot_simhash(inputfile, outfile, simhash_type, proto_type): simhash_type = get_simhash_type(simhash_type) sites = getattr(CD, proto_type)() read_proto_from_file(sites, inputfile) out_f = open(outfile, "w") if proto_type == "LearnedSites": for site in sites.site: observation_size = 0 for pattern in site.pattern: for item in pattern.item: observation_size += item.count out_f.write(site.name + "," + str(observation_size) + "\n") for pattern in site.pattern: for item in pattern.item: item_str = "%0.16x" % item.simhash item_str_array = [item_str for i in range(item.count)] out_f.write("\n".join(item_str_array) + "\n") out_f.close() elif proto_type == "ObservedSites": for site in sites.site: out_f.write(site.name + "," + str(len(site.observation)) + "\n") for observation in site.observation: simhash_str = "%0.16x" % getattr(observation, simhash_type) out_f.write(simhash_str + "\n") out_f.close() else: raise Exception("Wrong proto! Only LearnedSites and ObservedSites can be used!")
def get_domains(observed_sites_list, outfile): domain_set = set() for filename in observed_sites_list: observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) for site in observed_sites.site: for observation in site.observation: url_domain = top_domain(observation.landing_url) domain_set.add(url_domain) open(outfile, 'w').write("\n".join(domain_set))
def test_intersect_observed_sites(): observed_sites_list = ["../../data/abusive_words_9_category.computed/test.user.dom.cloaking", "../../data/abusive_words_9_category.computed/test.user.text.cloaking"] result = None for filename in observed_sites_list: observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) files = sites_file_path_set(observed_sites) result = result & files if result else files result_sites = intersect_observed_sites(*observed_sites_list) new_set = sites_file_path_set(result_sites) assert_equal(result, new_set)
def test_generate_test(): filename = "../../data/US_web_search_list.Chrome.20141110-185317.selenium.crawl/crawl_log" generate_test(filename) text_test = filename + ".text.test" text_mismatch = filename + ".text.mismatch" dom_test = filename + ".dom.test" dom_mismatch = filename + ".dom.mismatch" text_test_sites = CD.ObservedSites() text_mismatch_sites = CD.ObservedSites() dom_test_sites = CD.ObservedSites() dom_mismatch_sites = CD.ObservedSites() read_proto_from_file(text_test_sites, text_test) read_proto_from_file(text_mismatch_sites, text_mismatch) read_proto_from_file(dom_test_sites, dom_test) read_proto_from_file(dom_mismatch_sites, dom_mismatch) assert_equal(len(text_test_sites.site), 5000) assert_equal(len(text_mismatch_sites.site), 1000) assert_equal(len(dom_test_sites.site), 5000) assert_equal(len(dom_mismatch_sites.site), 1000) text_test_set = set() text_mismatch_set = set() dom_test_set = set() dom_mismatch_set = set() for site in text_test_sites.site: text_test_set.add(site.name) for site in text_mismatch_sites.site: text_mismatch_set.add(site.name) for site in dom_test_sites.site: dom_test_set.add(site.name) for site in dom_mismatch_sites.site: dom_mismatch_set.add(site.name) assert_equal(text_test_set, dom_test_set) assert_equal(text_mismatch_set, dom_mismatch_set)
def check_equal(first_file, second_file): first_observed_sites = CD.ObservedSites() read_proto_from_file(first_observed_sites, first_file) second_observed_sites = CD.ObservedSites() read_proto_from_file(second_observed_sites, second_file) first_observed_sites_map = dict() for observed_site in first_observed_sites.site: first_observed_sites_map[observed_site.name] = observed_site for observed_site in second_observed_sites.site: if not observed_site.name in first_observed_sites_map: return False if not observed_site == first_observed_sites_map[observed_site.name]: return False return True
def load_split_observed_sites(filename): if not os.path.exists(filename): count = 0 split_files = list() while True: split_file = filename.replace('list', 'list_' + str(count)) if not os.path.exists(split_file): break split_files.append(split_file) count += 1 observed_sites = merge_observed_sites(split_files) else: observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, filename) return observed_sites
def evaluation_form(sites_filename, out_filename, proto): sites = getattr(CD, proto)() read_proto_from_file(sites, sites_filename) out_f = open(out_filename, "w") if proto == "LearnedSites": for site in sites.site: for pattern in site.pattern: out_f.write(site.name + "\n" + \ pattern.item[0].sample_file_path + "\n") out_f.close() elif proto == "ObservedSites": for site in sites.site: for observation in site.observation: out_f.write(site.name + "\n" + observation.file_path + "\n") out_f.close() else: raise Exception("Wrong proto! Only LearnedSites and ObservedSites can be used!")
def crawl(self): has_written = False for user_agent in self.user_agents: user_agent_md5 = hex_md5(user_agent) self.crawl_config.user_agent = user_agent self.crawl_config.user_agent_md5_dir = self.base_dir + user_agent_md5 + '/' # specify which type of browser to use set_browser_type(self.crawl_config) mkdir_if_not_exist(self.crawl_config.user_agent_md5_dir) # md5 - user agent mapping logs md5_UA_f = open(self.md5_UA_filename, 'a') # user agent md5_UA_f.write(user_agent_md5 + ":" + user_agent + "\n") md5_UA_f.close() # crawl web pages url_fetcher = UrlFetcher(self.crawl_config) thread_computer = ThreadComputer(url_fetcher, 'fetch_url', self.urls) url_fetcher.quit() # Write log for current user agent current_log = CD.CrawlLog() current_log_filename = self.crawl_config.user_agent_md5_dir + 'crawl_log' current_search = CD.CrawlSearchTerm() for p, s in thread_computer.result: result = current_search.result.add() result.CopyFrom(s) result_search = current_log.result_search.add() result_search.CopyFrom(current_search) write_proto_to_file(current_log, current_log_filename) # Write global crawl_log crawl_log = CD.CrawlLog() if has_written: read_proto_from_file(crawl_log, self.crawl_log_filename) else: has_written = True for r_s in current_log.result_search: result_search = crawl_log.result_search.add() result_search.CopyFrom(r_s) """ for s in current_log.result: result = crawl_log.result.add() result.CopyFrom(s) """ write_proto_to_file(crawl_log, self.crawl_log_filename)
def get_learned_eval(learned_file, observed_file): learned_sites = CD.LearnedSites() read_proto_from_file(learned_sites, learned_file) observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, observed_file) observed_sites_list = list() for observed_site in observed_sites.site: observed_sites_list.append(observed_site.name) learned_sites_map = dict() for learned_site in learned_sites.site: learned_sites_map[learned_site.name] = learned_site result_sites = CD.LearnedSites() for site_name in observed_sites_list: if site_name not in learned_sites_map: print "Detected cloaking: {0} not in learned sites, \ Strange!".format(site_name) continue result_site = result_sites.site.add() result_site.CopyFrom(learned_sites_map[site_name]) return result_sites
def revisit(crawl_log_file_list, word_file, n): """ visit landing urls in crawl_log_file n times @parameter crawl_log_file_list: list of filenames of crawl_log word_file: file containing words in crawl_log_file, used for creating base_dir n: number of times to visit """ # google_UA is not used in search and crawl. Used in later visit. google_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)" google_suffix = 'google.crawl/' for i in range(int(n)): # the time label is set for each iteration of visit now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S") for crawl_log_file in crawl_log_file_list: # compute base_dir and start logging base_dir = '.'.join([word_file, google_suffix]) mkdir_if_not_exist(base_dir) logging.basicConfig(filename=base_dir+'running_log'+now_suffix, level=logging.DEBUG) logging.getLogger("global") # set crawl_config crawl_config = CD.CrawlConfig() crawl_config.maximum_threads = 6 crawl_config.user_agent = google_UA crawl_config.user_agent_md5_dir = base_dir + hex_md5(crawl_config.user_agent) \ + now_suffix + '/' crawl_config.browser_type = CD.CrawlConfig.CHROME google_crawl_log = crawl_log_file.split('/')[-1] + '.google' crawl_config.log_filename = google_crawl_log + now_suffix revisit = Visit(crawl_config) crawl_log = CD.CrawlLog() read_proto_from_file(crawl_log, crawl_log_file) landing_url_set = crawl_log_attr_set(crawl_log, "landing_url") revisit.visit_landing_url(landing_url_set) revisit.write_crawl_log(False)
def update_groundtruth(original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) # add google is list add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) #add_u_t = CD.ObservedSites() #read_proto_from_file(add_u_t, add_u_text) #add_u_d = CD.ObservedSites() #read_proto_from_file(add_u_d, add_u_dom) #add_g_t = merge_observed_sites(add_g_text) #add_g_d = merge_observed_sites(add_g_dom) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) _output_sample_sites(diff_e_set, [add_expected], add_expected + ".temp") _output_sample_sites(diff_e_set, [add_u_text], add_u_text + ".temp") _output_sample_sites(diff_e_set, [add_u_dom], add_u_dom + ".temp") add_g_text_fs = filter(bool, open(add_g_text, 'r').read().split('\n')) add_g_dom_fs = filter(bool, open(add_g_dom, 'r').read().split('\n')) _output_sample_sites(diff_e_set, add_g_text_fs, add_g_text + ".temp") _output_sample_sites(diff_e_set, add_g_dom_fs, add_g_dom + ".temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_u_text + ".temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_u_dom + ".temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_g_text + ".temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_g_dom + ".temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)
def dedup(text_file): """ 1. dom_file, google_text_file, google_dom_file are deducted from text_file 2. google files can be split. we first check whether unsplit exisits, if not we merge all the split ones. 3. The observed sites are output to correponding filename + '.dedup' @parameter text_file: text observed sites file @return number of websites after deduplicate """ dom_file = text_file.replace('text', 'dom') user_text_observed_sites = CD.ObservedSites() read_proto_from_file(user_text_observed_sites, text_file) logger = logging.getLogger("global") logger.info("processing {0}".format(text_file)) logger.info("before dedup: {0}".format(len(user_text_observed_sites.site))) user_dom_observed_sites = CD.ObservedSites() read_proto_from_file(user_dom_observed_sites, dom_file) google_text_file = text_file.replace('user', 'google') google_text_observed_sites = load_split_observed_sites(google_text_file) google_dom_file = dom_file.replace('user', 'google') google_dom_observed_sites = load_split_observed_sites(google_dom_file) user_text_dict, user_text_sites_dict = build_site_simhash_dict(user_text_observed_sites) user_dom_dict, user_dom_sites_dict = build_site_simhash_dict(user_dom_observed_sites) google_text_dict, google_text_sites_dict = build_site_simhash_dict(google_text_observed_sites) google_dom_dict, google_dom_sites_dict = build_site_simhash_dict(google_dom_observed_sites) # how to define exact match user_text_remained = CD.ObservedSites() user_dom_remained = CD.ObservedSites() google_text_remained = CD.ObservedSites() google_dom_remained = CD.ObservedSites() text_failure = set([0]) failure_count = 0 # if the feature set is empty, then this is the hash value. text_zero = set([18446744073709551615]) zero_count = 0 google_failure_count = 0 google_zero_count = 0 for site_name in user_text_dict: if ((not site_name in google_text_dict) or (not site_name in google_dom_dict)): continue if (user_text_dict[site_name] == text_failure): failure_count += 1 continue elif (user_text_dict[site_name] == text_zero): zero_count += 1 continue elif (google_text_dict[site_name] == text_failure): google_failure_count += 1 continue elif (google_text_dict[site_name] == text_zero): google_zero_count += 1 continue text_common = user_text_dict[site_name] & google_text_dict[site_name] dom_common = user_dom_dict[site_name] & google_dom_dict[site_name] if (text_common == user_text_dict[site_name] and dom_common == user_dom_dict[site_name]): continue else: _add_observed_site(user_text_remained, user_text_sites_dict, site_name) _add_observed_site(user_dom_remained, user_dom_sites_dict, site_name) _add_observed_site(google_text_remained, google_text_sites_dict, site_name) _add_observed_site(google_dom_remained, google_dom_sites_dict, site_name) user_text_remained.config.CopyFrom(user_text_observed_sites.config) user_dom_remained.config.CopyFrom(user_dom_observed_sites.config) google_text_remained.config.CopyFrom(google_text_observed_sites.config) google_dom_remained.config.CopyFrom(google_dom_observed_sites.config) write_proto_to_file(user_text_remained, text_file + ".dedup") write_proto_to_file(user_dom_remained, dom_file + ".dedup") write_proto_to_file(google_text_remained, google_text_file + ".dedup") write_proto_to_file(google_dom_remained, google_dom_file + ".dedup") logger.info("after dedup: {0}".format(len(user_text_remained.site))) logger.info("failure count: {0}, zero feature count: {1}".format(failure_count, zero_count)) logger.info("google failure count: {0}, google zero feature count: {1}".format(google_failure_count, google_zero_count)) return len(user_text_remained.site)
def main(argv): has_function = False help_msg = """data_util.py -f <function> [-p <prefix>][-p <prefix> -o <outfile>][-i <inputfile> -t <proto_type>][-o <outfile>][-i <site_list> -l <server_link> -o <outdir> -m <mode>][-i <inputfile>-o <outfile> -s <simhash_type> -t <proto_type>][-i <inputfile> -o <outfile> -s <simhash_type> -t <proto_type> -a] [-o <outfile>] [-i <inputfile> -o <outfile>] [-i <inputfile>] [-i <text_filt>] [-i <inputfile> -c <count> -o <outfile>] [-o <outfile>] [-i <inputfile> -l <leanredfile> -o <outfile>], valid functions are append_prefix, compute_list, show_proto, intersect_sites, collect_observations, plot_simhash, plot_sim_distance, get_domains, get_domain_scores, domain_filter, dedup, sample, merge_sites, get_learned_eval, [-i <table_name> -o <outfie>] export_db [-i <inputfile> -o <outfile>] de_noise [-i <inputfile> -c <count>] update_groundtruth [-i <user observation list, suffix removed>] merge_user_sites""" try: opts, args = getopt.getopt(argv, "hf:p:o:t:i:m:l:s:ac:", ["function=", "prefix=", "outfile=", "proto_type=", "ifile=", "mode=", "link=", "simhash_type=", "avg_dist", "count"]) except getopt.GetoptError: print help_msg sys.exit(2) hasinputfile = False outfile = None avg_dist = False for opt, arg in opts: if opt == "-h": print help_msg sys.exit() elif opt in ("-f", "--function"): function = arg has_function = True elif opt in ("-p", "--prefix"): prefix = arg elif opt in ("-o", "--outfile"): outfile = arg elif opt in ("-i", "--ifile"): inputfile = arg hasinputfile = True elif opt in ("-t", "--proto_type"): proto_type = arg elif opt in ("-m", "--mode"): mode = arg elif opt in ("-l", "--link"): link = arg elif opt in ("-s", "--simhash_type"): simhash_type = arg elif opt in ("-a", "--avg_dist"): avg_dist = True elif opt in ("-c", "--count"): count = arg else: print help_msg sys.exit(2) if hasinputfile: logging.basicConfig(filename= inputfile + "_running_log_" + function, level=logging.DEBUG) logging.getLogger("global") if not has_function: print help_msg sys.exit() if function == "append_prefix": inputfile_list = [line[:-1] for line in sys.stdin] append_prefix(inputfile_list, prefix) elif function == "compute_list": crawl_log_list = [line[:-1] for line in sys.stdin] compute_list(crawl_log_list, outfile, prefix) elif function == "show_proto": show_proto(inputfile, proto_type) elif function == "intersect_sites": observed_sites_list = [line[:-1] for line in sys.stdin] result_sites = intersect_observed_sites(*observed_sites_list) write_proto_to_file(result_sites, outfile) evaluation_form(outfile, outfile + ".eval", "ObservedSites") elif function == "collect_observations": if link: util.REMOTE_DRIVER = link site_list = filter(bool, open(inputfile, 'r').read().split('\n')) site_set = set(site_list) outdir = outfile collect_site_for_plot(site_set, outdir, mode) elif function == "plot_simhash": if not outfile: outfile = inputfile + ".plot_cluster" plot_simhash(inputfile, outfile, simhash_type, proto_type) elif function == "plot_sim_distance": if not outfile: outfile = inputfile + ".plot_sim_distance" plot_sim_distance(inputfile, outfile, simhash_type, proto_type, avg_dist) elif function == "get_domains": observed_sites_list = [line[:-1] for line in sys.stdin] get_domains(observed_sites_list, outfile) elif function == "get_domain_scores": domains = filter(bool, open(inputfile, 'r').read().split('\n')) result = domain_scores(domains, outfile) elif function == "domain_filter": """ Three steps for computed sites. 1. filter known benign 2. de-duplicate 3. sample $count number of sites """ bar_points = 60 observed_sites_list = filter(bool, open(inputfile, 'r').read().split('\n')) for filename in observed_sites_list: get_bad(bar_points, filename, filename + ".filt") elif function == "dedup": text_filenames = filter(bool, open(inputfile, 'r').read().split('\n')) count = 0 for filename in text_filenames: if ((not 'text' in filename) or ('google' in filename) or ('dom' in filename)): response = interact_query("The input file doesn't seem to \ be valid! Press [Yes/No] to continue or exit!") if not response: sys.exit(0) count += dedup(filename) logger = logging.getLogger("global") logger.info("total sites after dedup: {0}".format(count)) elif function == "sample": text_filenames = filter(bool, open(inputfile, 'r').read().split('\n')) sample(text_filenames, outfile, int(count)) evaluation_form(outfile + '.user.sample.text', outfile + ".user.sample.text.eval", "ObservedSites") evaluation_form(outfile + '.google.sample.text', outfile + ".google.sample.text.eval", "ObservedSites") elif function == "merge_sites": observed_sites_names = [line[:-1] for line in sys.stdin] observed_sites = merge_observed_sites(observed_sites_names) logger = logging.getLogger("global") logger.info("total sites after merge: {0}".format(len(observed_sites.site))) write_proto_to_file(observed_sites, outfile) elif function == "merge_user_sites": """ -i input_file """ filenames = filter(bool, open(inputfile, 'r').read().split('\n')) text_filenames = [filename + '.text' for filename in filenames] dom_filenames = [filename + '.dom' for filename in filenames] text_observed_sites = merge_observed_sites(text_filenames) logger = logging.getLogger("global") logger.info("total sites after merge: {0}".format(len(text_observed_sites.site))) write_proto_to_file(text_observed_sites, inputfile + '.text') dom_observed_sites = merge_observed_sites(dom_filenames) logger.info("total sites after merge: {0}".format(len(dom_observed_sites.site))) write_proto_to_file(dom_observed_sites, inputfile + '.dom') elif function == "get_learned_eval": """ -l learned_file -i detected_file """ learned_file = link observed_file = inputfile result_sites = get_learned_eval(learned_file, observed_file) write_proto_to_file(result_sites, outfile) evaluation_form(outfile, outfile + ".eval", "LearnedSites") elif function == "export_db": """ -i table_name -o outfile """ export_db_to_file(inputfile, outfile) export_db_to_file(inputfile, outfile + ".noise", ["PageBroken"]) elif function == "de_noise": """ remove noise: index.html not found, feature count = 0 """ if "learn" in inputfile: response = interact_query("The input file seems to \ be learned sites, we only support observed \ sites! Press [Yes/No] to continue or exit!") if not response: sys.exit(0) logger = logging.getLogger("global") logger.info("processing {0}".format(inputfile)) de_noise_config = CD.DeNoiseConfig() de_noise_config.zero_feature = True original = CD.ObservedSites() read_proto_from_file(original, inputfile) observed_sites = de_noise(original, de_noise_config) logger.info("before de-noise {0}".format(len(original.site))) logger.info("after de-noise: {0}".format(len(observed_sites.site))) outfile = outfile if outfile else inputfile write_proto_to_file(observed_sites, outfile) elif function == "update_groundtruth": """ This function is too specific. It is to add more malicious examples to the collected groundtruth. """ filenames = filter(bool, open(inputfile, 'r').read().split('\n')) if len(filenames) == 15: original_expected = filenames[0] original_u_text = filenames[1] original_u_dom = filenames[2] original_g_text = filenames[3] original_g_dom = filenames[4] # observed site may have same URL. add_count = count add_expected = filenames[5] add_u_text = filenames[6] add_u_dom = filenames[7] add_g_text = filenames[8] add_g_dom = filenames[9] # outfile out_expected = filenames[10] out_u_text = filenames[11] out_u_dom = filenames[12] out_g_text = filenames[13] out_g_dom = filenames[14] # in this case we will add all update_groundtruth(original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom) elif len(filenames) == 12: original_expected = filenames[0] original_u_text = filenames[1] original_u_dom = filenames[2] original_g_text = filenames[3] original_g_dom = filenames[4] # observed site may have same URL. add_count = int(count) add_expected = filenames[5] add_all = filenames[6] ''' add_u_text = filenames[6] add_u_dom = filenames[7] add_g_text = filenames[8] add_g_dom = filenames[9] # outfile out_expected = filenames[10] out_u_text = filenames[11] out_u_dom = filenames[12] out_g_text = filenames[13] out_g_dom = filenames[14] ''' out_expected = filenames[7] out_u_text = filenames[8] out_u_dom = filenames[9] out_g_text = filenames[10] out_g_dom = filenames[11] update_groundtruth_redundant(add_count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom) else: raise Exception("Cannot handle now!") else: print help_msg sys.exit(2)
def update_groundtruth_redundant(count, original_expected, original_u_text, original_u_dom, original_g_text, original_g_dom, add_expected, add_all, out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom): valid_instance(count, int) in_e = CD.ObservedSites() read_proto_from_file(in_e, original_expected) in_u_t = CD.ObservedSites() read_proto_from_file(in_u_t, original_u_text) in_u_d = CD.ObservedSites() read_proto_from_file(in_u_d, original_u_dom) in_g_t = CD.ObservedSites() read_proto_from_file(in_g_t, original_g_text) in_g_d = CD.ObservedSites() read_proto_from_file(in_g_d, original_g_dom) add_e = CD.ObservedSites() read_proto_from_file(add_e, add_expected) in_e_set = sites_name_set(in_e) add_e_set = sites_name_set(add_e) diff_e_set = add_e_set - in_e_set logger = logging.getLogger("global") logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format( len(in_e_set), len(add_e_set), len(diff_e_set))) logger.info("diff set is") logger.info(diff_e_set) diff_e_list = list(diff_e_set) logger.info(len(diff_e_list)) random.shuffle(diff_e_list) diff_e_sample = diff_e_list[:count] """ get the sites that are in "de-deduplicated" examples and add them this is necessary, because there are sites, that are cloaking, but remove in de-dup phase. Doesn't know why. """ add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n')) diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all + ".u.text.temp")) # use the updated diff expected set, to generate the new data _output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp") add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom') _output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp") add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp") add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google') _output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp") out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"]) out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"]) out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"]) out_g_t_sites = merge_observed_sites([original_g_text, add_all + ".g.text.temp"]) out_g_d_sites = merge_observed_sites([original_g_dom, add_all + ".g.dom.temp"]) out_u_t_sites.config.CopyFrom(in_u_t.config) out_u_d_sites.config.CopyFrom(in_u_d.config) out_g_t_sites.config.CopyFrom(in_g_t.config) out_g_d_sites.config.CopyFrom(in_g_d.config) write_proto_to_file(out_expected_sites, out_expected) write_proto_to_file(out_u_t_sites, out_u_text) write_proto_to_file(out_u_d_sites, out_u_dom) write_proto_to_file(out_g_t_sites, out_g_text) write_proto_to_file(out_g_d_sites, out_g_dom)
def search_and_revisit(word_file, n, threads=6, ad_only=False): """ This function does the following things. 1. Search each word in word file. 2. Grab the top 200 returned results and corresponding ads 3. Visit all the results and ads with "chrome user agent", repeat n times 4. Visit all the landing pages in step 3 with "google ads bot user agent" @parameter word_file: the filename containing the words to search n: repeat step 3 for n times ad_only: Only retrieve the advertisements. In this case, we only view the first 5 pages. @output Following are output of this function Running log: [WORD_FILE].selenium.crawl/running_log.[SEARCH_TIME] "chrome user agent" result is: [WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5] [WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5] [WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME]/[URL_MD5]/index.html "google ads bot user agent" result is: [WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5].google [WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5].google [WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME].revisit.[REVISIT_TIME]/[URL_MD5]/index.html """ valid_instance(threads, int) # prepare search and visit user_UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/" \ "537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36" user_suffix = "selenium.crawl/" search_now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S") word_md5_delimiter = "WORD_MD5" # compute base_dir and start logging base_dir = '.'.join([word_file, user_suffix]) mkdir_if_not_exist(base_dir) logging.basicConfig(filename=base_dir+'running_log'+search_now_suffix, level=logging.DEBUG) logging.getLogger("global") # set search and visit crawl_config search_config = CD.CrawlConfig() search_config.maximum_threads = threads search_config.user_agent = user_UA # number of top search results to be inspected if ad_only: search_config.count = 50 search_config.browser_type = CD.CrawlConfig.CHROME ad_crawl_config = CD.CrawlConfig() ad_crawl_config.CopyFrom(search_config) ad_crawl_config.result_type = CD.AD ad_crawl_config.crawl_log_dir = base_dir ad_log_filename_prefix = 'ad_crawl_log' + search_now_suffix ad_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(ad_crawl_config.user_agent) + search_now_suffix + '/' search_crawl_config = CD.CrawlConfig() search_crawl_config.CopyFrom(search_config) search_crawl_config.result_type = CD.SEARCH search_crawl_config.crawl_log_dir = base_dir search_log_filename_prefix = 'search_crawl_log' + search_now_suffix search_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(search_crawl_config.user_agent) + search_now_suffix + '/' # print crawl_config.user_agent words = SearchTerm(word_file) search = Search(search_config) ad_visit = Visit(ad_crawl_config, 1) search_visit = Visit(search_crawl_config, 1) # prepare the revisit google_ad_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)" google_search_UA = "Googlebot/2.1 (+http://www.google.com/bot.html)" # set revisit crawl_config revisit_crawl_config = CD.CrawlConfig() revisit_crawl_config.maximum_threads = threads revisit_crawl_config.browser_type = CD.CrawlConfig.CHROME # base directory uses search_now_suffix to correlate these two revisit_crawl_config.crawl_log_dir = base_dir # search, visit and revisit each word for word in words.get_word_list(): print "Processing {0} word: {1}".format(words.current(), word) # update word_md5 related directories print word word_md5 = hex_md5(word) ad_crawl_config.log_filename = ad_log_filename_prefix + "." + word_md5 ad_crawl_config.user_agent_md5_dir = word_md5.join( ad_dir_prefix.split(word_md5_delimiter)) search_crawl_config.log_filename = search_log_filename_prefix + "." + word_md5 search_crawl_config.user_agent_md5_dir = word_md5.join( search_dir_prefix.split(word_md5_delimiter)) ad_visit.update_crawl_config(ad_crawl_config) search_visit.update_crawl_config(search_crawl_config) # search and crawl right_click = not ad_only ad_set, search_set = search.search(word, right_click) ad_crawl_log_filename = ad_visit.visit(ad_set, word) if ad_only: search_crawl_log_filename = None else: search_crawl_log_filename = search_visit.visit(search_set, word) # revisit crawl_log_file_list = list() if ad_crawl_log_filename: crawl_log_file_list.append(ad_crawl_log_filename) if search_crawl_log_filename: crawl_log_file_list.append(search_crawl_log_filename) for crawl_log_file in crawl_log_file_list: if crawl_log_file == ad_crawl_log_filename: revisit_crawl_config.user_agent = google_ad_UA else: revisit_crawl_config.user_agent = google_search_UA revisit_dir_prefix = base_dir + word_md5_delimiter + "/" + \ hex_md5(revisit_crawl_config.user_agent) + search_now_suffix revisit_crawl_config.log_filename = crawl_log_file.split('/')[-1] + '.google' revisit = Visit(revisit_crawl_config) crawl_log = CD.CrawlLog() read_proto_from_file(crawl_log, crawl_log_file) revisit.visit_landing_url_n_times(crawl_log, int(n), revisit_dir_prefix, word_md5, word_md5_delimiter) words.next() """
def generate_test(observed_sites_filename, test_size=5000, positive_size=1000): text_observed_sites_filename = observed_sites_filename + ".text" dom_observed_sites_filename = observed_sites_filename + ".dom" if not (os.path.exists(dom_observed_sites_filename) and os.path.exists(text_observed_sites_filename)): raise Exception("Computed observed sites file doesn't exist!") # select for text simhash first computed_observed_sites_filename = text_observed_sites_filename observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, computed_observed_sites_filename) observed_site_list = list() for observed_site in observed_sites.site: observed_site_list.append(observed_site) random.shuffle(observed_site_list) # test_size is number of sites, actual observation should be more than this. test_sites = CD.ObservedSites() mismatch_sites = CD.ObservedSites() test_sites.config.CopyFrom(observed_sites.config) mismatch_sites.config.CopyFrom(observed_sites.config) test_list = observed_site_list[0:test_size] mismatch_list = test_list[0:positive_size] # original_label_list and mismatch_label_mapping is used in dom select. original_label_list = [observed_site.name for observed_site in test_list] mismatch_label_mapping = dict() for observed_site in mismatch_list: # observed_site in test_list are also changed. current_label = observed_site.name mismatch_label = random.sample(observed_site_list, 1)[0].name while (top_domain(current_label) == top_domain(mismatch_label)): mismatch_label = random.sample(observed_site_list, 1)[0].name observed_site.name = mismatch_label mismatch_site = mismatch_sites.site.add() mismatch_site.CopyFrom(observed_site) mismatch_label_mapping[current_label] = mismatch_label for observed_site in test_list: test_site = test_sites.site.add() test_site.CopyFrom(observed_site) mismatch_sites_filename = computed_observed_sites_filename + ".mismatch" test_sites_filename = computed_observed_sites_filename + ".test" write_proto_to_file(mismatch_sites, mismatch_sites_filename) write_proto_to_file(test_sites, test_sites_filename) # select for dom simhash now computed_observed_sites_filename = dom_observed_sites_filename observed_sites = CD.ObservedSites() read_proto_from_file(observed_sites, computed_observed_sites_filename) observed_sites_map = dict() for observed_site in observed_sites.site: observed_sites_map[observed_site.name] = observed_site test_sites = CD.ObservedSites() mismatch_sites = CD.ObservedSites() test_sites.config.CopyFrom(observed_sites.config) mismatch_sites.config.CopyFrom(observed_sites.config) test_list = list() for label in original_label_list: test_list.append(observed_sites_map[label]) for label in mismatch_label_mapping: observed_sites_map[label].name = mismatch_label_mapping[label] mismatch_site = mismatch_sites.site.add() mismatch_site.CopyFrom(observed_sites_map[label]) for observed_site in test_list: test_site = test_sites.site.add() test_site.CopyFrom(observed_site) mismatch_sites_filename = computed_observed_sites_filename + ".mismatch" test_sites_filename = computed_observed_sites_filename + ".test" write_proto_to_file(mismatch_sites, mismatch_sites_filename) write_proto_to_file(test_sites, test_sites_filename)