Exemplo n.º 1
0
def get_bad(bar_points, filename, outfilename):
	observed_sites = CD.ObservedSites()
	read_proto_from_file(observed_sites, filename)
	domain_set = set();

	for site in observed_sites.site:
		for observation in site.observation:
			url_domain = top_domain(observation.landing_url)
			domain_set.add(url_domain)

	domain_list = list(domain_set)
	bad_domains = get_domain_reputation(domain_list, bar_points)
	bad_observed_sites = CD.ObservedSites()
	bad_observed_sites.config.CopyFrom(observed_sites.config)

	for site in observed_sites.site:
		observation_list = list()
		for observation in site.observation:
			if top_domain(observation.landing_url) in bad_domains:
				observation_list.append(observation)
		if len(observation_list) == 0:
			continue
		bad_site = bad_observed_sites.site.add()
		bad_site.name = site.name
		for observation in observation_list:
			to_add = bad_site.observation.add()
			to_add.CopyFrom(observation)

	write_proto_to_file(bad_observed_sites, outfilename)
Exemplo n.º 2
0
def sample(text_filenames, outfile, sample_size):
	dom_filenames = _replace_list_by(text_filenames, 'text', 'dom')
	google_text_filenames = _replace_list_by(text_filenames, 'user',
			'google')
	google_dom_filenames = _replace_list_by(dom_filenames, 'user', 'google')

	text_observed_sites = merge_observed_sites(text_filenames)
	observed_site_list = list()
	url_set = set()
	for observed_site in text_observed_sites.site:
		observed_site_list.append(observed_site)
		for observation in observed_site.observation:
			url_set.add(observation.landing_url)
	logger = logging.getLogger("global")
	logger.info("there are {0} urls".format(len(url_set)))
	logger.info("there are {0} observed sites".format(len(observed_site_list)))
	random.shuffle(observed_site_list)
	# test_size is number of sites, actual observation should be more than this.
	sample_sites = CD.ObservedSites()
	sample_sites.config.CopyFrom(text_observed_sites.config)
	sample_list = observed_site_list[0:sample_size]
	original_label_list = [observed_site.name for observed_site in sample_list]
	for observed_site in sample_list:
		sample_site = sample_sites.site.add()
		sample_site.CopyFrom(observed_site)
	sample_filename = outfile + ".user.sample.text"
	write_proto_to_file(sample_sites, sample_filename)


	_output_sample_sites(original_label_list, dom_filenames, outfile + ".user.sample.dom")
	_output_sample_sites(original_label_list, google_text_filenames,
			outfile + '.google.sample.text')
	_output_sample_sites(original_label_list, google_dom_filenames, outfile
			+ '.google.sample.dom')
Exemplo n.º 3
0
def update_groundtruth(original_expected,
		original_u_text, original_u_dom, original_g_text, original_g_dom, 
		add_expected, add_u_text, add_u_dom, add_g_text, add_g_dom, 
		out_expected, out_u_text, out_u_dom, out_g_text, out_g_dom):
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)

	# add google is list
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)
	#add_u_t = CD.ObservedSites()
	#read_proto_from_file(add_u_t, add_u_text)
	#add_u_d = CD.ObservedSites()
	#read_proto_from_file(add_u_d, add_u_dom)
	#add_g_t = merge_observed_sites(add_g_text)
	#add_g_d = merge_observed_sites(add_g_dom)
	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)

	_output_sample_sites(diff_e_set, [add_expected], add_expected + ".temp")
	_output_sample_sites(diff_e_set, [add_u_text], add_u_text + ".temp")
	_output_sample_sites(diff_e_set, [add_u_dom], add_u_dom + ".temp")
	add_g_text_fs = filter(bool, open(add_g_text, 'r').read().split('\n'))
	add_g_dom_fs = filter(bool, open(add_g_dom, 'r').read().split('\n'))
	_output_sample_sites(diff_e_set, add_g_text_fs, add_g_text + ".temp")
	_output_sample_sites(diff_e_set, add_g_dom_fs, add_g_dom + ".temp")
	out_expected_sites = merge_observed_sites([original_expected,
		add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_u_text + ".temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_u_dom + ".temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_g_text + ".temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_g_dom + ".temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)
Exemplo n.º 4
0
	def write_crawl_log(self, counter_suffix=True):
		crawl_log_dir = self.crawl_config.crawl_log_dir
		if (not crawl_log_dir) or crawl_log_dir == "":
			crawl_log_dir = self.crawl_config.user_agent_md5_dir
		current_log_filename = crawl_log_dir + self.crawl_config.log_filename
		if counter_suffix:
			current_log_filename += "_" + str(self.counter)
		# Write global crawl_log
		write_proto_to_file(self.current_log, current_log_filename)
		# After write, reset variables
		self.current_log = CD.CrawlLog()
		return current_log_filename
Exemplo n.º 5
0
	def crawl(self):
		has_written = False
		for user_agent in self.user_agents:
			user_agent_md5 = hex_md5(user_agent)
			self.crawl_config.user_agent = user_agent
			self.crawl_config.user_agent_md5_dir = self.base_dir + user_agent_md5 + '/'
			# specify which type of browser to use
			set_browser_type(self.crawl_config)
			mkdir_if_not_exist(self.crawl_config.user_agent_md5_dir)
			# md5 - user agent mapping logs
			md5_UA_f = open(self.md5_UA_filename, 'a')  # user agent
			md5_UA_f.write(user_agent_md5 + ":" + user_agent + "\n")
			md5_UA_f.close()
			# crawl web pages
			url_fetcher = UrlFetcher(self.crawl_config)
			thread_computer = ThreadComputer(url_fetcher, 'fetch_url', self.urls)
			url_fetcher.quit()
			# Write log for current user agent
			current_log = CD.CrawlLog()
			current_log_filename = self.crawl_config.user_agent_md5_dir + 'crawl_log'
			current_search = CD.CrawlSearchTerm()
			for p, s in thread_computer.result:
				result = current_search.result.add()
				result.CopyFrom(s)
				result_search = current_log.result_search.add()
				result_search.CopyFrom(current_search)
			write_proto_to_file(current_log, current_log_filename)
			# Write global crawl_log
			crawl_log = CD.CrawlLog()
			if has_written:
				read_proto_from_file(crawl_log, self.crawl_log_filename)
			else:
				has_written = True
			for r_s in current_log.result_search:
				result_search = crawl_log.result_search.add()
				result_search.CopyFrom(r_s)
			"""
			for s in current_log.result:
				result = crawl_log.result.add()
				result.CopyFrom(s)
			"""
			write_proto_to_file(crawl_log, self.crawl_log_filename)
Exemplo n.º 6
0
def _output_sample_sites(original_label_list, filenames, outfile):
	"""
	Output the sample sites, either google or user

	@parameter
	oringinal_label_list: the selected websites
	filenames: observed sites filenames
	outfile: output filename
	@return
	If observed_sites from filenames doesn't contain all urls from original
	label list, use the return value new_label_list to get the updated
	label list.
	"""
	observed_sites = merge_observed_sites(filenames)
	observed_sites_map = dict()
	for observed_site in observed_sites.site:
		observed_sites_map[observed_site.name] = observed_site
	sample_sites = CD.ObservedSites()
	if observed_sites.HasField("config"):
		sample_sites.config.CopyFrom(observed_sites.config)
	else:
		print "There is no config in the observed_sites, please double check why"
		print "This can only happen to expected sites"
		print filenames
	sample_list = list()
	new_label_list = list()
	for label in original_label_list:
		if label in observed_sites_map:
			sample_list.append(observed_sites_map[label])
			new_label_list.append(label)
	for observed_site in sample_list:
		sample_site = sample_sites.site.add()
		sample_site.CopyFrom(observed_site)
	write_proto_to_file(sample_sites, outfile)
	o_size = len(original_label_list) 
	n_size = len(new_label_list)
	if not o_size == n_size:
		print "size of the original label list is: {0}".format(o_size)
		print "size of the new label list is: {0}".format(n_size)
	return new_label_list
Exemplo n.º 7
0
def generate_test(observed_sites_filename, test_size=5000, positive_size=1000):
	text_observed_sites_filename = observed_sites_filename + ".text"
	dom_observed_sites_filename = observed_sites_filename + ".dom"
	if not (os.path.exists(dom_observed_sites_filename) and os.path.exists(text_observed_sites_filename)):
		raise Exception("Computed observed sites file doesn't exist!")

	# select for text simhash first
	computed_observed_sites_filename = text_observed_sites_filename
	observed_sites = CD.ObservedSites()
	read_proto_from_file(observed_sites, computed_observed_sites_filename)
	observed_site_list = list()
	for observed_site in observed_sites.site:
		observed_site_list.append(observed_site)
	random.shuffle(observed_site_list)
	# test_size is number of sites, actual observation should be more than this.
	test_sites = CD.ObservedSites()
	mismatch_sites = CD.ObservedSites()
	test_sites.config.CopyFrom(observed_sites.config)
	mismatch_sites.config.CopyFrom(observed_sites.config)

	test_list = observed_site_list[0:test_size]
	mismatch_list = test_list[0:positive_size]
	# original_label_list and mismatch_label_mapping is used in dom select.
	original_label_list = [observed_site.name for observed_site in test_list]
	mismatch_label_mapping = dict()
	for observed_site in mismatch_list:
		# observed_site in test_list are also changed.
		current_label = observed_site.name
		mismatch_label = random.sample(observed_site_list, 1)[0].name
		while (top_domain(current_label) == top_domain(mismatch_label)):
			mismatch_label = random.sample(observed_site_list, 1)[0].name
		observed_site.name = mismatch_label
		mismatch_site = mismatch_sites.site.add()
		mismatch_site.CopyFrom(observed_site)
		mismatch_label_mapping[current_label] = mismatch_label
	for observed_site in test_list:
		test_site = test_sites.site.add()
		test_site.CopyFrom(observed_site)
	mismatch_sites_filename = computed_observed_sites_filename + ".mismatch"
	test_sites_filename = computed_observed_sites_filename + ".test"
	write_proto_to_file(mismatch_sites, mismatch_sites_filename)
	write_proto_to_file(test_sites, test_sites_filename)

	# select for dom simhash now
	computed_observed_sites_filename = dom_observed_sites_filename
	observed_sites = CD.ObservedSites()
	read_proto_from_file(observed_sites, computed_observed_sites_filename)
	observed_sites_map = dict()
	for observed_site in observed_sites.site:
		observed_sites_map[observed_site.name] = observed_site
	test_sites = CD.ObservedSites()
	mismatch_sites = CD.ObservedSites()
	test_sites.config.CopyFrom(observed_sites.config)
	mismatch_sites.config.CopyFrom(observed_sites.config)

	test_list = list()
	for label in original_label_list:
		test_list.append(observed_sites_map[label])
	for label in mismatch_label_mapping:
		observed_sites_map[label].name = mismatch_label_mapping[label]
		mismatch_site = mismatch_sites.site.add()
		mismatch_site.CopyFrom(observed_sites_map[label])
	for observed_site in test_list:
		test_site = test_sites.site.add()
		test_site.CopyFrom(observed_site)
	mismatch_sites_filename = computed_observed_sites_filename + ".mismatch"
	test_sites_filename = computed_observed_sites_filename + ".test"
	write_proto_to_file(mismatch_sites, mismatch_sites_filename)
	write_proto_to_file(test_sites, test_sites_filename)
Exemplo n.º 8
0
def update_groundtruth_redundant(count, original_expected,
		original_u_text, original_u_dom,
		original_g_text, original_g_dom,
		add_expected, add_all,
		out_expected, out_u_text, out_u_dom,
		out_g_text, out_g_dom):
	valid_instance(count, int)
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)

	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)
	diff_e_list = list(diff_e_set)
	logger.info(len(diff_e_list))
	random.shuffle(diff_e_list)
	diff_e_sample = diff_e_list[:count]
	

	"""
	get the sites that are in "de-deduplicated" examples and add them
	this is necessary, because there are sites, that are cloaking, but
	remove in de-dup phase. Doesn't know why.
	"""
	add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n'))
	diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all
		+ ".u.text.temp"))

	# use the updated diff expected set, to generate the new data
	_output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp")
	add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom')
	_output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp")
	add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp")
	add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp")
	out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_all +
		".g.text.temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_all +
		".g.dom.temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)
Exemplo n.º 9
0
def main(argv):
	has_function = False
	help_msg = """data_util.py -f <function> [-p <prefix>][-p <prefix> -o
	<outfile>][-i <inputfile> -t <proto_type>][-o <outfile>][-i <site_list>
	-l <server_link> -o <outdir> -m <mode>][-i <inputfile>-o <outfile> -s
	<simhash_type> -t <proto_type>][-i <inputfile> -o <outfile> -s
	<simhash_type> -t <proto_type> -a] [-o <outfile>] [-i <inputfile> -o
	<outfile>] [-i <inputfile>] [-i <text_filt>] [-i <inputfile> -c <count>
	-o <outfile>] [-o <outfile>] [-i <inputfile> -l <leanredfile> -o <outfile>], valid functions are
	append_prefix, compute_list, show_proto, intersect_sites,
	collect_observations, plot_simhash, plot_sim_distance, get_domains,
	get_domain_scores, domain_filter, dedup, sample, merge_sites,
	get_learned_eval, [-i <table_name> -o <outfie>] export_db
	[-i <inputfile> -o <outfile>] de_noise
	[-i <inputfile> -c <count>] update_groundtruth
	[-i <user observation list, suffix removed>] merge_user_sites"""
	try:
		opts, args = getopt.getopt(argv, "hf:p:o:t:i:m:l:s:ac:",
				["function=", "prefix=", "outfile=",
					"proto_type=", "ifile=", "mode=",
					"link=", "simhash_type=", "avg_dist",
					"count"])
	except getopt.GetoptError:
		print help_msg
		sys.exit(2)
	hasinputfile = False
	outfile = None
	avg_dist = False
	for opt, arg in opts:
		if opt == "-h":
			print help_msg
			sys.exit()
		elif opt in ("-f", "--function"):
			function = arg
			has_function = True
		elif opt in ("-p", "--prefix"):
			prefix = arg
		elif opt in ("-o", "--outfile"):
			outfile = arg
		elif opt in ("-i", "--ifile"):
			inputfile = arg
			hasinputfile = True
		elif opt in ("-t", "--proto_type"):
			proto_type = arg
		elif opt in ("-m", "--mode"):
			mode = arg
		elif opt in ("-l", "--link"):
			link = arg
		elif opt in ("-s", "--simhash_type"):
			simhash_type = arg
		elif opt in ("-a", "--avg_dist"):
			avg_dist = True
		elif opt in ("-c", "--count"):
			count = arg
		else:
			print help_msg
			sys.exit(2)
	if hasinputfile:
		logging.basicConfig(filename= inputfile + "_running_log_" + function, level=logging.DEBUG)
		logging.getLogger("global")
	if not has_function:
		print help_msg
		sys.exit()
	if function == "append_prefix":
		inputfile_list = [line[:-1] for line in sys.stdin]
		append_prefix(inputfile_list, prefix)
	elif function == "compute_list":
		crawl_log_list = [line[:-1] for line in sys.stdin]
		compute_list(crawl_log_list, outfile, prefix)
	elif function == "show_proto":
		show_proto(inputfile, proto_type)
	elif function == "intersect_sites":
		observed_sites_list = [line[:-1] for line in sys.stdin]
		result_sites = intersect_observed_sites(*observed_sites_list)
		write_proto_to_file(result_sites, outfile)
		evaluation_form(outfile, outfile + ".eval", "ObservedSites")
	elif function == "collect_observations":
		if link:
			util.REMOTE_DRIVER = link
		site_list = filter(bool, open(inputfile, 'r').read().split('\n'))
		site_set = set(site_list)
		outdir = outfile
		collect_site_for_plot(site_set, outdir, mode)
	elif function == "plot_simhash":
		if not outfile:
			outfile = inputfile + ".plot_cluster"
		plot_simhash(inputfile, outfile, simhash_type, proto_type)
	elif function == "plot_sim_distance":
		if not outfile:
			outfile = inputfile + ".plot_sim_distance"
		plot_sim_distance(inputfile, outfile, simhash_type, proto_type,
				avg_dist)
	elif function == "get_domains":
		observed_sites_list = [line[:-1] for line in sys.stdin]
		get_domains(observed_sites_list, outfile)
	elif function == "get_domain_scores":
		domains = filter(bool, open(inputfile, 'r').read().split('\n'))
		result = domain_scores(domains, outfile)
	elif function == "domain_filter":
		"""
		Three steps for computed sites.
		1. filter known benign
		2. de-duplicate
		3. sample $count number of sites
		"""
		bar_points = 60
		observed_sites_list = filter(bool, open(inputfile, 'r').read().split('\n'))
		for filename in observed_sites_list:
			get_bad(bar_points, filename, filename + ".filt")
	elif function == "dedup":
		text_filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		count = 0
		for filename in text_filenames:
			if ((not 'text' in filename) or ('google' in filename) or
					('dom' in filename)):
				response = interact_query("The input file doesn't seem to \
						be valid! Press [Yes/No] to continue or exit!")
				if not response:
					sys.exit(0)
			count += dedup(filename)

		logger = logging.getLogger("global")
		logger.info("total sites after dedup: {0}".format(count))
	elif function == "sample":
		text_filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		sample(text_filenames, outfile, int(count))
		evaluation_form(outfile + '.user.sample.text', outfile +
				".user.sample.text.eval", "ObservedSites")
		evaluation_form(outfile + '.google.sample.text', outfile +
				".google.sample.text.eval", "ObservedSites")
	elif function == "merge_sites":
		observed_sites_names = [line[:-1] for line in sys.stdin]
		observed_sites = merge_observed_sites(observed_sites_names)
		logger = logging.getLogger("global")
		logger.info("total sites after merge: {0}".format(len(observed_sites.site)))
		write_proto_to_file(observed_sites, outfile)
	elif function == "merge_user_sites":
		"""
		-i input_file
		"""
		filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		text_filenames = [filename + '.text' for filename in filenames]
		dom_filenames = [filename + '.dom' for filename in filenames]
		text_observed_sites = merge_observed_sites(text_filenames)
		logger = logging.getLogger("global")
		logger.info("total sites after merge: {0}".format(len(text_observed_sites.site)))
		write_proto_to_file(text_observed_sites, inputfile + '.text')
		dom_observed_sites = merge_observed_sites(dom_filenames)
		logger.info("total sites after merge: {0}".format(len(dom_observed_sites.site)))
		write_proto_to_file(dom_observed_sites, inputfile + '.dom')
	elif function == "get_learned_eval":
		"""
		-l learned_file -i detected_file
		"""
		learned_file = link
		observed_file = inputfile
		result_sites = get_learned_eval(learned_file, observed_file)
		write_proto_to_file(result_sites, outfile)
		evaluation_form(outfile, outfile + ".eval", "LearnedSites")
	elif function == "export_db":
		"""
		-i table_name -o outfile
		"""
		export_db_to_file(inputfile, outfile)
		export_db_to_file(inputfile, outfile + ".noise", ["PageBroken"])
	elif function == "de_noise":
		"""
		remove noise: index.html not found, feature count = 0
		"""
		if "learn" in inputfile:
			response = interact_query("The input file seems to \
					be learned sites, we only support observed \
					sites! Press [Yes/No] to continue or exit!")
			if not response:
				sys.exit(0)

		logger = logging.getLogger("global")
		logger.info("processing {0}".format(inputfile))
		de_noise_config = CD.DeNoiseConfig()
		de_noise_config.zero_feature = True
		original = CD.ObservedSites()
		read_proto_from_file(original, inputfile)
		observed_sites = de_noise(original, de_noise_config)
		logger.info("before de-noise {0}".format(len(original.site)))
		logger.info("after de-noise: {0}".format(len(observed_sites.site)))
		outfile = outfile if outfile else inputfile
		write_proto_to_file(observed_sites, outfile)
	elif function == "update_groundtruth":
		"""
		This function is too specific. It is to add more malicious
		examples to the collected groundtruth.
		"""
		filenames = filter(bool, open(inputfile, 'r').read().split('\n'))
		if len(filenames) == 15:
			original_expected = filenames[0]
			original_u_text = filenames[1]
			original_u_dom = filenames[2]
			original_g_text = filenames[3]
			original_g_dom = filenames[4]
			# observed site may have same URL.
			add_count = count
			add_expected = filenames[5]
			add_u_text = filenames[6]
			add_u_dom = filenames[7]
			add_g_text = filenames[8]
			add_g_dom = filenames[9]
			# outfile
			out_expected = filenames[10]
			out_u_text = filenames[11]
			out_u_dom = filenames[12]
			out_g_text = filenames[13]
			out_g_dom = filenames[14]
			# in this case we will add all
			update_groundtruth(original_expected,
					original_u_text, original_u_dom,
					original_g_text, original_g_dom,
					add_expected, add_u_text, add_u_dom,
					add_g_text, add_g_dom,
					out_expected, out_u_text, out_u_dom,
					out_g_text, out_g_dom)
		elif len(filenames) == 12:
			original_expected = filenames[0]
			original_u_text = filenames[1]
			original_u_dom = filenames[2]
			original_g_text = filenames[3]
			original_g_dom = filenames[4]
			# observed site may have same URL.
			add_count = int(count)
			add_expected = filenames[5]
			add_all = filenames[6]
			'''
			add_u_text = filenames[6]
			add_u_dom = filenames[7]
			add_g_text = filenames[8]
			add_g_dom = filenames[9]
			# outfile
			out_expected = filenames[10]
			out_u_text = filenames[11]
			out_u_dom = filenames[12]
			out_g_text = filenames[13]
			out_g_dom = filenames[14]
			'''
			out_expected = filenames[7]
			out_u_text = filenames[8]
			out_u_dom = filenames[9]
			out_g_text = filenames[10]
			out_g_dom = filenames[11]

			update_groundtruth_redundant(add_count, original_expected,
					original_u_text, original_u_dom,
					original_g_text, original_g_dom,
					add_expected, add_all,
					out_expected, out_u_text, out_u_dom,
					out_g_text, out_g_dom)
		else:
			raise Exception("Cannot handle now!")
	else:
		print help_msg
		sys.exit(2)
Exemplo n.º 10
0
def dedup(text_file):
	"""
	1. dom_file, google_text_file, google_dom_file are deducted from text_file
	2. google files can be split. we first check whether unsplit exisits, if
	not we merge all the split ones.
	3. The observed sites are output to correponding filename + '.dedup'

	@parameter
	text_file: text observed sites file
	@return
	number of websites after deduplicate
	"""
	dom_file = text_file.replace('text', 'dom')
	user_text_observed_sites = CD.ObservedSites()
	read_proto_from_file(user_text_observed_sites, text_file)
	logger = logging.getLogger("global")
	logger.info("processing {0}".format(text_file))
	logger.info("before dedup: {0}".format(len(user_text_observed_sites.site)))
	user_dom_observed_sites = CD.ObservedSites()
	read_proto_from_file(user_dom_observed_sites, dom_file)
	google_text_file = text_file.replace('user', 'google')
	google_text_observed_sites = load_split_observed_sites(google_text_file)
	google_dom_file = dom_file.replace('user', 'google')
	google_dom_observed_sites = load_split_observed_sites(google_dom_file)

	user_text_dict, user_text_sites_dict = build_site_simhash_dict(user_text_observed_sites)
	user_dom_dict, user_dom_sites_dict = build_site_simhash_dict(user_dom_observed_sites)
	google_text_dict, google_text_sites_dict = build_site_simhash_dict(google_text_observed_sites)
	google_dom_dict, google_dom_sites_dict = build_site_simhash_dict(google_dom_observed_sites)

	# how to define exact match
	user_text_remained = CD.ObservedSites()
	user_dom_remained = CD.ObservedSites()
	google_text_remained = CD.ObservedSites()
	google_dom_remained = CD.ObservedSites()
	text_failure = set([0])
	failure_count = 0
	# if the feature set is empty, then this is the hash value.
	text_zero = set([18446744073709551615])
	zero_count = 0
	google_failure_count = 0
	google_zero_count = 0
	for site_name in user_text_dict:
		if ((not site_name in google_text_dict) or
				(not site_name in google_dom_dict)):
			continue
		if (user_text_dict[site_name] == text_failure):
			failure_count += 1
			continue
		elif (user_text_dict[site_name] == text_zero):
			zero_count += 1
			continue
		elif (google_text_dict[site_name] == text_failure):
			google_failure_count += 1
			continue
		elif (google_text_dict[site_name] == text_zero):
			google_zero_count += 1
			continue
		text_common = user_text_dict[site_name] & google_text_dict[site_name] 
		dom_common = user_dom_dict[site_name] & google_dom_dict[site_name]
		if (text_common == user_text_dict[site_name] and 
				dom_common == user_dom_dict[site_name]):
			continue
		else:
			_add_observed_site(user_text_remained, user_text_sites_dict, site_name)
			_add_observed_site(user_dom_remained, user_dom_sites_dict, site_name)
			_add_observed_site(google_text_remained, google_text_sites_dict, site_name)
			_add_observed_site(google_dom_remained, google_dom_sites_dict, site_name)

	user_text_remained.config.CopyFrom(user_text_observed_sites.config)
	user_dom_remained.config.CopyFrom(user_dom_observed_sites.config)
	google_text_remained.config.CopyFrom(google_text_observed_sites.config)
	google_dom_remained.config.CopyFrom(google_dom_observed_sites.config)
	write_proto_to_file(user_text_remained, text_file + ".dedup")
	write_proto_to_file(user_dom_remained, dom_file + ".dedup")
	write_proto_to_file(google_text_remained, google_text_file + ".dedup")
	write_proto_to_file(google_dom_remained, google_dom_file + ".dedup")
	logger.info("after dedup: {0}".format(len(user_text_remained.site)))
	logger.info("failure count: {0}, zero feature count: {1}".format(failure_count, zero_count))
	logger.info("google failure count: {0}, google zero feature count: {1}".format(google_failure_count, google_zero_count))
	return len(user_text_remained.site)