Exemplo n.º 1
0
	def visit_landing_url_n_times(self, crawl_log, n_times, revisit_dir_prefix,
			word_md5, word_md5_delimiter):
		"""
		@parameter
		crawl_log: crawl log to visit
		n_times: visit crawl_log for n_times
		"""
		valid_instance(crawl_log, CD.CrawlLog)
		valid_instance(n_times, int)
		# prepare landing_url_set
		landing_url_set = crawl_log_attr_set(crawl_log, "landing_url")
		landing_url_set_size = len(landing_url_set)
		if landing_url_set_size < 8:
			record_maximum_threads = self.crawl_config.maximum_threads
			self.crawl_config.maximum_threads = 2
		url_fetcher = UrlFetcher(self.crawl_config)
		for i in range(n_times):
			# the time label is set for each iteration of visit
			revisit_now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S")
			self.crawl_config.user_agent_md5_dir = word_md5.join(
					revisit_dir_prefix.split(word_md5_delimiter)) + \
					'.revisit_time' + revisit_now_suffix + '/'
			url_fetcher.update_dir(self.crawl_config.user_agent_md5_dir)

			self.visit_landing_url(landing_url_set, url_fetcher)
		self.write_crawl_log(False)
		url_fetcher.quit()
		if landing_url_set_size < 8:
			self.crawl_config.maximum_threads = record_maximum_threads
Exemplo n.º 2
0
	def __init__(self, url_file, user_agent_file, crawl_config):
		valid_instance(crawl_config, CD.CrawlConfig)
		self.crawl_config = CD.CrawlConfig()
		self.crawl_config.CopyFrom(crawl_config)

		# Prepare the input
		self.urls = filter(bool, open(url_file, 'r').read().split('\n'))
		self.user_agents = filter(bool, open(user_agent_file, 'r').read().split('\n'))
		# self.referers = filter(bool, open(referer_file, 'r').read().split('\n'))

		# Prepare the output directory
		crawl_type = None
		for user_agent in self.user_agents:
			if "bot" in user_agent:
				crawl_type = "bot"
				break
		if not crawl_type:
			crawl_type = "user"
		now = datetime.now().strftime("%Y%m%d-%H%M%S")
		self.base_dir = url_file + '.' + crawl_type + '.' + now + '.selenium.crawl/'
		mkdir_if_not_exist(self.base_dir)

		# Prepare log files
		# self.htmls_f = open(self.base_dir + 'html_path_list', 'a')
		self.md5_UA_filename = self.base_dir + 'md5_UA.log'
		self.crawl_log_filename = self.base_dir + 'crawl_log'
	def __init__(self, crawl_config):
		# user_agent should be set.
		valid_instance(crawl_config, CD.CrawlConfig)
		self.crawl_config = CD.CrawlConfig()
		self.crawl_config.CopyFrom(crawl_config)
		switch_vpn_state(True)
		self.connected = False
Exemplo n.º 4
0
	def visit_landing_url(self, landing_url_set, url_fetcher=None):
		"""
		@parameter
		landing_url_set: landing url set to visit
		url_fetcher: selenium handles to use for crawl
		"""
		valid_instance(landing_url_set, set)
		mkdir_if_not_exist(self.crawl_config.user_agent_md5_dir)
		# crawl web pages
		landing_url_set_size = len(landing_url_set)
		if landing_url_set_size < 8:
			record_maximum_threads = self.crawl_config.maximum_threads
			self.crawl_config.maximum_threads = 2
		quit_fetcher_when_done = False
		if not url_fetcher:
			url_fetcher = UrlFetcher(self.crawl_config)
			quit_fetcher_when_done = True
		thread_computer = ThreadComputer(url_fetcher, 'fetch_url',
				landing_url_set)
		if quit_fetcher_when_done:
			url_fetcher.quit()
		if landing_url_set_size < 8:
			self.crawl_config.maximum_threads = record_maximum_threads
		# create and fill current_search, including urls, search_term etc.
		current_search = CD.CrawlSearchTerm()
		for p, s in thread_computer.result:
			result = current_search.result.add()
			result.CopyFrom(s)
		# update current_log
		if self.first:
			self.first = False
			self.current_log = CD.CrawlLog()
		result_search = self.current_log.result_search.add()
		result_search.CopyFrom(current_search)
Exemplo n.º 5
0
	def __init__(self, crawl_config, max_word_per_file=5):
		# user_agent, user_agent_md5_dir should be set.
		valid_instance(crawl_config, CD.CrawlConfig)
		self.crawl_config = CD.CrawlConfig()
		self.crawl_config.CopyFrom(crawl_config)
		self.first = True 
		self.max_word_per_file = max_word_per_file 
		self.counter = 0
Exemplo n.º 6
0
	def __init__(self, crawl_config):
		valid_instance(crawl_config, CD.CrawlConfig)
		self.crawl_config = CD.CrawlConfig()
		self.crawl_config.CopyFrom(crawl_config)
		self.browser_queue = Queue.Queue()
		for i in xrange(self.crawl_config.maximum_threads):
			browser = start_browser(self.crawl_config.browser_type, incognito=False, \
					user_agent=self.crawl_config.user_agent)
			browser.set_page_load_timeout(15)
			self.browser_queue.put(browser)
		self.lock = threading.Lock()
Exemplo n.º 7
0
def build_site_simhash_dict(observed_sites):
	"""
	Return two dict, one maps site name to all the simhashs,
	the other maps site name to observed site
	"""
	valid_instance(observed_sites, CD.ObservedSites)
	site_simhash_dict = dict()
	observed_sites_dict = dict()
	attr_name = get_simhash_type(observed_sites.config.simhash_type)
	for observed_site in observed_sites.site:
		if not observed_site.name in site_simhash_dict:
			site_simhash_dict[observed_site.name] = set()
			observed_sites_dict[observed_site.name] = observed_site
		for observation in observed_site.observation:
			site_simhash_dict[observed_site.name].add(getattr(observation,
				attr_name))
	return site_simhash_dict, observed_sites_dict
Exemplo n.º 8
0
def crawl_log_attr_set(crawl_log, attr_name, success_only=True):
	"""
	Get attribute set from CrawlLog.
	@parameter
	crawl_log: the crawl log to extract attribute set from.
	attr_name: the name of attribute in crawl_log to collect.
	@return
	attr_set: the set of attrbiutes corresponding to attr_name
	"""
	valid_instance(crawl_log, CD.CrawlLog)
	attr_set = set()
	for result_search in crawl_log.result_search:
		for result in result_search.result:
			# collect information on success or not
			if success_only:
				if result.success:
					attr_set.add(getattr(result, attr_name))
			else:
				attr_set.add(getattr(result, attr_name))
	return attr_set
Exemplo n.º 9
0
def collect_site_for_plot(site_set, outdir, mode="user"):
	"""
	Collect user and google observation for site in site_set.
	This is scheduled by cron job. In order to show how hash values of
	websites change over time.

	@parameter
	site_set: the set of urls to visit
	outdir: the output directory
	mode: which user agent to use, supported mode includes user, google, both
	"""
	valid_instance(site_set, set)
	mkdir_if_not_exist(outdir)

	user_UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/" \
			"537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"
	google_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)"
	crawl_config = CD.CrawlConfig()
	crawl_config.maximum_threads = 1
	crawl_config.browser_type = CD.CrawlConfig.CHROME
	crawl_config.crawl_log_dir = outdir
	now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S")
	UAs = dict()
	if mode == "user":
		UAs["user"] = user_UA
	elif mode == "google":
		UAs["google"] = google_UA
	elif mode == "both":
		UAs["user"] =  user_UA
		UAs["google"] = google_UA
	else:
		raise Exception("Unknown mode {0}".format(mode))
	for mode in UAs:
		crawl_config.user_agent = UAs[mode]
		crawl_config.user_agent_md5_dir = outdir + hex_md5(crawl_config.user_agent) \
				+ now_suffix + '/'
		crawl_config.log_filename = mode + '_crawl_log' + now_suffix
		mode_visit = Visit(crawl_config)
		mode_visit.visit_landing_url(site_set)
		mode_visit.write_crawl_log(False)
Exemplo n.º 10
0
	def update_crawl_config(self, crawl_config):
		valid_instance(crawl_config, CD.CrawlConfig)
		self.crawl_config = CD.CrawlConfig()
		self.crawl_config.CopyFrom(crawl_config)
def search_and_revisit(word_file, n, threads=6, ad_only=False):
	"""
	This function does the following things.
	1. Search each word in word file.
	2. Grab the top 200 returned results and corresponding ads
	3. Visit all the results and ads with "chrome user agent", repeat n times
	4. Visit all the landing pages in step 3 with "google ads bot user agent"

	@parameter
	word_file: the filename containing the words to search
	n: repeat step 3 for n times
	ad_only: Only retrieve the advertisements. In this case, we only view the first 5 pages.

	@output
	Following are output of this function
	Running log:
	[WORD_FILE].selenium.crawl/running_log.[SEARCH_TIME]
	"chrome user agent" result is:
	[WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5]
	[WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5]
	[WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME]/[URL_MD5]/index.html
	"google ads bot user agent" result is:
	[WORD_FILE].selenium.crawl/ad_crawl_log.[SEARCH_TIME].[WORD_MD5].google
	[WORD_FILE].selenium.crawl/search_crawl_log.[SEARCH_TIME].[WORD_MD5].google
	[WORD_FILE].selenium.crawl/[WORD_MD5]/[UA_MD5].[SEARCH_TIME].revisit.[REVISIT_TIME]/[URL_MD5]/index.html
	"""
	valid_instance(threads, int)
	# prepare search and visit
	user_UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/" \
			"537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"
	user_suffix = "selenium.crawl/"
	search_now_suffix = datetime.now().strftime(".%Y%m%d-%H%M%S")
	word_md5_delimiter = "WORD_MD5"

	# compute base_dir and start logging
	base_dir = '.'.join([word_file, user_suffix])
	mkdir_if_not_exist(base_dir)
	logging.basicConfig(filename=base_dir+'running_log'+search_now_suffix, level=logging.DEBUG)
	logging.getLogger("global")

	# set search and visit crawl_config
	search_config = CD.CrawlConfig()
	search_config.maximum_threads = threads
	search_config.user_agent = user_UA
	# number of top search results to be inspected
	if ad_only:
		search_config.count = 50
	search_config.browser_type = CD.CrawlConfig.CHROME

	ad_crawl_config = CD.CrawlConfig()
	ad_crawl_config.CopyFrom(search_config)
	ad_crawl_config.result_type = CD.AD
	ad_crawl_config.crawl_log_dir = base_dir
	ad_log_filename_prefix = 'ad_crawl_log' + search_now_suffix
	ad_dir_prefix = base_dir + word_md5_delimiter + "/" + \
			hex_md5(ad_crawl_config.user_agent) + search_now_suffix + '/'
	search_crawl_config = CD.CrawlConfig()
	search_crawl_config.CopyFrom(search_config)
	search_crawl_config.result_type = CD.SEARCH
	search_crawl_config.crawl_log_dir = base_dir
	search_log_filename_prefix = 'search_crawl_log' + search_now_suffix
	search_dir_prefix = base_dir + word_md5_delimiter + "/" + \
			hex_md5(search_crawl_config.user_agent) + search_now_suffix + '/'

	# print crawl_config.user_agent
	words = SearchTerm(word_file)
	search = Search(search_config)
	ad_visit = Visit(ad_crawl_config, 1)
	search_visit = Visit(search_crawl_config, 1)

	# prepare the revisit
	google_ad_UA = "AdsBot-Google (+http://www.google.com/adsbot.html)"
	google_search_UA = "Googlebot/2.1 (+http://www.google.com/bot.html)"

	# set revisit crawl_config
	revisit_crawl_config = CD.CrawlConfig()
	revisit_crawl_config.maximum_threads = threads
	revisit_crawl_config.browser_type = CD.CrawlConfig.CHROME
	# base directory uses search_now_suffix to correlate these two
	revisit_crawl_config.crawl_log_dir = base_dir

	# search, visit and revisit each word
	for word in words.get_word_list():
		print "Processing {0} word: {1}".format(words.current(), word)
		# update word_md5 related directories
		print word
		word_md5 = hex_md5(word)
		ad_crawl_config.log_filename = ad_log_filename_prefix + "." + word_md5
		ad_crawl_config.user_agent_md5_dir = word_md5.join(
				ad_dir_prefix.split(word_md5_delimiter))
		search_crawl_config.log_filename = search_log_filename_prefix + "." + word_md5
		search_crawl_config.user_agent_md5_dir = word_md5.join(
				search_dir_prefix.split(word_md5_delimiter))
		ad_visit.update_crawl_config(ad_crawl_config)
		search_visit.update_crawl_config(search_crawl_config)
		
		# search and crawl
		right_click = not ad_only
		ad_set, search_set = search.search(word, right_click)
		ad_crawl_log_filename = ad_visit.visit(ad_set, word)
		if ad_only:
			search_crawl_log_filename = None
		else:
			search_crawl_log_filename = search_visit.visit(search_set, word)

		# revisit
		crawl_log_file_list = list()
		if ad_crawl_log_filename:
			crawl_log_file_list.append(ad_crawl_log_filename)
		if search_crawl_log_filename:
			crawl_log_file_list.append(search_crawl_log_filename)
		for crawl_log_file in crawl_log_file_list:
			if crawl_log_file == ad_crawl_log_filename:
				revisit_crawl_config.user_agent = google_ad_UA
			else:
				revisit_crawl_config.user_agent = google_search_UA
			revisit_dir_prefix = base_dir + word_md5_delimiter + "/" + \
					hex_md5(revisit_crawl_config.user_agent) + search_now_suffix
			revisit_crawl_config.log_filename = crawl_log_file.split('/')[-1] + '.google'
			revisit = Visit(revisit_crawl_config)
			crawl_log = CD.CrawlLog()
			read_proto_from_file(crawl_log, crawl_log_file)
			revisit.visit_landing_url_n_times(crawl_log, int(n), revisit_dir_prefix,
					word_md5, word_md5_delimiter)
		words.next()
		"""
Exemplo n.º 12
0
def update_groundtruth_redundant(count, original_expected,
		original_u_text, original_u_dom,
		original_g_text, original_g_dom,
		add_expected, add_all,
		out_expected, out_u_text, out_u_dom,
		out_g_text, out_g_dom):
	valid_instance(count, int)
	in_e = CD.ObservedSites()
	read_proto_from_file(in_e, original_expected)
	in_u_t = CD.ObservedSites()
	read_proto_from_file(in_u_t, original_u_text)
	in_u_d = CD.ObservedSites()
	read_proto_from_file(in_u_d, original_u_dom)
	in_g_t = CD.ObservedSites()
	read_proto_from_file(in_g_t, original_g_text)
	in_g_d = CD.ObservedSites()
	read_proto_from_file(in_g_d, original_g_dom)
	add_e = CD.ObservedSites()
	read_proto_from_file(add_e, add_expected)

	in_e_set = sites_name_set(in_e)
	add_e_set = sites_name_set(add_e)
	diff_e_set = add_e_set - in_e_set
	logger = logging.getLogger("global")
	logger.info("size of original set: {0}, size of add set: {1}, size of diff set: {2}".format(
		len(in_e_set), len(add_e_set), len(diff_e_set)))
	logger.info("diff set is")	
	logger.info(diff_e_set)
	diff_e_list = list(diff_e_set)
	logger.info(len(diff_e_list))
	random.shuffle(diff_e_list)
	diff_e_sample = diff_e_list[:count]
	

	"""
	get the sites that are in "de-deduplicated" examples and add them
	this is necessary, because there are sites, that are cloaking, but
	remove in de-dup phase. Doesn't know why.
	"""
	add_u_text_fs = filter(bool, open(add_all, 'r').read().split('\n'))
	diff_e_sample = set(_output_sample_sites(diff_e_sample, add_u_text_fs, add_all
		+ ".u.text.temp"))

	# use the updated diff expected set, to generate the new data
	_output_sample_sites(diff_e_sample, [add_expected], add_expected + ".temp")
	add_u_dom_fs = _replace_list_by(add_u_text_fs, 'text', 'dom')
	_output_sample_sites(diff_e_sample, add_u_dom_fs, add_all + ".u.dom.temp")
	add_g_text_fs = _replace_list_by(add_u_text_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_text_fs, add_all + ".g.text.temp")
	add_g_dom_fs = _replace_list_by(add_u_dom_fs, 'user', 'google')
	_output_sample_sites(diff_e_sample, add_g_dom_fs, add_all + ".g.dom.temp")
	out_expected_sites = merge_observed_sites([original_expected, add_expected + ".temp"])
	out_u_t_sites = merge_observed_sites([original_u_text, add_all + ".u.text.temp"])
	out_u_d_sites = merge_observed_sites([original_u_dom, add_all + ".u.dom.temp"])
	out_g_t_sites = merge_observed_sites([original_g_text, add_all +
		".g.text.temp"])
	out_g_d_sites = merge_observed_sites([original_g_dom, add_all +
		".g.dom.temp"])
	out_u_t_sites.config.CopyFrom(in_u_t.config)
	out_u_d_sites.config.CopyFrom(in_u_d.config)
	out_g_t_sites.config.CopyFrom(in_g_t.config)
	out_g_d_sites.config.CopyFrom(in_g_d.config)
	write_proto_to_file(out_expected_sites, out_expected)
	write_proto_to_file(out_u_t_sites, out_u_text)
	write_proto_to_file(out_u_d_sites, out_u_dom)
	write_proto_to_file(out_g_t_sites, out_g_text)
	write_proto_to_file(out_g_d_sites, out_g_dom)
Exemplo n.º 13
0
def _replace_list_by(to_replace_list, src, dst):
	valid_instance(to_replace_list, list)
	return [item.replace(src, dst) for item in to_replace_list]