def search(self, search_term, right_click = True):
		"""
		Search search_term in browser. Return True if search succeeded.
		@parmeter
		search_term: the words to search
		click: whether to right click on each search result
		@return
		result_set: the set of results
		"""
		# start browser
		self.browser = start_browser(self.crawl_config.browser_type, incognito=False,
				user_agent=self.crawl_config.user_agent)
		self.browser.set_page_load_timeout(15)

		# search 
		start = 0
		ad_set = set()
		search_set = set()
		while start < self.crawl_config.count:
			try:
				# google search advertisements or results
				url = 'https://www.google.com/?gws_rd=ssl#q='
				url += '+'.join(search_term.split(' '))
				# append start when the start is greater than zero
				if start > 0:
					url += '&start={0}'.format(start)
				self.browser.get(url)
				# wait until page load complete
				elem = wait_find_element(self.browser, 'id', 'ires')
				if elem is None:
					raise Exception("Page load failed.")
				time.sleep(random.randint(1, 3))
				ad_set = ad_set | self.ad_links()
				if right_click:
					search_set = search_set | self.search_results()
				start = start + 10
			except:
				# For robustness, don't throw errors here.
				safe_quit(self.browser)
				logger = logging.getLogger("global")
				logger.error("error in search")
				logger.error(sys.exc_info()[0])
				if switch_vpn_state(self.connected):
					self.connected = not self.connected
				self.browser = restart_browser(self.crawl_config.browser_type,
						incognito=False,
						user_agent=self.crawl_config.user_agent,
						browser=self.browser)
		safe_quit(self.browser)
		return ad_set, search_set 
Пример #2
0
	def fetch_url(self, url):
		while True:
			self.lock.acquire()
			if self.browser_queue.empty():
				self.lock.release()
				time.sleep(5)
			else:
				browser = self.browser_queue.get()
				self.lock.release()
				break
		result = CD.CrawlResult() # record whether url loading failed!
		result.url = url
		result.url_md5 = hex_md5(url)
		result.success = True
		try:
			# This line is used to handle alert: <stay on this page> <leave this page>
			browser.get(result.url)
			browser.execute_script("window.onbeforeunload = function() {};")
			time.sleep(1)
			if self.crawl_config.browser_type == CD.CrawlConfig.CHROME and \
					(('404 Not Found' in browser.title) \
					or ('403' in browser.title) \
					or ('Forbidden' in browser.title) \
					or ('not available' in browser.title) \
					or ('Problem loading page' in browser.title) \
					or ('Page not found' in browser.title) \
					or ('Error' in browser.title) \
					or ('Access denied' in browser.title) \
					or (browser.current_url == 'data:text/html,chromewebdata')):
				result.landing_url = browser.current_url
				result.landing_url_md5 = hex_md5(result.landing_url)
				result.success = False
			elif self.crawl_config.browser_type == CD.CrawlConfig.FIREFOX and \
					(('404 Not Found' in browser.title) \
					or ('403' in browser.title) \
					or ('Forbidden' in browser.title) \
					or ('not available' in browser.title) \
					or ('Problem loading page' in browser.title) \
					or ('Page not found' in browser.title) \
					or ('Error' in browser.title) \
					or ('Access denied' in browser.title)):
				result.landing_url = browser.current_url
				result.landing_url_md5 = hex_md5(result.landing_url)
				result.success = False
			else:
				#############
				# md5 the original url
				url_md5_dir = self.crawl_config.user_agent_md5_dir + result.url_md5 + '/'
				mkdir_if_not_exist(url_md5_dir)
				# get the landing url
				result.landing_url = browser.current_url
				result.landing_url_md5 = hex_md5(result.landing_url)
				# get the whole page source
				response = browser.execute_script("return document.documentElement.innerHTML;")
				result.file_path = url_md5_dir + 'index.html'
				f = open(result.file_path, 'w')
				f.write(response.encode('utf-8'))
				f.close()
			browser.delete_all_cookies()
			if len(browser.window_handles) > 1:
				# close all the other windows
				current_window_handle = browser.current_window_handle
				for handle in browser.window_handles:
					if handle != current_window_handle:
						browser.switch_to_window(handle)
						browser.close()
				# switch back to the current window
				browser.switch_to_window(current_window_handle)
		except:
			result.success = False
			browser = restart_browser(self.crawl_config.browser_type, incognito=False,
					user_agent=self.crawl_config.user_agent, browser=browser)
		self.browser_queue.put(browser)
		logger = logging.getLogger("global")
		logger.info("the length of the browser_queue")
		logger.info(self.browser_queue.qsize())
		return result