Exemplo n.º 1
0
	def __init__(self, crawl_config):
		valid_instance(crawl_config, CD.CrawlConfig)
		self.crawl_config = CD.CrawlConfig()
		self.crawl_config.CopyFrom(crawl_config)
		self.browser_queue = Queue.Queue()
		for i in xrange(self.crawl_config.maximum_threads):
			browser = start_browser(self.crawl_config.browser_type, incognito=False, \
					user_agent=self.crawl_config.user_agent)
			browser.set_page_load_timeout(15)
			self.browser_queue.put(browser)
		self.lock = threading.Lock()
	def search(self, search_term, right_click = True):
		"""
		Search search_term in browser. Return True if search succeeded.
		@parmeter
		search_term: the words to search
		click: whether to right click on each search result
		@return
		result_set: the set of results
		"""
		# start browser
		self.browser = start_browser(self.crawl_config.browser_type, incognito=False,
				user_agent=self.crawl_config.user_agent)
		self.browser.set_page_load_timeout(15)

		# search 
		start = 0
		ad_set = set()
		search_set = set()
		while start < self.crawl_config.count:
			try:
				# google search advertisements or results
				url = 'https://www.google.com/?gws_rd=ssl#q='
				url += '+'.join(search_term.split(' '))
				# append start when the start is greater than zero
				if start > 0:
					url += '&start={0}'.format(start)
				self.browser.get(url)
				# wait until page load complete
				elem = wait_find_element(self.browser, 'id', 'ires')
				if elem is None:
					raise Exception("Page load failed.")
				time.sleep(random.randint(1, 3))
				ad_set = ad_set | self.ad_links()
				if right_click:
					search_set = search_set | self.search_results()
				start = start + 10
			except:
				# For robustness, don't throw errors here.
				safe_quit(self.browser)
				logger = logging.getLogger("global")
				logger.error("error in search")
				logger.error(sys.exc_info()[0])
				if switch_vpn_state(self.connected):
					self.connected = not self.connected
				self.browser = restart_browser(self.crawl_config.browser_type,
						incognito=False,
						user_agent=self.crawl_config.user_agent,
						browser=self.browser)
		safe_quit(self.browser)
		return ad_set, search_set