Пример #1
0
	def set_proxy(self):
		if self.proxy_list:

			self.proxy_list.select_proxy()
			self.actual_proxy = self.proxy_list.selected_proxy
			logger.info("Setting {}:{} from {} as proxy".format(self.actual_proxy.host, 
																self.actual_proxy.port, 
																self.actual_proxy.geo))

			self.driver.execute("SET_CONTEXT", {"context": "chrome"})

			try:
				self.driver.execute_script("""
					Services.prefs.setIntPref('network.proxy.type', 1);
					Services.prefs.setCharPref("network.proxy.http", arguments[0]);
					Services.prefs.setIntPref("network.proxy.http_port", arguments[1]);
					Services.prefs.setCharPref("network.proxy.ssl", arguments[0]);
					Services.prefs.setIntPref("network.proxy.ssl_port", arguments[1]);""",
					self.actual_proxy.host,self.actual_proxy.port
					)
				
			finally:
				self.driver.execute("SET_CONTEXT", {"context": "content"})
				self.test_proxy()

		else:
			logger.info("------------------------La cagó pedazo e bola!!!!")

		"""
Пример #2
0
 def get_proxies(self):
     self.consults += 1
     if self.consults < 10:
         try:
             logger.info("ProxyGetter: ---> Starting to get proxies")
             proxies = asyncio.Queue()
             broker = Broker(proxies)
             tasks = asyncio.gather(
                 broker.find(types=self.types,
                             limit=self.limit,
                             countries=self.countries_list),
                 self.append_proxies(proxies))
             loop = asyncio.get_event_loop()
             loop.run_until_complete(tasks)
             self.retries = 0
         except RuntimeError:
             self.retries += 1
             logger.info(
                 "ProxyGetter: ---> Getproxy fail, waiting {} to the next try"
                 .format(5 * self.retries))
             sleep(5 * self.retries)
             self.get_proxies()
     else:
         sleep(5)
         self.get_proxies()
Пример #3
0
	def __init__(self, 
				 name, 
				 urls,
				 str_file,
				 test_url = "http://ipv4.plain-text-ip.com/",
				 #test_url = "https://www.york.ac.uk/teaching/cws/wws/webpage1.html", 
				 data_folder = "{}/scrapped".format(thispath),
				 proxy = False,
				 headless= False, 
				 wait=1,
				 max_nexts=0):

		logger.info("")
		logger.info("********************************************************************")
		logger.info("***               Inicializando el scrapper                      ***")
		logger.info("********************************************************************")
		logger.info("")

		self.name = name
		self.urls =  urls
		self.str_file = str_file
		self.test_url = test_url
		self.data_folder = "{}/{}".format(data_folder,self.name)
		
		self.proxy = proxy
		if self.proxy:
			self.actual_proxy = False
			if isinstance(self.proxy,dict):
				try: 
					self.proxy_list = proxies(typ=self.proxy["Types"], 
											  lim=self.proxy["Limit"], 
											  countries_list=self.proxy["Countries"])
				except:
					raise 
			else:
				self.proxy_list = proxies()
		else:
			self.proxy_list = False

		self.headless = headless
		self.wait = wait
		self.max_nexts = max_nexts
		
		

		self.data = []
		self.previus_data = []

		self.tread = False

		self.set_dom()
		self.configure_driver()
		self.crawl()
Пример #4
0
	def configure_driver(self):
		self.firefox_capabilities = DesiredCapabilities.FIREFOX

		UserAgent = random.choice(self.UserAgentList)
		logger.info("User agent: {}".format(UserAgent))


		if self.headless:
			self.options = Options()
			self.options.add_argument('--headless')
			self.firefox_profile = FirefoxProfile()
			self.firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
			self.firefox_profile.set_preference("general.useragent.override", UserAgent)
			self.firefox_profile.update_preferences()
					
		else:
			self.options = Options()
			self.firefox_profile = FirefoxProfile()
			self.firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
			# Esta lineas no sirven por ahora, buscar alternativas
			"""self.firefox_profile.set_preference('permissions.default.image',2) 
			self.firefox_profile.set_preference("permissions.default.stylesheet",2)"""
			self.firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False)
			self.firefox_profile.set_preference('media.navigator.video.enabled',False)
			self.firefox_profile.set_preference('media.encoder.webm.enabled',False)
			self.firefox_profile.set_preference('media.ffmpeg.enabled',False)
			self.firefox_profile.set_preference('media.flac.enabled',False)
			self.firefox_profile.set_preference("general.useragent.override", UserAgent)
			self.firefox_profile.update_preferences()

		try:
			self.driver = Firefox(capabilities=self.firefox_capabilities, 
								  options=self.options, 
								  firefox_profile=self.firefox_profile, 
								  executable_path='geckodriver')
		except:
			raise
Пример #5
0
	def test_proxy(self, timeout = 10):
		if self.actual_proxy:
			logger.info("Testing {}:{} from {} as proxy".format(self.actual_proxy.host))
			try:
				self.driver.set_page_load_timeout(timeout)
				self.driver.get(self.test_url)
				logger.info("{} worked in test page".format(self.actual_proxy.host))

			except TimeoutException as e:
				if timeout == 10:
					test_proxy(self, timeout = 20)
				else:
					logger.info("{} Time out".format(self.actual_proxy.host))	
					self.proxy_list.proxy_notwork(self.actual_proxy.host self.actual_proxy.port)
					self.set_proxy()

			except WebDriverException as e:				
				logger.info("{} Something goes wrong".format(self.actual_proxy.host))
				self.proxy_list.proxy_notwork(self.actual_proxy.ip, self.actual_proxy.port)
				self.set_proxy()
Пример #6
0
 def proxy_notwork(self, host, port):
     logger.info("ProxyGetter: -----> Proxies in list before : {} ".format(
         len(self.proxy_list)))
     if len(self.proxy_list) > 0:
         for proxy in self.proxy_list:
             if proxy.host == host and proxy.port == port:
                 logger.info(
                     "ProxyGetter: ---> {} doesn't work, deleting".format(
                         host))
                 self.proxy_list.remove(proxy)
         logger.info(
             "ProxyGetter: -----> Proxies in list after : {} ".format(
                 len(self.proxy_list)))
Пример #7
0
	def crawl(self):
		logger.info("Starting crawler")
		try:
			for enum, url in enumerate(self.urls):
				logger.info("Crawling {}".format(url["URL"]))
				filename = url["PARAM"].replace(" ","")
				self.data_file = "{}/{}.csv".format(self.data_folder,filename)
				self.createDataFile()
				self.url = url
				self.navigate()
			self.driver.close()
			logger.info("Finishing crawler")
		except:
			raise
Пример #8
0
 def select_proxy(self):
     if len(self.proxy_list) > 0:
         if self.proxy_list[0].is_working:
             self.selected_proxy = self.proxy_list[0]
             logger.info("ProxyGetter: ---> {} selected".format(
                 self.selected_proxy.host))
         else:
             logger.info(
                 "ProxyGetter: ---> {} reported as not working, deleting".
                 format(self.selected_proxy.host))
             self.proxy_list.remove(proxy)
             self.select_proxy()
     else:
         logger.info("ProxyGetter: ---> Proxies list exhausted")
         self.get_proxies()
         self.select_proxy()