def __init__(self, web_proxy_list=[], sustain=False): self.userAgent = UserAgentManager() self.logger = logging.getLogger() self.logger.addHandler(handler) self.logger.setLevel(0) ##### # Each of the classes below implements a specific URL Parser ##### parsers = [] parsers.append(freeproxyParser('http://free-proxy-list.net')) parsers.append(proxyforeuParser('http://proxyfor.eu/geo.php', 100.0)) parsers.append(rebroweeblyParser('http://rebro.weebly.com/proxy-list.html')) parsers.append(semairproxyParser('http://www.samair.ru/proxy/time-01.htm')) self.logger.debug("=== Initialized Proxy Parsers ===") for i in range(len(parsers)): self.logger.debug("\t {0}".format(parsers[i].__str__())) self.logger.debug("=================================") self.sustain = sustain self.parsers = parsers self.proxy_list = web_proxy_list for i in range(len(parsers)): self.proxy_list += parsers[i].parse_proxyList() self.current_proxy = self.randomize_proxy()
class RequestProxy: def __init__(self, web_proxy_list=[], sustain=False): self.userAgent = UserAgentManager() self.logger = logging.getLogger() self.logger.addHandler(handler) self.logger.setLevel(0) ##### # Each of the classes below implements a specific URL Parser ##### parsers = [] parsers.append(freeproxyParser('http://free-proxy-list.net')) parsers.append(proxyforeuParser('http://proxyfor.eu/geo.php', 100.0)) parsers.append(rebroweeblyParser('http://rebro.weebly.com/proxy-list.html')) parsers.append(semairproxyParser('http://www.samair.ru/proxy/time-01.htm')) self.logger.debug("=== Initialized Proxy Parsers ===") for i in range(len(parsers)): self.logger.debug("\t {0}".format(parsers[i].__str__())) self.logger.debug("=================================") self.sustain = sustain self.parsers = parsers self.proxy_list = web_proxy_list for i in range(len(parsers)): self.proxy_list += parsers[i].parse_proxyList() self.current_proxy = self.randomize_proxy() def set_logger_level(self, level): self.logger.setLevel(level) def get_proxy_list(self): return self.proxy_list def generate_random_request_headers(self): headers = { "Connection": "close", # another way to cover tracks "User-Agent": self.userAgent.get_random_user_agent() } # select a random user agent return headers def randomize_proxy(self): rand_proxy = random.choice(self.proxy_list) while not rand_proxy: rand_proxy = random.choice(self.proxy_list) self.current_proxy = rand_proxy return rand_proxy ##### # Proxy format: # http://<USERNAME>:<PASSWORD>@<IP-ADDR>:<PORT> ##### def generate_proxied_request(self, url, method="GET", params={}, data={}, headers={}, req_timeout=30): try: random.shuffle(self.proxy_list) req_headers = dict(params.items() + self.generate_random_request_headers().items()) if not self.sustain: self.randomize_proxy() headers.update(req_headers) self.logger.debug("Using proxy: {0}".format(str(self.current_proxy))) request = requests.request(method, url, proxies={"http": self.current_proxy}, headers=headers, data=data, params=params, timeout=req_timeout) return request except ConnectionError: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug("Proxy unreachable - Removed Straggling proxy: {0} PL Size = {1}".format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy() except ReadTimeout: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug("Read timed out - Removed Straggling proxy: {0} PL Size = {1}".format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy() except ChunkedEncodingError: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug("Wrong server chunked encoding - Removed Straggling proxy: {0} PL Size = {1}".format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy()