class TestBaseProxyParsers(unittest.TestCase): def setUp(self): agentsfile = os.path.join( os.path.dirname(__file__), '../http_request_randomizer/requests/data/user_agents.txt') self.uafile = UserAgentManager(file=agentsfile) self.uafake = UserAgentManager() def test_agent_size(self): self.assertTrue(self.uafile.get_len_user_agent() >= 899) self.assertIsNone(self.uafake.get_len_user_agent()) def test_fist_user_agent(self): expected = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0" self.assertEqual(self.uafile.get_first_user_agent(), expected) self.assertIsNone(self.uafake.get_first_user_agent()) def test_last_user_agent(self): expected = "Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.2.15 Version/10.0" self.assertEqual(self.uafile.get_last_user_agent(), expected) self.assertIsNone(self.uafake.get_last_user_agent()) def test_random_user_agent(self): count = 0 for i in range(1, 101): if self.uafile.get_random_user_agent( ) == self.uafile.get_random_user_agent(): count = count + 1 self.assertNotEqual(count, i)
class TestBaseProxyParsers(unittest.TestCase): def setUp(self): self.ua = UserAgentManager() def test_agent_size(self): self.assertTrue(self.ua.get_len_user_agent() >= 899) def test_fist_user_agent(self): expected = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0" self.assertEquals(self.ua.get_first_user_agent(), expected) def test_last_user_agent(self): expected = "Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.2.15 Version/10.0" self.assertEquals(self.ua.get_last_user_agent(), expected) def test_random_user_agent(self): self.assertNotEqual(self.ua.get_random_user_agent(), self.ua.get_random_user_agent())
class RequestProxy: def __init__(self, web_proxy_list=[], sustain=False, timeout=5, protocol=Protocol.HTTP, log_level=0): self.logger = logging.getLogger() self.logger.addHandler(handler) self.logger.setLevel(log_level) self.userAgent = UserAgentManager(file=os.path.join( os.path.dirname(__file__), '../data/user_agents.txt')) ##### # Each of the classes below implements a specific URL Parser ##### parsers = list([]) parsers.append( FreeProxyParser('FreeProxy', 'http://free-proxy-list.net', timeout=timeout)) #parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) <--doesn't work anymore #parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)) <--doesn't work anymore parsers.append( PremProxyParser('PremProxy', 'https://premproxy.com', timeout=timeout)) parsers.append( SslProxyParser('SslProxy', 'https://www.sslproxies.org', timeout=timeout)) self.logger.debug("=== Initialized Proxy Parsers ===") for i in range(len(parsers)): self.logger.debug("\t {0}".format(parsers[i].__str__())) self.logger.debug("=================================") self.sustain = sustain self.parsers = parsers self.proxy_list = web_proxy_list for parser in parsers: try: size = len(self.proxy_list) self.proxy_list += parser.parse_proxyList() self.logger.debug('Added {} proxies from {}'.format( len(self.proxy_list) - size, parser.id)) except ReadTimeout: self.logger.warning("Proxy Parser: '{}' TimedOut!".format( parser.url)) self.logger.debug('Total proxies = ' + str(len(self.proxy_list))) # filtering the list of available proxies according to user preferences self.proxy_list = [ p for p in self.proxy_list if protocol in p.protocols ] self.logger.debug('Filtered proxies = ' + str(len(self.proxy_list))) self.current_proxy = self.randomize_proxy() def set_logger_level(self, level): self.logger.setLevel(level) def get_proxy_list(self): return self.proxy_list def generate_random_request_headers(self): headers = { "Connection": "close", # another way to cover tracks "User-Agent": self.userAgent.get_random_user_agent() } # select a random user agent return headers def randomize_proxy(self): if len(self.proxy_list) == 0: raise ProxyListException("list is empty") rand_proxy = random.choice(self.proxy_list) while not rand_proxy: rand_proxy = random.choice(self.proxy_list) self.current_proxy = rand_proxy return rand_proxy ##### # Proxy format: # http://<USERNAME>:<PASSWORD>@<IP-ADDR>:<PORT> ##### def generate_proxied_request(self, url, method="GET", params={}, data={}, headers={}, req_timeout=30): try: random.shuffle(self.proxy_list) # req_headers = dict(params.items() + self.generate_random_request_headers().items()) req_headers = dict(params.items()) req_headers_random = dict( self.generate_random_request_headers().items()) req_headers.update(req_headers_random) if not self.sustain: self.randomize_proxy() headers.update(req_headers) self.logger.debug("Using headers: {0}".format(str(headers))) self.logger.debug("Using proxy: {0}".format(str( self.current_proxy))) request = requests.request( method, url, headers=headers, data=data, params=params, timeout=req_timeout, proxies={ "http": "http://{0}".format(self.current_proxy.get_address()), "https": "https://{0}".format(self.current_proxy.get_address()) }) # Avoid HTTP request errors if request.status_code == 409: raise ConnectionError( "HTTP Response [409] - Possible Cloudflare DNS resolution error" ) elif request.status_code == 403: raise ConnectionError( "HTTP Response [403] - Permission denied error") elif request.status_code == 503: raise ConnectionError( "HTTP Response [503] - Service unavailable error") self.logger.info('RR Status {}'.format(request.status_code)) return request except ConnectionError: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug( "Proxy unreachable - Removed Straggling proxy: {0} PL Size = {1}" .format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy() except ReadTimeout: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug( "Read timed out - Removed Straggling proxy: {0} PL Size = {1}". format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy() except ChunkedEncodingError: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug( "Wrong server chunked encoding - Removed Straggling proxy: {0} PL Size = {1}" .format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy() except TooManyRedirects: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug( "Too many redirects - Removed Straggling proxy: {0} PL Size = {1}" .format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy()
class RequestProxy: def __init__(self, web_proxy_list=[], sustain=False, timeout=5): self.userAgent = UserAgentManager() self.logger = logging.getLogger() self.logger.addHandler(handler) self.logger.setLevel(0) ##### # Each of the classes below implements a specific URL Parser ##### parsers = list([]) parsers.append( FreeProxyParser('http://free-proxy-list.net', timeout=timeout)) parsers.append( ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) parsers.append( RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout)) parsers.append( SamairProxyParser('http://samair.ru/proxy/time-01.htm', timeout=timeout)) self.logger.debug("=== Initialized Proxy Parsers ===") for i in range(len(parsers)): self.logger.debug("\t {0}".format(parsers[i].__str__())) self.logger.debug("=================================") self.sustain = sustain self.parsers = parsers self.proxy_list = web_proxy_list for i in range(len(parsers)): try: self.proxy_list += parsers[i].parse_proxyList() except ReadTimeout: self.logger.warn("Proxy Parser: '{}' TimedOut!".format( parsers[i].url)) self.current_proxy = self.randomize_proxy() def set_logger_level(self, level): self.logger.setLevel(level) def get_proxy_list(self): return self.proxy_list def current_proxy_ip(self): return str(self.current_proxy) def generate_random_request_headers(self): headers = { "Connection": "close", # another way to cover tracks "User-Agent": self.userAgent.get_random_user_agent() } # select a random user agent return headers def randomize_proxy(self): if len(self.proxy_list) == 0: raise ProxyListException("list is empty") rand_proxy = random.choice(self.proxy_list) while not rand_proxy: rand_proxy = random.choice(self.proxy_list) self.current_proxy = rand_proxy return rand_proxy ##### # Proxy format: # http://<USERNAME>:<PASSWORD>@<IP-ADDR>:<PORT> ##### def generate_proxied_request(self, url, method="GET", params={}, data={}, headers={}, req_timeout=30): try: random.shuffle(self.proxy_list) # req_headers = dict(params.items() + self.generate_random_request_headers().items()) req_headers = dict(params.items()) req_headers_random = dict( self.generate_random_request_headers().items()) req_headers.update(req_headers_random) if not self.sustain: self.randomize_proxy() headers.update(req_headers) self.logger.debug("Using proxy: {0}".format(str( self.current_proxy))) request = requests.request(method, url, proxies={"http": self.current_proxy}, headers=headers, data=data, params=params, timeout=req_timeout) # Avoid HTTP request errors if request.status_code == 409: raise ConnectionError( "HTTP Response [409] - Possible Cloudflare DNS resolution error" ) elif request.status_code == 403: raise ConnectionError( "HTTP Response [403] - Permission denied error") elif request.status_code == 503: raise ConnectionError( "HTTP Response [503] - Service unavailable error") print('RR Status {}'.format(request.status_code)) return request except ConnectionError: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug( "Proxy unreachable - Removed Straggling proxy: {0} PL Size = {1}" .format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy() except ReadTimeout: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug( "Read timed out - Removed Straggling proxy: {0} PL Size = {1}". format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy() except ChunkedEncodingError: try: self.proxy_list.remove(self.current_proxy) except ValueError: pass self.logger.debug( "Wrong server chunked encoding - Removed Straggling proxy: {0} PL Size = {1}" .format(self.current_proxy, len(self.proxy_list))) self.randomize_proxy()