class TestBaseProxyParsers(unittest.TestCase):
    def setUp(self):
        agentsfile = os.path.join(
            os.path.dirname(__file__),
            '../http_request_randomizer/requests/data/user_agents.txt')
        self.uafile = UserAgentManager(file=agentsfile)
        self.uafake = UserAgentManager()

    def test_agent_size(self):
        self.assertTrue(self.uafile.get_len_user_agent() >= 899)
        self.assertIsNone(self.uafake.get_len_user_agent())

    def test_fist_user_agent(self):
        expected = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0"
        self.assertEqual(self.uafile.get_first_user_agent(), expected)
        self.assertIsNone(self.uafake.get_first_user_agent())

    def test_last_user_agent(self):
        expected = "Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.2.15 Version/10.0"
        self.assertEqual(self.uafile.get_last_user_agent(), expected)
        self.assertIsNone(self.uafake.get_last_user_agent())

    def test_random_user_agent(self):
        count = 0
        for i in range(1, 101):
            if self.uafile.get_random_user_agent(
            ) == self.uafile.get_random_user_agent():
                count = count + 1
        self.assertNotEqual(count, i)
示例#2
0
class TestBaseProxyParsers(unittest.TestCase):
    def setUp(self):
        self.ua = UserAgentManager()

    def test_agent_size(self):
        self.assertTrue(self.ua.get_len_user_agent() >= 899)

    def test_fist_user_agent(self):
        expected = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0"
        self.assertEquals(self.ua.get_first_user_agent(), expected)

    def test_last_user_agent(self):
        expected = "Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.2.15 Version/10.0"
        self.assertEquals(self.ua.get_last_user_agent(), expected)

    def test_random_user_agent(self):
        self.assertNotEqual(self.ua.get_random_user_agent(), self.ua.get_random_user_agent())
示例#3
0
class RequestProxy:
    def __init__(self,
                 web_proxy_list=[],
                 sustain=False,
                 timeout=5,
                 protocol=Protocol.HTTP,
                 log_level=0):
        self.logger = logging.getLogger()
        self.logger.addHandler(handler)
        self.logger.setLevel(log_level)
        self.userAgent = UserAgentManager(file=os.path.join(
            os.path.dirname(__file__), '../data/user_agents.txt'))

        #####
        # Each of the classes below implements a specific URL Parser
        #####
        parsers = list([])
        parsers.append(
            FreeProxyParser('FreeProxy',
                            'http://free-proxy-list.net',
                            timeout=timeout))
        #parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) <--doesn't work anymore
        #parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)) <--doesn't work anymore
        parsers.append(
            PremProxyParser('PremProxy',
                            'https://premproxy.com',
                            timeout=timeout))
        parsers.append(
            SslProxyParser('SslProxy',
                           'https://www.sslproxies.org',
                           timeout=timeout))

        self.logger.debug("=== Initialized Proxy Parsers ===")
        for i in range(len(parsers)):
            self.logger.debug("\t {0}".format(parsers[i].__str__()))
        self.logger.debug("=================================")

        self.sustain = sustain
        self.parsers = parsers
        self.proxy_list = web_proxy_list
        for parser in parsers:
            try:
                size = len(self.proxy_list)
                self.proxy_list += parser.parse_proxyList()
                self.logger.debug('Added {} proxies from {}'.format(
                    len(self.proxy_list) - size, parser.id))
            except ReadTimeout:
                self.logger.warning("Proxy Parser: '{}' TimedOut!".format(
                    parser.url))
        self.logger.debug('Total proxies = ' + str(len(self.proxy_list)))
        # filtering the list of available proxies according to user preferences
        self.proxy_list = [
            p for p in self.proxy_list if protocol in p.protocols
        ]
        self.logger.debug('Filtered proxies = ' + str(len(self.proxy_list)))
        self.current_proxy = self.randomize_proxy()

    def set_logger_level(self, level):
        self.logger.setLevel(level)

    def get_proxy_list(self):
        return self.proxy_list

    def generate_random_request_headers(self):
        headers = {
            "Connection": "close",  # another way to cover tracks
            "User-Agent": self.userAgent.get_random_user_agent()
        }  # select a random user agent
        return headers

    def randomize_proxy(self):
        if len(self.proxy_list) == 0:
            raise ProxyListException("list is empty")
        rand_proxy = random.choice(self.proxy_list)
        while not rand_proxy:
            rand_proxy = random.choice(self.proxy_list)
        self.current_proxy = rand_proxy
        return rand_proxy

    #####
    # Proxy format:
    # http://<USERNAME>:<PASSWORD>@<IP-ADDR>:<PORT>
    #####
    def generate_proxied_request(self,
                                 url,
                                 method="GET",
                                 params={},
                                 data={},
                                 headers={},
                                 req_timeout=30):
        try:
            random.shuffle(self.proxy_list)
            # req_headers = dict(params.items() + self.generate_random_request_headers().items())

            req_headers = dict(params.items())
            req_headers_random = dict(
                self.generate_random_request_headers().items())
            req_headers.update(req_headers_random)

            if not self.sustain:
                self.randomize_proxy()

            headers.update(req_headers)

            self.logger.debug("Using headers: {0}".format(str(headers)))
            self.logger.debug("Using proxy: {0}".format(str(
                self.current_proxy)))
            request = requests.request(
                method,
                url,
                headers=headers,
                data=data,
                params=params,
                timeout=req_timeout,
                proxies={
                    "http":
                    "http://{0}".format(self.current_proxy.get_address()),
                    "https":
                    "https://{0}".format(self.current_proxy.get_address())
                })
            # Avoid HTTP request errors
            if request.status_code == 409:
                raise ConnectionError(
                    "HTTP Response [409] - Possible Cloudflare DNS resolution error"
                )
            elif request.status_code == 403:
                raise ConnectionError(
                    "HTTP Response [403] - Permission denied error")
            elif request.status_code == 503:
                raise ConnectionError(
                    "HTTP Response [503] - Service unavailable error")
            self.logger.info('RR Status {}'.format(request.status_code))
            return request
        except ConnectionError:
            try:
                self.proxy_list.remove(self.current_proxy)
            except ValueError:
                pass
            self.logger.debug(
                "Proxy unreachable - Removed Straggling proxy: {0} PL Size = {1}"
                .format(self.current_proxy, len(self.proxy_list)))
            self.randomize_proxy()
        except ReadTimeout:
            try:
                self.proxy_list.remove(self.current_proxy)
            except ValueError:
                pass
            self.logger.debug(
                "Read timed out - Removed Straggling proxy: {0} PL Size = {1}".
                format(self.current_proxy, len(self.proxy_list)))
            self.randomize_proxy()
        except ChunkedEncodingError:
            try:
                self.proxy_list.remove(self.current_proxy)
            except ValueError:
                pass
            self.logger.debug(
                "Wrong server chunked encoding - Removed Straggling proxy: {0} PL Size = {1}"
                .format(self.current_proxy, len(self.proxy_list)))
            self.randomize_proxy()
        except TooManyRedirects:
            try:
                self.proxy_list.remove(self.current_proxy)
            except ValueError:
                pass
            self.logger.debug(
                "Too many redirects - Removed Straggling proxy: {0} PL Size = {1}"
                .format(self.current_proxy, len(self.proxy_list)))
            self.randomize_proxy()
示例#4
0
class RequestProxy:
    def __init__(self, web_proxy_list=[], sustain=False, timeout=5):
        self.userAgent = UserAgentManager()
        self.logger = logging.getLogger()
        self.logger.addHandler(handler)
        self.logger.setLevel(0)

        #####
        # Each of the classes below implements a specific URL Parser
        #####
        parsers = list([])
        parsers.append(
            FreeProxyParser('http://free-proxy-list.net', timeout=timeout))
        parsers.append(
            ProxyForEuParser('http://proxyfor.eu/geo.php',
                             1.0,
                             timeout=timeout))
        parsers.append(
            RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout))
        parsers.append(
            SamairProxyParser('http://samair.ru/proxy/time-01.htm',
                              timeout=timeout))

        self.logger.debug("=== Initialized Proxy Parsers ===")
        for i in range(len(parsers)):
            self.logger.debug("\t {0}".format(parsers[i].__str__()))
        self.logger.debug("=================================")

        self.sustain = sustain
        self.parsers = parsers
        self.proxy_list = web_proxy_list
        for i in range(len(parsers)):
            try:
                self.proxy_list += parsers[i].parse_proxyList()
            except ReadTimeout:
                self.logger.warn("Proxy Parser: '{}' TimedOut!".format(
                    parsers[i].url))
        self.current_proxy = self.randomize_proxy()

    def set_logger_level(self, level):
        self.logger.setLevel(level)

    def get_proxy_list(self):
        return self.proxy_list

    def current_proxy_ip(self):
        return str(self.current_proxy)

    def generate_random_request_headers(self):
        headers = {
            "Connection": "close",  # another way to cover tracks
            "User-Agent": self.userAgent.get_random_user_agent()
        }  # select a random user agent
        return headers

    def randomize_proxy(self):
        if len(self.proxy_list) == 0:
            raise ProxyListException("list is empty")
        rand_proxy = random.choice(self.proxy_list)
        while not rand_proxy:
            rand_proxy = random.choice(self.proxy_list)
        self.current_proxy = rand_proxy
        return rand_proxy

    #####
    # Proxy format:
    # http://<USERNAME>:<PASSWORD>@<IP-ADDR>:<PORT>
    #####
    def generate_proxied_request(self,
                                 url,
                                 method="GET",
                                 params={},
                                 data={},
                                 headers={},
                                 req_timeout=30):
        try:
            random.shuffle(self.proxy_list)
            # req_headers = dict(params.items() + self.generate_random_request_headers().items())

            req_headers = dict(params.items())
            req_headers_random = dict(
                self.generate_random_request_headers().items())
            req_headers.update(req_headers_random)

            if not self.sustain:
                self.randomize_proxy()

            headers.update(req_headers)

            self.logger.debug("Using proxy: {0}".format(str(
                self.current_proxy)))
            request = requests.request(method,
                                       url,
                                       proxies={"http": self.current_proxy},
                                       headers=headers,
                                       data=data,
                                       params=params,
                                       timeout=req_timeout)
            # Avoid HTTP request errors
            if request.status_code == 409:
                raise ConnectionError(
                    "HTTP Response [409] - Possible Cloudflare DNS resolution error"
                )
            elif request.status_code == 403:
                raise ConnectionError(
                    "HTTP Response [403] - Permission denied error")
            elif request.status_code == 503:
                raise ConnectionError(
                    "HTTP Response [503] - Service unavailable error")
            print('RR Status {}'.format(request.status_code))
            return request
        except ConnectionError:
            try:
                self.proxy_list.remove(self.current_proxy)
            except ValueError:
                pass
            self.logger.debug(
                "Proxy unreachable - Removed Straggling proxy: {0} PL Size = {1}"
                .format(self.current_proxy, len(self.proxy_list)))
            self.randomize_proxy()
        except ReadTimeout:
            try:
                self.proxy_list.remove(self.current_proxy)
            except ValueError:
                pass
            self.logger.debug(
                "Read timed out - Removed Straggling proxy: {0} PL Size = {1}".
                format(self.current_proxy, len(self.proxy_list)))
            self.randomize_proxy()
        except ChunkedEncodingError:
            try:
                self.proxy_list.remove(self.current_proxy)
            except ValueError:
                pass
            self.logger.debug(
                "Wrong server chunked encoding - Removed Straggling proxy: {0} PL Size = {1}"
                .format(self.current_proxy, len(self.proxy_list)))
            self.randomize_proxy()