Exemplo n.º 1
0
def fetch_ips(q: Queue, validator_queue: Queue):
    logger.debug('fetch_ips...')
    worker = Worker()

    while True:
        try:
            provider: BaseProvider = q.get()()

            provider_name = provider.__class__.__name__

            logger.debug('Get a provider from the provider queue: ' +
                         provider_name)

            for url in provider.urls():
                try:
                    html = worker.get_html(
                        url, render_js=provider.should_render_js())
                except Exception as e:
                    logger.error("worker.get_html failed: %s", e)
                    continue

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            worker.stop()
            logger.info('worker_process exited.')
            break
Exemplo n.º 2
0
class ProxylistsProvider(BaseProvider):
    def __init__(self):
        self.w = Worker()
        self.country_patten = re.compile('^/(.+)_0.html$')

    def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for tr in html.find('table table tr'):
            ip_element = tr.find('td:nth-of-type(1)', first=True)
            port_element = tr.find('td:nth-of-type(2)', first=True)
            if ip_element and port_element:
                ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',
                               ip_element.text).group(0)
                port = re.search(r'\d{2,5}', port_element.text).group(0)
                proxy = ProxyIP(ip=ip,
                                port=port,
                                provider=self.__class__.__name__)
                ip_list.append(proxy)

        return ip_list

    def urls(self) -> [str]:
        ret = set([])
        country_url = 'http://www.proxylists.net/countries.html'
        country_page = self.w.get_html(country_url, False)
        for a in country_page.find('a'):
            relative_path = a.attrs['href']
            if self.country_patten.match(relative_path):
                ret.update(
                    self.gen_url_for_country(
                        self.country_patten.findall(relative_path)[0]))
                break
        return list(ret)

    def gen_url_for_country(self, country) -> [str]:
        ret = []
        first_page = self.w.get_html(
            'http://www.proxylists.net/{}_0.html'.format(country), False)
        for a in first_page.find('table table tr:last-of-type a'):
            ret.append('http://www.proxylists.net/{}'.format(a.attrs['href']))
        return ret

    @staticmethod
    def should_render_js() -> bool:
        return True
Exemplo n.º 3
0
class ProxyListProvider(BaseProvider):
    def __init__(self):
        super().__init__()
        self.w = Worker()

    def parse(self, document: PyQuery) -> List[ProxyIP]:
        ip_list: List[ProxyIP] = []

        if document is None:
            return []

        for ul in document.find('#proxy-table > div.table-wrap ul'):
            js_code_element = ul.find('li.proxy script')

            if not js_code_element:
                return []

            js_code = js_code_element.text()
            matched = re.findall(r"Proxy\('(.+)'\)", js_code)
            if matched and len(matched) > 0:
                encoded = matched[0]
                ip_port = base64.b64decode(encoded).decode("utf-8")
                ip = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',
                                ip_port)[0]
                port = re.findall(r':(\d{2,5})', ip_port)[0]
                ip_list.append(ProxyIP(ip=ip, port=port))

        return ip_list

    def urls(self) -> List[str]:
        ret = []
        first_url = 'http://proxy-list.org/english/index.php?p=1'
        first_page = self.w.get_html(first_url, False)
        if first_page:
            ret.append(first_url)
            for a in first_page.find(
                    '#content div.content div.table-menu a.item'):
                relative_path = a.attrib['href']
                absolute_url = urllib.parse.urljoin(first_url, relative_path)
                ret.append(absolute_url)
        return ret

    @staticmethod
    def should_render_js() -> bool:
        return False
Exemplo n.º 4
0
def fetch_ips(q: Queue, validator_queue: Queue, run_once=False):
    logger.debug('worker_process started.')
    logger.info('fetching ips...')
    worker = Worker()

    while True:
        try:
            if run_once and q.empty():
                raise SystemExit
                break

            provider: BaseProvider = q.get()

            provider_name = provider.__class__.__name__

            logger.info('Get a provider from the provider queue: ' +
                        provider_name)

            for url in provider.urls():

                html = worker.get_html(url,
                                       render_js=provider.should_render_js())

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            worker.stop()
            break
        except pyppeteer.errors.PyppeteerError as e:
            logger.error(
                """pyppeteer.errors.PyppeteerError detected: %s\n
                         'Please make sure you have installed all the dependencies for chromium correctly""",
                e)
            break

    logger.debug('worker_process exited.')
Exemplo n.º 5
0
class ProxyListProvider(BaseProvider):

    def __init__(self):
        self.w = Worker()

    def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        if html is None:
            return []

        for ul in html.find('#proxy-table > div.table-wrap ul'):
            js_code = ul.find('li.proxy script', first=True).text
            matched = re.findall(r"Proxy\('(.+)'\)", js_code)
            if matched and len(matched) > 0:
                encoded = matched[0]
                ip_port = base64.b64decode(encoded).decode("utf-8")
                ip = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port)[0]
                port = re.findall(r':(\d{2,5})', ip_port)[0]
                proxy = ProxyIP(ip=ip, port=port, provider=self.__class__.__name__)
                ip_list.append(proxy)

        return ip_list

    def urls(self) -> [str]:
        ret = []
        first_url = 'http://proxy-list.org/english/index.php?p=1'
        sub = first_url[0:first_url.rfind('/')] # http://proxy-list.org/english
        first_page = self.w.get_html(first_url, False)

        ret.append(first_url)
        for a in first_page.find('#content div.content div.table-menu a.item'):
            relative_path = a.attrs['href']
            absolute_url = sub + relative_path[relative_path.find('/'):]
            ret.append(absolute_url)
        return ret


    @staticmethod
    def should_render_js() -> bool:
        return False
Exemplo n.º 6
0
def fetch_ips(q: Queue, validator_queue: Queue):
    logger.debug('fetch_ips...')
    worker = Worker()

    while True:
        try:
            provider: BaseProvider = q.get()

            provider_name = provider.__class__.__name__

            logger.debug('Get a provider from the provider queue: ' +
                         provider_name)

            for url in provider.urls():

                html = worker.get_html(url,
                                       render_js=provider.should_render_js())

                if html:
                    proxies = provider.parse(html)

                    for p in proxies:
                        validator_queue.put(p)
                        logger.debug('Put new proxy ip into queue: {}'.format(
                            p.__str__()))

                    logger.info(
                        ' {}: feed {} potential proxies into the validator queue'
                        .format(provider_name, len(proxies)))
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            logger.info('worker_process exited.')
            break
        except pyppeteer.errors.PyppeteerError as e:
            logger.debug(
                'pyppeteer.errors.PyppeteerError detected: {}\n'.format(e) +
                'Please make sure you have installed all the dependencies for chromium correctly'
            )
        except Exception as e:
            worker = Worker()  # reset worker
            logger.warning('Unhandled exception is detected: {}'.format(e))