Exemplo n.º 1
0
 def run(self) -> None:
     for url in self.urls:
         response = self.http.get(url)
         if not response.ok:
             self.logger.error(f'request failed {response.status_code}')
             continue
         html = etree.HTML(response.content)
         trs = html.xpath('.//ul[@class="l2"]')
         for tr in trs:
             try:
                 tds = tr.xpath('.//li//text()')
                 if len(tds) < 6:
                     continue
                 ip = tds[0]
                 protocol = tds[3]
                 raw_port = tr.xpath('.//li[1]/@class')[0].split(' ')[1]
                 port = decode_port(raw_port)
                 proxy_ip = ProxyIP(ip, port, protocol)
                 if proxy_ip.ok:
                     self.logger.info(f'got raw proxy_ip {str(proxy_ip)}')
                     db.add_raw(str(proxy_ip))
             except Exception:
                 self.logger.error(f'error occurred when crawl {url}')
                 traceback.print_exc()
         gevent.sleep(10)
Exemplo n.º 2
0
 def run(self) -> None:
     for url in self.urls:
         response = self.http.get(url)
         if not response.ok:
             self.logger.error(f'request failed {response.status_code}')
             continue
         html = etree.HTML(response.content)
         trs = html.xpath('..//table[@class="table table-hover"]//tr')
         xpath_str = """./td[@class="ip"]//*[not(contains(@style, 'display: none'))
                                                     and not(contains(@style, 'display:none'))
                                                     and not(contains(@class, 'port'))
                                                     ]/text()
                                                     """
         for tr in trs:
             try:
                 tds = tr.xpath('./td')
                 if len(tds) < 5:
                     continue
                 protocol = tds[2].xpath('.//text()')[0]
                 ip = ''.join(tr.xpath(xpath_str))
                 raw_port = tr.xpath(
                     './/span[contains(@class, "port")]/@class')[0].split(
                         ' ')[1]
                 port = decode_port(raw_port)
                 proxy_ip = ProxyIP(ip, port, protocol)
                 if proxy_ip.ok:
                     self.logger.info(f'got raw proxy_ip {str(proxy_ip)}')
                     db.add_raw(str(proxy_ip))
             except Exception:
                 self.logger.error(f'error occurred when crawl {url}')
                 traceback.print_exc()
         gevent.sleep(10)
Exemplo n.º 3
0
 def run(self) -> None:
     for url in self.urls:
         response = self.http.get(url)
         if not response.ok:
             self.logger.error(f'request failed {response.status_code}')
             continue
         html = etree.HTML(response.content)
         trs = html.xpath('//table[@id="ip_list"]//tr/td')
         for tr in trs:
             tds = tr.xpath('..//text()')
             ip = tds[2]
             port = tds[4]
             protocol = tds[12]
             proxy_ip = ProxyIP(ip, port, protocol)
             if proxy_ip.ok:
                 self.logger.info(f'got raw proxy_ip {str(proxy_ip)}')
                 db.add_raw(str(proxy_ip))
         gevent.sleep(10)
Exemplo n.º 4
0
 def run(self) -> None:
     for url in self.urls:
         response = self.http.get(url)
         if not response.ok:
             self.logger.error(f'request failed {response.status_code}')
             continue
         html = etree.HTML(response.content)
         trs = html.xpath('.//div[@id="list"]//tr')
         for tr in trs:
             tds = tr.xpath('./td/text()')
             if len(tds) < 4:
                 continue
             ip = tds[0]
             port = tds[1]
             protocol = tds[3]
             proxy_ip = ProxyIP(ip, port, protocol)
             if proxy_ip.ok:
                 self.logger.info(f'got raw proxy_ip {str(proxy_ip)}')
                 db.add_raw(str(proxy_ip))
         gevent.sleep(randint(11, 23))
Exemplo n.º 5
0
def test_ProxyIP():
    cases = [
        (dict(ip="127.0.0.1", port="80", protocol='http'), True),
        (dict(ip="127.0.0.1", port="80", protocol='https'), True),
        (dict(ip="127.0.0.1", port="80", protocol=None), False),
        (dict(ip="127.0.0.1", port="80", protocol="whatever"), False),
        (dict(ip="827.0.0.1", port="80", protocol='https'), False),
        (dict(ip="xxx.xx.xx.1", port="80", protocol='https'), False),
        (dict(ip="whatever", port="80", protocol='https'), False),
        (dict(ip=None, port="80", protocol='https'), False),
        (dict(ip="127.0.0.1", port=80, protocol='https'), True),
        (dict(ip="127.0.0.1", port=65536, protocol='https'), False),
        (dict(ip="127.0.0.1", port="80", protocol='https'), True),
        (dict(ip="127.0.0.1", port=None, protocol='https'), False),
        (dict(ip_port="127.0.0.1:8080", protocol='https'), True),
        (dict(ip_port="whatever:8080", protocol='https'), False),
        (dict(ip_port="127.0.0.1:888888", protocol='https'), False),
        (dict(ip_port="127.0.0.1", protocol='https'), False),
    ]
    for params, result in cases:
        assert ProxyIP(**params).ok == result