Exemplo n.º 1
0
def validate(item, callback):
    http_url_pool = [
        "http://www.baidu.com",
        "http://cn.bing.com",
        "http://www.sohu.com",
        "http://www.sina.com.cn"
    ]
    https_url_pool = [
        "https://www.baidu.com",
        "https://www.zhihu.com"
    ]
    _type, _ip, _port = item.split(':')

    if _type == "http":
        url = random.choice(http_url_pool)
        proxies = {"http": "http://%s:%s" % (_ip, str(_port))}

    elif _type == "https":
        url = random.choice(https_url_pool)
        proxies = {"https": "http://%s:%s" % (_ip, str(_port))}

    elif _type == "socks5":
        url = random.choice(http_url_pool)
        proxies = {
            "http": "socks5://%s:%s" % (_ip, str(_port)),
            "https": "socks5://%s:%s" % (_ip, str(_port))
        }

    try:
        req = utils.send_http('get', url, timeout=config.timeout*3, proxies=proxies, allow_redirects=False)
        if req.ok:
            callback(item)
    except Exception, e:
        logging.error("Validate on %s Error: %s" % (item, str(e)))
Exemplo n.º 2
0
 def run(self):
     urls = ["http://www.xicidaili.com/nn/1",
             "http://www.xicidaili.com/nn/2"]
     for url in urls:
         req = send_http('get', url)
         tree = getHtmlTree(req.content)
         proxy_list = tree.xpath('.//table[@id="ip_list"]/*')[1:]
         for proxy in proxy_list:
             ip = proxy.xpath('.//td[2]/text()')[0]
             port = proxy.xpath('.//td[3]/text()')[0]
             ptype = proxy.xpath('.//td[6]/text()')[0]
             if ptype in PROXY_TYPES:
                 self.items.append(ptype + PROXY_SEP + ip + ':' + str(port))
Exemplo n.º 3
0
    def numocr(self, url):
        '''
        图像识别不保证100%准确
        经测试, 3128 -> 3328 , 137 -> 337
        '''

        key = url.split('port=')[-1]
        if key in self.cache:
            return self.cache.get(key)

        img = send_http('get', url).content
        img = Image.open(StringIO.StringIO(img))

        port = pytesseract.image_to_string(img, lang='osd', config="digits")
        self.cache[key] = port
        return port
Exemplo n.º 4
0
 def run(self):
     urls = ["http://proxy.mimvp.com/free.php?proxy=in_hp&sort=&page=1",
             "http://proxy.mimvp.com/free.php?proxy=out_tp&sort=&page=1",
             "http://proxy.mimvp.com/free.php?proxy=out_hp&sort=&page=1",
             "http://proxy.mimvp.com/free.php?proxy=in_socks&sort=&page=1",
             "http://proxy.mimvp.com/free.php?proxy=out_socks&sort=&page=1"]
     for url in urls:
         req = send_http('get', url)
         tree = getHtmlTree(req.content)
         _proxy_list = tree.xpath('.//tbody/td')
         proxy_list = [_proxy_list[i:i+10] for i in xrange(0, len(_proxy_list), 10)]
         for proxy in proxy_list:
             ip = proxy[1].text
             port = self.numocr("http://proxy.mimvp.com/" + proxy[2][0].attrib['src'])
             _tmp_ptype = proxy[3].text
             ptypes = map(lambda _: _.strip().lower(), _tmp_ptype.split('/'))
             for ptype in ptypes:
                 if ptype in PROXY_TYPES:
                     self.items.append(ptype + PROXY_SEP + ip + ':' + str(port))
Exemplo n.º 5
0
 def run(self):
     urls = [
         "http://www.kuaidaili.com/proxylist/%d/" % i
         for i in xrange(1, 11)
     ]
     for url in urls:
         req = send_http('get', url)
         tree = getHtmlTree(req.content)
         proxy_list = tree.xpath('.//tbody/tr')
         for proxy in proxy_list:
             ip = proxy.xpath('.//td[@data-title="IP"]/text()')[0]
             port = proxy.xpath('.//td[@data-title="PORT"]/text()')[0]
             _tmp_ptypes = proxy.xpath('.//td[@data-title="%s"]/text()' %
                                       u"类型")[0]
             ptypes = map(lambda _: _.strip().lower(),
                          _tmp_ptypes.split(','))
             for ptype in ptypes:
                 if ptype in PROXY_TYPES:
                     self.items.append(ptype + PROXY_SEP + ip + ':' +
                                       str(port))
Exemplo n.º 6
0
 def run(self):
     urls = [
         "http://www.goubanjia.com/free/gngn/index.shtml",
         "http://www.goubanjia.com/free/gnpt/index.shtml",
         "http://www.goubanjia.com/free/gwgn/index.shtml",
         "http://www.goubanjia.com/free/gwpt/index.shtml"
     ]
     for url in urls:
         req = send_http('get', url)
         tree = getHtmlTree(req.content)
         proxy_list = tree.xpath('.//td[@class="ip"]')
         for proxy in proxy_list:
             _tmp_proxy = proxy.xpath(
                 './/*[not(contains(@style, "none"))]/text()')
             ip, port = ''.join(_tmp_proxy[:-1]), _tmp_proxy[-1]
             _tmp_ptype = proxy.xpath('../td[3]/a/text()')[0]
             ptypes = map(lambda _: _.strip().lower(),
                          _tmp_ptype.split(','))
             for ptype in ptypes:
                 if ptype in PROXY_TYPES:
                     self.items.append(ptype + PROXY_SEP + ip + ':' +
                                       str(port))