def validate(item, callback): http_url_pool = [ "http://www.baidu.com", "http://cn.bing.com", "http://www.sohu.com", "http://www.sina.com.cn" ] https_url_pool = [ "https://www.baidu.com", "https://www.zhihu.com" ] _type, _ip, _port = item.split(':') if _type == "http": url = random.choice(http_url_pool) proxies = {"http": "http://%s:%s" % (_ip, str(_port))} elif _type == "https": url = random.choice(https_url_pool) proxies = {"https": "http://%s:%s" % (_ip, str(_port))} elif _type == "socks5": url = random.choice(http_url_pool) proxies = { "http": "socks5://%s:%s" % (_ip, str(_port)), "https": "socks5://%s:%s" % (_ip, str(_port)) } try: req = utils.send_http('get', url, timeout=config.timeout*3, proxies=proxies, allow_redirects=False) if req.ok: callback(item) except Exception, e: logging.error("Validate on %s Error: %s" % (item, str(e)))
def run(self): urls = ["http://www.xicidaili.com/nn/1", "http://www.xicidaili.com/nn/2"] for url in urls: req = send_http('get', url) tree = getHtmlTree(req.content) proxy_list = tree.xpath('.//table[@id="ip_list"]/*')[1:] for proxy in proxy_list: ip = proxy.xpath('.//td[2]/text()')[0] port = proxy.xpath('.//td[3]/text()')[0] ptype = proxy.xpath('.//td[6]/text()')[0] if ptype in PROXY_TYPES: self.items.append(ptype + PROXY_SEP + ip + ':' + str(port))
def numocr(self, url): ''' 图像识别不保证100%准确 经测试, 3128 -> 3328 , 137 -> 337 ''' key = url.split('port=')[-1] if key in self.cache: return self.cache.get(key) img = send_http('get', url).content img = Image.open(StringIO.StringIO(img)) port = pytesseract.image_to_string(img, lang='osd', config="digits") self.cache[key] = port return port
def run(self): urls = ["http://proxy.mimvp.com/free.php?proxy=in_hp&sort=&page=1", "http://proxy.mimvp.com/free.php?proxy=out_tp&sort=&page=1", "http://proxy.mimvp.com/free.php?proxy=out_hp&sort=&page=1", "http://proxy.mimvp.com/free.php?proxy=in_socks&sort=&page=1", "http://proxy.mimvp.com/free.php?proxy=out_socks&sort=&page=1"] for url in urls: req = send_http('get', url) tree = getHtmlTree(req.content) _proxy_list = tree.xpath('.//tbody/td') proxy_list = [_proxy_list[i:i+10] for i in xrange(0, len(_proxy_list), 10)] for proxy in proxy_list: ip = proxy[1].text port = self.numocr("http://proxy.mimvp.com/" + proxy[2][0].attrib['src']) _tmp_ptype = proxy[3].text ptypes = map(lambda _: _.strip().lower(), _tmp_ptype.split('/')) for ptype in ptypes: if ptype in PROXY_TYPES: self.items.append(ptype + PROXY_SEP + ip + ':' + str(port))
def run(self): urls = [ "http://www.kuaidaili.com/proxylist/%d/" % i for i in xrange(1, 11) ] for url in urls: req = send_http('get', url) tree = getHtmlTree(req.content) proxy_list = tree.xpath('.//tbody/tr') for proxy in proxy_list: ip = proxy.xpath('.//td[@data-title="IP"]/text()')[0] port = proxy.xpath('.//td[@data-title="PORT"]/text()')[0] _tmp_ptypes = proxy.xpath('.//td[@data-title="%s"]/text()' % u"类型")[0] ptypes = map(lambda _: _.strip().lower(), _tmp_ptypes.split(',')) for ptype in ptypes: if ptype in PROXY_TYPES: self.items.append(ptype + PROXY_SEP + ip + ':' + str(port))
def run(self): urls = [ "http://www.goubanjia.com/free/gngn/index.shtml", "http://www.goubanjia.com/free/gnpt/index.shtml", "http://www.goubanjia.com/free/gwgn/index.shtml", "http://www.goubanjia.com/free/gwpt/index.shtml" ] for url in urls: req = send_http('get', url) tree = getHtmlTree(req.content) proxy_list = tree.xpath('.//td[@class="ip"]') for proxy in proxy_list: _tmp_proxy = proxy.xpath( './/*[not(contains(@style, "none"))]/text()') ip, port = ''.join(_tmp_proxy[:-1]), _tmp_proxy[-1] _tmp_ptype = proxy.xpath('../td[3]/a/text()')[0] ptypes = map(lambda _: _.strip().lower(), _tmp_ptype.split(',')) for ptype in ptypes: if ptype in PROXY_TYPES: self.items.append(ptype + PROXY_SEP + ip + ':' + str(port))