示例#1
0
def getHtmlTree(url, **kwargs):
    """
    获取html树
    :param url:
    :param kwargs:
    :return:
    """

    header = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
    }
    # TODO 取代理服务器用代理服务器访问
    wr = WebRequest()

    # delay 2s for per request
    time.sleep(2)

    html = wr.get(url=url, header=header).content
    return etree.HTML(html)
示例#2
0
 def freeproxywallthird():
     urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>',
             r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
示例#3
0
 def freeproxysixth():
     """
     讯代理 http://www.xdaili.cn/  已停用
     :return:
     """
     url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
     request = WebRequest()
     try:
         res = request.get(url, timeout=10).json()
         for row in res['RESULT']['rows']:
             yield '{}:{}'.format(row['ip'], row['port'])
     except Exception as e:
         pass
示例#4
0
 def freeproxywallfirst():
     """
     墙外网站 cn-proxy
     :return:
     """
     urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>',
             r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
示例#5
0
 def freeproxyten():
     """
     云代理 http://www.ip3366.net/free/
     :return:
     """
     urls = ['http://www.ip3366.net/free/']
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>',
             r.text)
         for proxy in proxies:
             yield ":".join(proxy)
示例#6
0
 def freeproxyninth():
     """
     码农代理 https://proxy.coderbusy.com/ 已停用
     :return:
     """
     urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             'data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>',
             r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
示例#7
0
 def freeproxywallsecond():
     """
     https://proxy-list.org/english/index.php
     :return:
     """
     urls = [
         'https://proxy-list.org/english/index.php?p=%s' % n
         for n in range(1, 10)
     ]
     request = WebRequest()
     import base64
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
         for proxy in proxies:
             yield base64.b64decode(proxy).decode()
示例#8
0
 def freeproxyeleven():
     """
     IP海 http://www.iphai.com/free/ng
     :return:
     """
     urls = [
         'http://www.iphai.com/free/ng', 'http://www.iphai.com/free/np',
         'http://www.iphai.com/free/wg', 'http://www.iphai.com/free/wp'
     ]
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</td>[\s\S]*?<td>\s*?(\d+)\s*?</td>',
             r.text)
         for proxy in proxies:
             yield ":".join(proxy)
示例#9
0
 def freeproxysecond(count=20):
     """
     代理66 http://www.66ip.cn/
     :param count: 提取数量
     :return:
     """
     urls = [
         "http://www.66ip.cn/mo.php?sxb=&tqsl={count}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=",
         "http://www.66ip.cn/nmtq.php?getnum={count}"
         "&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip",
     ]
     request = WebRequest()
     for _ in urls:
         url = _.format(count=count)
         html = request.get(url).content
         ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}",
                          html)
         for ip in ips:
             yield ip.strip()
示例#10
0
    def freeproxyeight():
        """
        秘密代理 http://www.mimiip.com  已停用
        """
        url_gngao = [
            'http://www.mimiip.com/gngao/%s' % n for n in range(1, 2)
        ]  # 国内高匿
        url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n
                    for n in range(1, 2)]  # 国内普匿
        url_gntou = [
            'http://www.mimiip.com/gntou/%s' % n for n in range(1, 2)
        ]  # 国内透明
        url_list = url_gngao + url_gnpu + url_gntou

        request = WebRequest()
        for url in url_list:
            r = request.get(url, timeout=10)
            proxies = re.findall(
                r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W].*<td>(\d+)</td>',
                r.text)
            for proxy in proxies:
                yield ':'.join(proxy)