Python getHtmlTree示例，Util.UtilFunction.getHtmlTree Python示例

示例#1

0

显示文件

 def freeProxycoderbusy(pages=1):
     """
     码农代理 https://proxy.coderbusy.com/
     :return:
     """
     url = 'https://proxy.coderbusy.com/classical/country/cn.aspx?page={page}'
     for page in range(1, pages + 1):
         page_url = url.format(page=page)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath(".//table[@class='table']//tr")
         for tr in proxy_list[1:]:
             td_list = tr.xpath(".//td/text()")
             ip = td_list[1].strip()
             host = td_list[3].strip()
             yield ":".join([ip, host])

示例#2

0

显示文件

 def freeProxykuai(pages=1):
     """
     快代理 https://www.kuaidaili.com
     """
     url_list = [
         'https://www.kuaidaili.com/free/inha/{page}/',
         'https://www.kuaidaili.com/free/intr/{page}/'
     ]
     for url in url_list:
         for page in range(1, pages + 1):
             page_url = url.format(page=page)
             tree = getHtmlTree(page_url)
             proxy_list = tree.xpath('.//table//tr')
             for tr in proxy_list[1:]:
                 yield ':'.join(tr.xpath('./td/text()')[0:2])

示例#3

0

显示文件

 def freeProxyjiangxianli(page_count=8):
     """
     guobanjia http://ip.jiangxianli.com/?page=
     免费代理库
     超多量
     :return:
     """
     for i in range(1, page_count + 1):
         url = 'http://ip.jiangxianli.com/?page={}'.format(i)
         html_tree = getHtmlTree(url)
         tr_list = html_tree.xpath(
             "/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr")
         if len(tr_list) == 0:
             continue
         for tr in tr_list:
             yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath(
                 "./td[3]/text()")[0]

示例#4

0

显示文件

    def freeProxymimi(pages=1):
        """
        秘密代理 http://www.mimiip.com
        """
        url_list = [
            'http://www.mimiip.com/gngao/{page}',  # 国内高匿
            'http://www.mimiip.com/gnpu/{page}',  # 国内普匿
            'http://www.mimiip.com/gntou/{page}'  # 国内透明
        ]

        for url in url_list:
            for page in range(1, pages + 1):
                page_url = url.format(page=page)
                tree = getHtmlTree(page_url)
                proxy_list = tree.xpath(".//table[@class='list']//tr")
                for tr in proxy_list[1:]:
                    yield ":".join(tr.xpath(".//td/text()")[0:2])

示例#5

0

显示文件

 def freeProxyxici(page=2):
     """
     西刺代理 http://www.xicidaili.com
     :return:
     """
     url_list = [
         'http://www.xicidaili.com/nn/',  # 高匿
         'http://www.xicidaili.com/nt/',  # 透明
     ]
     for each_url in url_list:
         for i in range(1, page + 1):
             page_url = each_url + str(i)
             tree = getHtmlTree(page_url)
             proxy_list = tree.xpath(
                 './/table[@id="ip_list"]//tr[position()>1]')
             for proxy in proxy_list:
                 try:
                     yield ':'.join(proxy.xpath('./td/text()')[0:2])
                 except Exception as e:
                     print(e)

示例#6

0

显示文件

    def freeProxy5u():
        """
               无忧代理 http://www.data5u.com/
               几乎没有能用的
        """
        url_list = [
            'http://www.data5u.com/',  # 首页提供的20个
            'http://www.data5u.com/free/gngn/index.shtml',  # 国内高匿
            'http://www.data5u.com/free/gnpt/index.shtml'  # 国内普通
        ]

        for url in url_list:
            html_tree = getHtmlTree(url)
            ul_list = html_tree.xpath(".//ul[@class='l2']")
            for ul in ul_list:
                try:
                    proxy = ul.xpath(".//li/text()")[0:2]
                    yield ":".join(proxy)
                except Exception as e:
                    print(e)

示例#7

0

显示文件

 def freeProxy66ip(area=34, page=1):
     """
     代理66 http://www.66ip.cn/
     :param area: 抓取代理页数，areaindex_1北京代理页，areaindex_2上海代理页......
     :param page: 翻页
     :return:
     """
     area = 34 if area > 34 else area
     for areaindex in range(1, area + 1):
         for i in range(1, page + 1):
             url = "http://www.66ip.cn/areaindex_{}/{}.html".format(
                 areaindex, i)
             html_tree = getHtmlTree(url)
             tr_list = html_tree.xpath(
                 ".//div[@id='footer']/div/table//tr[position()>1]")
             if len(tr_list) == 0:
                 continue
             for tr in tr_list:
                 try:
                     proxy = tr.xpath(".//td/text()")[0:2]
                     yield ":".join(proxy)
                 except Exception as e:
                     print(e)

示例#8

0

显示文件

 def freeProxygouban():
     """
     guobanjia http://www.goubanjia.com/
     :return:
     """
     url = "http://www.goubanjia.com/"
     tree = getHtmlTree(url)
     proxy_list = tree.xpath(".//table//tr//td[@class='ip']")
     # 此网站有隐藏的数字干扰，或抓取到多余的数字或.符号
     # 需要过滤掉<p style="display:none;">的内容
     xpath_str = """.//*[not(contains(@style, 'display:none'))
                                         and not(contains(@style, 'display: none'))
                                         and not(contains(@class, 'port'))
                                         ]/text()
                                 """
     for each_proxy in proxy_list:
         try:
             # :符号裸放在td下，其他放在div span p中，先分割找出ip，再找port
             ip_addr = ''.join(each_proxy.xpath(xpath_str))
             port = each_proxy.xpath(
                 ".//span[contains(@class, 'port')]/text()")[0]
             yield '{}:{}'.format(ip_addr, port)
         except Exception as e:
             print(e)