示例#1
0
class YouDaiLi(object):
    def __init__(self):
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.youdaili.net/Daili/http/"

    def parser(self):
        url = etree.HTML(self.getter.rget_data(
            self.url)).xpath('//div[@class="chunlist"]/ul/li[1]/p/a/@href')[0]
        time.sleep(2)
        html = self.getter.rget_data(url)
        soup = BeautifulSoup(html, 'lxml')
        p_tag = soup.find_all('p')
        sql_list = list()
        for p in p_tag:
            ip_list = re.findall(
                '(.*?)    ————    (.*?)    ————    (.*?)    ————    (.*?)    ',
                p.get_text())
            if ip_list:
                # [('61.130.226.39', '20753', '浙江湖州', 'HTTPS')]
                ip = ip_list[0][0]
                port = ip_list[0][1]
                lx = ip_list[0][3]
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, lx))
                    self.bf.insert(ip)
                else:
                    pass
        for sql in sql_list:  # 一次性操作数据库
            # print(sql)
            self.cm.exe(sql)
        self.cm.close()
示例#2
0
 def __init__(self):
     """360代理的爬虫"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = [
         "http://www.swei360.com/free/?page={}",  # 国内高匿代理
         "http://www.swei360.com/free/?stype=2&page={}",  # 国内普通代理
         "http://www.swei360.com/free/?stype=3&page={}",  # 国外高匿代理
         "http://www.swei360.com/free/?stype=4&page={}"  # 国外普通代理
     ]
示例#3
0
 def __init__(self):
     """
     蜻蜓代理的爬虫
     https://proxy.horocn.com/free-proxy.html?page={}
     """
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.img = ImagePort()
     self.port = "12345"
     self.url = "https://proxy.horocn.com/free-proxy.html?page={}"
示例#4
0
 def __init__(self):
     """秘密代理的IP抓取"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = [
         "http://www.mimiip.com/gngao/{}",  # 高匿代理IP
         "http://www.mimiip.com/gnpu/{}",  # 普匿代理IP
         "http://www.mimiip.com/gntou/{}",  # 透明代理IP
         "http://www.mimiip.com/hw/{}"  # 国外代理IP
     ]
示例#5
0
 def __init__(self):
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url_list = [
         'https://www.rmccurdy.com/scripts/proxy/output/http/ALL',
         'https://www.rmccurdy.com/scripts/proxy/output/socks/ALL',
         'https://www.rmccurdy.com/scripts/proxy/proxylist.txt',
         'http://www.proxylists.net/http_highanon.txt',
         'http://ab57.ru/downloads/proxyold.txt'
     ]
示例#6
0
 def __init__(self):
     """码农很忙代理爬虫"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = [
         'https://proxy.coderbusy.com/',  # 首页
         'https://proxy.coderbusy.com/classical/https-ready.aspx?page={}',  # HTTPS代理
         'https://proxy.coderbusy.com/classical/post-ready.aspx?page={}',  # 支持POST的代理
         'https://proxy.coderbusy.com/classical/anonymous-type/transparent.aspx?page={}',  # 透明代理
         'https://proxy.coderbusy.com/classical/anonymous-type/anonymous.aspx?page={}',  # 匿名代理
         'https://proxy.coderbusy.com/classical/anonymous-type/highanonymous.aspx?page={}',  # 高匿代理
     ]
示例#7
0
class KuaiDaiLi(object):
    def __init__(self):
        """快代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')

    def parser(self, total_url, xpath_str, format_url):
        total = int(
            etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str)[0])
        time.sleep(2)
        for pageNum in range(1, total):
            url = format_url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(2)  # 睡两秒,防止被干掉
            soup = BeautifulSoup(html, 'lxml')
            proxy_list = soup.find(
                'table', {
                    'class': 'table table-bordered table-striped'
                }).find('tbody')
            sql_list = list()
            for proxy in proxy_list.find_all('tr'):
                tmp = proxy.find_all('td')
                ip = tmp[0].get_text()
                port = tmp[1].get_text()
                lx = tmp[3].get_text().lower()
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, lx))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)

    def run(self):
        # 获取国内高匿部分
        self.parser("https://www.kuaidaili.com/free/inha/1/",
                    '//div[@id="listnav"]/ul/li[last()-1]/a/text()',
                    "https://www.kuaidaili.com/free/inha/{}/")
        # 获取国内普通部分
        time.sleep(3)
        self.parser("https://www.kuaidaili.com/free/intr/1/",
                    '//div[@id="listnav"]/ul/li[last()-1]/a/text()',
                    "https://www.kuaidaili.com/free/intr/{}/")
        self.cm.close()  # 关闭数据库连接
示例#8
0
class CoderBusy(object):

    def __init__(self):
        """码农很忙代理爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = [
            'https://proxy.coderbusy.com/',  # 首页
            'https://proxy.coderbusy.com/classical/https-ready.aspx?page={}',  # HTTPS代理
            'https://proxy.coderbusy.com/classical/post-ready.aspx?page={}',  # 支持POST的代理
            'https://proxy.coderbusy.com/classical/anonymous-type/transparent.aspx?page={}',  # 透明代理
            'https://proxy.coderbusy.com/classical/anonymous-type/anonymous.aspx?page={}',  # 匿名代理
            'https://proxy.coderbusy.com/classical/anonymous-type/highanonymous.aspx?page={}',  # 高匿代理
        ]

    def parser(self, page_lx):
        page = 1
        while True:
            try:
                html = self.getter.rget_data(page_lx.format(page))
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(2)  # 睡两秒,防止被干掉
            next_page = etree.HTML(html).xpath('//nav[@class="text-center"]/ul/li[@title="下一页"]/a/@href')
            soup = BeautifulSoup(html, 'lxml')
            proxies_list = soup.find('table', 'table').find_all('tr')
            sql_list = list()
            for proxy in proxies_list:
                temp = proxy.find_all('td')
                if temp:
                    # 获取ip
                    ip = temp[0].get_text().strip()
                    # 获取端口
                    port = int(temp[2].get("data-i"))
                    for num in ip.split('.'):
                        port -= int(num)
                    # 获取类型
                    if temp[8].find('i'):
                        lx = 'https'
                    else:
                        lx = 'http'
                    # 校验是否已有
                    if not self.bf.isContains(ip):
                        sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx))
                        self.bf.insert(ip)
                    else:
                        pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
            if next_page:
                page += 1
            else:
                break

    def run(self):
        for page_lx in self.url:
            time.sleep(2)
            self.parser(page_lx)
示例#9
0
class ThreeFourSixFour(object):
    def __init__(self):
        """3464网站的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.3464.com/data/Proxy/http/"

    def parser(self):
        html = self.getter.rget_data(self.url)
        html_ele = etree.HTML(html)
        tr_list = html_ele.xpath(
            '//div[@class="CommonBody"]/table[6]//table//tr')[1:]
        sql_list = list()
        for tr in tr_list:
            try:
                ip = tr.xpath('./td[1]/text()')[0]
                port = tr.xpath('./td[2]/text()')[0]
            except Exception:
                continue
            # 校验是否已有
            if not self.bf.isContains(ip):
                sql_list.append(
                    """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                    .format(ip, port, "http"))
                self.bf.insert(ip)
            else:
                pass
        for sql in sql_list:  # 一次性操作数据库
            self.cm.exe(sql)
示例#10
0
class ThreeThreeSixSix(object):
    def __init__(self):
        """3366代理网站的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.ip3366.net/?stype={}&page={}"

    def parser(self):
        for stype in range(1, 6):
            for page in range(1, 11):
                url = self.url.format(stype, page)
                time.sleep(2)
                try:
                    html = self.getter.rget_data(url)
                except Exception:
                    continue
                html_ele = etree.HTML(html)
                tr_list = html_ele.xpath('//table/tbody/tr')
                sql_list = list()
                for tr in tr_list:
                    ip = tr.xpath('./td[1]/text()')[0]
                    port = tr.xpath('./td[2]/text()')[0]
                    lx = tr.xpath('./td[4]/text()')[0]
                    if not self.bf.isContains(ip):
                        sql_list.append(
                            """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                            .format(ip, port, lx))
                        self.bf.insert(ip)
                    else:
                        pass
                for sql in sql_list:  # 一次性操作数据库
                    self.cm.exe(sql)
示例#11
0
class TXTIPPage(object):
    def __init__(self):
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url_list = [
            'https://www.rmccurdy.com/scripts/proxy/output/http/ALL',
            'https://www.rmccurdy.com/scripts/proxy/output/socks/ALL',
            'https://www.rmccurdy.com/scripts/proxy/proxylist.txt',
            'http://www.proxylists.net/http_highanon.txt',
            'http://ab57.ru/downloads/proxyold.txt'
        ]

    def run(self):
        for url in self.url_list:
            data = self.getter.rget_data(url)
            ip_list = re.findall('\d+\.\d+\.\d+\.\d+:\d+', data)
            temp_l = [[ipport.split(":")[0],
                       ipport.split(":")[1]] for ipport in ip_list]
            sql_list = list()
            for temp in temp_l:
                ip = temp[0]
                port = temp[1]
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, 'http'))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
        self.cm.close()
示例#12
0
class HinkyDink(object):
    def __init__(self):
        """Hinky Dink's代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')

    def parser(self, total_url, xpath_str, format_url):
        total = int(
            etree.HTML(self.getter.rget_data(total_url)).xpath(xpath_str)
            [0].strip("[").strip("]"))
        time.sleep(2)
        for pageNum in range(1, total):
            if pageNum == 1:
                url = total_url
            else:
                url = format_url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(2)  # 睡两秒,防止被干掉
            html_ele = etree.HTML(html)
            tr_list = html_ele.xpath(
                '//table[2]//tr[2]/td[3]/table//tr/td//table//tr[@class="text"]'
            )
            sql_list = list()
            for tr in tr_list:
                ip = tr.xpath('./td[1]/text()')[0]
                port = tr.xpath('./td[2]/text()')[0]
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, 'http'))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)

    def run(self):
        self.parser(
            "http://www.mrhinkydink.com/proxies.htm",  # 第一页url
            '//table[2]//tr[2]/td[3]/table//tr/td//table//tr[last()]/td/a[last()]/text()',
            "http://www.mrhinkydink.com/proxies{}.htm"  # 第二页,开始格式化的url
        )
示例#13
0
class Horocn(object):

    def __init__(self):
        """
        蜻蜓代理的爬虫
        https://proxy.horocn.com/free-proxy.html?page={}
        """
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.img = ImagePort()
        self.port = "12345"
        self.url = "https://proxy.horocn.com/free-proxy.html?page={}"

    def parser(self):
        # page = 1
        page = 3000
        while True:
            try:
                html = self.getter.rget_data(self.url.format(page))
            except Exception as e:
                print("出现错误为{}".format(e))
                continue
            time.sleep(2)  # 睡两秒,防止被干掉
            html_ele = etree.HTML(html)
            next_page = html_ele.xpath('//ul[@class="pager"]//a[text()="下一页 →"]/@href')[0]
            tr_list = html_ele.xpath('//table/tbody/tr')
            sql_list = list()
            path_list = list()
            for tr in tr_list:
                ip = tr.xpath('./th[1]/text()')[0]
                # 开始保存图片
                base_port_image = tr.xpath('./th[2]/img/@src')[0]
                photo = base64.b64decode(re.search(r"data:image/jpeg;base64,(.*)", base_port_image).group(1))
                path = "./{}.jpg".format(tr_list.index(tr))
                path_list.append(path)  # 将其放入列表一次性操作
                with open(path, "wb") as f:
                    f.write(photo)
                for times in range(10):
                    try:
                        self.port = int(self.img.run(path))
                    except Exception:
                        continue
                    else:
                        break
                # 校验是否已有
                if not self.bf.isContains(ip):
                    sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, self.port, "http"))
                    self.bf.insert(ip)
                else:
                    pass
            for path in path_list:
                os.remove(path)
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
            if next_page != "javascript:;":
                page += 1
            else:
                break
示例#14
0
class MiMi(object):

    def __init__(self):
        """秘密代理的IP抓取"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = [
            "http://www.mimiip.com/gngao/{}",  # 高匿代理IP
            "http://www.mimiip.com/gnpu/{}",  # 普匿代理IP
            "http://www.mimiip.com/gntou/{}",  # 透明代理IP
            "http://www.mimiip.com/hw/{}"  # 国外代理IP
        ]

    def parser(self, page_lx):
        page = 1
        while True:
            try:
                html = self.getter.rget_data(page_lx.format(page))
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(2)  # 睡两秒,防止被干掉
            next_page = etree.HTML(html).xpath('//div[@class="pagination"]//*[text()="下一页 ›"]/@href')
            soup = BeautifulSoup(html, 'lxml')
            proxies_list = soup.find('table', 'list').find_all('tr')
            sql_list = list()
            for proxy in proxies_list:
                temp = proxy.find_all('td')
                if temp:
                    # 获取ip
                    ip = temp[0].get_text()
                    # 获取端口
                    port = temp[1].get_text()
                    # 获取类型
                    lx = temp[4].get_text().lower()
                    # 校验是否已有
                    if not self.bf.isContains(ip):
                        sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx))
                        self.bf.insert(ip)
                    else:
                        pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
            if next_page:
                page += 1
            else:
                break

    def run(self):
        for page_lx in self.url:
            time.sleep(2)
            self.parser(page_lx)
示例#15
0
class SixSixIP(object):
    def __init__(self):
        """66ip代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.66ip.cn/{}.html"

    def run(self):
        total = int(
            etree.HTML(
                self.getter.rget_data("http://www.66ip.cn/1.html")).xpath(
                    '//div[@id="PageList"]/a[last()-1]/text()')[0])
        time.sleep(3)
        # for pageNum in range(1, total):
        # for pageNum in range(1176, total):
        for pageNum in range(1200, total):
            url = self.url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(3)  # 睡两秒,防止被干掉
            soup = BeautifulSoup(html, 'lxml')
            proxy_list = soup.find('table', {"border": "2px"})
            sql_list = list()
            for proxy in proxy_list.find_all('tr')[1:]:
                ip = proxy.find_all('td')[0].get_text()  # 获取ip
                port = proxy.find_all('td')[1].get_text()  # 获取端口
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, 'http'))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库)
                self.cm.exe(sql)
        self.cm.close()
示例#16
0
class ListProxy(object):

    def __init__(self):
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-{}"

    def parser(self):
        total = int(etree.HTML(self.getter.rget_data(self.url.format(1))).xpath('//div[@id="page"]/table[3]/tr/td[1]/a[last()]/text()')[0].strip('[').strip(']'))
        time.sleep(3)
        for pageNum in range(1, total):
            url = self.url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(3)  # 睡两秒,防止被干掉
            soup = BeautifulSoup(html, 'lxml')
            proxy_list = soup.find("table", 'bg').find_all('tr')
            sql_list = list()
            for proxy in proxy_list:
                tmp = proxy.find_all("td")
                if tmp:
                    ip = tmp[1].get_text()
                    port = tmp[2].get_text()
                    lx = tmp[6].get_text()
                    if lx == "yes":
                        lx = 'https'
                    else:
                        lx = 'http'
                    if not self.bf.isContains(ip):
                        sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx))
                        self.bf.insert(ip)
                    else:
                        pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
        self.cm.close()
示例#17
0
class ThreeSixZero(object):

    def __init__(self):
        """360代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = [
            "http://www.swei360.com/free/?page={}",  # 国内高匿代理
            "http://www.swei360.com/free/?stype=2&page={}",  # 国内普通代理
            "http://www.swei360.com/free/?stype=3&page={}",  # 国外高匿代理
            "http://www.swei360.com/free/?stype=4&page={}"  # 国外普通代理
        ]

    def parser(self, format_url):
        for pageNum in range(1, 8):  # 他只有7页
            url = format_url.format(pageNum)  # 拼接url
            try:
                html = self.getter.rget_data(url)  # 访问页面
            except Exception as e:
                print("出现错误为{}".format(e))
                continue
            time.sleep(2)  # 睡两秒,防止被干掉
            html_ele = etree.HTML(html)
            tr_list = html_ele.xpath('//table/tbody/tr')
            sql_list = list()
            for tr in tr_list:
                ip = tr.xpath('./td[1]/text()')[0]
                port = tr.xpath('./td[2]/text()')[0]
                lx = tr.xpath('./td[4]/text()')[0]
                if not self.bf.isContains(ip):
                    sql_list.append("""INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')""".format(ip, port, lx))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)

    def run(self):
        for format_url in self.url:
            time.sleep(2)
            self.parser(format_url)
示例#18
0
class Data5U(object):
    def __init__(self):
        """data5u代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.data5u.com/free/index.shtml"

    def parser(self):
        html = self.getter.rget_data(self.url)
        soup = BeautifulSoup(html, "lxml")
        proxy_list = soup.find_all('ul', {'class': "l2"})
        sql_list = list()  # 一次性操作数据库
        for proxy in proxy_list:
            tmp = proxy.find_all('li')
            ip = tmp[0].get_text()
            port_zimu = list(tmp[1].attrs.values())[0][1]
            lx = tmp[3].get_text()
            port = self.mk_port(port_zimu)
            # 对ip进行布隆去重
            if not self.bf.isContains(ip):
                sql_list.append(
                    """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                    .format(ip, port, lx))
                self.bf.insert(ip)
            else:
                pass
        for sql in sql_list:  # 一次性操作数据库
            self.cm.exe(sql)
        self.cm.close()  # 关闭数据库连接

    def mk_port(self, port_word):
        word = list(port_word)
        num_list = []
        for item in word:
            num = 'ABCDEFGHIZ'.find(item)
            num_list.append(str(num))
        port = int("".join(num_list)) >> 0x3
        return port
示例#19
0
class IP181(object):
    def __init__(self):
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = 'http://www.ip181.com/'

    def parser(self):
        js_data = self.getter.rget_data(self.url)
        sql_list = list()  # 一次性操作数据库
        for proxy in json.loads(js_data).get("RESULT"):
            ip = proxy.get('ip')
            port = proxy.get('port')
            if not self.bf.isContains(ip):
                sql_list.append(
                    """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                    .format(ip, port, 'http'))
                self.bf.insert(ip)
            else:
                pass
        for sql in sql_list:  # 一次性操作数据库
            self.cm.exe(sql)
        self.cm.close()  # 关闭数据库连接
示例#20
0
 def __init__(self):
     """ProxyDB代理的爬虫"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = "http://proxydb.net/?offset={}"
示例#21
0
class ProxyDB(object):
    def __init__(self):
        """ProxyDB代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://proxydb.net/?offset={}"

    def parser(self, page_lx):
        # page = 0
        page = 150
        while True:
            try:
                html = self.getter.rget_data(page_lx.format(page * 15))
            except Exception as e:
                print("出先错误为{}".format(e))
                continue
            time.sleep(3)  # 睡两秒,防止被干掉
            html_ele = etree.HTML(html)
            next_page = html_ele.xpath('//nav/ul/li[2]/a/@href')
            add_num = html_ele.xpath('//div[@style="display:none"]/@*')[1]
            td_list = html_ele.xpath(
                '//table[contains(@class, "table")]/tbody/tr/td[1]/script/text()'
            )
            lx_list = html_ele.xpath(
                '//table[contains(@class, "table")]/tbody/tr/td[5]/text()')
            sql_list = list()
            for td in td_list:
                ip_h_reve = re.search(r"'(.*?)'.split", td).group(1)  # 提取ip头部
                ip_t_b64 = re.search(r"atob\('(.*?)'.replace",
                                     td).group(1)  # 提取base64编码部分
                p = re.search(r"pp =  \((\d+) - \(", td).group(1)  # 提取待相加的port
                ip, port = self.mk_ip_port(ip_h_reve, ip_t_b64, p, add_num)
                lx = lx_list[td_list.index(td)].strip().lower()
                if "socket" in lx:
                    lx = "http"
                # 校验是否已有
                if not self.bf.isContains(ip):
                    sql_list.append(
                        """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                        .format(ip, port, lx))
                    self.bf.insert(ip)
                else:
                    pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
            if next_page:
                page += 1
            else:
                break

    def mk_ip_port(self, ip_h_reve, ip_t_b64, p, add_n):
        """
        将网页上抓取下来的参数,直接组织成ip和port
        :param ip_h_reve: 待翻转的ip前一部分
        :param ip_t_b64: base64加密部分的字母
        :param p: 直接抓取到的端口,需要相加
        :param add_n: 要相加的值
        :return:
        """
        l_ip_head = list(ip_h_reve)
        l_ip_head.reverse()
        ip_head = ""
        for char in l_ip_head:
            ip_head += char
        # 下面这句codecs.getdecoder("unicode_escape")(ip_t_b64)[0]超级重要.取消了转义而使用的
        ip_tail = base64.b64decode(
            codecs.getdecoder("unicode_escape")(ip_t_b64)[0]).decode()
        ip = ip_head + ip_tail
        port = int(p) + int(add_n)
        return ip, port

    def run(self):
        self.parser(self.url)
示例#22
0
 def __init__(self):
     """cool-proxy.net代理的爬虫"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = "https://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:{}"
示例#23
0
class CoolProxy(object):
    def __init__(self):
        """cool-proxy.net代理的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "https://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:{}"

    def parser(self):
        page = 1
        while True:
            try:
                html = self.getter.rget_data(self.url.format(page))
            except Exception as e:
                print("出现错误为{}".format(e))
                continue
            time.sleep(2)  # 睡两秒,防止被干掉
            next_page = etree.HTML(html).xpath(
                '//table//tr[last()]//span[last()]/a')
            soup = BeautifulSoup(html, 'lxml')
            tr_list = soup.find('table').find_all('tr')
            sql_list = list()
            for tr in tr_list:
                temp = tr.find_all('td')
                if temp:
                    try:
                        ip = self.mk_ip(
                            re.search(
                                r"str_rot13\(\"(.*?)\"\)",
                                temp[0].find('script').get_text()).group(1))
                    except Exception:
                        continue  # 里面有混淆的tr
                    port = temp[1].get_text()
                    # 校验是否已有
                    if not self.bf.isContains(ip):
                        sql_list.append(
                            """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                            .format(ip, port, "http"))
                        self.bf.insert(ip)
                    else:
                        pass
            for sql in sql_list:  # 一次性操作数据库
                self.cm.exe(sql)
            if next_page:
                page += 1
            else:
                break

    def mk_ip(self, en_ip):
        """
        将拿到的-->`ZGH5Ywt5YwVlBF42At==`这种ip解码成可用ip-->159.89.229.66
        :param en_ip:爬取到的加密ip
        :return:解密后的ip
        """
        letter_str = ""
        for char in en_ip:
            if char in "0123456789==":  # 数字和等号用来混淆,直接拼接
                letter_str += char
            else:
                head = ord(char[0])  # 获得该字母的Unicode的编码
                tail = 13 if char.lower() < 'n' else -13  # 盐
                letter_str += chr(head + tail)  # 讲加密后的值解析成字母并拼接
        return base64.b64decode(letter_str).decode()  # base64解码拼接后的字符串
示例#24
0
 def __init__(self):
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = "http://www.youdaili.net/Daili/http/"
示例#25
0
 def __init__(self):
     """3464网站的爬虫"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = "http://www.3464.com/data/Proxy/http/"
示例#26
0
 def __init__(self):
     """data5u代理的爬虫"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = "http://www.data5u.com/free/index.shtml"
示例#27
0
 def __init__(self):
     """全网代理IP的爬虫"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = "http://www.goubanjia.com/"
示例#28
0
class GouBanJia(object):
    def __init__(self):
        """全网代理IP的爬虫"""
        self.getter = GETTER(rtimes=10)
        self.cm = ConnMysql()
        self.bf = BloomFilter(key='allip')
        self.url = "http://www.goubanjia.com/"

    def parser(self):
        """
        ↓此函数借鉴于↓
        https://blog.csdn.net/weixin_37586648/article/details/78868015
        """
        html = self.getter.rget_data(self.url)
        # 解析html
        soup = BeautifulSoup(html, "lxml")
        # ---------↓自己添加获取类型↓---------
        lx_list = list()
        ip_port_list = list()
        for tr in soup.find_all('tr'):
            temp = tr.find_all('td')
            if temp:
                lx = temp[2].get_text()
                lx_list.append(lx)
        # ---------↑自己添加获取类型↑---------
        # 获取所有的ip的td
        td_list = soup.select('td[class="ip"]')
        for td in td_list:
            # 获取当前td所以的子标签
            child_list = td.find_all()
            ip_port = ""
            for child in child_list:
                if 'style' in child.attrs.keys():
                    if child.attrs['style'].replace(
                            ' ', '') == "display:inline-block;":
                        if child.string is not None:
                            ip_port = ip_port + child.string
                # 过滤出端口号
                elif 'class' in child.attrs.keys():
                    class_list = child.attrs['class']
                    if 'port' in class_list:
                        port = self.mk_port(class_list[1])
                        # 拼接端口
                        ip_port = ip_port + ":" + str(port)
                else:
                    if child.string is not None:
                        ip_port = ip_port + child.string
            # 接下来是我自己的
            ip_port_list.append(ip_port)
        return lx_list, ip_port_list

    def run(self):
        lx_list, ip_port_list = self.parser()
        sql_list = list()
        for ip_port in ip_port_list:
            lx = lx_list[ip_port_list.index(ip_port)]
            ip = ip_port.split(":")[0]
            port = ip_port.split(":")[1]
            # 对ip进行布隆去重
            if not self.bf.isContains(ip):
                sql_list.append(
                    """INSERT INTO allip (`ip`, `port`, `type`) VALUES ('{}', '{}', '{}')"""
                    .format(ip, port, lx))
                self.bf.insert(ip)
            else:
                pass
        for sql in sql_list:  # 一次性操作数据库
            self.cm.exe(sql)
        self.cm.close()  # 关闭数据库连接

    def mk_port(self, port_word):
        word = list(port_word)
        num_list = []
        for item in word:
            num = 'ABCDEFGHIZ'.find(item)
            num_list.append(str(num))
        port = int("".join(num_list)) >> 0x3
        return port
示例#29
0
 def __init__(self):
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')
     self.url = "https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-{}"
示例#30
0
 def __init__(self):
     """快代理的爬虫"""
     self.getter = GETTER(rtimes=10)
     self.cm = ConnMysql()
     self.bf = BloomFilter(key='allip')