예제 #1
0
    def _crawl_site_6(self):
        '''
        站大爷代理,爬取最近三次更新的代理
        :return:
        '''
        main_url = 'http://ip.zdaye.com/dayProxy.html'

        resp = requests.get(main_url, self.header)
        soup = BeautifulSoup(resp.content, 'lxml')

        urls = [u.find('a').attrs['href'] for u in soup.find_all('h3', class_='thread_title')]

        header = copy(self.header)
        header['referer'] = main_url
        for url in urls[0:3]:
            resp = requests.get('http://ip.zdaye.com' + url, headers=header)
            soup = BeautifulSoup(resp.content, 'lxml')
            text = soup.find('div', class_='cont').text
            pattern = '((\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])):(\d*)@(HTTP|HTTPS)'

            ip_list = re.findall(pattern, text)
            for ip_items in ip_list:
                item = {
                    'ip': ip_items[0],
                    'port': ip_items[5],
                    'type': ip_items[-1]
                }
                # print(item)
                ProxyManager.feed_pool(json.dumps(item))
예제 #2
0
    def _crawl_site_5(self):
        '''
        小舒代理,爬取最近两天更新的代理
        :return:
        '''
        main_url = 'http://www.xsdaili.com'
        resp = requests.get(main_url, headers=self.header)
        soup = BeautifulSoup(resp.text, 'lxml')

        urls = [u.find('a').attrs['href'] for u in soup.find_all('div', class_='title')]

        for url in urls[0:2]:
            resp = requests.get(main_url + url, headers=self.header)
            soup = BeautifulSoup(resp.content, 'lxml')

            text = soup.find('div', class_='cont').text
            pattern = '((\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])):(\d*)@(HTTP|HTTPS)'

            ip_list = re.findall(pattern, text)
            for ip_items in ip_list:
                item = {
                    'ip': ip_items[0],
                    'port': ip_items[5],
                    'type': ip_items[-1]
                }
                ProxyManager.feed_pool(json.dumps(item))
예제 #3
0
    def _crawl_site_3(self):
        '''
        爬取无忧免费代理,只有10个
        :return:
        '''
        url = 'http://www.data5u.com/'
        resp = requests.get(url, headers=self.header)
        soup = BeautifulSoup(resp.text, 'lxml')

        rows = soup.find_all('ul', class_='l2')
        for row in rows:
            tds = row.find_all('li')
            item = {
                'ip': tds[0].text,
                'port': tds[1].text,
                'type': tds[3].text
            }
            ProxyManager.feed_pool(json.dumps(item))
예제 #4
0
    def _crawl_site_7(self):
        '''
        米扑代理,游客只有第一页可见
        :return:
        '''
        url = 'https://proxy.mimvp.com/freeopen.php'
        resp = requests.get(url, headers=self.header)
        soup = BeautifulSoup(resp.text, 'lxml')

        rows = soup.find('div', class_='free-list').find_all('tr')
        for row in rows[1:]:
            tds = row.find_all('td')
            item = {
                'ip': tds[0].text,
                'port': tds[1].text,  # 需要ocr将图片内容转成文本
                'type': tds[3].text
            }
            ProxyManager.feed_pool(json.dumps(item))
예제 #5
0
    def _crawl_site_4(self):
        '''
        云代理,爬取10页
        :return:
        '''
        for i in range(1, 11):
            url = 'http://www.ip3366.net/?stype=1&page={0}'.format(i)
            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find('div', id='list').find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[0].text,
                    'port': tds[1].text,
                    'type': tds[3].text
                }
                ProxyManager.feed_pool(json.dumps(item))
예제 #6
0
    def _crawl_site_2(self):
        '''
        爬取极速代理前10页
        :return:
        '''
        for i in range(1, 11):
            url = 'http://www.superfastip.com/welcome/freeip/{0}'.format(i)

            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find_all('div', class_='row clearfix')[2].find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[0].text,
                    'port': tds[1].text,
                    'type': tds[3].text
                }
                ProxyManager.feed_pool(json.dumps(item))
예제 #7
0
    def _crawl_site_10(self):
        '''
        万能代理前十页
        :return:
        '''
        for i in range(1, 11):
            url = 'http://wndaili.cn/?page={0}'.format(i)
            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find('div', id='list').find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[0].text,
                    'port': tds[1].text,
                    'type': tds[3].text
                }
                # print(item)
                ProxyManager.feed_pool(json.dumps(item))
예제 #8
0
    def _crawl_site_0(self):
        '''
        爬取西刺代理前四页的IP
        :return:
        '''
        for i in range(1, 5):
            url = 'https://www.xicidaili.com/nn/{0}'.format(i)

            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find('table', id='ip_list').find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[1].text,
                    'port': tds[2].text,
                    'type': tds[5].text
                }
                ProxyManager.feed_pool(json.dumps(item))
예제 #9
0
    def _crawl_site_1(self):
        '''
        爬取快代理前三页的ip
        :return:
        '''
        for i in range(1, 4):
            url = 'https://www.kuaidaili.com/free/inha/{0}'.format(i)

            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find('div', id='list').find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[0].text,
                    'port': tds[1].text,
                    'type': tds[3].text
                }
                ProxyManager.feed_pool(json.dumps(item))

            time.sleep(2)
예제 #10
0
    def _crawl_site_9(self):
        '''
        泥马代理
        :return:
        '''
        url = 'http://www.nimadaili.com'
        resp = requests.get(url, headers=self.header)
        soup = BeautifulSoup(resp.text, 'lxml')

        tables = soup.find_all('div', id='overflow')
        # 爬取HTTP和HTTPS两块
        for t in tables[2:]:
            for row in t.find_all('tr')[1:-1]:
                tds = row.find_all('td')
                ip, port = tds[0].text.split(':')
                item = {
                    'ip': ip,
                    'port': port,
                    'type': tds[2].text
                }
                # print(item)
                ProxyManager.feed_pool(json.dumps(item))
예제 #11
0
    def _crawl_site_8(self):
        '''
        西拉代理
        :return:
        '''
        url = 'http://www.xiladaili.com'
        resp = requests.get(url, headers=self.header, )
        soup = BeautifulSoup(resp.text, 'lxml')

        tables = soup.find_all('table', class_='fl-table')
        # 爬取HTTP和HTTPS两块
        for t in tables[1:3]:
            for row in t.find_all('tr')[2:]:
                tds = row.find_all('td')
                ip, port = tds[0].text.split(':')
                item = {
                    'ip': ip,
                    'port': port,
                    'type': tds[2].text
                }
                print(item)
                ProxyManager.feed_pool(json.dumps(item))