Пример #1
0
    def get_proxy_zdaye(self):
        ''' 获取站大爷代理ip'''
        url = 'http://ip.zdaye.com/dayProxy/' + str(
            time.localtime().tm_year) + '/' + str(
                time.localtime().tm_mon) + '/1.html'
        db = RedisClient()
        if USE_GET_PROXY:
            proxy = db.getOnceProxy() if GET_PROXY_TYPE is 0 else db.getProxy()
        else:
            proxy = None
        html = get_html(url, proxy)

        attemp = 1
        while html is None and attemp < 3:
            attemp += 1
            if USE_GET_PROXY:
                proxy = db.getOnceProxy(
                ) if GET_PROXY_TYPE is 0 else db.getProxy()
            else:
                proxy = None
            html = get_html(url, proxy)
            if html:
                break

        if html:
            proxies = list()
            doc = pq(html)
            urls = [
                doc('#J_posts_list .thread_theme_type').eq(0).attr('href'),
                doc('#J_posts_list .thread_theme_type').eq(1).attr('href')
            ]
            if urls:
                for uri in urls:
                    time.sleep(random.uniform(1, 3))
                    ip_url = 'http://ip.zdaye.com' + uri
                    html = get_html(ip_url, proxy, False, referer=url)
                    if html:
                        try:
                            doc = pq(html)
                            trList = doc('div.cont').text()
                            pattern = re.compile(
                                r'(\d{2,3}.\d{2,3}.\d{2,3}.\d{2,3}):(\d{2,5})@HTTP#\[高匿\]',
                                re.M)
                            ip_list = pattern.findall(
                                trList
                            )  #[('117.191.11.74', '80'), ('39.137.168.230', '80'), ('117.191.11.73', '80')]
                            for i in ip_list:
                                ip = i[0]
                                port = i[1]
                                proxies.append(
                                    json.dumps({
                                        'http':
                                        'http' + '://' + ip + ':' + port
                                    }))
                        except Exception as e:
                            # set_log_zh_bytime('analysis_proxy_kuaidaili').debug(e)
                            continue
                if proxies:
                    return proxies
                return None
Пример #2
0
    def get_proxy_kuaidaili(self):
        '''获取快代理ip'''
        db = RedisClient()
        proxies = list()

        if USE_GET_PROXY:
            proxy = db.getOnceProxy() if GET_PROXY_TYPE is 0 else db.getProxy()
        else:
            proxy = None

        for page in range(1, 3):
            time.sleep(random.uniform(1, 3))
            url = 'https://www.kuaidaili.com/free/inha/' + str(page)

            html = get_html(url, proxy)

            attemp = 1
            while html is None and attemp < 3:
                attemp += 1
                if USE_GET_PROXY:
                    proxy = db.getOnceProxy(
                    ) if GET_PROXY_TYPE is 0 else db.getProxy()
                else:
                    proxy = None
                html = get_html(url, proxy, True)
                if html:
                    break

            if html:
                try:
                    doc = pq(html)
                    trList = doc('#list table tr:gt(0)')
                    for i in trList.items():
                        if int(float(i.find('td').eq(5).text()[0])) >= 2:
                            continue
                        ip = i.find('td').eq(0).text()
                        port = i.find('td').eq(1).text()
                        agreement = i.find('td').eq(3).text().lower()
                        if agreement != 'http':  # 只抓取http代理,异步校验只能校验http
                            continue
                        proxies.append(
                            json.dumps({
                                agreement:
                                agreement + '://' + ip + ':' + port
                            }))
                except Exception as e:
                    # set_log_zh_bytime('analysis_proxy_kuaidaili').debug(e)
                    return None
        return proxies
Пример #3
0
    def get_proxy_goubanjia(self):
        '''获取goubanjia代理ip'''

        url = 'http://www.goubanjia.com/'
        db = RedisClient()

        if USE_GET_PROXY:
            proxy = db.getOnceProxy() if GET_PROXY_TYPE is 0 else db.getProxy()
        else:
            proxy = None
        html = get_html(url, proxy)

        attemp = 1
        while html is None and attemp < 3:
            attemp += 1
            if USE_GET_PROXY:
                proxy = db.getOnceProxy(
                ) if GET_PROXY_TYPE is 0 else db.getProxy()
            else:
                proxy = None
            html = get_html(url, proxy, True)
            if html:
                break

        if html:
            proxies = list()
            doc = pq(html)
            try:
                trList = doc(".table.table-hover tr:gt(0)")
                for i in trList.items():
                    ipList = []
                    for v in i.find('.ip').children().items():
                        if v.attr('style') != 'display: none;' and v.attr(
                                'style') != 'display:none;' and v.text(
                                ) is not '':
                            ipList.append(v.text())
                    ip = ''.join(ipList[:-1])
                    port = ipList[-1]
                    agreement = i.find('td').eq(2).text().lower()
                    if agreement != 'http':  # 只抓取http代理,异步校验只能校验http
                        continue
                    proxies.append(
                        json.dumps(
                            {agreement: agreement + '://' + ip + ':' + port}))
                return proxies
            except Exception as e:
                # set_log_zh_bytime('analysis_proxy_goubanjia').debug(e)
                return None
Пример #4
0
    def get_proxy_xici(self):
        '''获取西刺代理ip'''

        url = 'http://www.xicidaili.com/nn/'
        db = RedisClient()
        if USE_GET_PROXY:
            proxy = db.getOnceProxy() if GET_PROXY_TYPE is 0 else db.getProxy()
        else:
            proxy = None
        html = get_html(url, proxy)

        attemp = 1
        while html is None and attemp < 3:
            attemp += 1
            if USE_GET_PROXY:
                proxy = db.getOnceProxy(
                ) if GET_PROXY_TYPE is 0 else db.getProxy()
            else:
                proxy = None
            html = get_html(url, proxy, True)
            if html:
                break

        if html:
            proxies = list()
            doc = pq(html)
            try:
                trList = doc('#ip_list tr:gt(0)')
                for x in trList.items():
                    delay = x.find('td').eq(6).find("div").attr('title')[0]
                    if int(float(delay)) >= 1:
                        continue
                    ip = x.find('td').eq(1).text()
                    port = x.find('td').eq(2).text()
                    agreement = x.find('td').eq(5).text().lower()
                    if agreement != 'http':  # 只抓取http代理,异步校验只能校验http
                        continue
                    proxies.append(
                        json.dumps(
                            {agreement: agreement + '://' + ip + ':' + port}))
                return proxies
            except Exception as e:
                # set_log_zh_bytime('analysis_proxy_xici').debug(e)
                return None
Пример #5
0
    def get_proxy_seofangfa(self):

        url = 'http://proxy.seofangfa.com/'
        db = RedisClient()
        proxies = list()

        if USE_GET_PROXY:
            proxy = db.getOnceProxy() if GET_PROXY_TYPE is 0 else db.getProxy()
        else:
            proxy = None

        html = get_html(url, proxy)

        attemp = 1
        while html is None and attemp < 3:
            attemp += 1
            if USE_GET_PROXY:
                proxy = db.getOnceProxy(
                ) if GET_PROXY_TYPE is 0 else db.getProxy()
            else:
                proxy = None
            html = get_html(url, proxy, True)
            if html:
                break

        if html:
            try:
                doc = pq(html)
                trList = doc('.table tr:gt(0)')
                for x in trList.items():
                    delay = x.find('td').eq(2).text()
                    if int(float(delay)) >= 1:
                        continue
                    # 移除国外代理
                    location = x.find('td').eq(3).text()
                    if '中国' not in location:
                        continue
                    ip = x.find('td').eq(0).text()
                    port = x.find('td').eq(1).text()
                    proxies.append(
                        json.dumps({'http': 'http' + '://' + ip + ':' + port}))
            except Exception as e:
                return None
        return proxies
Пример #6
0
    def get_proxy_data5u(self):

        url = 'http://www.data5u.com/'
        db = RedisClient()
        proxies = list()

        if USE_GET_PROXY:
            proxy = db.getOnceProxy() if GET_PROXY_TYPE is 0 else db.getProxy()
        else:
            proxy = None

        html = get_html(url, proxy)

        attemp = 1
        while html is None and attemp < 3:
            attemp += 1
            if USE_GET_PROXY:
                proxy = db.getOnceProxy(
                ) if GET_PROXY_TYPE is 0 else db.getProxy()
            else:
                proxy = None
            html = get_html(url, proxy, True)
            if html:
                break

        if html:
            try:
                doc = pq(html)
                trList = doc('.wlist ul li ul.l2')
                for x in trList.items():
                    delay = x.find('span').eq(7).text()
                    if int(float(delay[:-1])) >= 2:
                        continue
                    ip = x.find('span').eq(0).text()
                    port = x.find('span').eq(1).text()
                    proxies.append(
                        json.dumps({'http': 'http' + '://' + ip + ':' + port}))
            except Exception as e:
                return None
        return proxies