Exemplo n.º 1
0
def main():
    while True:
        origin_proxy_data = deserializate_pickle_object(
            redis_cli.get(_key) or dumps([]))
        while len(origin_proxy_data) < MAX_PROXY_NUM:
            lg.info('Pools已存在proxy_num: {}'.format(len(origin_proxy_data)))
            get_proxy_process_data()
            # 重置
            origin_proxy_data = deserializate_pickle_object(
                redis_cli.get(_key) or dumps([]))
        else:
            lg.info('达标!休眠{}s...'.format(WAIT_TIME))
            sleep(WAIT_TIME)
            check_all_proxy(origin_proxy_data)
Exemplo n.º 2
0
def _get_proxies() -> dict:
    '''
    随机一个高匿名proxy(极大概率失败, 耐心!)
    :return:
    '''
    global ori_ip_list
    proxy_list = deserializate_pickle_object(
        redis_cli.get(_h_key) or dumps([]))
    proxies = choice(proxy_list) if len(proxy_list) > 0 else None
    if proxies is not None:
        ip, port = proxies['ip'], proxies['port']
        proxies = {
            'http': 'http://{}:{}'.format(ip, port),
            'https': 'https://{}:{}'.format(ip, port),
        }
        lg.info('正在使用代理 {} crawl...'.format(proxies['http']))
    else:
        if ori_ip_list == []:
            for url in start_up_ip_url_list:
                tmp = get_start_up_ip_list(url)
                ori_ip_list += tmp
            if ori_ip_list == []:
                ori_ip_list = _get_66_ip_list()
                if ori_ip_list == []:
                    lg.info('正在使用本机ip抓取...')

        else:
            pass
        ori_ip_list = list(set(ori_ip_list))
        proxies = {
            'http': 'http://{}'.format(choice(ori_ip_list)),
        }
        lg.info('正在使用代理 {} crawl...'.format(proxies['http']))

    return proxies or {}  # 如果None则返回{}
Exemplo n.º 3
0
    def _get_all_ip_proxy(self, _k=high_proxy_list_key_name) -> list:
        '''
        得到所有ip proxy
        :param _k: 原始值
        :return:
        '''
        _ = deserializate_pickle_object(
            self.redis_cli.get(name=get_uuid3(_k)) or dumps([]))

        return _
Exemplo n.º 4
0
def _write_into_redis(res):
    '''
    读取并更新新采集的proxy
    :param res:
    :return:
    '''
    origin_data = redis_cli.get(_key) or dumps([])  # get为None, 则返回[]
    old = deserializate_pickle_object(origin_data)
    old += res
    redis_cli.set(name=_key, value=dumps(old))

    return True
Exemplo n.º 5
0
def main():
    global time_str

    while True:
        origin_proxy_data = list_remove_repeat_dict(
            target=deserializate_pickle_object(
                redis_cli.get(_key) or dumps([])),
            repeat_key='ip')
        # print()
        while len(origin_proxy_data) < MAX_PROXY_NUM:
            print('\r' + _get_simulate_log_info() +
                  'Ip Pools --->>> 已存在proxy_num(匿名度未知): {}'.format(
                      len(origin_proxy_data)),
                  end='',
                  flush=True)
            get_proxy_process_data()
            # 重置
            origin_proxy_data = list_remove_repeat_dict(
                target=deserializate_pickle_object(
                    redis_cli.get(_key) or dumps([])),
                repeat_key='ip')
        else:
            print()
            lg.info('达标!休眠{}s...'.format(WAIT_TIME))
            sleep(WAIT_TIME)
            lg.info('Async Checking all_proxy(匿名度未知)...')
            origin_proxy_data = list_remove_repeat_dict(
                target=origin_proxy_data, repeat_key='ip')
            check_all_proxy(origin_proxy_data,
                            redis_key_name=_key,
                            delete_score=88)
            '''删除失效的, 时刻保持最新高匿可用proxy'''
            high_origin_proxy_list = list_remove_repeat_dict(
                target=deserializate_pickle_object(
                    redis_cli.get(_h_key) or dumps([])),
                repeat_key='ip')
            lg.info('Async Checking hign_proxy(高匿名)状态...')
            check_all_proxy(high_origin_proxy_list,
                            redis_key_name=_h_key,
                            delete_score=MIN_SCORE)
Exemplo n.º 6
0
    def _get_random_ip_proxy(self) -> str:
        '''
        随机获取一个代理
        :return: 格式: 'http://175.6.2.174:8088'
        '''
        _ = deserializate_pickle_object(self.redis_cli.get(name=self._k) or dumps([]))
        if _ == []:
            return ''

        random_porxy = choice(_)

        return 'http://{}:{}'.format(
            random_porxy.get('ip'),
            random_porxy.get('port'))
Exemplo n.º 7
0
    def _handle_tasks_result_list(**kwargs):
        all = kwargs.get('all', [])
        origin_data = redis_cli.get(_key) or dumps([])  # get为None, 则返回[]
        old = deserializate_pickle_object(origin_data)

        for res_content in all:
            if res_content != []:
                old += res_content

        old = list_remove_repeat_dict(target=old, repeat_key='ip')
        old = serialize_obj_item_2_dict(old)  # 转化为dict, 避免反序列化时无法识别ProxyItem
        redis_cli.set(name=_key, value=dumps(old))

        return True
Exemplo n.º 8
0
def read_celery_tasks_result_info(celery_id_list: list) -> list:
    '''
    读取celery tasks的结果
    :param celery_id_list:
    :return:
    '''
    res = []
    for item in celery_id_list:
        # 读取
        _k = 'celery-task-meta-' + str(item.id)
        result = deserializate_pickle_object(redis_cli.get(_k))
        if result.get('status', '') == 'SUCCESS':
            res.append(result.get('result', []))
        else:
            lg.info('获取key值为{}失败!'.format(_k))

    return res
Exemplo n.º 9
0
def _get_proxies() -> dict:
    '''
    随机一个proxy
    :return:
    '''
    origin_data = redis_cli.get(_key) or dumps([])
    proxy_list = deserializate_pickle_object(origin_data)
    proxies = choice(proxy_list) if len(proxy_list) > 0 else None
    if proxies is not None:
        proxies = {
            'http': 'http://{}:{}'.format(proxies['ip'], proxies['port'])
        }
        lg.info('正在使用代理{}crawl...'.format(proxies['http']))
    else:
        lg.info('第一次抓取使用本机ip...')

    return proxies or {}  # 如果None则返回{}
Exemplo n.º 10
0
 def write_hign_proxy_info_2_redis(one_proxy_info):
     '''redis新写入高匿名ip'''
     old_h_proxy_list = deserializate_pickle_object(
         redis_cli.get(name=_h_key) or dumps([]))
     old_ip_list = [i.get('ip') for i in old_h_proxy_list]
     if one_proxy_info.get('ip') not in old_ip_list:
         old_score = one_proxy_info.get('score')
         one_proxy_info.update({  # 加分
             'score': old_score + 5,
         })
         old_h_proxy_list.append(one_proxy_info)
         old_h_proxy_list = serialize_obj_item_2_dict(
             old_h_proxy_list)  # 防止反序列化时, 提示无法识别ProxyItem
         redis_cli.set(name=_h_key, value=dumps(old_h_proxy_list))
     else:
         pass
     return None
Exemplo n.º 11
0
def _get_proxies() -> dict:
    '''
    随机一个高匿名proxy(极大概率失败, 耐心!)
    :return:
    '''
    proxy_list = deserializate_pickle_object(
        redis_cli.get(_h_key) or dumps([]))
    proxies = choice(proxy_list) if len(proxy_list) > 0 else None
    if proxies is not None:
        proxies = {
            'http': 'http://{}:{}'.format(proxies['ip'], proxies['port'])
        }
        lg.info('正在使用代理 {} crawl...'.format(proxies['http']))
    else:
        lg.info('第一次抓取使用本机ip...')
        # 使用66ip,免费高匿ip
        # if a_66_ip == []:
        #     _get_66_ip_list()
        # proxies = {
        #     'http': 'http://{}'.format(choice(a_66_ip)),
        # }
        # lg.info('正在使用代理 {} crawl...'.format(proxies['http']))

    return proxies or {}  # 如果None则返回{}
Exemplo n.º 12
0
    def _get_all_ip_proxy(self) -> list:
        '''得到所有ip proxy'''
        _ = deserializate_pickle_object(self.redis_cli.get(name=self._k) or dumps([]))

        return _
Exemplo n.º 13
0
    else:
        logger.error('get_goods_data得到的data为空dict!')
        return None

    return _


if __name__ == '__main__':
    url = 'https://item.taobao.com/item.htm?id=534498954634'

    _r = get_tb_process_data(tb_object=tb, url=url)

    # logger.info(_r.get(timeout=2))
    _r.get(timeout=2)
    print('tasks的id: {0}, status: {1}'.format(_r.id, _r.status))

    # 从redis获取结果
    pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)
    redis_cli = redis.StrictRedis(connection_pool=pool)
    _k = 'celery-task-meta-' + str(_r)

    # 将redis里面的序列化python对象进行反序列化
    result = deserializate_pickle_object(redis_cli.get(_k))
    if result.get('status', '') == 'SUCCESS':
        result = result.get('result', '{}')
        print(result)

    else:
        print('获取失败!')
Exemplo n.º 14
0
# coding:utf-8
'''
@author = super_fazai
@File    : utils.py
@connect : [email protected]
'''

from fzutils.sql_utils import BaseRedisCli
from fzutils.safe_utils import get_uuid3
from fzutils.data.pickle_utils import deserializate_pickle_object
from fzutils.linux_utils import kill_process_by_name
from fzutils.time_utils import get_shanghai_time
from fzutils.common_utils import get_random_int_number
from fzutils.common_utils import retry
from pprint import pprint
from pickle import dumps
from time import sleep
from random import choice
from settings import high_proxy_list_key_name

# print(get_uuid3('proxy_tasks'))
# print(get_uuid3(high_proxy_list_key_name))
_ = BaseRedisCli()
pprint(
    deserializate_pickle_object(
        _.get('5e421d78-a394-3b44-aae1-fd86aa127255') or dumps([])))

# 清除celery workers
kill_process_by_name(process_name='celery')