def _handle_tasks_result_list(**kwargs): all = kwargs.get('all', []) old = ip_pools_obj._get_all_ip_proxy(_k=proxy_list_key_name) # print(all) # print(old) for res_content in all: if res_content != []: # print(res_content) try: for j in res_content: old.append(j) except TypeError as e: # 处理'NotRegistered' object is not iterable, Task of kind 'proxy_tasks._get_proxy' never registered, please make sure it's imported., Task of kind 'proxy_tasks._get_proxy' never registered, please make sure it's imported. try: # print('{}, 跳过!'.format(e.args[0])) pass except: pass old = list_remove_repeat_dict(target=old, repeat_key='ip') old = serialize_obj_item_2_dict(old) # 转化为dict, 避免反序列化时无法识别ProxyItem redis_cli.set(name=_key, value=dumps(old)) return True
def main(): global time_str, local_ip _welcome() print('Getting local ip...') local_ip = get_local_external_network_ip() assert local_ip != '', 'local_ip获取失败!' print('[+] local ip: {}'.format(local_ip)) while True: origin_proxy_data = list_remove_repeat_dict( target=ip_pools_obj._get_all_ip_proxy(_k=proxy_list_key_name), repeat_key='ip') while len(origin_proxy_data) < MAX_PROXY_NUM: print('\r' + _get_simulate_logger() + 'Ip Pools --->>> 已存在proxy_num(匿名度未知): {}'.format( len(origin_proxy_data)), end='', flush=True) get_proxy_process_data() # 重置 origin_proxy_data = list_remove_repeat_dict( target=ip_pools_obj._get_all_ip_proxy(_k=proxy_list_key_name), repeat_key='ip') else: print() lg.info('达标!休眠{}s...'.format(WAIT_TIME)) sleep(WAIT_TIME) lg.info('Async Checking all_proxy(匿名度未知)...') origin_proxy_data = list_remove_repeat_dict( target=origin_proxy_data, repeat_key='ip') check_all_proxy(origin_proxy_data, redis_key_name=_key, delete_score=90) '''删除失效的, 时刻保持最新高匿可用proxy''' high_origin_proxy_list = list_remove_repeat_dict( target=ip_pools_obj._get_all_ip_proxy( _k=high_proxy_list_key_name), repeat_key='ip') lg.info('Async Checking hign_proxy(高匿名)状态...') check_all_proxy(high_origin_proxy_list, redis_key_name=_h_key, delete_score=MIN_SCORE)
def main(): global time_str while True: origin_proxy_data = list_remove_repeat_dict( target=deserializate_pickle_object( redis_cli.get(_key) or dumps([])), repeat_key='ip') # print() while len(origin_proxy_data) < MAX_PROXY_NUM: print('\r' + _get_simulate_log_info() + 'Ip Pools --->>> 已存在proxy_num(匿名度未知): {}'.format( len(origin_proxy_data)), end='', flush=True) get_proxy_process_data() # 重置 origin_proxy_data = list_remove_repeat_dict( target=deserializate_pickle_object( redis_cli.get(_key) or dumps([])), repeat_key='ip') else: print() lg.info('达标!休眠{}s...'.format(WAIT_TIME)) sleep(WAIT_TIME) lg.info('Async Checking all_proxy(匿名度未知)...') origin_proxy_data = list_remove_repeat_dict( target=origin_proxy_data, repeat_key='ip') check_all_proxy(origin_proxy_data, redis_key_name=_key, delete_score=88) '''删除失效的, 时刻保持最新高匿可用proxy''' high_origin_proxy_list = list_remove_repeat_dict( target=deserializate_pickle_object( redis_cli.get(_h_key) or dumps([])), repeat_key='ip') lg.info('Async Checking hign_proxy(高匿名)状态...') check_all_proxy(high_origin_proxy_list, redis_key_name=_h_key, delete_score=MIN_SCORE)
def _handle_tasks_result_list(**kwargs): all = kwargs.get('all', []) old = ip_pools_obj._get_all_ip_proxy(_k=proxy_list_key_name) for res_content in all: if res_content != []: old += res_content old = list_remove_repeat_dict(target=old, repeat_key='ip') old = serialize_obj_item_2_dict(old) # 转化为dict, 避免反序列化时无法识别ProxyItem redis_cli.set(name=_key, value=dumps(old)) return True
def _handle_tasks_result_list(**kwargs): all = kwargs.get('all', []) origin_data = redis_cli.get(_key) or dumps([]) # get为None, 则返回[] old = deserializate_pickle_object(origin_data) for res_content in all: if res_content != []: old += res_content old = list_remove_repeat_dict(target=old, repeat_key='ip') old = serialize_obj_item_2_dict(old) # 转化为dict, 避免反序列化时无法识别ProxyItem redis_cli.set(name=_key, value=dumps(old)) return True
async def _get_ip_proxy_list(self, ip_num=200) -> list: ''' 获取一个proxy :return: ''' # http://webapi.http.zhimacangku.com/getip?num=200&type=2&pro=&city=0&yys=0&port=1&time=1&ts=1&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions= params = ( ('num', str(ip_num)), # 提取ip数 ('type', '2'), # 数据格式:1:TXT 2:JSON 3:html ('pro', ''), # 省份, 默认全国 ('city', '0'), # 城市, 默认全国 ('yys', '0'), # 0:不限 100026:联通 100017:电信 ('port', '1'), # IP协议 1:HTTP 2:SOCK5 11:HTTPS ('time', '1'), # 稳定时长, 最小的是1, 提取数量多 ('ts', '1'), # 是否显示IP过期时间: 1显示 2不显示 ('ys', '0'), # 是否显示IP运营商: 1显示 ('cs', '0'), # 是否显示位置: 1显示 ('lb', '1'), # 分隔符(1:\r\n 2:/br 3:\r 4:\n 5:\t 6 :自定义) ('sb', '0'), ('pb', '4'), # 端口位数(4:4位端口 5:5位端口) ('mr', '1'), # 去重选择(1:360天去重 2:单日去重 3:不去重) ('regions', ''), # 全国混拨地区 ) url = 'http://webapi.http.zhimacangku.com/getip' ori = json_2_dict(await self._request(url=url, headers=await self._get_phone_headers(), params=params)) data = ori.get('data', []) # pprint(data) if data != []: self.ip_list = await self._delete_expire_time_ip(data=data) self.ip_list = list_remove_repeat_dict(target=self.ip_list, repeat_key='ip') self.redis_cli.set(name=self._k, value=dumps(self.ip_list)) # 先转换为json再存入 msg = ori.get('msg', '') if '设置为白名单' in msg: try: _ip = re.compile('(\d+\.\d+\.\d+\.\d+)').findall(msg)[0] await self._add_local_ip_to_white_list(local_ip=_ip) print('已将{}设置为白名单!'.format(_ip)) except IndexError: pass return data
def check_all_proxy(origin_proxy_data, redis_key_name, delete_score): ''' 检查所有已抓取代理状态 :param origin_proxy_data: :param redis_key_name: redis待处理的key :param delete_score: 最低删除分数 :return: ''' def _create_tasks_list(origin_proxy_data): '''建立任务集''' nonlocal delete_score resutls = [] for proxy_info in origin_proxy_data: last_check_time = proxy_info['last_check_time'] ip = proxy_info['ip'] port = proxy_info['port'] score = proxy_info['score'] if score <= delete_score: # 删除跳过 continue proxy = ip + ':' + str(port) # lg.info('testing {}...'.format(proxy)) async_obj = check_proxy_status.apply_async(args=[proxy, local_ip], ) resutls.append({ 'proxy_info': proxy_info, 'async_obj': async_obj, }) return resutls def _get_tasks_result_list(resutls): '''得到结果集''' def write_hign_proxy_info_2_redis(one_proxy_info): '''redis新写入高匿名ip''' old_h_proxy_list = ip_pools_obj._get_all_ip_proxy( _k=high_proxy_list_key_name) old_ip_list = [i.get('ip') for i in old_h_proxy_list] if one_proxy_info.get('ip') not in old_ip_list: old_score = one_proxy_info.get('score') one_proxy_info.update({ # 加分 'score': old_score + 5, }) old_h_proxy_list.append(one_proxy_info) old_h_proxy_list = serialize_obj_item_2_dict( old_h_proxy_list) # 防止反序列化时, 提示无法识别ProxyItem redis_cli.set(name=_h_key, value=dumps(old_h_proxy_list)) else: pass return None all = [] success_num = 1 available_num = 0 results_len = len(resutls) while len(resutls) > 0: for r_index, r in enumerate(resutls): proxy = r.get('proxy_info', {}).get('ip') + ':' + str( r.get('proxy_info', {}).get('port')) task_id = r.get('async_obj').id status = r.get('async_obj').status one_proxy_info = r.get('proxy_info', {}) # lg.info('task_id: {}, status: {}'.format(task_id, status)) if r.get('async_obj').ready(): async_res = False try: async_res = r.get('async_obj').get( timeout=2, propagate=False ) # 抛出异常,但程序不会停止, r.get('async_obj').traceback 追踪完整异常 except TimeoutError: pass if async_res: available_num += 1 # 高匿ip写入redis write_hign_proxy_info_2_redis(one_proxy_info) all.append({ 'async_res': async_res, 'proxy_info': one_proxy_info, }) # 动态输出, '\r'回到当前开头 print('\r' + _get_simulate_logger() + '已检测ip: {}, 剩余: {}, 实际可用高匿个数: {}'.format( success_num, results_len - success_num, available_num), end='', flush=True) success_num += 1 try: resutls.pop(r_index) except: pass else: # lg.info('{} 未完成!'.format(proxy)) pass else: print() # lg.info('所有异步结果完成!!') print('\r', end='', flush=True) return all def _handle_tasks_result_list(all): '''处理结果集''' def on_success(res, proxy_info): '''回调函数''' score = proxy_info.get('score') ip = proxy_info.get('ip') port = proxy_info.get('port') if not res: proxy_info.update({ 'score': score - 2, }) # lg.info('[-] {}:{}'.format(ip, port)) else: # lg.info('[+] {}:{}'.format(ip, port)) pass # 更新监控时间 proxy_info.update({ 'last_check_time': str(get_shanghai_time()), }) return proxy_info new_proxy_data = [] for index, item in enumerate(all): new_proxy_info = on_success(res=item.get('async_res'), proxy_info=item.get('proxy_info')) new_proxy_data.append(new_proxy_info) return new_proxy_data global time_str resutls = _create_tasks_list(origin_proxy_data) sleep(.8) all = _get_tasks_result_list(resutls) # 处理储存最新数据 new_proxy_data = list_remove_repeat_dict( target=_handle_tasks_result_list(all), repeat_key='ip', ) new_proxy_data = serialize_obj_item_2_dict(new_proxy_data) redis_cli.set(name=redis_key_name, value=dumps(new_proxy_data)) # lg.info('一次检查完毕!') return True