def write_message(max_try): """ :param max_try: :return: """ try: max_try -= 1 msg = json.dumps({ 'qid': task.req_qid, 'type': task.callback_type, 'uid': task.req_uid, 'query': json.dumps(query), 'status': spider_status }) credentials = pika.PlainCredentials( username=task.master_info['spider_mq_user'], password=task.master_info['spider_mq_passwd']) connection = pika.BlockingConnection( pika.ConnectionParameters( host=task.master_info['spider_mq_host'], virtual_host=task.master_info['spider_mq_vhost'], credentials=credentials, # heartbeat_interval=0 )) channel = connection.channel() res = channel.basic_publish( exchange=task.master_info['spider_mq_exchange'], routing_key=task.master_info['spider_mq_routerKey'], properties=pika.BasicProperties(delivery_mode=2), body=msg, ) connection.process_data_events() connection.close() if not res: warn_msg = 'RabbitMQ Result False: {0}'.format(msg) info = warn(str(task.req_qid), 'ex_RabbitMQ', get_local_ip(), warn_msg) logger.debug("\n" + info) raise Exception('RabbitMQ Result False') logger.debug( '[callback a verifytask done] qid:{}, source: {}, task_info: {}, status: {}' .format(str(task.req_qid), str(task.source), task.content, spider_status)) return max_try except Exception as exc: if max_try > 0: return write_message(max_try) else: warn_msg = 'RabbitMQ Result False qid : {}, e_info: {}, msg: {}'.format( task.req_qid, traceback.format_exc(), msg) info = warn(task.req_qid, 'ex_SpiderMQ', get_local_ip(), warn_msg) logger.exception("\n" + info) return max_try
def run(self): while True: spider_task = g_task_queue.get(block=True) logger.info('协程池大小: {0} 协程池空闲: {1}'.format(g_co_pool.size, g_co_pool.free_count())) if g_co_pool.free_count() < 2: msg = "协程池中任务堆积:{} 空闲池:{} 任务池:{}".format(g_co_pool.size,g_co_pool.free_count(), g_task_queue.qsize()) print callback.CallbackResult(task=spider_task, error_code=98, result_type="END") logger.debug("\n" + warn(qid=spider_task.req_qid, type="ex1002", msg="爬虫队列满了")) else: g_co_pool.spawn(doTask, spider_task)
def do_worker(task_info_list): ''' 1、接收web请求 2、检查并解析task 3、一个请求中可能有多个任务,将任务依次添加进worker中 4、接收任务时检查任务队列长度,如超过,同步回调中返回错误信息。使检索重发任务(假设负载均衡做的不好) ''' bottle_r_time_0 = time.time() task_num = len(task_info_list) req_num = g_task_queue.qsize() + task_num bottle_r_time_1 = time.time() - bottle_r_time_0 for task in task_info_list: try: g_task_queue.put(task) except: # 任务队列已满 traceback.format_exc() callback.CallbackResult(task=task, error_code=98, result_type="END") logger.debug("\n" + warn(qid=task.req_qid, type="ex1002", msg="爬虫队列满了")) bottle_r_time_2 = time.time() - bottle_r_time_0 logger.info("bottle_run_time: 解析task: {}秒,总耗时:{}秒".format(bottle_r_time_1, bottle_r_time_2))
def write_redis_ticket(self, task, result, error_code, extra): try: begin = time.time() params = (task.redis_host, task.redis_port, int(task.redis_db), task.redis_passwd) rds = self.get_redis_pool(params) conn_cost_time = round(time.time() - begin, 3) # 等检索做兼容,暂时妥协方案,谷歌api返回格式强行转换。 if task.source == "Realtraffic" and result: result = json.dumps(result[0]) result = {"err_code": error_code, "data": result, "extra": extra} begin = time.time() if task.ticket_info.get("auth"): rds.setex(task.redis_key, json.dumps(result), 1800) else: rds.setex(task.redis_key, json.dumps(result), 600) write_cost_time = round(time.time() - begin, 3) return conn_cost_time, write_cost_time except Exception, e: warn_msg = 'redis_host:' + task.redis_host + ' ' + str(e) info = warn(task.req_qid, 'ex_SpiderRedis', msg=warn_msg) logger.exception("\n" + info) return str(e)
def get_proxy( source=None, allow_ports=[], forbid_ports=[], allow_regions=[], forbid_regions=[], user='******', passwd='realtime', proxy_info={}, verify_info="verify", ip_num=1, ip_type="internal", task=Task(), ): """ 全都需要取代理暂时 """ qid = str(task.ticket_info.get('qid', int(time.time() * 1000))) msg = { "req": [{ "source": source, "type": verify_info, "num": ip_num, "ip_type": ip_type, }] } msg = json.dumps(msg) ptid = task.ticket_info.get('ptid', "") time_st = time.time() get_info = '/?type=px001&qid={0}&query={1}&ptid={2}&tid=tid&ccy=AUD'.format( qid, msg, ptid) logger.info("get proxy info :http://{1}{0}".format(get_info, g_config.proxy_host)) count = 1 while 1: try: p = requests.get("http://{0}".format(g_config.proxy_host) + get_info, timeout=(6, 6), stream=False) p_time = p.elapsed.total_seconds() p = p.content logger.info("代理返回内容为{0}".format(p)) proxy_ip = json.loads(p)['resp'][0]['ips'][0]['inner_ip'] break except: exstr = traceback.format_exc() msg = '取代理请求时报错,错误信息为:' + exstr info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) if count == 3: raise parser_except.ParserException(21, "取代理时失败") time.sleep(3) logger.debug("取代理失败,进行第{}次重试,".format(count)) count += 1 time_end = time.time() - time_st # 代理服务有时候会返回一个只有":"的代理! if len(proxy_ip) < 9: msg = "获取到的代理不可用!" info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) raise parser_except.ParserException(21, "获取到的代理有误:{}".format(p)) if not proxy_ip: msg = '未获取到代理,请求信息为:' + get_info info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) raise parser_except.ParserException(21, "未获取到代理") if p_time > 1.5: msg = '获取代理成功耗时, 耗时:{0}, requests 记录超时时间:{1}'.format(time_end, p_time) info = warn(qid, 'ex_GetProxyFail', ip, msg) logger.debug("\n" + info) p = [proxy_ip, [p, time_end, get_info]] return p