示例#1
0
        def write_message(max_try):
            """
            :param max_try:
            :return:
            """
            try:
                max_try -= 1
                msg = json.dumps({
                    'qid': task.req_qid,
                    'type': task.callback_type,
                    'uid': task.req_uid,
                    'query': json.dumps(query),
                    'status': spider_status
                })
                credentials = pika.PlainCredentials(
                    username=task.master_info['spider_mq_user'],
                    password=task.master_info['spider_mq_passwd'])
                connection = pika.BlockingConnection(
                    pika.ConnectionParameters(
                        host=task.master_info['spider_mq_host'],
                        virtual_host=task.master_info['spider_mq_vhost'],
                        credentials=credentials,
                        # heartbeat_interval=0
                    ))
                channel = connection.channel()

                res = channel.basic_publish(
                    exchange=task.master_info['spider_mq_exchange'],
                    routing_key=task.master_info['spider_mq_routerKey'],
                    properties=pika.BasicProperties(delivery_mode=2),
                    body=msg,
                )
                connection.process_data_events()

                connection.close()
                if not res:
                    warn_msg = 'RabbitMQ Result False: {0}'.format(msg)
                    info = warn(str(task.req_qid), 'ex_RabbitMQ',
                                get_local_ip(), warn_msg)
                    logger.debug("\n" + info)
                    raise Exception('RabbitMQ Result False')
                logger.debug(
                    '[callback a verifytask done] qid:{}, source: {}, task_info: {}, status: {}'
                    .format(str(task.req_qid), str(task.source), task.content,
                            spider_status))
                return max_try
            except Exception as exc:
                if max_try > 0:
                    return write_message(max_try)
                else:
                    warn_msg = 'RabbitMQ Result False qid : {}, e_info: {}, msg: {}'.format(
                        task.req_qid, traceback.format_exc(), msg)
                    info = warn(task.req_qid, 'ex_SpiderMQ', get_local_ip(),
                                warn_msg)
                    logger.exception("\n" + info)
                    return max_try
示例#2
0
文件: slave.py 项目: gitPff/sb-
 def run(self):
     while True:
         spider_task = g_task_queue.get(block=True)
         logger.info('协程池大小: {0} 协程池空闲: {1}'.format(g_co_pool.size, g_co_pool.free_count()))
         if g_co_pool.free_count() < 2:
             msg = "协程池中任务堆积:{} 空闲池:{} 任务池:{}".format(g_co_pool.size,g_co_pool.free_count(),
                                                        g_task_queue.qsize())
             print
             callback.CallbackResult(task=spider_task, error_code=98, result_type="END")
             logger.debug("\n" + warn(qid=spider_task.req_qid, type="ex1002", msg="爬虫队列满了"))
         else:
             g_co_pool.spawn(doTask, spider_task)
示例#3
0
文件: slave.py 项目: gitPff/sb-
def do_worker(task_info_list):
    ''' 
    1、接收web请求
    2、检查并解析task
    3、一个请求中可能有多个任务,将任务依次添加进worker中
    4、接收任务时检查任务队列长度,如超过,同步回调中返回错误信息。使检索重发任务(假设负载均衡做的不好)
    
    '''
    bottle_r_time_0 = time.time()

    task_num = len(task_info_list)
    req_num = g_task_queue.qsize() + task_num
    bottle_r_time_1 = time.time() - bottle_r_time_0
    for task in task_info_list:
        try:
            g_task_queue.put(task)
        except:
            # 任务队列已满
            traceback.format_exc()
            callback.CallbackResult(task=task, error_code=98, result_type="END")
            logger.debug("\n" + warn(qid=task.req_qid, type="ex1002", msg="爬虫队列满了"))
    bottle_r_time_2 = time.time() - bottle_r_time_0
    logger.info("bottle_run_time: 解析task: {}秒,总耗时:{}秒".format(bottle_r_time_1, bottle_r_time_2))
示例#4
0
 def write_redis_ticket(self, task, result, error_code, extra):
     try:
         begin = time.time()
         params = (task.redis_host, task.redis_port, int(task.redis_db),
                   task.redis_passwd)
         rds = self.get_redis_pool(params)
         conn_cost_time = round(time.time() - begin, 3)
         # 等检索做兼容,暂时妥协方案,谷歌api返回格式强行转换。
         if task.source == "Realtraffic" and result:
             result = json.dumps(result[0])
         result = {"err_code": error_code, "data": result, "extra": extra}
         begin = time.time()
         if task.ticket_info.get("auth"):
             rds.setex(task.redis_key, json.dumps(result), 1800)
         else:
             rds.setex(task.redis_key, json.dumps(result), 600)
         write_cost_time = round(time.time() - begin, 3)
         return conn_cost_time, write_cost_time
     except Exception, e:
         warn_msg = 'redis_host:' + task.redis_host + ' ' + str(e)
         info = warn(task.req_qid, 'ex_SpiderRedis', msg=warn_msg)
         logger.exception("\n" + info)
         return str(e)
示例#5
0
文件: common.py 项目: gitPff/sb-
def get_proxy(
        source=None,
        allow_ports=[],
        forbid_ports=[],
        allow_regions=[],
        forbid_regions=[],
        user='******',
        passwd='realtime',
        proxy_info={},
        verify_info="verify",
        ip_num=1,
        ip_type="internal",
        task=Task(),
):
    """
    全都需要取代理暂时
    """

    qid = str(task.ticket_info.get('qid', int(time.time() * 1000)))

    msg = {
        "req": [{
            "source": source,
            "type": verify_info,
            "num": ip_num,
            "ip_type": ip_type,
        }]
    }
    msg = json.dumps(msg)
    ptid = task.ticket_info.get('ptid', "")
    time_st = time.time()
    get_info = '/?type=px001&qid={0}&query={1}&ptid={2}&tid=tid&ccy=AUD'.format(
        qid, msg, ptid)
    logger.info("get proxy info :http://{1}{0}".format(get_info,
                                                       g_config.proxy_host))
    count = 1
    while 1:
        try:
            p = requests.get("http://{0}".format(g_config.proxy_host) +
                             get_info,
                             timeout=(6, 6),
                             stream=False)
            p_time = p.elapsed.total_seconds()
            p = p.content
            logger.info("代理返回内容为{0}".format(p))
            proxy_ip = json.loads(p)['resp'][0]['ips'][0]['inner_ip']
            break
        except:
            exstr = traceback.format_exc()
            msg = '取代理请求时报错,错误信息为:' + exstr
            info = warn(qid, 'ex_GetProxyFail', ip, msg)
            logger.debug("\n" + info)
            if count == 3:
                raise parser_except.ParserException(21, "取代理时失败")
            time.sleep(3)
            logger.debug("取代理失败,进行第{}次重试,".format(count))
            count += 1
    time_end = time.time() - time_st
    # 代理服务有时候会返回一个只有":"的代理!
    if len(proxy_ip) < 9:
        msg = "获取到的代理不可用!"
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
        raise parser_except.ParserException(21, "获取到的代理有误:{}".format(p))
    if not proxy_ip:
        msg = '未获取到代理,请求信息为:' + get_info
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
        raise parser_except.ParserException(21, "未获取到代理")
    if p_time > 1.5:
        msg = '获取代理成功耗时, 耗时:{0}, requests 记录超时时间:{1}'.format(time_end, p_time)
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
    p = [proxy_ip, [p, time_end, get_info]]
    return p