예제 #1
0
def k8s_project_update():
    db_token = db_op.platform_token
    params = request.get_json()
    new_replicas = None
    msg = None
    project = None
    version = None
    try:
        if params:
            if 'project' in params and 'version' in params and 'access_token' in params and 'context' in params:
                token = params['access_token']
                project = params['project']
                version = params['version']
                context = params['context']
                new_image = "%s/%s:%s" % (docker_registry[context],
                                          project.split('.')[0], version)
                if 'replicas' in params:
                    new_replicas = params['replicas']
                # 验证token是否有效
                vals = db_token.query.filter(
                    and_(
                        db_token.token == token,
                        db_token.expire > time.strftime(
                            '%Y-%m-%d', time.localtime()))).all()
                if vals:
                    redis_key = 'op_k8s_update_%s' % time.strftime(
                        '%Y%m%d%H%M%S', time.localtime())
                    Scheduler = produce.SchedulerPublish()
                    Scheduler = Scheduler.Scheduler_mem(
                        k8s_resource.object_update, [
                            context, new_image, new_replicas, version,
                            redis_key, 'api'
                        ])
                    Scheduler.start()
                else:
                    msg = '授权验证不通过!'
            else:
                msg = '传递参数错误!'
        else:
            msg = '传递参数不能为空!'
    except Exception as e:
        msg = e
    finally:
        if msg:
            ops_token = app.config.get('OPS_TOKEN')
            text = [
                '**容器平台自动上线:**',
                "项目:%s" % project,
                "版本:%s" % version,
                "Error:%s" % msg, '**请确认请求正确!**'
            ]
            tools.dingding_msg(text, ops_token)
        return jsonify({'result': 'ok'})
예제 #2
0
파일: Task2.py 프로젝트: smallc2009/sparrow
def k8s_health_check():
    for context in contexts:
        config.load_kube_config(config_file, context=context)
        v1 = client.CoreV1Api()
        try:
            #nodes健康检测
            ret = v1.list_node(watch=False)
            for i in ret.items:
                if 'node-role.kubernetes.io/master' in i.metadata.labels:
                    node_type = 'master'
                else:
                    node_type = 'node'
                status = i.status.conditions[-1].type
                if status != 'Ready':
                    text = ['**容器平台NODE报警:%s**' % i.metadata.name,'节点类型:%s' %node_type,'节点状态:%s' %status,'需及时处理!']
                    tools.dingding_msg(text,token=ops_token)
        except Exception as e:
            logging.error(e)
        try:
            # endpoints健康检测
            ret = v1.list_namespaced_endpoints('default')
            for i in ret.items:
                try:
                    for infos in i.subsets:
                        try:
                            for info in infos.addresses:
                                try:
                                    ip_header = '.'.join(str(info.ip).split('.')[:2])
                                    if '{}.'.format(ip_header) in ('172.16.', '10.10.'):
                                        if not tcpping(host=info.ip, port=infos.ports[0].port, timeout=5):
                                            text = ['**容器平台endpoints报警:**', 'IP:%s' % info.ip,
                                                    '服务端口:%s' % infos.ports[0].port, '服务端口不可用,需及时处理!']
                                            tools.dingding_msg(text)
                                except:
                                    continue
                        except:
                            continue
                except:
                    continue
        except Exception as e:
            logging.error(e)
예제 #3
0
def object_update(args):
    try:
        namespace = "default"
        mounts = None
        healthcheck = None
        sidecar = None
        run_args = None
        new_image, new_replicas, version, redis_key, channel = args
        if new_image and redis_key:
            db_k8s = db_op.k8s_deploy
            db_docker_run = db_op.docker_run
            dm_name = new_image.split('/')[-1].split(':')[0]
            #生成新镜像
            values = db_k8s.query.with_entities(
                db_k8s.project, db_k8s.container_port, db_k8s.image,
                db_k8s.war, db_k8s.replicas, db_k8s.re_requests,
                db_k8s.re_limits).filter(
                    and_(db_k8s.deployment == dm_name,
                         db_k8s.action != 'delete')).order_by(desc(
                             db_k8s.id)).limit(1).all()
            project, container_port, image, war, replicas, re_requests, re_limits = values[
                0]
            vals = db_docker_run.query.with_entities(
                db_docker_run.run_args, db_docker_run.side_car).filter(
                    db_docker_run.deployment == dm_name).all()
            if vals:
                run_args = eval(run_args[0][0])
                sidecar = eval(run_args[0][1])
            war = download_war(dm_name, version, run_args, redis_key)
            if not war:
                _flow_log("params error,update fail!")
                raise Redis.lpush(redis_key, "params error,update fail!")
            if not make_image(new_image, redis_key):
                _flow_log("image record not exists,update fail!")
                raise Redis.lpush(redis_key,
                                  "image record not exists,update fail!")
            try:
                _flow_log('start deploy image %s   ......' % new_image)
                Redis.lpush(redis_key,
                            'start deploy image %s   ......' % new_image)
                re_requests = eval(re_requests)
                re_limits = eval(re_limits)
                container_port = container_port.split(',')
                k8s = k8s_object(dm_name, image, container_port, replicas,
                                 mounts, healthcheck, sidecar, re_requests,
                                 re_limits)
                deployment = k8s.export_deployment()
                # Update container image
                deployment.spec.template.spec.containers[0].image = new_image
                if new_replicas:
                    deployment.spec.replicas = int(new_replicas)
                    replicas = new_replicas
                # Update the deployment
                try:
                    api_instance = client.CoreV1Api()
                    ret = api_instance.list_namespaced_pod(namespace=namespace)
                    old_pos = [
                        i.metadata.name for i in ret.items
                        if i.metadata.name.startswith(dm_name)
                    ]
                    api_instance = client.ExtensionsV1beta1Api()
                    api_instance.patch_namespaced_deployment(
                        name=dm_name, namespace=namespace, body=deployment)
                except Exception as e:
                    logging.error(e)
                    _flow_log('deployment parameter fail!')
                    Redis.lpush(redis_key, 'deployment parameter fail!')
                else:
                    _flow_log('开始进行更新后的结果验证......')
                    Redis.lpush(redis_key, '开始进行更新后的结果验证......')
                    if check_pod(dm_name, replicas, old_pos, redis_key):
                        v = db_k8s(
                            project=project,
                            deployment=dm_name,
                            image=new_image,
                            war=war,
                            container_port=container_port,
                            replicas=replicas,
                            re_requests=str(re_requests).replace("'", '"'),
                            re_limits=str(re_limits).replace("'", '"'),
                            action='update',
                            update_date=time.strftime('%Y-%m-%d',
                                                      time.localtime()),
                            update_time=time.strftime('%H:%M:%S',
                                                      time.localtime()))
                        db_op.DB.session.add(v)
                        db_op.DB.session.commit()
                        _flow_log('%s 镜像更新成功!' % new_image)
                        Redis.lpush(redis_key, '%s 镜像更新成功!' % new_image)
                        if channel == 'api':
                            text = [
                                '**容器平台自动上线:**',
                                "项目:%s" % project,
                                "版本:%s" % version, "操作:更新成功", '**请关注业务健康状况!**'
                            ]
                    else:
                        deployment.spec.template.spec.containers[
                            0].image = image
                        if image == new_image:
                            delete_pod(dm_name)
                        api_instance = client.ExtensionsV1beta1Api()
                        api_instance.patch_namespaced_deployment(
                            name=dm_name, namespace=namespace, body=deployment)
                        _flow_log('%s 镜像更新失败并自动回滚!' % new_image)
                        Redis.lpush(redis_key, '%s 镜像更新失败并自动回滚!' % new_image)
                        if channel == 'api':
                            text = [
                                '**容器平台自动上线:**',
                                "项目:%s" % project,
                                "版本:%s" % version, "操作:失败并回滚", '**需要手动处理!**'
                            ]
            except Exception as e:
                logging.error(e)
                _flow_log('fail:%s' % e)
                Redis.lpush(redis_key, 'fail:%s' % e)
                if channel == 'api':
                    text = [
                        '**容器平台自动上线:**',
                        "项目:%s" % project,
                        "版本:%s" % version, "操作:更新未完成", '**需要手动处理!**'
                    ]
    except Exception as e:
        logging.error(e)
        if 'BaseException' not in str(e):
            _flow_log('fail:%s' % e)
            Redis.lpush(redis_key, 'fail:%s' % e)
        if channel == 'api':
            text = [
                '**容器平台自动上线:**',
                "项目:%s" % project,
                "版本:%s" % version, "操作:更新未完成", '**需要手动处理!**'
            ]
    finally:
        db_op.DB.session.remove()
        Redis.lpush(redis_key, '_End_')
        if channel == 'api':
            tools.dingding_msg(text, ops_token)
예제 #4
0
def ensure_server_auth():
    try:
        db_work_order = db_op.work_order
        db_server_auth = db_op.server_auth
        db_sso = db_op.user_sso
        msg = None
        source = 'ensure_server_auth'
        Key = 'new_server_auth_work_number_%s' % dt
        actions = {'complete': '已完成', 'deny': '审批拒绝', 'agree': '审批通过'}
        action = tools.http_args(request, 'action')
        work_number = tools.http_args(request, 'work_number')
        #验证票据
        ticket = tools.http_args(request, 'ticket')
        if ticket or (action == 'activate' and work_number):
            if ticket:
                work_number = Redis.get('work_order_ticket_%s' % ticket)
            if work_number:
                val = db_work_order.query.filter(
                    db_work_order.work_number == int(work_number)).all()
                if val:
                    val = db_work_order.query.filter(
                        and_(db_work_order.work_number == int(work_number),
                             db_work_order.source == source,
                             db_work_order.status == '审批通过')).all()
                    if val:
                        db_work_order.query.filter(
                            and_(db_work_order.work_number == int(work_number),
                                 db_work_order.source == source)).update({
                                     db_work_order.dingid:
                                     g.dingId,
                                     db_work_order.status:
                                     '受理中'
                                 })
                        db_op.DB.session.commit()
                        Redis.sadd(Key, work_number)
                        Redis.delete('work_order_ticket_%s' % ticket)
                        msg = "%s工单已受理!" % work_number
                    else:
                        msg = "工单暂时无法受理!"
                else:
                    msg = "无效的请求验证地址!"
            else:
                msg = "无效的请求验证地址!"
        #验证执行状态
        if action and work_number:
            if action in actions:
                if action in ('deny', 'agree'):
                    if g.grade[0] != '0':
                        msg = '当前用户无审批权限!'
                        raise AssertionError
                val = db_work_order.query.filter(
                    and_(db_work_order.work_number == int(work_number),
                         db_work_order.source == source,
                         db_work_order.status.in_(('待审批', '受理中')))).all()
                if val:
                    db_work_order.query.filter(
                        and_(db_work_order.work_number == int(work_number),
                             db_work_order.source == source)).update(
                                 {db_work_order.status: actions[action]})
                    db_op.DB.session.commit()
                    Redis.srem(Key, work_number)
                    #完成发送邮件
                    if Redis.exists('op_send_mail_html_%s' % work_number):
                        dingid = db_server_auth.query.with_entities(
                            db_server_auth.dingid).filter(
                                db_server_auth.work_number ==
                                work_number).all()
                        mailer = db_sso.query.with_entities(
                            db_sso.mail).filter(
                                db_sso.dingunionid == dingid[0][0]).all()
                        if mailer:
                            Msg = Message("%s工单进度通知" % work_number,
                                          sender=sender,
                                          recipients=[mailer[0][0]],
                                          cc=[receiver],
                                          charset='utf-8')
                            mail_html = Redis.get('op_send_mail_html_%s' %
                                                  work_number)
                            alarm_html = '<p style="color:red">工单当前进度:%s</p>' % actions[
                                action]
                            Msg.html = '%s%s' % (mail_html, alarm_html)
                            if action == 'agree':
                                Msg.html = '%s%s%s' % (
                                    mail_html, alarm_html,
                                    Redis.get(
                                        'op_send_mail_url_%s' % work_number))
                            with mapp.app_context():
                                mail.send(Msg)
                    if Redis.exists(
                            'op_send_dingding_msg_%s' %
                            work_number) and action in ('deny', 'agree'):
                        text = eval(
                            Redis.get('op_send_dingding_msg_%s' % work_number))
                        text.append("##### 审批结果:%s" % actions[action])
                        if action == 'agree':
                            text.append("##### %s" % Redis.get(
                                'op_send_dingding_url_%s' % work_number))
                        tools.dingding_msg(text, token=work_token)
                    msg = "%s工单当前状态:%s!" % (work_number, actions[action])
                else:
                    msg = "无效操作!"
    except Exception as e:
        logging.error(e)
        if not msg:
            msg = "未知异常错误!"
    finally:
        #获取最新数据
        tables = ('工单号', '日期', '申请人', '部门', '系统账号', '服务器列表', '申请权限', '所属用途',
                  '执行人', '详情', '状态', '操作')
        users = db_sso.query.with_entities(db_sso.dingunionid, db_sso.realName,
                                           db_sso.department).all()
        users = {info[0]: info[1:] for info in users}
        servers = db_server_auth.query.with_entities(
            db_server_auth.work_number, db_server_auth.date,
            db_server_auth.account, db_server_auth.servers,
            db_server_auth.auth_level, db_server_auth.purpose,
            db_server_auth.dingid).all()
        servers = {info[0]: info[1:] for info in servers}
        work_orders = db_work_order.query.with_entities(
            db_work_order.work_number, db_work_order.dingid,
            db_work_order.status).filter(
                db_work_order.source == source).order_by(desc(
                    db_work_order.id)).all()
        if action and work_number:
            if action == 'query':
                work_orders = db_work_order.query.with_entities(
                    db_work_order.work_number, db_work_order.dingid,
                    db_work_order.status).filter(
                        and_(db_work_order.source == source,
                             db_work_order.work_number == work_number)).all()
        if work_orders:
            work_orders = [list(info) for info in work_orders]
            for info in work_orders:
                info.extend(servers[info[0]][:-1])
                info.insert(4, users[servers[info[0]][-1]][0])
                info.insert(5, users[servers[info[0]][-1]][-1])
                if info[1]:
                    info.append(users[info[1]][0])
                else:
                    info.append('')
        new_work_number = Redis.smembers(Key)
    return render_template('ensure_server_auth.html',
                           tables=tables,
                           work_orders=work_orders,
                           msg=msg,
                           new_work_number=new_work_number,
                           total='服务器权限工单管理')
예제 #5
0
 def check_slave(info):
     #检查从reids是否同步
     server_id,port = info
     if int(port) not in [10080]:
         #获取从redis端口列表
         slave_ports = db_redis.query.with_entities(distinct(db_redis.port)).filter(and_(db_redis.Master_Host==server_id,db_redis.Master_Port==port)).all()
         if slave_ports:
             slave_ports = [int(sport[0]) for sport in slave_ports]
             for slave_port in slave_ports:
                 #获取从redis信息
                 redis_lists = db_redis.query.with_entities(db_redis.server_id,db_redis.port,db_redis.requirepass).filter(and_(db_redis.slave=='是',db_redis.port==slave_port)).all()
                 for info in redis_lists:
                     text = None
                     slave_lists = []
                     server_id,sport,requirepass = info
                     try:
                         sip = server_ids[str(server_id)]
                     except:
                         continue
                     else:
                         try:
                             RC = redis.StrictRedis(sip, int(sport), decode_responses=True)
                             if requirepass:
                                 RC = redis.StrictRedis(sip, int(sport), password=requirepass, decode_responses=True)
                         except:
                             continue
                         else:
                             #获取从redis时间戳
                             mvals = db_redis.query.with_entities(db_redis.Master_Host, db_redis.Master_Port).filter(and_(db_redis.server_id == server_id, db_redis.port == sport)).all()
                             mip,mport = mvals[0]
                             mip = server_ids[str(mip)]
                             val = RC.get(Key)
                             try:
                                 RC = redis.StrictRedis(mip, int(mport), decode_responses=True)
                                 if requirepass:
                                     RC = redis.StrictRedis(mip, int(mport), password=requirepass,
                                                            decode_responses=True)
                             except:
                                 continue
                             else:
                                 if sip not in blacklist:
                                     mval = RC.get(Key)
                                     if mval and not val:
                                         text = ['**线上Redis同步报警:**',
                                                 "同步Redis:%s:%s 验证数据:%s"%(mip,mport,mval),
                                                 "延时Redis:%s:%s 验证数据:%s" % (sip, sport,val),
                                                 "数据同步异常!",
                                                 '**请及时进行处理!**']
                     if text:
                         alarm_info = '%s:%s' % (server_id, sport)
                         #判断节点redis
                         if alarm_info in S_Masters:
                             vals = db_redis.query.with_entities(db_redis.server_id,db_redis.port).filter(and_(db_redis.Master_Host==server_id,db_redis.Master_Port==sport)).all()
                             if vals:
                                 slave_lists.extend(['%s:%s'%val for val in vals])
                         if alarm_info not in slave_lists:
                             #redis异常报警
                             token = ops_token
                             if int(sport) in (8379,6387,17379):
                                 token = redis_token
                             tools.dingding_msg(text,token=token)
예제 #6
0
def k8s_ingress_log():
    td = time.strftime('%Y-%m-%d', time.localtime())
    th = time.strftime('%H:%M', time.localtime())
    Key = 'op_k8s_ingress_log'
    stat_key = 'op_k8s_ingress_stat'
    rt_key = 'op_k8s_ingress_rt'
    k8s_domains_key = 'op_k8s_domains_%s' %td
    k8s_pv_key = 'op_k8s_pv_%s' %td
    now_date = datetime.datetime.now()
    lte_date = now_date.strftime('%Y-%m-%dT%H:%M:%S+08:00')
    gte_date = now_date - datetime.timedelta(minutes=1)
    gte_date = gte_date.strftime('%Y-%m-%dT%H:%M:%S+08:00')
    Domains = []
    def auto_delete_pod(pod_name,text):
        try:
            namespace = "default"
            api_instance = client.CoreV1Api()
            ret = api_instance.list_namespaced_pod(namespace=namespace)
            for i in ret.items:
                if i.metadata.name.startswith(pod_name):
                    RC.incr(delete_pod_key, 1)
                    api_instance.delete_namespaced_pod(name=i.metadata.name,
                                                   namespace=namespace,
                                                   body=client.V1DeleteOptions())
                    time.sleep(30)
        except Exception as e:
            logging.error(e)
        finally:
            counts = RC.get(delete_pod_key)
            RC.delete(delete_pod_key)
            text.append('**自动处理问题pod数量:{}**'.format(counts))
            return text
    try:
        loging.write('start %s ......' % k8s_ingress_log.__name__)
        # 获取容器平台并发访问数据
        try:
            body = {"query": {"range": {"time_iso8601": {"gte": "%s" % gte_date, "lte": "%s" % lte_date}}},
                    "aggs": {
                        "avg_resp": {
                            "avg": {"field": "upstream_response_time"}
                        }
                    }}
            res = es.search(index='k8s-ingress-log-*', body=body)
            if res['hits']['total']:
                rt = float(res['aggregations']['avg_resp']['value'])
                counts = int(res['hits']['total'])
                if rt > 1:
                    # 统计全部访问量
                    RC.hset('%s_%s' % (Key, td), th, counts*rt)
                else:
                    RC.hset('%s_%s' % (Key, td), th, counts)
                RC.expire('%s_%s' % (Key, td), 864000)
                # 统计k8s总访问量
                RC.incr(k8s_pv_key, counts)
                RC.expire(k8s_pv_key, 864000)
        except Exception as e:
            logging.error(e)
        # 获取es当前1分钟的状态码统计
        try:
            body = {'size': 0, "query": {
                "bool": {
                    "must": [{"range": {"time_iso8601": {"gte": gte_date, "lte": lte_date}}}]}},
                    "aggs": {
                        "hosts": {
                            "terms": {
                                "field": "host.keyword",
                                "size": 100
                            },
                            "aggs": {
                                "counts": {
                                    "terms": {
                                        "field": "status",
                                        "size": 100
                                    }
                                }
                            }
                        }
                    }}
            res = es.search(index='k8s-ingress-log-*', body=body)
            for infos in res['aggregations']['hosts']['buckets']:
                try:
                    domain = infos['key']
                    Domains.append(domain)
                    counts = int(infos['doc_count'])
                    #统计域名列表
                    RC.sadd(k8s_domains_key,domain)
                    #统计域名访问量
                    RC.hset('%s_%s_%s'%(Key,domain,td),th,counts)
                    RC.expire('%s_%s_%s' % (Key, domain, td), 864000)
                    #状态码统计
                    vals = {info['key']: info['doc_count'] for info in infos['counts']['buckets']}
                    RC.hset('%s_%s_%s' % (stat_key, domain, td), th, vals)
                    RC.expire('%s_%s_%s' % (stat_key, domain, td), 864000)
                except:
                    continue
        except Exception as e:
            logging.error(e)
        try:
            # 获取es当前1分钟的响应时间统计
            body = {'size': 0, "query": {
                "bool": {
                    "must": [{"range": {"time_iso8601": {"gte": gte_date, "lte": lte_date}}}]}},
                    "aggs": {
                        "hosts": {
                            "terms": {
                                "field": "host.keyword",
                                "size": 100
                            },
                    "aggs": {
                        "avg_resp": {
                            "avg": {"field": "upstream_response_time"}
                        }
                        }}}
                    }
            res = es.search(index='k8s-ingress-log-*', body=body)
            for infos in res['aggregations']['hosts']['buckets']:
                try:
                    domain = infos['key']
                    RC.hset('%s_%s_%s' % (rt_key, domain, td), th,float('%.3f'%infos['avg_resp']['value']))
                    RC.expire('%s_%s_%s' % (rt_key, domain, td), 864000)
                except:
                    continue
        except Exception as e:
            logging.error(e)
        try:
            for domain in Domains:
                #业务状态码和响应时间超时报警
                text = ['**容器平台业务报警:%s**' % domain]
                stat_vals = 0.0
                nd = now_date - datetime.timedelta(minutes=1)
                th = nd.strftime('%H:%M')
                vals = RC.hget('%s_%s_%s' % (stat_key, domain, td), th)
                if vals:
                    vals = eval(str(vals))
                    if 200 in vals:
                        stat_vals = vals[200]
                    if len(vals) >1:
                        total_vals = reduce(lambda x, y: x + y, vals.values())
                    else:
                        total_vals = stat_vals
                    if stat_vals >0:
                        diff_vals = float(stat_vals)/float(total_vals)
                        rt_vals = RC.hget('%s_%s_%s' % (rt_key, domain, td), th)
                        if diff_vals < 0.99:
                            Key = 'op_k8s_project_alarm'
                            RC.incr(Key, 1)
                            RC.expire(Key, 180)
                            if int(RC.get(Key)) >3:
                                db_project = db_op.project_list
                                project = db_project.query.with_entities(distinct(db_project.project)).filter(
                                    db_project.domain.like('%{}%'.format(domain))).all()
                                if project:
                                    db_k8s_deploy = db_op.k8s_deploy
                                    pod_name = db_k8s_deploy.query.with_entities(db_k8s_deploy.deployment).filter(
                                        db_k8s_deploy.project == project[0][0]).all()
                                    if pod_name:
                                        pod_name = pod_name[0][0]
                                        text.append("服务可用率:{}%".format('%.2f' % (diff_vals * 100)))
                                        if rt_vals:
                                            text.append("服务响应时间:{}ms".format(int(float(rt_vals) * 1000)))
                                        delete_pod_key = 'op_auto_delete_pod_%s_%s' % (pod_name, td)
                                        if not RC.exists(delete_pod_key):
                                            text = auto_delete_pod(pod_name,text)
                                            tools.dingding_msg(text)
                                            RC.delete(Key)
        except Exception as e:
            logging.error(e)
    except Exception as e:
        logging.error(e)
    finally:
        db_op.DB.session.remove()
        for key in (k8s_domains_key,k8s_pv_key):
            RC.expire(key,864000)
        loging.write('complete %s !' % k8s_ingress_log.__name__)
예제 #7
0
def alarm_load():
    try:
        loging.write("start %s ......" %alarm_load.__name__)
        whitelist = []
        dict_load = defaultdict()
        db_server = db_idc.idc_servers
        db_zabbix = db_idc.zabbix_info
        db_project = db_op.project_list
        db_project_other = db_op.project_other
        Influx_cli = InfluxDBClient(influxdb_host, influxdb_port, influxdb_user, influxdb_pw, 'zabbix_infos')
        host_infos = db_zabbix.query.with_entities(db_zabbix.ip, db_zabbix.ssh_port,db_zabbix.hostname,db_zabbix.update_time).filter(and_(db_zabbix.cpu_load > 100, db_zabbix.icmpping == 1)).all()
        Key = "op_alarm_load_whitelist"
        if RC_CLUSTER.exists(Key):
            whitelist = RC_CLUSTER.smembers(Key)
        #循环监控疑似问题服务器
        for infos in host_infos:
            host,ssh_port,hostname,update_time=infos
            if time.strftime('%Y-%m-%d',time.localtime()) in update_time:
                try:
                    if not host.startswith('172.16.19.'):
                        now_time = datetime.datetime.now()
                        dt = now_time - datetime.timedelta(minutes=10)
                        dt = dt.strftime('%Y-%m-%dT%H:%M:%SZ')
                        cmd = "select mean(*) from server_infos where time >='%s' group by hostname" % dt
                        results = Influx_cli.query(cmd)
                        if results:
                            for key in results.keys():
                                if hostname == key[-1]['hostname']:
                                    for infos in results[key]:
                                        if infos['mean_cpu_load'] >100:
                                            dict_load[hostname] = (host,ssh_port,int(infos['mean_cpu_load']))
                except Exception as e:
                    logging.error(e)
                    continue
        #进行重启操作
        if dict_load:
            for hostname in dict_load:
                host,ssh_port,cpu_load = dict_load[hostname]
                # 判断ssh是否可以登录
                try:
                    Ssh = SSH.ssh(ip=host,ssh_port=ssh_port)
                except Exception as e:
                    if not hostname.startswith('nj'):
                        Ssh_Key = "op_ssh_login_fail_%s" %hostname
                        RC.incr(Ssh_Key,1)
                        RC.expire(Ssh_Key,350)
                        if int(RC.get(Ssh_Key)) >5:
                            tools.dingding_msg(text,token=ops_token)
                        else:
                            tools.dingding_msg(text)
                else:
                    try:
                        Key = 'op_alarm_load_%s' % hostname
                        Project = None
                        RC_CLUSTER.incr(Key, 5)
                        RC_CLUSTER.expire(Key, 600)
                        ctime = int(RC_CLUSTER.get(Key))
                        if hostname not in whitelist:
                            #筛查可重启服务进程
                            results = Ssh.Run("ps -aux | sort -k3nr |head -n 1")
                            if results['stdout']:
                                results = results['stdout'][0].strip().split()
                                try:

                                    if results[-1].endswith('-rpc.jar'):
                                        pro_jar = results[-1]
                                        if pro_jar in ['moji-location-rpc.jar']:
                                            Project =pro_jar.split('.')[0]
                                    else:
                                        for line in results:
                                            if '-Dcatalina.home=' in line :
                                                Project = line.strip().split('/')[-1]
                                                break
                                except Exception as e:
                                    logging.error(e)
                                if Project:
                                    try:
                                        # 判断是否是tomcat项目
                                        ret = db_project.query.filter(and_(db_project.ip == host, db_project.ssh_port == ssh_port)).all()
                                        if ret:
                                            #重启问题tomcat
                                            result = Ssh.Run("supervisorctl  restart  {0}".format(Project))
                                            if result['stderr']:
                                                text = ['**线上服务重启:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load),
                                                        "相关进程:{0}".format(Project), '**服务重启失败,需手动处理!**']
                                            else:
                                                text = ['**线上服务重启:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load),
                                                        "相关进程:{0}".format(Project), '**服务重启成功!**']
                                                ops_token = None

                                        else:
                                            # 判断是否是jar项目
                                            server_id = db_server.query.with_entities(db_server.id).filter(db_server.hostname==hostname).all()
                                            if server_id[0]:
                                                ret = db_project_other.query.filter(db_project_other.server_id == int(server_id[0][0])).all()
                                                if ret:
                                                    text = ['**线上服务器预警:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load),
                                                    "相关进程:{0}".format(Project), '**请及时进行处理!**']
                                        if text and not hostname.startswith('nj'):
                                            tools.dingding_msg(text,ops_token)
                                    except Exception as e:
                                        logging.error(e)
                    finally:
                        Ssh.Close()
    finally:
        loging.write("%s complete!" % alarm_load.__name__)
        db_idc.DB.session.remove()
        db_op.DB.session.remove()
예제 #8
0
def object_update(args):
    try:
        db_k8s = db_op.k8s_deploy
        db_docker_run = db_op.docker_run
        namespace = "default"
        mounts = None
        text = None
        labels = None
        allcontexts = []
        context, new_image, version, rollback, redis_key, channel, user = args
        dm_name = new_image.split('/')[-1].split(':')[0]
        # 获取已部署镜像部署信息
        values = db_k8s.query.with_entities(
            db_k8s.project, db_k8s.container_port, db_k8s.image, db_k8s.war,
            db_k8s.replicas, db_k8s.re_requests, db_k8s.re_limits,
            db_k8s.healthcheck).filter(
                and_(db_k8s.deployment == dm_name,
                     db_k8s.action != 'delete')).order_by(desc(
                         db_k8s.id)).limit(1).all()
        project, container_port, image, war, replicas, re_requests, re_limits, healthcheck = values[
            0]
    except Exception as e:
        logging.error(e)
    else:
        try:
            if new_image and redis_key:
                try:

                    vals = db_docker_run.query.with_entities(
                        db_docker_run.dockerfile, db_docker_run.run_args,
                        db_docker_run.side_car).filter(
                            and_(db_docker_run.deployment == dm_name,
                                 db_docker_run.context == context)).all()
                    docker_args, run_args, sidecar = vals[0]
                    if docker_args:
                        docker_args = eval(docker_args)
                    if run_args:
                        run_args = eval(run_args)
                except Exception as e:
                    logging.error(e)
                else:
                    if not rollback:
                        war = download_war(dm_name, version, docker_args,
                                           run_args, redis_key)
                        if not war:
                            _flow_log("params error,update fail!")
                            raise Redis.lpush(redis_key,
                                              "params error,update fail!")
                        if not make_image(new_image, redis_key):
                            _flow_log("image record not exists,update fail!")
                            raise Redis.lpush(
                                redis_key,
                                "image record not exists,update fail!")
                    try:
                        re_requests = eval(re_requests)
                        re_limits = eval(re_limits)
                        allcontexts.append(context)
                        if 'all-cluster' in context:
                            allcontexts = contexts
                        for context in allcontexts:
                            _flow_log('开始更新 %s image %s   ......' %
                                      (context, new_image))
                            Redis.lpush(redis_key, '*' * 80)
                            Redis.lpush(
                                redis_key, '开始更新 %s image %s   ......' %
                                (context, new_image))
                            k8s = k8s_object(context, dm_name, image,
                                             container_port.split(','),
                                             replicas, mounts, labels,
                                             healthcheck, sidecar, re_requests,
                                             re_limits)
                            deployment = k8s.export_deployment()
                            # Update container image
                            deployment.spec.template.spec.containers[
                                0].image = new_image
                            # Update the deployment
                            try:
                                api_instance = client.CoreV1Api()
                                ret = api_instance.list_namespaced_pod(
                                    namespace=namespace)
                                old_pos = [
                                    i.metadata.name for i in ret.items
                                    if i.metadata.name.startswith(dm_name)
                                ]
                                api_instance = client.ExtensionsV1beta1Api()
                                api_instance.patch_namespaced_deployment(
                                    name=dm_name,
                                    namespace=namespace,
                                    body=deployment)
                            except Exception as e:
                                logging.error(e)
                                _flow_log('deployment parameter fail!')
                                Redis.lpush(redis_key,
                                            'deployment parameter fail!')
                            else:
                                if rollback:
                                    action = 'rollback'
                                    _flow_log('开始进行回滚后的结果验证......')
                                    Redis.lpush(redis_key,
                                                '开始进行回滚后的结果验证......')
                                else:
                                    action = 'update'
                                    _flow_log('开始进行更新后的结果验证......')
                                    Redis.lpush(redis_key,
                                                '开始进行更新后的结果验证......')
                                if check_pod(context, dm_name, replicas,
                                             old_pos, redis_key):
                                    v = db_k8s(
                                        project=project,
                                        context=context,
                                        deployment=dm_name,
                                        image=new_image,
                                        war=war,
                                        container_port=container_port,
                                        replicas=replicas,
                                        re_requests=str(re_requests).replace(
                                            "'", '"'),
                                        re_limits=str(re_limits).replace(
                                            "'", '"'),
                                        action=action,
                                        healthcheck=healthcheck,
                                        update_date=time.strftime(
                                            '%Y-%m-%d', time.localtime()),
                                        update_time=time.strftime(
                                            '%H:%M:%S', time.localtime()),
                                        user=user)
                                    db_op.DB.session.add(v)
                                    db_op.DB.session.commit()
                                    if rollback:
                                        _flow_log('%s 镜像回滚成功!' % new_image)
                                        Redis.lpush(redis_key,
                                                    '%s 镜像回滚成功!' % new_image)
                                    else:
                                        _flow_log('%s 镜像更新成功!' % new_image)
                                        Redis.lpush(redis_key,
                                                    '%s 镜像更新成功!' % new_image)
                                    if channel == 'api':
                                        if rollback:
                                            text = [
                                                '**容器平台自动上线:**',
                                                "项目:%s" % project,
                                                "版本:%s" % version, "操作:更新成功",
                                                '**请关注业务健康状况!**'
                                            ]
                                        else:
                                            text = [
                                                '**容器平台自动回滚:**',
                                                "项目:%s" % project,
                                                "版本:%s" % version, "操作:回滚成功",
                                                '**请关注业务健康状况!**'
                                            ]
                                else:
                                    if rollback:
                                        _flow_log('%s 镜像回滚失败!' % new_image)
                                        Redis.lpush(redis_key,
                                                    '%s 镜像回滚失败!' % new_image)
                                        if channel == 'api':
                                            text = [
                                                '**容器平台自动回滚:**',
                                                "项目:%s" % project,
                                                "版本:%s" % version, "操作:回滚失败",
                                                '**需要手动处理!**'
                                            ]
                                    else:
                                        deployment.spec.template.spec.containers[
                                            0].image = image
                                        if image == new_image:
                                            delete_pod(context, dm_name)
                                        api_instance = client.ExtensionsV1beta1Api(
                                        )
                                        api_instance.patch_namespaced_deployment(
                                            name=dm_name,
                                            namespace=namespace,
                                            body=deployment)
                                        _flow_log('%s 镜像更新失败并自动回滚!' %
                                                  new_image)
                                        Redis.lpush(
                                            redis_key,
                                            '%s 镜像更新失败并自动回滚!' % new_image)
                                        if channel == 'api':
                                            text = [
                                                '**容器平台自动上线:**',
                                                "项目:%s" % project,
                                                "版本:%s" % version, "操作:失败并回滚",
                                                '**需要手动处理!**'
                                            ]
                        Redis.lpush(redis_key, '*' * 80)
                    except Exception as e:
                        logging.error(e)
                        _flow_log('fail:%s' % e)
                        Redis.lpush(redis_key, 'fail:%s' % e)
                        if channel == 'api':
                            text = [
                                '**容器平台自动上线:**',
                                "项目:%s" % project,
                                "版本:%s" % version, "操作:更新未完成", '**需要手动处理!**'
                            ]
        except Exception as e:
            logging.error(e)
            if 'BaseException' not in str(e):
                _flow_log('fail:%s' % e)
                Redis.lpush(redis_key, 'fail:%s' % e)
            if channel == 'api':
                text = [
                    '**容器平台自动上线:**',
                    "项目:%s" % project,
                    "版本:%s" % version, "操作:更新未完成", '**需要手动处理!**'
                ]
        finally:
            db_op.DB.session.remove()
            Redis.lpush(redis_key, '_End_')
            if channel == 'api':
                tools.dingding_msg(text, ops_token)