示例#1
0
def remove_node(endpoint, id):
    log = logging.getLogger('pk_k8s')
    if pk_config.dryrun_get(dryrun_id):
        log.info('(M)   DRYRUN enabled. Skipping...')
        return
    kubernetes.config.load_kube_config()
    client = kubernetes.client.CoreV1Api()
    try:
        client.delete_node(id)
    except Exception:
        log.error('(M)   => Removing k8s node failed.')
    return
示例#2
0
def drop_worker_node(endpoint, infra_name, worker_name, replica):
    log = logging.getLogger('pk_occopus')
    if pk_config.dryrun_get(dryrun_id):
        log.info('(S)   DRYRUN enabled. Skipping...')
        return
    log.info('(S)   => node drop: {0}'.format(replica))
    wscall = '{0}/infrastructures/{1}/scaledown/{2}/{3}'.format(
        endpoint, infra_name, worker_name, replica)
    log.debug('-->curl -X POST {0}'.format(wscall))
    response = requests.post(wscall).json()
    log.debug('-->response: {0}'.format(response))
    return
示例#3
0
def calling_rest_api_sample(sample=dict()):
  log=logging.getLogger('pk_optimizer')
  config = pk_config.config()
  if pk_config.dryrun_get(dryrun_id):
    log.info('(O)   DRYRUN enabled. Skipping...')
    return
  if not m_opt_accessible:
    return
  url = config.get('optimizer_endpoint')+'/sample'
  log.debug('(O) Calling optimizer REST API sample() method: '+url)
  response = requests.post(url, data=yaml.dump(sample))
  log.debug('(O) Response: '+str(response))
  return
示例#4
0
def remove_alerts_under_prometheus(rules_directory, alerts, stack):
    log = logging.getLogger('pk_prometheus')
    if pk_config.dryrun_get(dryrun_id):
        log.info('(C)   DRYRUN enabled. Skipping...')
        return
    if not alerts:
        return
    try:
        rule_file = os.path.join(rules_directory, stack + '.rules')
        os.remove(rule_file)
    except Exception:
        log.exception('Removing alerts under Prometheus failed:')
    return
def query_number_of_worker_nodes(config,worker_name):
    log=logging.getLogger('pk_occopus')
    instances=1
    if pk_config.dryrun_get(dryrun_id):
      log.info('(C)   DRYRUN enabled. Skipping...')
      return instances
    endpoint, infra_name = config[CONFIG_ENDPOINT], config[CONFIG_INFRA_NAME]
    wscall = '{0}/infrastructures/{1}'.format(endpoint,infra_name)
    log.debug('-->curl -X GET {0}'.format(wscall))
    response = requests.get(wscall).json()
    instances = response.get(worker_name,dict()).get('scaling',dict()).get('target',0)
    log.debug('-->instances: {0}, response: {1}'.format(instances,response))
    return instances
def scale_worker_node(config,scaling_info_list):
    log=logging.getLogger('pk_occopus')
    if pk_config.dryrun_get(dryrun_id):
      log.info('(S)   DRYRUN enabled. Skipping...')
      return
    endpoint, infra_name = config[CONFIG_ENDPOINT], config[CONFIG_INFRA_NAME]
    for info in scaling_info_list:
      worker_name, replicas = info.get('node_name'), info.get('replicas')
      log.info('(S) {0}  => m_node_count: {1}'.format(worker_name, replicas))
      wscall = '{0}/infrastructures/{1}/scaleto/{2}/{3}'.format(endpoint,infra_name,worker_name,replicas)
      log.debug('-->curl -X POST {0}'.format(wscall))
      response = requests.post(wscall).json()
      log.debug('-->response: {0}'.format(response))
    return
def evaluate_data_queries_and_alerts_for_a_service(endpoint,policy,servicename):
  log=logging.getLogger('pk_prometheus')
  if pk_config.dryrun_get(dryrun_id):
    log.info('(Q)   DRYRUN enabled. Skipping...')
  queries, alerts = dict(), dict()
  if 'query_results' not in policy['data']:
    policy['data']['query_results']=dict()
  all_services = policy.get('scaling',dict()).get('services',dict())
  target_service = [ srv for srv in all_services if srv.get('name','')==servicename ]
  scaling_rule_str = target_service[0].get('scaling_rule','') if target_service else ''
  for param,query in policy.get('data',dict()).get('queries',dict()).items():
    try:
      if scaling_rule_str is not None and scaling_rule_str.find(param) != -1:
        if pk_config.dryrun_get(dryrun_id):
          policy['data']['query_results'][param]=query
          queries[param]=query
        else:
          response = requests.get(endpoint+"/api/v1/query?query="+query).json()
          log.debug('Prometheus response query "{0}":{1}'.format(query,response))
          val = extract_value_from_prometheus_response(query,response,dict())
          policy['data']['query_results'][param]=float(val)
          queries[param]=float(val)
    except Exception as e:
      policy['data']['query_results'][param]=None
      queries[param]=None
      log.warning('Evaluating expression for query "{0}" failed: {1}'.format(param,e))
  policy['data']['alert_results']={}
  for item in policy.get('data',dict()).get('alerts',dict()):
    attrname = item['alert']
    if scaling_rule_str is not None and scaling_rule_str.find(attrname) != -1:
      if alerts_query(attrname) is not None:
        policy['data']['alert_results'][attrname]=True
        alerts[attrname]=True
      else:
        policy['data']['alert_results'][attrname]=False
        alerts[attrname]=False
  return queries, alerts
def remove_node(endpoint, id):
    log = logging.getLogger("pk_k8s")
    if pk_config.dryrun_get(dryrun_id):
        log.info("(M)   DRYRUN enabled. Skipping...")
        return

    try:
        query = pykube.Node.objects(kube).filter(
            field_selector={"metadata.name": id})
        node = [x for x in query][0]
        node.reload()
        node.delete()
    except Exception:
        log.error("(M)   => Removing k8s node failed.")
    return
示例#9
0
def query_number_of_worker_nodes(config, worker_name):
    """
    Return the number of instances of a worker node, pulled from tfstate
    """
    instances = 1
    if pk_config.dryrun_get(dryrun_id):
        log.info("(C)   DRYRUN enabled. Skipping...")
        return instances
    try:
        resources = _get_resources_from_state(config, worker_name)
        instances = len(resources[0]["instances"])
    except Exception:
        log.error("Failed to get no. of instances for {}".format(worker_name))
    log.debug("-->instances: {0}".format(instances))
    return instances
示例#10
0
def scale_k8s_deploy(endpoint, service_name, replicas):
    service_name = '-'.join(service_name.split('_')[1:])
    log = logging.getLogger('pk_k8s')
    log.info('(S)   => m_container_count: {0}'.format(replicas))
    if pk_config.dryrun_get(dryrun_id):
        log.info('(S)   DRYRUN enabled. Skipping...')
        return
    kubernetes.config.load_kube_config()
    client = kubernetes.client.ExtensionsV1beta1Api()
    try:
        dep = client.read_namespaced_deployment(service_name, "default")
        dep.spec.replicas = replicas
        client.patch_namespaced_deployment_scale(service_name, "default", dep)
    except Exception as e:
        log.warning('(S) Scaling of k8s service "{0}" failed: {1}'.format(
            service_name, str(e)))
    return
示例#11
0
def deploy_alerts_under_prometheus(rules_directory, alerts, stack):
    log = logging.getLogger('pk_prometheus')
    if pk_config.dryrun_get(dryrun_id):
        log.info('(C)   DRYRUN enabled. Skipping...')
        return
    if not alerts:
        return
    try:
        content = {'groups': [{'name': 'micado', 'rules': []}]}
        for alert in alerts:
            content['groups'][0]['rules'].append(dict(alert))
        rule_file = os.path.join(rules_directory, stack + '.rules')
        with open(rule_file, 'w') as outfile:
            yaml.round_trip_dump(content, outfile, default_flow_style=False)
    except Exception:
        log.exception('Deploying alerts under Prometheus failed:')
    return
def scale_k8s_deploy(endpoint, service_name, replicas):
    service_name = "-".join(service_name.split("_")[1:])
    log = logging.getLogger("pk_k8s")
    log.info("(S)   => m_container_count: {0}".format(replicas))
    if pk_config.dryrun_get(dryrun_id):
        log.info("(S)   DRYRUN enabled. Skipping...")
        return

    try:
        query = pykube.Deployment.objects(kube).filter(
            field_selector={"metadata.name": service_name})
        deployment = [x for x in query][0]
        deployment.reload()
        deployment.scale(replicas)
    except Exception as e:
        log.warning('(S) Scaling of k8s service "{0}" failed: {1}'.format(
            service_name, str(e)))
    return
示例#13
0
def calling_rest_api_init():
  global m_opt_accessible
  log=logging.getLogger('pk_optimizer')
  config = pk_config.config()
  if pk_config.dryrun_get(dryrun_id):
    log.info('(O)   DRYRUN enabled. Skipping...')
    return
  url = config.get('optimizer_endpoint')+'/init'
  log.debug('(O) Calling optimizer REST API init() method: '+url)
  try:
    response = requests.post(url, data=yaml.dump(m_opt_init_params))
    m_opt_accessible = True
  except Exception as e:
    m_opt_accessible = False
    log.exception('(O) Calling optimizer REST API init() method raised exception: ')
    log.info('(O) WARNING: Optimizer is disabled for the current policy.')
    return
  log.debug('(O) Response: '+str(response))
  return
示例#14
0
def query_k8s_replicas(endpoint, service_name):
    service_name = '-'.join(service_name.split('_')[1:])
    log = logging.getLogger('pk_k8s')
    instance = 1
    if pk_config.dryrun_get(dryrun_id):
        log.info('(I)   DRYRUN enabled. Skipping...')
        return instance
    kubernetes.config.load_kube_config()
    client = kubernetes.client.ExtensionsV1beta1Api()
    try:
        dep = client.read_namespaced_deployment(service_name, "default")
        replicas = dep.spec.replicas
        log.debug('(I)   => m_container_count for {0}: {1}'.format(
            service_name, replicas))
    except Exception as e:
        log.warning(
            '(Q) Querying k8s service "{0}" replicas failed: {1}'.format(
                service_name, str(e)))
    return instance
def query_k8s_replicas(endpoint, service_name):
    service_name = "-".join(service_name.split("_")[1:])
    log = logging.getLogger("pk_k8s")
    instance = 1
    if pk_config.dryrun_get(dryrun_id):
        log.info("(I)   DRYRUN enabled. Skipping...")
        return instance

    try:
        query = pykube.Deployment.objects(kube).filter(
            field_selector={"metadata.name": service_name})
        deployment = [x for x in query][0]
        deployment.reload()
        instance = deployment.replicas
        log.debug("(I)   => m_container_count for {0}: {1}".format(
            service_name, instance))
    except Exception as e:
        log.warning(
            '(Q) Querying k8s service "{0}" replicas failed: {1}'.format(
                service_name, str(e)))
    return instance
示例#16
0
def query_list_of_nodes(endpoint, worker_name='micado-worker', status='ready'):
    log = logging.getLogger('pk_k8s')
    list_of_nodes = []
    if pk_config.dryrun_get(dryrun_id):
        log.info('(I)   DRYRUN enabled. Skipping...')
        a = {}
        a['ID'] = 'dummyID'
        a['Addr'] = '127.0.0.1'
        list_of_nodes.append(a.copy())
        return list_of_nodes
    kubernetes.config.load_kube_config()
    client = kubernetes.client.CoreV1Api()
    try:
        nodes = [
            x for x in client.list_node().items
            if MASTER not in x.metadata.labels
        ]
        if status == 'ready':
            nodes = [
                x for x in nodes
                if NOTREADY not in [y.key for y in x.spec.taints or []]
            ]
            nodes = [
                x for x in nodes
                if x.metadata.labels.get('micado.eu/node_type') == worker_name
            ]
        elif status == 'down':
            nodes = [
                x for x in nodes
                if NOTREADY in [y.key for y in x.spec.taints or []]
            ]
        for n in nodes:
            a = {}
            a['ID'] = n.metadata.name
            a['Addr'] = n.status.addresses[0].address
            list_of_nodes.append(a.copy())
        return list_of_nodes
    except Exception as e:
        log.exception('(Q) Query of k8s nodes failed.')
        return dict()
示例#17
0
def collect_init_params_and_variables(policy):
    log = logging.getLogger('pk_optimizer')
    config = pk_config.config()
    if pk_config.dryrun_get(dryrun_id):
        log.info('(O)   DRYRUN enabled. Skipping...')
        return
    reset_variables()
    m_opt_init_params['constants'] = dict()
    for varname, value in policy.get('data', dict()).get('constants',
                                                         dict()).items():
        retvarname = varname_if_init(varname)
        if retvarname:
            log.info('(O)   => INIT: {0}:{1}'.format(retvarname, value))
            m_opt_init_params['constants'][retvarname] = value
    m_opt_init_params['constants']['input_metrics'] = list()
    for varname, query in policy.get('data', dict()).get('queries',
                                                         dict()).items():
        retvarname = varname_if_input(varname)
        if retvarname:
            log.info('(O)   => INPUT: {0}:{1}'.format(retvarname, query))
            m_opt_init_params['constants']['input_metrics'].append(
                dict(name=retvarname))
            m_opt_variables.append(
                dict(lname=varname, sname=retvarname, query=query))
    m_opt_init_params['constants']['target_metrics'] = list()
    for varname, query in policy.get('data', dict()).get('queries',
                                                         dict()).items():
        if check_if_target(varname):
            insert_target_structure(m_opt_init_params, varname, query)
    for onenode in policy.get('scaling', dict()).get('nodes', []):
        if 'm_opt_advice' in onenode.get('scaling_rule', ''):
            _, omin, omax = limit_instances(None, onenode.get('min_instances'),
                                            onenode.get('max_instances'))
            m_opt_init_params['constants']['min_vm_number'] = omin
            m_opt_init_params['constants']['max_vm_number'] = omax
    log.debug('(O) m_opt_init_params (yaml) => {0}'.format(
        yaml.dump(m_opt_init_params)))
    log.debug('(O) m_opt_variables (yaml) => {0}'.format(
        yaml.dump(m_opt_variables)))
    return
def query_list_of_nodes(endpoint, worker_name="micado-worker", status="ready"):
    log = logging.getLogger("pk_k8s")
    list_of_nodes = []
    if pk_config.dryrun_get(dryrun_id):
        log.info("(I)   DRYRUN enabled. Skipping...")
        a = {}
        a["ID"] = "dummyID"
        a["Addr"] = "127.0.0.1"
        list_of_nodes.append(a.copy())
        return list_of_nodes

    try:
        if status == "ready":
            query = pykube.Node.objects(kube).filter(
                selector={"micado.eu/node_type__in": {worker_name}})
            nodes = [x for x in query if "taints" not in x.obj["spec"]]
        elif status == "down":
            nodes = []
            worker_nodes = [
                x for x in pykube.Node.objects(kube) if MASTER not in x.labels
            ]
            for node in worker_nodes:
                ready_condition = [
                    x.items() for x in node.obj["status"]["conditions"]
                    if x.get("type") == "Ready"
                ][0]
                if ("status", "Unknown") in ready_condition:
                    nodes.append(node)
        for n in nodes:
            a = {}
            n.reload()
            a["ID"] = n.metadata["name"]
            a["Addr"] = n.obj["status"]["addresses"][0]["address"]
            list_of_nodes.append(a.copy())
        return list_of_nodes
    except Exception:
        log.exception("(Q) Query of k8s nodes failed.")
        return dict()
示例#19
0
def remove_exporters_from_prometheus_config(template_file, config_file):
    log = logging.getLogger('pk_prometheus')
    if pk_config.dryrun_get(dryrun_id):
        log.info('(C)   DRYRUN enabled. Skipping...')
        return
    shutil.copyfile(template_file, config_file)
示例#20
0
def evaluate_data_queries_and_alerts_for_nodes(endpoint, policy, node):
    log = logging.getLogger('pk_prometheus')
    if pk_config.dryrun_get(dryrun_id):
        log.info(
            '(Q)   DRYRUN enabled. Assigning queries as values to metrics...')
    queries, alerts = dict(), dict()
    if 'data' not in policy:
        policy['data'] = {}
    if 'query_results' not in policy['data']:
        policy['data']['query_results'] = dict()
    scaling_rule_str = node.get('scaling_rule', '')
    for param, query in policy.get('data', dict()).get('queries',
                                                       dict()).iteritems():
        try:
            if param.find('m_opt') != -1 or \
               (scaling_rule_str is not None and \
               scaling_rule_str.find(param) != -1):
                if pk_config.dryrun_get(dryrun_id) or \
                   param.startswith("m_opt_target_minth_") or \
                   param.startswith("m_opt_target_maxth_"):
                    #TODO: handle dummy value more appropriately
                    policy['data']['query_results'][param] = query
                    queries[param] = query
                else:
                    if isinstance(query, list):
                        response = requests.get(endpoint +
                                                "/api/v1/query?query=" +
                                                query[0]).json()
                        log.debug('Prometheus response query "{0}":{1}'.format(
                            query[0], response))
                        val = extract_value_from_prometheus_response(
                            query, response, dict())
                        policy['data']['query_results'][param] = val
                        queries[param] = val
                    else:
                        response = requests.get(endpoint +
                                                "/api/v1/query?query=" +
                                                query).json()
                        log.debug('Prometheus response query "{0}":{1}'.format(
                            query, response))
                        val = extract_value_from_prometheus_response(
                            query, response, dict())
                        policy['data']['query_results'][param] = float(val)
                        queries[param] = float(val)
        except Exception as e:
            policy['data']['query_results'][param] = None
            queries[param] = None
            log.warning(
                'Evaluating expression for query "{0}" failed: {1}'.format(
                    param, e.message))
    policy['data']['alert_results'] = {}
    for item in policy.get('data', dict()).get('alerts', dict()):
        attrname = item['alert']
        if scaling_rule_str is not None and scaling_rule_str.find(
                attrname) != -1:
            if alerts_query(attrname) is not None:
                policy['data']['alert_results'][attrname] = True
                alerts[attrname] = True
            else:
                policy['data']['alert_results'][attrname] = False
                alerts[attrname] = False
    return queries, alerts
示例#21
0
def add_exporters_to_prometheus_config(policy, template_file, config_file):
    log = logging.getLogger('pk_prometheus')
    try:
        config_content = dict()
        if pk_config.dryrun_get(dryrun_id):
            log.info('(C)   DRYRUN enabled. Skipping...')
            return
        shutil.copy(config_file, template_file)
        with open(template_file, 'r') as f:
            config_content = yaml.round_trip_load(f)
        if 'scrape_configs' not in config_content:
            config_content['scrape_configs'] = []
        #Find proper scrape_config or create
        scrape_config = [
            x for x in config_content['scrape_configs']
            if x.get('job_name', '') == 'micado' and 'static_configs' in x
        ]
        if not scrape_config:
            config_content['scrape_configs'].append({
                'job_name': 'micado',
                'static_configs': []
            })
            scrape_config = [
                x for x in config_content['scrape_configs']
                if x.get('job_name', '') == 'micado' and 'static_configs' in x
            ][0]
        else:
            scrape_config = scrape_config[0]
        #Find proper static_config or create
        static_config = [
            x for x in scrape_config['static_configs']
            if 'targets' in x.keys()
        ]
        if not static_config:
            scrape_config['static_configs'].append({'targets': []})
            static_config = [
                x for x in scrape_config['static_configs']
                if 'targets' in x.keys()
            ][0]
        else:
            static_config = static_config[0]

        config_changed = False
        for exporter_endpoint in policy.get('data',
                                            dict()).get('sources', dict()):
            if exporter_endpoint not in static_config['targets']:
                exp = exporter_endpoint.split(':')
                if len(exp) == 1:
                    continue
                elif '.' not in exp[0]:
                    kube_job = [
                        x for x in config_content['scrape_configs']
                        if x.get('job_name') == 'kube-services'
                    ]
                    if not kube_job:
                        continue
                    relabel = kube_job[0].get('relabel_configs', [])
                    old_label = [
                        x for x in relabel if x.get('action') == 'keep'
                    ]
                    if old_label:
                        old_label = old_label[0]
                        old_regex = old_label.get('regex')
                        new_regex = '{}|{}:{}'.format(old_regex, exp[0],
                                                      exp[1])
                        old_label['regex'] = new_regex
                    else:
                        label = {
                            'source_labels': ['endpoint'],
                            'action': 'keep',
                            'regex': '(^a)|{}:{}'.format(exp[0], exp[1])
                        }
                        relabel.append(label)
                else:
                    static_config['targets'].append(exporter_endpoint)
                config_changed = True
                log.info('(C)   => exporter "{0}" added to config'.format(
                    exporter_endpoint))
            else:
                log.info(
                    '(C)   => exporter "{0}" skipped, already part of config'.
                    format(exporter_endpoint))

        if config_changed:
            with open(config_file, 'w') as outfile:
                yaml.round_trip_dump(config_content,
                                     outfile,
                                     default_flow_style=False)

    except Exception as e:
        log.exception('Adding exporters to prometheus config failed:')

    return