Exemplo n.º 1
0
    def _set_autoscaling(self):
        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg() or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException("Failed to get autoscaling group for cluster {}".format(self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if asg_name is not None:
            self._replacing["ASG_NAME"] = asg_name
        else:
            logger.error("Autoscaling group name not found for %s", self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")
Exemplo n.º 2
0
 def asgs_to_option(self, asgs):
     """
     Returns the config option based on the names of the ASGs.
     """
     asg_manager = AXUserASGManager(self._cluster_name_id, self._region)
     all_asg_names = asg_manager.get_all_asg_names()
     if asgs is None or len(asgs) == 0:
         return SpotInstanceOption.NO_SPOT
     elif set(asgs) == set(all_asg_names):
         return SpotInstanceOption.ALL_SPOT
     else:
         return SpotInstanceOption.PARTIAL_SPOT
     return
Exemplo n.º 3
0
 def option_to_asgs(self, option):
     """
     Returns the names of the ASGs based on the provided config option.
     """
     assert option in SpotInstanceOption.VALID_SPOT_INSTANCE_OPTIONS, \
         "{} is not a valid spot instance option".format(option)
     asg_manager = AXUserASGManager(self._cluster_name_id, self._region)
     if option == SpotInstanceOption.ALL_SPOT:
         asg_names = asg_manager.get_all_asg_names()
         return asg_names
     elif option == SpotInstanceOption.NO_SPOT:
         return []
     else:
         return [asg_manager.get_variable_asg()["AutoScalingGroupName"]]
     return
Exemplo n.º 4
0
Arquivo: rest.py Projeto: nuaays/argo
def put_spot_instance_config():
    if AXClusterConfig().get_cluster_provider().is_user_cluster():
        raise AXIllegalOperationException("Spot instances not allowed on user provided K8S clusters.")

    (data,) = _get_optional_arguments('enabled')
    if isinstance(data, bool):
        enabled_str = str(data)
    elif isinstance(data, string_types):
        enabled_str = "True" if data.lower() == "true" else "False"
    else:
        raise ValueError("enabled must be string or boolean")
    payload = {'enabled': enabled_str}

    asg_manager = AXUserASGManager(cluster_name_id, AXClusterConfig().get_region())

    # Get "spot_instances_option" option
    (option,) = _get_optional_arguments('spot_instances_option')
    if option is not None:
        spotOptionMgr = SpotInstanceOptionManager(
            cluster_name_id, AXClusterConfig().get_region())
        asg_names = spotOptionMgr.option_to_asgs(option)
        asg_option = " ".join(asg_names)

        if option == SpotInstanceOption.NO_SPOT:
            _app.logger.info("ASGS passed in a \"none\". Disabling minion-manager.")
            enabled_str = "False"
            payload['enabled'] = enabled_str

        payload['asgs'] = asg_option

    response = requests.put(MINION_MANAGER_HOSTNAME + ":" + MINION_MANAGER_PORT + "/spot_instance_config", params=payload)
    response.raise_for_status()

    _app.logger.info("Change in Spot instance config: {}".format(enabled_str))
    return jsonify({"status": "ok"})
Exemplo n.º 5
0
    def modify_asg(self, min, max):
        logger.info("Modifying autoscaling group ...")

        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)

        asg = asg_manager.get_variable_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get variable autoscaling group for cluster {}".
                format(self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]
        try:
            asg_manager.set_asg_spec(name=asg_name, minsize=1, maxsize=max)
        except ClientError as ce:
            raise AXPlatformException(
                "Failed to set cluster's variable autoscaling group min/max. Error: {}"
                .format(ce))

        logger.info("Modifying cluster autoscaling group ... DONE")
Exemplo n.º 6
0
    def _recover_auto_scaling_groups(self):
        """
        This steps does the following:
            - fetch the previously restored auto scaling group config. If this config cannot be found,
              we can assume that all autoscaling groups have correct configurations. This could happen
              when previous restart failed in the middle but passed this stage already, or the cluster is
              not even paused
            - Wait for all instances to be in service
        :return:
        """
        # Get previously persisted asg status
        logger.info("Fetching last cluster status ...")
        cluster_status_raw = self._cluster_info.download_cluster_status_before_pause(
        )

        asg_mgr = AXUserASGManager(cluster_name_id=self._name_id,
                                   aws_profile=self._cfg.cloud_profile,
                                   region=self._cluster_config.get_region())

        if cluster_status_raw:
            logger.info("Found last cluster status, restoring cluster ...")
            cluster_status = yaml.load(cluster_status_raw)
            all_asg_statuses = cluster_status["asg_status"]

            # Restore minions
            for asg_name in all_asg_statuses.keys():
                asg_status = all_asg_statuses[asg_name]
                min_size = asg_status["min_size"]
                max_size = asg_status["max_size"]
                desired = asg_status["desired_capacity"]
                self._total_nodes += desired
                logger.info(
                    "Recovering autoscaling group %s. Min: %s, Max: %s, Desired: %s",
                    asg_name, min_size, max_size, desired)
                asg_mgr.set_asg_spec(name=asg_name,
                                     minsize=min_size,
                                     maxsize=max_size,
                                     desired=desired)

            logger.info("Waiting for all auto scaling groups to scale up ...")
            asg_mgr.wait_for_desired_asg_state()
            logger.info("%sAll cluster instances are in service%s",
                        COLOR_GREEN, COLOR_NORM)

            # Delete previously stored cluster status
            self._cluster_info.delete_cluster_status_before_pause()
        else:
            all_asgs = asg_mgr.get_all_asgs()
            for asg in all_asgs:
                self._total_nodes += asg["DesiredCapacity"]

            logger.info(
                "Cannot find last cluster status, cluster already resumed with %s nodes",
                self._total_nodes)
Exemplo n.º 7
0
    def _scale_down_auto_scaling_groups(self):
        """
        This step:
            - Persist autoscaling group states to S3,
            - Scale down all autoscaling groups to zero,
            - Wait for all minion to be terminated
        :return:
        """
        logger.info("Discovering autoscaling groups")
        asg_mgr = AXUserASGManager(cluster_name_id=self._name_id,
                                   aws_profile=self._cfg.cloud_profile,
                                   region=self._cluster_config.get_region())
        all_asgs = asg_mgr.get_all_asgs()

        # Generate cluster status before pause. This is used to recover same amount of nodes
        # when we want to restart cluster
        cluster_status = {"asg_status": {}}
        for asg in all_asgs:
            cluster_status["asg_status"][asg["AutoScalingGroupName"]] = {
                "min_size": asg["MinSize"],
                "max_size": asg["MaxSize"],
                "desired_capacity": asg["DesiredCapacity"]
            }
        self._cluster_info.upload_cluster_status_before_pause(
            status=yaml.dump(cluster_status))

        # Scale down asg
        logger.info("Scaling down autoscaling groups ...")
        for asg in all_asgs:
            asg_name = asg["AutoScalingGroupName"]
            asg_mgr.set_asg_spec(name=asg_name, minsize=0, maxsize=0)

        # Waiting for nodes to be terminated
        logger.info("Waiting for all auto scaling groups to scale down ...")
        asg_mgr.wait_for_desired_asg_state()
        logger.info("%sAll cluster nodes are terminated%s", COLOR_GREEN,
                    COLOR_NORM)
Exemplo n.º 8
0
    def _generate_replacing(self):
        # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method
        # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and
        # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid
        # yaml files, and therefore we construct string manually here
        trusted_cidr = self._cluster_config.get_trusted_cidr()
        if isinstance(trusted_cidr, list):
            trusted_cidr_str = "["
            for cidr in trusted_cidr:
                trusted_cidr_str += "\"{}\",".format(str(cidr))
            trusted_cidr_str = trusted_cidr_str[:-1]
            trusted_cidr_str += "]"
        else:
            trusted_cidr_str = "[{}]".format(trusted_cidr)

        axsys_cpu = 0
        axsys_mem = 0
        daemon_cpu = 0
        daemon_mem = 0
        for name in self._kube_objects.keys():
            cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage
            axsys_cpu += cpu
            axsys_mem += mem
            daemon_cpu += dcpu
            daemon_mem += dmem

        # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not
        # have a memory request, but this is an approximation)
        daemon_cpu += 100
        daemon_mem += 100

        logger.info(
            "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi",
            axsys_cpu, axsys_mem, daemon_cpu, daemon_mem)

        axsys_node_count = int(self._cluster_config.get_asxys_node_count())
        axuser_min_count = str(
            int(self._cluster_config.get_min_node_count()) - axsys_node_count)
        axuser_max_count = str(
            int(self._cluster_config.get_max_node_count()) - axsys_node_count)
        autoscaler_scan_interval = str(
            self._cluster_config.get_autoscaler_scan_interval())

        usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["cpu"]
        usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["memory"]
        scale_down_util_thresh = round(
            max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001
        logger.info("Setting node scale down utilization threshold to %s",
                    scale_down_util_thresh)

        self._persist_node_resource_rsvp(daemon_cpu, daemon_mem)

        with open("/kubernetes/cluster/version.txt", "r") as f:
            cluster_install_version = f.read().strip()

        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if not asg_name:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

        # Prepare minion-manager.
        spot_instances_option = self._cluster_config.get_spot_instances_option(
        )
        minion_manager_asgs = ""
        if spot_instances_option == SpotInstanceOption.ALL_SPOT:
            for asg in asg_manager.get_all_asgs():
                minion_manager_asgs = minion_manager_asgs + asg[
                    "AutoScalingGroupName"] + " "
            minion_manager_asgs = minion_manager_asgs[:-1]
        elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT:
            minion_manager_asgs = asg_manager.get_variable_asg(
            )["AutoScalingGroupName"]

        return {
            "REGISTRY":
            self._software_info.registry,
            "REGISTRY_SECRETS":
            self._software_info.registry_secrets,
            "NAMESPACE":
            self._software_info.image_namespace,
            "VERSION":
            self._software_info.image_version,
            "AX_CLUSTER_NAME_ID":
            self._cluster_name_id,
            "AX_AWS_REGION":
            self._region,
            "AX_AWS_ACCOUNT":
            self._account,
            "AX_CUSTOMER_ID":
            AXCustomerId().get_customer_id(),
            "TRUSTED_CIDR":
            trusted_cidr_str,
            "NEW_KUBE_SALT_SHA1":
            os.getenv("NEW_KUBE_SALT_SHA1") or " ",
            "NEW_KUBE_SERVER_SHA1":
            os.getenv("NEW_KUBE_SERVER_SHA1") or " ",
            "AX_KUBE_VERSION":
            os.getenv("AX_KUBE_VERSION"),
            "AX_CLUSTER_INSTALL_VERSION":
            cluster_install_version,
            "SANDBOX_ENABLED":
            str(self._cluster_config.get_sandbox_flag()),
            "ARGO_LOG_BUCKET_NAME":
            self._cluster_config.get_support_object_store_name(),
            "ASG_MIN":
            axuser_min_count,
            "ASG_MAX":
            axuser_max_count,
            "AUTOSCALER_SCAN_INTERVAL":
            autoscaler_scan_interval,
            "SCALE_DOWN_UTIL_THRESH":
            str(scale_down_util_thresh),
            "AX_CLUSTER_META_URL_V1":
            self._bucket.get_object_url_from_key(
                key=self._cluster_config_path.cluster_metadata()),
            "ASG_NAME":
            asg_name,
            "DNS_SERVER_IP":
            os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]),
            "AX_ENABLE_SPOT_INSTANCES":
            str(spot_instances_option != SpotInstanceOption.NO_SPOT),
            "AX_SPOT_INSTANCE_ASGS":
            minion_manager_asgs,
        }
Exemplo n.º 9
0
 def ax_asg_helper(self):
     self.mock_setup()
     return AXUserASGManager(self.cluster_name_id, 'us-west-2')
Exemplo n.º 10
0
Arquivo: rest.py Projeto: zhan849/argo
from werkzeug.exceptions import BadRequest

_app = Flask("AXmon")
axmon = None

# Rlock for counting the max concurrent requests
concurrent_reqs_lock = RLock()
concurrent_reqs = 0
MAX_CONCURRENT_REQS = 100

MINION_MANAGER_HOSTNAME = "http://minion-manager.kube-system"
MINION_MANAGER_PORT = "6000"

kubectl = KubernetesApiClient(use_proxy=True)
cluster_name_id = os.getenv("AX_CLUSTER_NAME_ID", None)
asg_manager = AXUserASGManager(os.getenv("AX_CLUSTER_NAME_ID"),
                               AXClusterConfig().get_region())

# Need a lock to serialize cluster config operation
cfg_lock = RLock()

axmon_api_latency_stats = Summary("axmon_api_latency", "Latency for axmon REST APIs",
                              ["method", "endpoint", "status"])
axmon_api_concurrent_reqs = Gauge("axmon_api_concurrent_reqs", "Concurrent requests in axmon")


def before_request():
    request.start_time = time.time()
    global concurrent_reqs, MAX_CONCURRENT_REQS, concurrent_reqs_lock
    with concurrent_reqs_lock:
        axmon_api_concurrent_reqs.set(concurrent_reqs)
        # Disabling concurrent request logic for now due to findings in AA-3167