예제 #1
0
 def test_get_asg_tag_fail(self):
     tags = [
         {
             'ResourceId': 'string',
             'ResourceType': 'string',
             'Key': 'eks-rolling-update:desired_capacity',
             'Value': '6',
             'PropagateAtLaunch': True
         },
     ]
     response = get_asg_tag(tags, "foo")
     self.assertEqual(response, {})
def scale_up_asg(cluster_name, asg, count):
    asg_old_max_size = asg['MaxSize']
    asg_old_desired_capacity = asg['DesiredCapacity']
    desired_capacity = asg_old_desired_capacity + count
    asg_tags = asg['Tags']
    asg_name = asg['AutoScalingGroupName']

    # remove any stale suspensions from asg that may be present
    modify_aws_autoscaling(asg_name, "resume")

    asg_tag_desired_capacity = get_asg_tag(asg_tags,
                                           app_config["ASG_DESIRED_STATE_TAG"])
    asg_tag_orig_capacity = get_asg_tag(asg_tags,
                                        app_config["ASG_ORIG_CAPACITY_TAG"])
    asg_tag_orig_max_capacity = get_asg_tag(
        asg_tags, app_config["ASG_ORIG_MAX_CAPACITY_TAG"])

    if desired_capacity == asg_old_desired_capacity:
        logger.info(
            f'Desired and current capacity for {asg_name} are equal. Skipping ASG.'
        )

        if asg_tag_desired_capacity.get('Value'):
            logger.info(
                'Found capacity tags on ASG from previous run. Leaving alone.')
            return int(asg_tag_desired_capacity.get('Value')), int(
                asg_tag_orig_capacity.get('Value')), int(
                    asg_tag_orig_max_capacity.get('Value'))
        else:
            save_asg_tags(asg_name, app_config["ASG_ORIG_CAPACITY_TAG"],
                          asg_old_desired_capacity)
            save_asg_tags(asg_name, app_config["ASG_DESIRED_STATE_TAG"],
                          asg_old_desired_capacity)
            save_asg_tags(asg_name, app_config["ASG_ORIG_MAX_CAPACITY_TAG"],
                          asg_old_max_size)
            return asg_old_desired_capacity, asg_old_desired_capacity, asg_old_max_size

    # only scale up if no previous desired capacity tag set
    if asg_tag_desired_capacity.get('Value'):
        logger.info(
            'Found previous desired capacity value tag set on asg from a previous run.'
        )
        logger.info(
            f'Maintaining previous capacity of {asg_old_desired_capacity} to not overscale.'
        )

        asg_instance_count = count_all_cluster_instances(cluster_name)

        # check cluster health before doing anything
        if not validate_cluster_health(
                asg_name, int(asg_tag_desired_capacity.get('Value')),
                asg_instance_count):
            logger.info('Exiting since ASG healthcheck failed')
            raise Exception('ASG healthcheck failed')

        return int(asg_tag_desired_capacity.get('Value')), int(
            asg_tag_orig_capacity.get('Value')), int(
                asg_tag_orig_max_capacity.get('Value'))
    else:
        logger.info(
            'No previous capacity value tags set on ASG; setting tags.')
        save_asg_tags(asg_name, app_config["ASG_ORIG_CAPACITY_TAG"],
                      asg_old_desired_capacity)
        save_asg_tags(asg_name, app_config["ASG_DESIRED_STATE_TAG"],
                      desired_capacity)
        save_asg_tags(asg_name, app_config["ASG_ORIG_MAX_CAPACITY_TAG"],
                      asg_old_max_size)

        # only change the max size if the new capacity is bigger than current max
        if desired_capacity > asg_old_max_size:
            scale_asg(asg_name, asg_old_desired_capacity, desired_capacity,
                      desired_capacity)
        else:
            scale_asg(asg_name, asg_old_desired_capacity, desired_capacity,
                      asg_old_max_size)

        cluster_health_wait = app_config['CLUSTER_HEALTH_WAIT']
        logger.info(
            f'Waiting for {cluster_health_wait} seconds for ASG to scale before validating cluster health...'
        )
        time.sleep(cluster_health_wait)
        asg_instance_count = count_all_cluster_instances(cluster_name)

        # check cluster health before doing anything
        if not validate_cluster_health(asg_name, desired_capacity,
                                       asg_instance_count):
            logger.info('Exiting since ASG healthcheck failed')
            raise Exception('ASG healthcheck failed')

        return desired_capacity, asg_old_desired_capacity, asg_old_max_size
def update_asgs(asgs, cluster_name):
    for asg in asgs:
        logger.info('\n')
        logger.info(
            '****  Starting rolling update for autoscaling group {}  ****'.
            format(asg['AutoScalingGroupName']))
        asg_name = asg['AutoScalingGroupName']
        asg_lc_name = asg['LaunchConfigurationName']
        asg_old_max_size = asg['MaxSize']
        instances = asg['Instances']
        asg_old_desired_capacity = asg['DesiredCapacity']
        asg_tags = asg['Tags']
        # return a list of outdated instances
        outdated_instances = []
        for instance in instances:
            if instance_outdated(instance, asg_lc_name):
                outdated_instances.append(instance)
        logger.info('Found {} outdated instances'.format(
            len(outdated_instances)))
        # skip to next asg if there are no outdated instances
        if len(outdated_instances) == 0:
            continue
        # remove any stale suspentions from asg that may be present
        modify_aws_autoscaling(asg_name, "resume")
        # check for previous run tag on asg
        asg_tag_desired_capacity = get_asg_tag(
            asg_tags, app_config["ASG_DESIRED_STATE_TAG"])
        if asg_tag_desired_capacity.get('Value'):
            logger.info(
                'Found previous desired capacity value tag set on asg from a previous run. Value: {}'
                .format(asg_tag_desired_capacity.get('Value')))
            logger.info('Maintaining previous capacity to not overscale')
            asg_new_desired_capacity = int(
                asg_tag_desired_capacity.get('Value'))
            asg_tag_original_capacity = get_asg_tag(
                asg_tags, app_config["ASG_ORIG_CAPACITY_TAG"])
            logger.info(
                'Maintaining original old capacity from a previous run so we can scale back down to original size of: {}'
                .format(asg_tag_original_capacity.get('Value')))
            asg_old_desired_capacity = int(
                asg_tag_original_capacity.get('Value'))
        else:
            logger.info('No previous capacity value tag set on asg')
            # save original capacity to asg tags
            logger.info('Setting original capacity on asg')
            save_asg_tags(asg_name, app_config["ASG_ORIG_CAPACITY_TAG"],
                          asg_old_desired_capacity)
            asg_new_desired_capacity = asg_old_desired_capacity + len(
                outdated_instances)
            # save new capacity to asg tags
            save_asg_tags(asg_name, app_config["ASG_DESIRED_STATE_TAG"],
                          asg_new_desired_capacity)
        # only change the max size if the new capacity is bigger than current max
        if asg_new_desired_capacity > asg_old_max_size:
            asg_new_max_size = asg_new_desired_capacity
        else:
            # dont change the size
            asg_new_max_size = asg_old_max_size
        # get number of k8s nodes before we scale used later
        # to determine how many new nodes have been created
        k8s_nodes = get_k8s_nodes()
        # now scale up
        scale_asg(asg_name, asg_old_desired_capacity, asg_new_desired_capacity,
                  asg_new_max_size)
        logger.info(
            'Waiting for {} seconds for asg {} to scale before validating cluster health...'
            .format(app_config['CLUSTER_HEALTH_WAIT'], asg_name))
        time.sleep(app_config['CLUSTER_HEALTH_WAIT'])
        # check how many instances are running
        asg_instance_count = count_all_cluster_instances(cluster_name)
        # check cluster health before doing anything
        if validate_cluster_health(asg_name, asg_new_desired_capacity,
                                   asg_instance_count):
            # pause aws autoscaling so new instances dont try
            # to spawn while instances are being terminated
            modify_aws_autoscaling(asg_name, "suspend")
            # start draining and terminating
            for outdated in outdated_instances:
                # catch any failures so we can resume aws autoscaling
                try:
                    # get the k8s node name instead of instance id
                    node_name = get_node_by_instance_id(
                        k8s_nodes, outdated['InstanceId'])
                    drain_node(node_name)
                    delete_node(node_name)
                    terminate_instance(outdated['InstanceId'])
                    if not instance_terminated(outdated['InstanceId']):
                        raise Exception(
                            'Instance is failing to terminate. Cancelling out.'
                        )
                    detach_instance(outdated['InstanceId'], asg_name)
                    if not instance_detached(outdated['InstanceId']):
                        raise Exception(
                            'Instance is failing to detach from ASG. Cancelling out.'
                        )
                except Exception as e:
                    logger.info(e)
                    raise RollingUpdateException(
                        "Rolling update on asg failed", asg_name)

            # resume aws autoscaling
            modify_aws_autoscaling(asg_name, "resume")
            # remove aws tag
            delete_asg_tags(asg_name, app_config["ASG_DESIRED_STATE_TAG"])
            delete_asg_tags(asg_name, app_config["ASG_ORIG_CAPACITY_TAG"])
            logger.info('*** Rolling update of asg {} is complete! ***'.format(
                asg_name))
        else:
            logger.info('Exiting since asg healthcheck failed')
            raise Exception('Asg healthcheck failed')
    logger.info('All asgs processed')