def update_asgs(asgs, cluster_name): run_mode = app_config['RUN_MODE'] asg_outdated_instance_dict = plan_asgs(asgs) asg_original_state_dict = {} if run_mode == 2: # Scale up all the ASGs with outdated nodes (by the number of outdated nodes) for asg_name, asg_tuple in asg_outdated_instance_dict.items(): outdated_instances, asg = asg_tuple outdated_instance_count = len(outdated_instances) logger.info( f'Setting the scale of ASG {asg_name} based on {outdated_instance_count} outdated instances.' ) asg_original_state_dict[asg_name] = scale_up_asg( cluster_name, asg, outdated_instance_count) k8s_nodes = get_k8s_nodes() if (run_mode == 2) or (run_mode == 3): for asg_name, asg_tuple in asg_outdated_instance_dict.items(): outdated_instances, asg = asg_tuple for outdated in outdated_instances: node_name = "" try: # get the k8s node name instead of instance id node_name = get_node_by_instance_id( k8s_nodes, outdated['InstanceId']) cordon_node(node_name) except Exception as cordon_exception: logger.error( f"Encountered an error when cordoning node {node_name}" ) logger.error(cordon_exception) exit(1) # Drain, Delete and Terminate the outdated nodes and return the ASGs back to their original state for asg_name, asg_tuple in asg_outdated_instance_dict.items(): outdated_instances, asg = asg_tuple outdated_instance_count = len(outdated_instances) if (run_mode == 1) or (run_mode == 3): logger.info( f'Setting the scale of ASG {asg_name} based on {outdated_instance_count} outdated instances.' ) asg_original_state_dict[asg_name] = scale_up_asg( cluster_name, asg, outdated_instance_count) if run_mode == 1: for outdated in outdated_instances: node_name = "" try: # get the k8s node name instead of instance id node_name = get_node_by_instance_id( k8s_nodes, outdated['InstanceId']) cordon_node(node_name) except Exception as cordon_exception: logger.error( f"Encountered an error when cordoning node {node_name}" ) logger.error(cordon_exception) exit(1) if len(outdated_instances) != 0: # pause aws autoscaling so new instances dont try # to spawn while instances are being terminated modify_aws_autoscaling(asg_name, "suspend") # start draining and terminating for outdated in outdated_instances: # catch any failures so we can resume aws autoscaling try: # get the k8s node name instead of instance id node_name = get_node_by_instance_id(k8s_nodes, outdated['InstanceId']) drain_node(node_name) delete_node(node_name) terminate_instance(outdated['InstanceId']) if not instance_terminated(outdated['InstanceId']): raise Exception( 'Instance is failing to terminate. Cancelling out.') detach_instance(outdated['InstanceId'], asg_name) if app_config[ 'ASG_WAIT_FOR_DETACHMENT'] and not instance_detached( outdated['InstanceId']): raise Exception( 'Instance is failing to detach from ASG. Cancelling out.' ) between_nodes_wait = app_config['BETWEEN_NODES_WAIT'] if between_nodes_wait != 0: logger.info( f'Waiting for {between_nodes_wait} seconds before continuing...' ) time.sleep(between_nodes_wait) except Exception as drain_exception: logger.info(drain_exception) raise RollingUpdateException("Rolling update on ASG failed", asg_name) # scaling cluster back down logger.info("Scaling asg back down to original state") asg_desired_capacity, asg_orig_desired_capacity, asg_orig_max_capacity = asg_original_state_dict[ asg_name] scale_asg(asg_name, asg_desired_capacity, asg_orig_desired_capacity, asg_orig_max_capacity) # resume aws autoscaling modify_aws_autoscaling(asg_name, "resume") # remove aws tag delete_asg_tags(asg_name, app_config["ASG_DESIRED_STATE_TAG"]) delete_asg_tags(asg_name, app_config["ASG_ORIG_CAPACITY_TAG"]) delete_asg_tags(asg_name, app_config["ASG_ORIG_MAX_CAPACITY_TAG"]) logger.info(f'*** Rolling update of asg {asg_name} is complete! ***') logger.info('All asgs processed')
def test_instance_terminated_fail(self): self.assertFalse(instance_terminated(self.instance_id, 2, 1))
def update_asgs(asgs, cluster_name): for asg in asgs: logger.info('\n') logger.info( '**** Starting rolling update for autoscaling group {} ****'. format(asg['AutoScalingGroupName'])) asg_name = asg['AutoScalingGroupName'] asg_lc_name = asg['LaunchConfigurationName'] asg_old_max_size = asg['MaxSize'] instances = asg['Instances'] asg_old_desired_capacity = asg['DesiredCapacity'] asg_tags = asg['Tags'] # return a list of outdated instances outdated_instances = [] for instance in instances: if instance_outdated(instance, asg_lc_name): outdated_instances.append(instance) logger.info('Found {} outdated instances'.format( len(outdated_instances))) # skip to next asg if there are no outdated instances if len(outdated_instances) == 0: continue # remove any stale suspentions from asg that may be present modify_aws_autoscaling(asg_name, "resume") # check for previous run tag on asg asg_tag_desired_capacity = get_asg_tag( asg_tags, app_config["ASG_DESIRED_STATE_TAG"]) if asg_tag_desired_capacity.get('Value'): logger.info( 'Found previous desired capacity value tag set on asg from a previous run. Value: {}' .format(asg_tag_desired_capacity.get('Value'))) logger.info('Maintaining previous capacity to not overscale') asg_new_desired_capacity = int( asg_tag_desired_capacity.get('Value')) asg_tag_original_capacity = get_asg_tag( asg_tags, app_config["ASG_ORIG_CAPACITY_TAG"]) logger.info( 'Maintaining original old capacity from a previous run so we can scale back down to original size of: {}' .format(asg_tag_original_capacity.get('Value'))) asg_old_desired_capacity = int( asg_tag_original_capacity.get('Value')) else: logger.info('No previous capacity value tag set on asg') # save original capacity to asg tags logger.info('Setting original capacity on asg') save_asg_tags(asg_name, app_config["ASG_ORIG_CAPACITY_TAG"], asg_old_desired_capacity) asg_new_desired_capacity = asg_old_desired_capacity + len( outdated_instances) # save new capacity to asg tags save_asg_tags(asg_name, app_config["ASG_DESIRED_STATE_TAG"], asg_new_desired_capacity) # only change the max size if the new capacity is bigger than current max if asg_new_desired_capacity > asg_old_max_size: asg_new_max_size = asg_new_desired_capacity else: # dont change the size asg_new_max_size = asg_old_max_size # get number of k8s nodes before we scale used later # to determine how many new nodes have been created k8s_nodes = get_k8s_nodes() # now scale up scale_asg(asg_name, asg_old_desired_capacity, asg_new_desired_capacity, asg_new_max_size) logger.info( 'Waiting for {} seconds for asg {} to scale before validating cluster health...' .format(app_config['CLUSTER_HEALTH_WAIT'], asg_name)) time.sleep(app_config['CLUSTER_HEALTH_WAIT']) # check how many instances are running asg_instance_count = count_all_cluster_instances(cluster_name) # check cluster health before doing anything if validate_cluster_health(asg_name, asg_new_desired_capacity, asg_instance_count): # pause aws autoscaling so new instances dont try # to spawn while instances are being terminated modify_aws_autoscaling(asg_name, "suspend") # start draining and terminating for outdated in outdated_instances: # catch any failures so we can resume aws autoscaling try: # get the k8s node name instead of instance id node_name = get_node_by_instance_id( k8s_nodes, outdated['InstanceId']) drain_node(node_name) delete_node(node_name) terminate_instance(outdated['InstanceId']) if not instance_terminated(outdated['InstanceId']): raise Exception( 'Instance is failing to terminate. Cancelling out.' ) detach_instance(outdated['InstanceId'], asg_name) if not instance_detached(outdated['InstanceId']): raise Exception( 'Instance is failing to detach from ASG. Cancelling out.' ) except Exception as e: logger.info(e) raise RollingUpdateException( "Rolling update on asg failed", asg_name) # resume aws autoscaling modify_aws_autoscaling(asg_name, "resume") # remove aws tag delete_asg_tags(asg_name, app_config["ASG_DESIRED_STATE_TAG"]) delete_asg_tags(asg_name, app_config["ASG_ORIG_CAPACITY_TAG"]) logger.info('*** Rolling update of asg {} is complete! ***'.format( asg_name)) else: logger.info('Exiting since asg healthcheck failed') raise Exception('Asg healthcheck failed') logger.info('All asgs processed')
def test_instance_terminated(self): with patch('lib.aws.ec2_client.describe_instances' ) as describe_instances_mock: describe_instances_mock.return_value = self.aws_response_mock_terminated self.assertTrue(instance_terminated(self.instance_id, 2, 1))