def test_wait_for_healthy_elbs(self): first_elb_name = "healthy-lb-1" second_elb_name = "healthy-lb-2" first_elb = create_elb(first_elb_name) second_elb = create_elb(second_elb_name) mock_function = "boto.ec2.elb.loadbalancer.LoadBalancer.get_instance_health" # Setup a side effect to simulate how a instances may come online in the load balancer. # 2 load balancers * 2 instances per * 3 iterations (They way these instances come online in to the load # balancer will ensure that the ELB will be removed from the list on the second iteration, then the second ELB # is removed on the 3rd iteation. first_elb_instances = first_elb.get_instance_health() second_elb_instances = second_elb.get_instance_health() return_vals = [ clone_elb_instances_with_state(first_elb_instances, "OutOfService"), clone_elb_instances_with_state(second_elb_instances, "OutOfService") ] return_vals += [ clone_elb_instances_with_state(first_elb_instances, "InService"), clone_elb_instances_with_state(second_elb_instances, "OutOfService") ] return_vals += [clone_elb_instances_with_state(second_elb_instances, "InService")] with mock.patch(mock_function, side_effect=return_vals): with mock.patch('tubular.ec2.WAIT_SLEEP_TIME', 1): self.assertEqual(None, ec2.wait_for_healthy_elbs([first_elb_name, second_elb_name], 3))
def test_wait_for_healthy_elbs(self): first_elb_name = "healthy-lb-1" second_elb_name = "healthy-lb-2" first_elb = create_elb(first_elb_name) second_elb = create_elb(second_elb_name) mock_function = "boto.ec2.elb.loadbalancer.LoadBalancer.get_instance_health" # Setup a side effect to simulate how a instances may come online in the load balancer. # 2 load balancers * 2 instances per * 3 iterations (They way these instances come online in to the load # balancer will ensure that the ELB will be removed from the list on the second iteration, then the second ELB # is removed on the 3rd iteation. first_elb_instances = first_elb.get_instance_health() second_elb_instances = second_elb.get_instance_health() return_vals = [ clone_elb_instances_with_state(first_elb_instances, "OutOfService"), clone_elb_instances_with_state(second_elb_instances, "OutOfService") ] return_vals += [ clone_elb_instances_with_state(first_elb_instances, "InService"), clone_elb_instances_with_state(second_elb_instances, "OutOfService") ] return_vals += [ clone_elb_instances_with_state(second_elb_instances, "InService") ] with mock.patch(mock_function, side_effect=return_vals): with mock.patch('tubular.ec2.WAIT_SLEEP_TIME', 1): self.assertEqual( None, ec2.wait_for_healthy_elbs( [first_elb_name, second_elb_name], 3))
def _red_black_deploy( new_cluster_asgs, baseline_cluster_asgs, secs_before_old_asgs_disabled=DISABLE_OLD_ASG_WAIT_TIME ): """ Takes two dicts of autoscale groups, new and baseline. Each dict key is a cluster name. Each dict value is a list of ASGs for that cluster. Enables the new ASGs, then disables the old ASGs. Red/black deploy refers to: - Existing ASG is "red", meaning active. - New ASG begins as "black", meaning inactive. - The new ASG is added to the ELB, making it "red". - The baseline and new ASGs are now existing as "red/red". - The baseline ASG is removed from the ELB. - As traffic has ceased to be directed to the baseline ASG, it becomes "black". Workflow: - enable new ASGs - wait for instances to be healthy in the load balancer - ensure the new ASGs are not pending delete or disabled - tag and disable current asgs Args: new_asgs (dict): List of new ASGs to be added to the ELB, keyed by cluster. baseline_asgs (dict): List of existing ASGs already added to the ELB, keyed by cluster. Returns: success (bool): True if red/black operation succeeded, else False. asgs_enabled (dict): List of ASGs that are added to the ELB, keyed by cluster. asgs_disabled (dict): List of ASGs that are removed from the ELB, keyed by cluster. """ asgs_enabled = copy.deepcopy(baseline_cluster_asgs) asgs_disabled = copy.deepcopy(new_cluster_asgs) def _enable_cluster_asg(cluster, asg): """ Shifts ASG from disabled to enabled. """ enable_asg(asg) _move_asg_from_disabled_to_enabled(cluster, asg) def _disable_cluster_asg(cluster, asg): """ Shifts ASG from enabled to disabled. """ disable_asg(asg) _move_asg_from_enabled_to_disabled(cluster, asg) def _move_asg_from_disabled_to_enabled(cluster, asg): """ Shifts ASG from disabled to enabled. """ asgs_enabled[cluster].append(asg) asgs_disabled[cluster].remove(asg) def _move_asg_from_enabled_to_disabled(cluster, asg): """ Shifts ASG from enabled to disabled. """ asgs_enabled[cluster].remove(asg) asgs_disabled[cluster].append(asg) def _disable_clustered_asgs(clustered_asgs, failure_msg): """ Disable all the ASGs in the lists, keyed by cluster. """ for cluster, asgs in six.iteritems(clustered_asgs): for asg in asgs: try: _disable_cluster_asg(cluster, asg) except: # pylint: disable=bare-except LOG.warning(failure_msg, asg, exc_info=True) elbs_to_monitor = [] newly_enabled_asgs = defaultdict(list) for cluster, asgs in six.iteritems(new_cluster_asgs): for asg in asgs: try: _enable_cluster_asg(cluster, asg) elbs_to_monitor.extend(elbs_for_asg(asg)) newly_enabled_asgs[cluster].append(asg) except: # pylint: disable=bare-except LOG.error("Error enabling ASG '%s'. Disabling traffic to all new ASGs.", asg, exc_info=True) # Disable the ASG which failed first. _disable_cluster_asg(cluster, asg) # Then disable any new other ASGs that have been newly enabled. _disable_clustered_asgs( newly_enabled_asgs, "Unable to disable ASG '%s' after failure." ) return (False, asgs_enabled, asgs_disabled) LOG.info("New ASGs {} are active and will be available after passing the healthchecks.".format( dict(newly_enabled_asgs) )) # Wait for all instances to be in service in all ELBs. try: ec2.wait_for_healthy_elbs(elbs_to_monitor, 600) except: # pylint: disable=bare-except LOG.info("Some ASGs are failing ELB health checks. Disabling traffic to all new ASGs.", exc_info=True) _disable_clustered_asgs( newly_enabled_asgs, "Unable to disable ASG '%s' after waiting for healthy ELBs." ) return (False, asgs_enabled, asgs_disabled) # Add a sleep delay here to wait and see how the new ASGs react to traffic. # A flawed release would likely make the new ASGs fail the health checks below # and, if any new ASGs fail the health checks, the old ASGs would *not be disabled. time.sleep(secs_before_old_asgs_disabled) # Ensure the new ASGs are still healthy and not pending delete before disabling the old ASGs. for cluster, asgs in six.iteritems(newly_enabled_asgs): for asg in asgs: err_msg = None if is_asg_pending_delete(asg): err_msg = "New ASG '{}' is pending delete.".format(asg) elif not is_asg_enabled(asg): err_msg = "New ASG '{}' is not enabled.".format(asg) if err_msg: LOG.error("{} Aborting disabling of old ASGs.".format(err_msg)) return (False, asgs_enabled, asgs_disabled) LOG.info("New ASGs have passed the healthchecks. Now disabling old ASGs.") for cluster, asgs in six.iteritems(baseline_cluster_asgs): for asg in asgs: try: if is_asg_enabled(asg): try: _disable_cluster_asg(cluster, asg) except: # pylint: disable=bare-except LOG.warning("Unable to disable ASG '%s' after enabling new ASGs.", asg, exc_info=True) elif asg in asgs_enabled[cluster]: # If the asg is not enabled, but we have it in the enabled list remove it. This may occur by # pulling from 2 different sources of truth at different intervals. The asg could have been disabled # in the intervening time. _move_asg_from_enabled_to_disabled(cluster, asg) except ASGDoesNotExistException: # This operation should not fail if one of the baseline ASGs was removed during the deployment process LOG.info("ASG {asg} in cluster {cluster} no longer exists, removing it from the enabled cluster list" .format(asg=asg, cluster=cluster)) _move_asg_from_enabled_to_disabled(cluster, asg) try: ec2.tag_asg_for_deletion(asg) except ASGDoesNotExistException: LOG.info("Unable to tag ASG '{}' as it no longer exists, skipping.".format(asg)) return (True, asgs_enabled, asgs_disabled)
def _red_black_deploy( new_cluster_asgs, baseline_cluster_asgs, secs_before_old_asgs_disabled=DISABLE_OLD_ASG_WAIT_TIME ): """ Takes two dicts of autoscale groups, new and baseline. Each dict key is a cluster name. Each dict value is a list of ASGs for that cluster. Enables the new ASGs, then disables the old ASGs. Red/black deploy refers to: - Existing ASG is "red", meaning active. - New ASG begins as "black", meaning inactive. - The new ASG is added to the ELB, making it "red". - The baseline and new ASGs are now existing as "red/red". - The baseline ASG is removed from the ELB. - As traffic has ceased to be directed to the baseline ASG, it becomes "black". Workflow: - enable new ASGs - wait for instances to be healthy in the load balancer - ensure the new ASGs are not pending delete or disabled - tag and disable current asgs Args: new_asgs (dict): List of new ASGs to be added to the ELB, keyed by cluster. baseline_asgs (dict): List of existing ASGs already added to the ELB, keyed by cluster. Returns: success (bool): True if red/black operation succeeded, else False. asgs_enabled (dict): List of ASGs that are added to the ELB, keyed by cluster. asgs_disabled (dict): List of ASGs that are removed from the ELB, keyed by cluster. """ asgs_enabled = copy.deepcopy(baseline_cluster_asgs) asgs_disabled = copy.deepcopy(new_cluster_asgs) def _enable_cluster_asg(cluster, asg): """ Shifts ASG from disabled to enabled. """ enable_asg(asg) _move_asg_from_disabled_to_enabled(cluster, asg) def _disable_cluster_asg(cluster, asg): """ Shifts ASG from enabled to disabled. """ disable_asg(asg) _move_asg_from_enabled_to_disabled(cluster, asg) def _move_asg_from_disabled_to_enabled(cluster, asg): """ Shifts ASG from disabled to enabled. """ asgs_enabled[cluster].append(asg) asgs_disabled[cluster].remove(asg) def _move_asg_from_enabled_to_disabled(cluster, asg): """ Shifts ASG from enabled to disabled. """ asgs_enabled[cluster].remove(asg) asgs_disabled[cluster].append(asg) def _disable_clustered_asgs(clustered_asgs, failure_msg): """ Disable all the ASGs in the lists, keyed by cluster. """ for cluster, asgs in clustered_asgs.iteritems(): for asg in asgs: try: _disable_cluster_asg(cluster, asg) except: # pylint: disable=bare-except LOG.warning(failure_msg.format(asg)) elbs_to_monitor = [] newly_enabled_asgs = defaultdict(list) for cluster, asgs in new_cluster_asgs.iteritems(): for asg in asgs: try: _enable_cluster_asg(cluster, asg) elbs_to_monitor.extend(elbs_for_asg(asg)) newly_enabled_asgs[cluster].append(asg) except: # pylint: disable=bare-except LOG.error("Error enabling ASG '{}'. Disabling traffic to all new ASGs.".format(asg)) LOG.error(traceback.format_exc()) # Disable the ASG which failed first. _disable_cluster_asg(cluster, asg) # Then disable any new other ASGs that have been newly enabled. _disable_clustered_asgs( newly_enabled_asgs, "Unable to disable ASG '{}' after failure." ) return (False, asgs_enabled, asgs_disabled) LOG.info("New ASGs {} are active and will be available after passing the healthchecks.".format( dict(newly_enabled_asgs) )) # Wait for all instances to be in service in all ELBs. try: ec2.wait_for_healthy_elbs(elbs_to_monitor, 600) except: # pylint: disable=bare-except LOG.info("Some ASGs are failing ELB health checks. Disabling traffic to all new ASGs.") _disable_clustered_asgs( newly_enabled_asgs, "Unable to disable ASG '{}' after waiting for healthy ELBs." ) return (False, asgs_enabled, asgs_disabled) # Add a sleep delay here to wait and see how the new ASGs react to traffic. # A flawed release would likely make the new ASGs fail the health checks below # and, if any new ASGs fail the health checks, the old ASGs would *not be disabled. time.sleep(secs_before_old_asgs_disabled) # Ensure the new ASGs are still healthy and not pending delete before disabling the old ASGs. for cluster, asgs in newly_enabled_asgs.iteritems(): for asg in asgs: err_msg = None if is_asg_pending_delete(asg): err_msg = "New ASG '{}' is pending delete.".format(asg) elif not is_asg_enabled(asg): err_msg = "New ASG '{}' is not enabled.".format(asg) if err_msg: LOG.error("{} Aborting disabling of old ASGs.".format(err_msg)) return (False, asgs_enabled, asgs_disabled) LOG.info("New ASGs have passed the healthchecks. Now disabling old ASGs.") for cluster, asgs in baseline_cluster_asgs.iteritems(): for asg in asgs: try: if is_asg_enabled(asg): try: _disable_cluster_asg(cluster, asg) except: # pylint: disable=bare-except LOG.warning("Unable to disable ASG '{}' after enabling new ASGs.".format(asg)) elif asg in asgs_enabled[cluster]: # If the asg is not enabled, but we have it in the enabled list remove it. This may occur by # pulling from 2 different sources of truth at different intervals. The asg could have been disabled # in the intervening time. _move_asg_from_enabled_to_disabled(cluster, asg) except ASGDoesNotExistException: # This operation should not fail if one of the baseline ASGs was removed during the deployment process LOG.info("ASG {asg} in cluster {cluster} no longer exists, removing it from the enabled cluster list" .format(asg=asg, cluster=cluster)) _move_asg_from_enabled_to_disabled(cluster, asg) try: ec2.tag_asg_for_deletion(asg) except ASGDoesNotExistException: LOG.info("Unable to tag ASG '{}' as it no longer exists, skipping.".format(asg)) return (True, asgs_enabled, asgs_disabled)