def set_spot_fleet_request_capacity(sfr_id, capacity, dry_run, region=None): """ AWS won't modify a request that is already modifying. This function ensures we wait a few seconds in case we've just modified a SFR""" ec2_client = boto3.client('ec2', region_name=region) with Timeout(seconds=AWS_SPOT_MODIFY_TIMEOUT): try: state = None while True: state = get_sfr(sfr_id, region=region)['SpotFleetRequestState'] if state == 'active': break if state == 'cancelled_running': log.info("Not updating target capacity because this is a cancelled SFR, " "we are just draining and killing the instances") return log.debug("SFR {0} in state {1}, waiting for state: active".format(sfr_id, state)) log.debug("Sleep 5 seconds") time.sleep(5) except TimeoutError: log.error("Spot fleet {0} not in active state so we can't modify it.".format(sfr_id)) raise FailSetSpotCapacity if dry_run: return True try: ret = ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=capacity, ExcessCapacityTerminationPolicy='noTermination') except ClientError as e: log.error("Error modifying spot fleet request: {0}".format(e)) raise FailSetSpotCapacity return ret
def perform_http_healthcheck(url, timeout): """Returns true if healthcheck on url succeeds, false otherwise :param url: the healthcheck url :param timeout: timeout in seconds :returns: True if healthcheck succeeds within number of seconds specified by timeout, false otherwise """ try: with Timeout(seconds=timeout): try: res = requests.get(url) except requests.ConnectionError: return (False, "http request failed: connection failed") except TimeoutError: return (False, "http request timed out after %d seconds" % timeout) if 'content-type' in res.headers and ',' in res.headers['content-type']: paasta_print(PaastaColors.yellow( "Multiple content-type headers detected in response." " The Mesos healthcheck system will treat this as a failure!")) return (False, "http request succeeded, code %d" % res.status_code) # check if response code is valid per https://mesosphere.github.io/marathon/docs/health-checks.html elif res.status_code >= 200 and res.status_code < 400: return (True, "http request succeeded, code %d" % res.status_code) else: return (False, "http request failed, code %s" % str(res.status_code))
def perform_http_healthcheck(url, timeout): """Returns true if healthcheck on url succeeds, false otherwise :param url: the healthcheck url :param timeout: timeout in seconds :returns: True if healthcheck succeeds within number of seconds specified by timeout, false otherwise """ try: with Timeout(seconds=timeout): try: res = requests.head(url) except requests.ConnectionError: return False except TimeoutError: return False if 'content-type' in res.headers and ',' in res.headers['content-type']: sys.stdout.write( PaastaColors.yellow( "Multiple content-type headers detected in response." " The Mesos healthcheck system will treat this as a failure!")) return False # check if response code is valid per https://mesosphere.github.io/marathon/docs/health-checks.html elif res.status_code >= 200 and res.status_code < 400: return True
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout): cluster_map = get_cluster_instance_map_for_service(soa_dir, service, deploy_group) if not cluster_map: line = "Couldn't find any instances for service {0} in deploy group {1}".format( service, deploy_group) _log(service=service, component='deploy', line=line, level='event') raise NoInstancesFound for cluster in cluster_map.values(): cluster['deployed'] = 0 try: with Timeout(seconds=timeout): total_instances = sum( [len(v["instances"]) for v in cluster_map.values()]) with progressbar.ProgressBar(maxval=total_instances) as bar: while True: for cluster, instances in cluster_map.items(): if cluster_map[cluster]['deployed'] != len( cluster_map[cluster]['instances']): cluster_map[cluster][ 'deployed'] = instances_deployed( cluster=cluster, service=service, instances=instances['instances'], git_sha=git_sha) if cluster_map[cluster]['deployed'] == len( cluster_map[cluster]['instances']): instance_csv = ", ".join( cluster_map[cluster]['instances']) print "Deploy to %s complete! (instances: %s)" % ( cluster, instance_csv) bar.update( sum([v["deployed"] for v in cluster_map.values()])) if all([ cluster['deployed'] == len(cluster["instances"]) for cluster in cluster_map.values() ]): break else: time.sleep(10) except TimeoutError: human_status = [ "{0}: {1}".format(cluster, data['deployed']) for cluster, data in cluster_map.items() ] line = "\nCurrent deployment status of {0} per cluster:\n".format( deploy_group) + "\n".join(human_status) _log(service=service, component='deploy', line=line, level='event') line = "\n\nTimed out after {0} seconds, waiting for {1} in {2} to be deployed by PaaSTA. \n\n"\ "This probably means the deploy hasn't suceeded. The new service might not be healthy or one "\ "or more clusters could be having issues.\n\n"\ "To debug: try running 'paasta status -s {2} -vv' or 'paasta logs -s {2}' to determine the cause.\n\n"\ "{3} is still *marked* for deployment. To rollback, you can run: 'paasta rollback --service "\ "{2} --deploy-group {1}'\n\n"\ "If the service is known to be slow to start you may wish to increase "\ "the timeout on this step.".format(timeout, deploy_group, service, git_sha) _log(service=service, component='deploy', line=line, level='event') raise return True
def wait_and_terminate(self, slave, drain_timeout, dry_run, region=None): """Waits for slave to be drained and then terminate :param slave: dict of slave to kill :param drain_timeout: how long to wait before terminating even if not drained :param region to connect to ec2 :param dry_run: Don't drain or make changes to spot fleet if True""" ec2_client = boto3.client('ec2', region_name=region) try: # This loop should always finish because the maintenance window should trigger is_ready_to_kill # being true. Just in case though we set a timeout and terminate anyway with Timeout(seconds=drain_timeout + 300): while True: instance_id = slave.instance_id if not instance_id: self.log.warning( "Didn't find instance ID for slave: {}. Skipping terminating" .format(slave.pid), ) continue # Check if no tasks are running or we have reached the maintenance window if is_safe_to_kill(slave.hostname) or dry_run: self.log.info( "TERMINATING: {} (Hostname = {}, IP = {})".format( instance_id, slave.hostname, slave.ip, )) try: ec2_client.terminate_instances( InstanceIds=[instance_id], DryRun=dry_run) except ClientError as e: if e.response['Error'].get( 'Code') == 'DryRunOperation': pass else: raise break else: self.log.info("Instance {}: NOT ready to kill".format( instance_id)) self.log.debug("Waiting 5 seconds and then checking again") time.sleep(5) except TimeoutError: self.log.error( "Timed out after {} waiting to drain {}, now terminating anyway" .format( drain_timeout, slave.pid, )) try: ec2_client.terminate_instances(InstanceIds=instance_id, DryRun=dry_run) except ClientError as e: if e.response['Error'].get('Code') == 'DryRunOperation': pass else: raise
def wait_and_terminate(slave, drain_timeout, dry_run, region=None): """Currently kills a slave, will wait for draining to complete soon :param slave: dict of slave to kill :param dry_run: Don't drain or make changes to spot fleet if True""" ec2_client = boto3.client('ec2', region_name=region) try: # This loop should always finish because the maintenance window should trigger is_ready_to_kill # being true. Just in case though we set a timeout and terminate anyway with Timeout(seconds=drain_timeout + 300): while True: instance_id = slave['instance_id'] if not instance_id: log.warning("Didn't find instance ID for slave: {0}. Skipping terminating".format(slave['pid'])) continue # Check if no tasks are running or we have reached the maintenance window if is_safe_to_kill(slave['hostname']) or dry_run: log.info("TERMINATING: {0} (Hostname = {1}, IP = {2})".format( instance_id, slave['hostname'], slave['ip'], )) try: ec2_client.terminate_instances(InstanceIds=[instance_id], DryRun=dry_run) except ClientError as e: if e.response['Error'].get('Code') == 'DryRunOperation': pass else: raise break else: log.info("Instance {0}: NOT ready to kill".format(instance_id)) log.debug("Waiting 5 seconds and then checking again") time.sleep(5) except TimeoutError: log.error("Timed out after {0} waiting to drain {1}, now terminating anyway".format(drain_timeout, slave['pid'])) try: ec2_client.terminate_instances(InstanceIds=instance_id, DryRun=dry_run) except ClientError as e: if e.response['Error'].get('Code') == 'DryRunOperation': pass else: raise
def set_capacity(self, capacity): """ AWS won't modify a request that is already modifying. This function ensures we wait a few seconds in case we've just modified a SFR""" rounded_capacity = int(floor(capacity)) ec2_client = boto3.client('ec2', region_name=self.resource['region']) with Timeout(seconds=AWS_SPOT_MODIFY_TIMEOUT): try: state = None while True: state = self.get_sfr(self.resource['id'], region=self.resource['region'] )['SpotFleetRequestState'] if state == 'active': break if state == 'cancelled_running': self.log.info( "Not updating target capacity because this is a cancelled SFR, " "we are just draining and killing the instances", ) return self.log.debug( "SFR {} in state {}, waiting for state: active".format( self.resource['id'], state)) self.log.debug("Sleep 5 seconds") time.sleep(5) except TimeoutError: self.log.error( "Spot fleet {} not in active state so we can't modify it.". format(self.resource['id'])) raise FailSetResourceCapacity if self.dry_run: return True try: ret = ec2_client.modify_spot_fleet_request( SpotFleetRequestId=self.resource['id'], TargetCapacity=rounded_capacity, ExcessCapacityTerminationPolicy='noTermination', ) except ClientError as e: self.log.error("Error modifying spot fleet request: {}".format(e)) raise FailSetResourceCapacity self.capacity = capacity return ret