def filter_sfr_slaves(slaves_list, resource): sfr = resource['sfr'] sfr_ips = get_sfr_instance_ips(sfr, region=resource['region']) log.debug("IPs in SFR: {0}".format(sfr_ips)) sfr_slaves = [slave for slave in slaves_list if slave_pid_to_ip(slave['task_counts'].slave['pid']) in sfr_ips] sfr_slave_ips = [slave_pid_to_ip(slave['task_counts'].slave['pid']) for slave in sfr_slaves] sfr_instance_descriptions = describe_instances([], region=resource['region'], instance_filters=[{'Name': 'private-ip-address', 'Values': sfr_slave_ips}]) sfr_slave_instances = [] for slave in sfr_slaves: ip = slave_pid_to_ip(slave['task_counts'].slave['pid']) instances = get_instances_from_ip(ip, sfr_instance_descriptions) if not instances: log.warning("Couldn't find instance for ip {0}".format(ip)) continue if len(instances) > 1: log.error("Found more than one instance with the same private IP {0}. " "This should never happen") continue sfr_slave_instances.append({'ip': ip, 'task_counts': slave['task_counts'], 'hostname': slave['task_counts'].slave['hostname'], 'id': slave['task_counts'].slave['id'], 'pid': slave['task_counts'].slave['pid'], 'instance_id': instances[0]['InstanceId']}) ret = [] instance_type_weights = get_instance_type_weights(sfr) for slave in sfr_slave_instances: instance_description = [instance_description for instance_description in sfr_instance_descriptions if instance_description['InstanceId'] == slave['instance_id']][0] slave['instance_type'] = instance_description['InstanceType'] slave['instance_weight'] = instance_type_weights[slave['instance_type']] ret.append(slave) return ret
def filter_sfr_slaves(sorted_slaves, sfr): sfr_ips = get_sfr_instance_ips(sfr) log.debug("IPs in SFR: {0}".format(sfr_ips)) sfr_sorted_slaves = [slave for slave in sorted_slaves if slave_pid_to_ip(slave['pid']) in sfr_ips] sfr_sorted_slave_ips = [slave_pid_to_ip(slave['pid']) for slave in sfr_sorted_slaves] sfr_instance_descriptions = describe_instances([], instance_filters=[{'Name': 'private-ip-address', 'Values': sfr_sorted_slave_ips}]) sfr_sorted_slave_instances = [] for slave in sfr_sorted_slaves: ip = slave_pid_to_ip(slave['pid']) instances = get_instances_from_ip(ip, sfr_instance_descriptions) if not instances: log.warning("Couldn't find instance for ip {0}".format(ip)) continue if len(instances) > 1: log.error("Found more than one instance with the same private IP {0}. " "This should never happen") continue sfr_sorted_slave_instances.append({'ip': ip, 'pid': slave['pid'], 'instance_id': instances[0]['InstanceId']}) ret = [] instance_type_weights = get_instance_type_weights(sfr) for slave in sfr_sorted_slave_instances: instance_description = [instance_description for instance_description in sfr_instance_descriptions if instance_description['InstanceId'] == slave['instance_id']][0] slave['instance_type'] = instance_description['InstanceType'] slave['instance_weight'] = instance_type_weights[slave['instance_type']] ret.append(slave) return ret
def filter_aws_slaves(self, slaves_list): ips = self.get_instance_ips(self.instances, region=self.resource['region']) self.log.debug("IPs in AWS resources: {0}".format(ips)) slaves = [ slave for slave in slaves_list if slave_pid_to_ip(slave['task_counts'].slave['pid']) in ips ] slave_ips = [ slave_pid_to_ip(slave['task_counts'].slave['pid']) for slave in slaves ] instance_descriptions = self.describe_instances( [], region=self.resource['region'], instance_filters=[{ 'Name': 'private-ip-address', 'Values': slave_ips }]) instance_type_weights = self.get_instance_type_weights() slave_instances = [ PaastaAwsSlave(slave, instance_descriptions, instance_type_weights) for slave in slaves ] return slave_instances
def scale_aws_spot_fleet_request(resource, delta, target_capacity, sorted_slaves, dry_run): """Scales a spot fleet request by delta to reach target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param resource: resource to scale :param delta: integer change in number of servers :param target_capacity: target number of instances :param sorted_slaves: list of slaves by order to kill :param dry_run: Don't drain or make changes to spot fleet if True""" sfr_id = resource['id'] ec2_client = boto3.client('ec2') if delta == 0: return elif delta > 0: log.info("Increasing spot fleet capacity to: {0}".format(target_capacity)) if not dry_run: ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=target_capacity, ExcessCapacityTerminationPolicy='noTermination') return elif delta < 0: number_to_kill = delta * -1 sfr_ips = get_sfr_instance_ips(sfr_id) log.debug("IPs in SFR: {0}".format(sfr_ips)) sfr_sorted_slaves = [slave for slave in sorted_slaves if slave_pid_to_ip(slave['pid']) in sfr_ips] log.info("SFR slave kill preference: {0}".format([slave['pid'] for slave in sfr_sorted_slaves])) if number_to_kill > len(sfr_sorted_slaves): log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!") return slaves_to_kill = sfr_sorted_slaves[0:number_to_kill] log.info("Set to kill: {0}".format([slave['pid'] for slave in slaves_to_kill])) instances_to_kill = {} for slave in slaves_to_kill: ip = slave_pid_to_ip(slave['pid']) instances_to_kill[slave['pid']] = {'ip': ip, 'instance_id': get_instance_id_from_ip(ip)} # The start time of the maintenance window is the point at which # we giveup waiting for the instance to drain and mark it for termination anyway start = int(time.time() + CLUSTER_DRAIN_TIMEOUT) # Set the duration to an hour, if we haven't cleaned up and termintated by then # mesos should put the slave back into the pool duration = 600 log.info("Draining {0}".format(instances_to_kill)) log.info("Decreasing spot fleet capacity to: {0}".format(target_capacity)) if not dry_run: # sort to make testing easier drain([instance['ip'] for instance in sorted(instances_to_kill.values())], start, duration) ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=target_capacity, ExcessCapacityTerminationPolicy='noTermination') log.info("Waiting for instances to drain before we terminate") wait_and_terminate(instances_to_kill, dry_run)
def filter_aws_slaves(self, slaves_list): ips = self.get_instance_ips(self.instances, region=self.resource['region']) self.log.debug("IPs in AWS resources: {}".format(ips)) slaves = [ slave for slave in slaves_list if slave_pid_to_ip(slave['task_counts'].slave['pid']) in ips ] slave_ips = [ slave_pid_to_ip(slave['task_counts'].slave['pid']) for slave in slaves ] instance_type_weights = self.get_instance_type_weights() instance_statuses = self.instance_status_for_instance_ids( instance_ids=[ instance['InstanceId'] for instance in self.instances ], ) instance_descriptions = self.instance_descriptions_for_ips(slave_ips) paasta_aws_slaves = [] for slave in slaves: slave_ip = slave_pid_to_ip(slave['task_counts'].slave['pid']) matching_descriptions = self.filter_instance_description_for_ip( slave_ip, instance_descriptions) if matching_descriptions: assert len(matching_descriptions) == 1, ( "There should be only one instance with the same IP." "Found instances %s with the same ip %d" % ( ",".join( [x['InstanceId'] for x in matching_descriptions], ), slave_ip, )) description = matching_descriptions[0] matching_status = self.filter_instance_status_for_instance_id( instance_id=description['InstanceId'], instance_statuses=instance_statuses, ) assert len( matching_status ) == 1, "There should be only one InstanceStatus per instance" else: description = None paasta_aws_slaves.append( PaastaAwsSlave( slave=slave, instance_status=matching_status[0], instance_description=description, instance_type_weights=instance_type_weights, )) return paasta_aws_slaves
def get_sfr_slaves(resource, mesos_state): instance_ips = get_sfr_instance_ips(resource['sfr'], region=resource['region']) slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == resource['pool'] } return slaves
def __init__(self, slave, instance_descriptions, instance_type_weights=None): self.wrapped_slave = slave self.instance_descriptions = instance_descriptions self.instance_type_weights = instance_type_weights self.task_counts = slave['task_counts'] self.slave = self.task_counts.slave self.ip = slave_pid_to_ip(self.slave['pid']) self.instances = get_instances_from_ip(self.ip, self.instance_descriptions)
def get_aws_slaves(self, mesos_state): instance_ips = self.get_instance_ips(self.instances, region=self.resource['region']) slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == self.resource['pool'] } return slaves
def __init__(self, slave, instance_description, instance_status=None, instance_type_weights=None): if instance_status is None: instance_status = {} self.wrapped_slave = slave self.instance_description = instance_description self.instance_type_weights = instance_type_weights self.task_counts = slave['task_counts'] self.slave = self.task_counts.slave self.ip = slave_pid_to_ip(self.slave['pid'])
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings): mesos_state = get_mesos_master().state sfr = get_sfr(spotfleet_request_id, region=resource['region']) if not sfr or not sfr['SpotFleetRequestState'] == 'active': log.error( "Ignoring SFR {0} that does not exist or is not active.".format( spotfleet_request_id)) return 0, 0 sfr['ActiveInstances'] = get_spot_fleet_instances( spotfleet_request_id, region=resource['region']) resource['sfr'] = sfr desired_instances = len(sfr['ActiveInstances']) instance_ips = get_sfr_instance_ips(sfr, region=resource['region']) slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == resource['pool'] } current_instances = len(slaves) log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" % (float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < ( 1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ( "We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state)[resource['pool']] log.debug(pool_utilization_dict) free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(float(pair[0]) / float(pair[1])) for pair in zip(free_pool_resources, total_pool_resources) ]) target_utilization = pool_settings.get('target_utilization', DEFAULT_TARGET_UTILIZATION) error = utilization - target_utilization current, target = get_spot_fleet_delta(resource, error) return current, target
def spotfleet_metrics_provider(spotfleet_request_id, mesos_state, resource): sfr = get_sfr(spotfleet_request_id) sfr['ActiveInstances'] = get_spot_fleet_instances(spotfleet_request_id) resource['sfr'] = sfr if not sfr['SpotFleetRequestState'] == 'active': log.error("Ignoring SFR {0} that is not yet active or is cancelled etc.".format(spotfleet_request_id)) return 0, 0 desired_instances = len(sfr['ActiveInstances']) instance_ips = get_sfr_instance_ips(sfr) slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == resource['pool'] } current_instances = len(slaves) log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" % ( float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < (1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ("We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state )[resource['pool']] log.debug(pool_utilization_dict) free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(float(pair[0]) / float(pair[1])) for pair in zip(free_pool_resources, total_pool_resources) ]) error = utilization - CLUSTER_TARGET_UTILIZATION current, target = get_spot_fleet_delta(resource, error) return current, target
def test_slave_pid_to_ip(): ret = mesos_tools.slave_pid_to_ip('slave(1)@10.40.31.172:5051') assert ret == '10.40.31.172'
def test_slave_pid_to_ip(): ret = mesos_tools.slave_pid_to_ip("slave(1)@10.40.31.172:5051") assert ret == "10.40.31.172"