예제 #1
0
def filter_sfr_slaves(slaves_list, resource):
    sfr = resource['sfr']
    sfr_ips = get_sfr_instance_ips(sfr, region=resource['region'])
    log.debug("IPs in SFR: {0}".format(sfr_ips))
    sfr_slaves = [slave for slave in slaves_list if slave_pid_to_ip(slave['task_counts'].slave['pid']) in sfr_ips]
    sfr_slave_ips = [slave_pid_to_ip(slave['task_counts'].slave['pid']) for slave in sfr_slaves]
    sfr_instance_descriptions = describe_instances([], region=resource['region'],
                                                   instance_filters=[{'Name': 'private-ip-address',
                                                                      'Values': sfr_slave_ips}])
    sfr_slave_instances = []
    for slave in sfr_slaves:
        ip = slave_pid_to_ip(slave['task_counts'].slave['pid'])
        instances = get_instances_from_ip(ip, sfr_instance_descriptions)
        if not instances:
            log.warning("Couldn't find instance for ip {0}".format(ip))
            continue
        if len(instances) > 1:
            log.error("Found more than one instance with the same private IP {0}. "
                      "This should never happen")
            continue
        sfr_slave_instances.append({'ip': ip,
                                    'task_counts': slave['task_counts'],
                                    'hostname': slave['task_counts'].slave['hostname'],
                                    'id': slave['task_counts'].slave['id'],
                                    'pid': slave['task_counts'].slave['pid'],
                                    'instance_id': instances[0]['InstanceId']})
    ret = []
    instance_type_weights = get_instance_type_weights(sfr)
    for slave in sfr_slave_instances:
        instance_description = [instance_description for instance_description in sfr_instance_descriptions
                                if instance_description['InstanceId'] == slave['instance_id']][0]
        slave['instance_type'] = instance_description['InstanceType']
        slave['instance_weight'] = instance_type_weights[slave['instance_type']]
        ret.append(slave)
    return ret
예제 #2
0
def filter_sfr_slaves(sorted_slaves, sfr):
    sfr_ips = get_sfr_instance_ips(sfr)
    log.debug("IPs in SFR: {0}".format(sfr_ips))
    sfr_sorted_slaves = [slave for slave in sorted_slaves if slave_pid_to_ip(slave['pid']) in sfr_ips]
    sfr_sorted_slave_ips = [slave_pid_to_ip(slave['pid']) for slave in sfr_sorted_slaves]
    sfr_instance_descriptions = describe_instances([], instance_filters=[{'Name': 'private-ip-address',
                                                                          'Values': sfr_sorted_slave_ips}])
    sfr_sorted_slave_instances = []
    for slave in sfr_sorted_slaves:
        ip = slave_pid_to_ip(slave['pid'])
        instances = get_instances_from_ip(ip, sfr_instance_descriptions)
        if not instances:
            log.warning("Couldn't find instance for ip {0}".format(ip))
            continue
        if len(instances) > 1:
            log.error("Found more than one instance with the same private IP {0}. "
                      "This should never happen")
            continue
        sfr_sorted_slave_instances.append({'ip': ip,
                                           'pid': slave['pid'],
                                           'instance_id': instances[0]['InstanceId']})
    ret = []
    instance_type_weights = get_instance_type_weights(sfr)
    for slave in sfr_sorted_slave_instances:
        instance_description = [instance_description for instance_description in sfr_instance_descriptions
                                if instance_description['InstanceId'] == slave['instance_id']][0]
        slave['instance_type'] = instance_description['InstanceType']
        slave['instance_weight'] = instance_type_weights[slave['instance_type']]
        ret.append(slave)
    return ret
 def filter_aws_slaves(self, slaves_list):
     ips = self.get_instance_ips(self.instances,
                                 region=self.resource['region'])
     self.log.debug("IPs in AWS resources: {0}".format(ips))
     slaves = [
         slave for slave in slaves_list
         if slave_pid_to_ip(slave['task_counts'].slave['pid']) in ips
     ]
     slave_ips = [
         slave_pid_to_ip(slave['task_counts'].slave['pid'])
         for slave in slaves
     ]
     instance_descriptions = self.describe_instances(
         [],
         region=self.resource['region'],
         instance_filters=[{
             'Name': 'private-ip-address',
             'Values': slave_ips
         }])
     instance_type_weights = self.get_instance_type_weights()
     slave_instances = [
         PaastaAwsSlave(slave, instance_descriptions, instance_type_weights)
         for slave in slaves
     ]
     return slave_instances
예제 #4
0
def scale_aws_spot_fleet_request(resource, delta, target_capacity, sorted_slaves, dry_run):
    """Scales a spot fleet request by delta to reach target capacity
    If scaling up we just set target capacity and let AWS take care of the rest
    If scaling down we pick the slaves we'd prefer to kill, put them in maintenance
    mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill
    them once they are running 0 tasks or once a timeout is reached

    :param resource: resource to scale
    :param delta: integer change in number of servers
    :param target_capacity: target number of instances
    :param sorted_slaves: list of slaves by order to kill
    :param dry_run: Don't drain or make changes to spot fleet if True"""
    sfr_id = resource['id']
    ec2_client = boto3.client('ec2')
    if delta == 0:
        return
    elif delta > 0:
        log.info("Increasing spot fleet capacity to: {0}".format(target_capacity))
        if not dry_run:
            ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=target_capacity,
                                                 ExcessCapacityTerminationPolicy='noTermination')
            return
    elif delta < 0:
        number_to_kill = delta * -1
        sfr_ips = get_sfr_instance_ips(sfr_id)
        log.debug("IPs in SFR: {0}".format(sfr_ips))
        sfr_sorted_slaves = [slave for slave in sorted_slaves if slave_pid_to_ip(slave['pid']) in sfr_ips]
        log.info("SFR slave kill preference: {0}".format([slave['pid'] for slave in sfr_sorted_slaves]))
        if number_to_kill > len(sfr_sorted_slaves):
            log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!")
            return
        slaves_to_kill = sfr_sorted_slaves[0:number_to_kill]
        log.info("Set to kill: {0}".format([slave['pid'] for slave in slaves_to_kill]))
        instances_to_kill = {}
        for slave in slaves_to_kill:
            ip = slave_pid_to_ip(slave['pid'])
            instances_to_kill[slave['pid']] = {'ip': ip,
                                               'instance_id': get_instance_id_from_ip(ip)}
        # The start time of the maintenance window is the point at which
        # we giveup waiting for the instance to drain and mark it for termination anyway
        start = int(time.time() + CLUSTER_DRAIN_TIMEOUT)
        # Set the duration to an hour, if we haven't cleaned up and termintated by then
        # mesos should put the slave back into the pool
        duration = 600
        log.info("Draining {0}".format(instances_to_kill))
        log.info("Decreasing spot fleet capacity to: {0}".format(target_capacity))
        if not dry_run:
            # sort to make testing easier
            drain([instance['ip'] for instance in sorted(instances_to_kill.values())], start, duration)
            ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=target_capacity,
                                                 ExcessCapacityTerminationPolicy='noTermination')
        log.info("Waiting for instances to drain before we terminate")
        wait_and_terminate(instances_to_kill, dry_run)
예제 #5
0
    def filter_aws_slaves(self, slaves_list):
        ips = self.get_instance_ips(self.instances,
                                    region=self.resource['region'])
        self.log.debug("IPs in AWS resources: {}".format(ips))
        slaves = [
            slave for slave in slaves_list
            if slave_pid_to_ip(slave['task_counts'].slave['pid']) in ips
        ]
        slave_ips = [
            slave_pid_to_ip(slave['task_counts'].slave['pid'])
            for slave in slaves
        ]
        instance_type_weights = self.get_instance_type_weights()
        instance_statuses = self.instance_status_for_instance_ids(
            instance_ids=[
                instance['InstanceId'] for instance in self.instances
            ], )
        instance_descriptions = self.instance_descriptions_for_ips(slave_ips)

        paasta_aws_slaves = []
        for slave in slaves:
            slave_ip = slave_pid_to_ip(slave['task_counts'].slave['pid'])
            matching_descriptions = self.filter_instance_description_for_ip(
                slave_ip, instance_descriptions)
            if matching_descriptions:
                assert len(matching_descriptions) == 1, (
                    "There should be only one instance with the same IP."
                    "Found instances %s with the same ip %d" % (
                        ",".join(
                            [x['InstanceId']
                             for x in matching_descriptions], ),
                        slave_ip,
                    ))
                description = matching_descriptions[0]
                matching_status = self.filter_instance_status_for_instance_id(
                    instance_id=description['InstanceId'],
                    instance_statuses=instance_statuses,
                )
                assert len(
                    matching_status
                ) == 1, "There should be only one InstanceStatus per instance"
            else:
                description = None

            paasta_aws_slaves.append(
                PaastaAwsSlave(
                    slave=slave,
                    instance_status=matching_status[0],
                    instance_description=description,
                    instance_type_weights=instance_type_weights,
                ))

        return paasta_aws_slaves
예제 #6
0
def get_sfr_slaves(resource, mesos_state):
    instance_ips = get_sfr_instance_ips(resource['sfr'], region=resource['region'])
    slaves = {
        slave['id']: slave for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips and
        slave['attributes'].get('pool', 'default') == resource['pool']
    }
    return slaves
예제 #7
0
 def __init__(self, slave, instance_descriptions, instance_type_weights=None):
     self.wrapped_slave = slave
     self.instance_descriptions = instance_descriptions
     self.instance_type_weights = instance_type_weights
     self.task_counts = slave['task_counts']
     self.slave = self.task_counts.slave
     self.ip = slave_pid_to_ip(self.slave['pid'])
     self.instances = get_instances_from_ip(self.ip, self.instance_descriptions)
예제 #8
0
 def get_aws_slaves(self, mesos_state):
     instance_ips = self.get_instance_ips(self.instances, region=self.resource['region'])
     slaves = {
         slave['id']: slave for slave in mesos_state.get('slaves', [])
         if slave_pid_to_ip(slave['pid']) in instance_ips and
         slave['attributes'].get('pool', 'default') == self.resource['pool']
     }
     return slaves
예제 #9
0
def get_sfr_slaves(resource, mesos_state):
    instance_ips = get_sfr_instance_ips(resource['sfr'], region=resource['region'])
    slaves = {
        slave['id']: slave for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips and
        slave['attributes'].get('pool', 'default') == resource['pool']
    }
    return slaves
예제 #10
0
 def __init__(self, slave, instance_description, instance_status=None, instance_type_weights=None):
     if instance_status is None:
         instance_status = {}
     self.wrapped_slave = slave
     self.instance_description = instance_description
     self.instance_type_weights = instance_type_weights
     self.task_counts = slave['task_counts']
     self.slave = self.task_counts.slave
     self.ip = slave_pid_to_ip(self.slave['pid'])
예제 #11
0
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings):
    mesos_state = get_mesos_master().state
    sfr = get_sfr(spotfleet_request_id, region=resource['region'])
    if not sfr or not sfr['SpotFleetRequestState'] == 'active':
        log.error(
            "Ignoring SFR {0} that does not exist or is not active.".format(
                spotfleet_request_id))
        return 0, 0
    sfr['ActiveInstances'] = get_spot_fleet_instances(
        spotfleet_request_id, region=resource['region'])
    resource['sfr'] = sfr
    desired_instances = len(sfr['ActiveInstances'])
    instance_ips = get_sfr_instance_ips(sfr, region=resource['region'])
    slaves = {
        slave['id']: slave
        for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips
        and slave['attributes'].get('pool', 'default') == resource['pool']
    }
    current_instances = len(slaves)
    log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" %
             (float(float(current_instances) / float(desired_instances)) * 100,
              current_instances, desired_instances))
    if float(current_instances) / desired_instances < (
            1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
        error_message = (
            "We currently have %d instances active in mesos out of a desired %d.\n"
            "Refusing to scale because we either need to wait for the requests to be "
            "filled, or the new instances are not healthy for some reason.\n"
            "(cowardly refusing to go past %.2f%% missing instances)") % (
                current_instances, desired_instances,
                MISSING_SLAVE_PANIC_THRESHOLD)
        raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'],
        mesos_state)[resource['pool']]

    log.debug(pool_utilization_dict)
    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(float(pair[0]) / float(pair[1]))
        for pair in zip(free_pool_resources, total_pool_resources)
    ])
    target_utilization = pool_settings.get('target_utilization',
                                           DEFAULT_TARGET_UTILIZATION)
    error = utilization - target_utilization
    current, target = get_spot_fleet_delta(resource, error)
    return current, target
예제 #12
0
def spotfleet_metrics_provider(spotfleet_request_id, mesos_state, resource):
    sfr = get_sfr(spotfleet_request_id)
    sfr['ActiveInstances'] = get_spot_fleet_instances(spotfleet_request_id)
    resource['sfr'] = sfr
    if not sfr['SpotFleetRequestState'] == 'active':
        log.error("Ignoring SFR {0} that is not yet active or is cancelled etc.".format(spotfleet_request_id))
        return 0, 0
    desired_instances = len(sfr['ActiveInstances'])
    instance_ips = get_sfr_instance_ips(sfr)
    slaves = {
        slave['id']: slave for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips and
        slave['attributes'].get('pool', 'default') == resource['pool']
    }
    current_instances = len(slaves)
    log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" % (
             float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances))
    if float(current_instances) / desired_instances < (1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
        error_message = ("We currently have %d instances active in mesos out of a desired %d.\n"
                         "Refusing to scale because we either need to wait for the requests to be "
                         "filled, or the new instances are not healthy for some reason.\n"
                         "(cowardly refusing to go past %.2f%% missing instances)") % (
            current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD)
        raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'],
        mesos_state
    )[resource['pool']]

    log.debug(pool_utilization_dict)
    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(float(pair[0]) / float(pair[1]))
        for pair in zip(free_pool_resources, total_pool_resources)
    ])
    error = utilization - CLUSTER_TARGET_UTILIZATION
    current, target = get_spot_fleet_delta(resource, error)
    return current, target
예제 #13
0
def test_slave_pid_to_ip():
    ret = mesos_tools.slave_pid_to_ip('slave(1)@10.40.31.172:5051')
    assert ret == '10.40.31.172'
예제 #14
0
def test_slave_pid_to_ip():
    ret = mesos_tools.slave_pid_to_ip("slave(1)@10.40.31.172:5051")
    assert ret == "10.40.31.172"
예제 #15
0
def test_slave_pid_to_ip():
    ret = mesos_tools.slave_pid_to_ip('slave(1)@10.40.31.172:5051')
    assert ret == '10.40.31.172'