Exemplo n.º 1
0
def test_undrain(
    mock_unreserve_all_resources,
    mock_build_maintenance_schedule_payload,
    mock_operator_api,
):
    fake_schedule = {'fake_schedule': 'fake_value'}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    undrain(hostnames=['some-host'])

    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(['some-host'], drain=False)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args

    assert mock_unreserve_all_resources.call_count == 1
    expected_args = mock.call(['some-host'])
    assert mock_unreserve_all_resources.call_args == expected_args

    assert mock_operator_api.call_count == 1
    assert mock_operator_api.return_value.call_count == 1
    expected_args = mock.call(data=fake_schedule)
    assert mock_operator_api.return_value.call_args == expected_args

    mock_unreserve_all_resources.side_effect = HTTPError()
    undrain(hostnames=['some-host'])
    assert mock_operator_api.call_count == 2
Exemplo n.º 2
0
def test_undrain(
    mock_unreserve_all_resources,
    mock_build_maintenance_schedule_payload,
    mock_get_schedule_client,
):
    fake_schedule = {'fake_schedule': 'fake_value'}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    undrain(hostnames=['some-host'])

    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(['some-host'], drain=False)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args

    assert mock_unreserve_all_resources.call_count == 1
    expected_args = mock.call(['some-host'])
    assert mock_unreserve_all_resources.call_args == expected_args

    assert mock_get_schedule_client.call_count == 1
    assert mock_get_schedule_client.return_value.call_count == 1
    expected_args = mock.call(method="POST",
                              endpoint="",
                              data=json.dumps(fake_schedule))
    assert mock_get_schedule_client.return_value.call_args == expected_args

    mock_unreserve_all_resources.side_effect = HTTPError()
    undrain(hostnames=['some-host'])
    assert mock_get_schedule_client.call_count == 2
Exemplo n.º 3
0
def cleanup_forgotten_draining():
    """Clean up hosts forgotten draining"""
    log.debug("Cleaning up hosts forgotten draining")
    hosts_forgotten_draining = get_hosts_forgotten_draining(grace=seconds_to_nanoseconds(10 * 60))
    if hosts_forgotten_draining:
        undrain(hostnames=hosts_forgotten_draining)
    else:
        log.debug("No hosts forgotten draining")
Exemplo n.º 4
0
def gracefully_terminate_slave(resource, slave_to_kill, pool_settings,
                               current_capacity, new_capacity, dry_run):
    sfr_id = resource['id']
    drain_timeout = pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT)
    # The start time of the maintenance window is the point at which
    # we giveup waiting for the instance to drain and mark it for termination anyway
    start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
    # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
    # do anything at the end of the maintenance window.
    duration = 600 * 1000000000  # nanoseconds
    log.info("Draining {0}".format(slave_to_kill['pid']))
    if not dry_run:
        try:
            drain_host_string = "{0}|{1}".format(slave_to_kill['hostname'],
                                                 slave_to_kill['ip'])
            drain([drain_host_string], start, duration)
        except HTTPError as e:
            log.error("Failed to start drain "
                      "on {0}: {1}\n Trying next host".format(
                          slave_to_kill['hostname'], e))
            raise
    log.info("Decreasing spot fleet capacity from {0} to: {1}".format(
        current_capacity, new_capacity))
    # Instance weights can be floats but the target has to be an integer
    # because this is all AWS allows on the API call to set target capacity
    new_capacity = int(floor(new_capacity))
    try:
        set_spot_fleet_request_capacity(sfr_id,
                                        new_capacity,
                                        dry_run,
                                        region=resource['region'])
    except FailSetSpotCapacity:
        log.error("Couldn't update spot fleet, stopping autoscaler")
        log.info("Undraining {0}".format(slave_to_kill['pid']))
        if not dry_run:
            undrain([drain_host_string])
        raise
    log.info("Waiting for instance to drain before we terminate")
    try:
        wait_and_terminate(slave_to_kill,
                           drain_timeout,
                           dry_run,
                           region=resource['region'])
    except ClientError as e:
        log.error("Failure when terminating: {0}: {1}".format(
            slave_to_kill['pid'], e))
        log.error(
            "Setting spot fleet capacity back to {0}".format(current_capacity))
        set_spot_fleet_request_capacity(sfr_id,
                                        current_capacity,
                                        dry_run,
                                        region=resource['region'])
    finally:
        log.info("Undraining {0}".format(slave_to_kill['pid']))
        if not dry_run:
            undrain([drain_host_string])
Exemplo n.º 5
0
def paasta_maintenance():
    """Manipulate the maintenance state of a PaaSTA host.
    :returns: None
    """
    args = parse_args()

    if args.verbose >= 2:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbose == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.WARNING)

    action = args.action
    hostnames = args.hostname

    if action != 'status' and not hostnames:
        paasta_print("You must specify one or more hostnames")
        return

    start = args.start
    duration = args.duration

    ret = "Done"
    if action == 'drain':
        mesos_maintenance.drain(hostnames, start, duration)
    elif action == 'undrain':
        mesos_maintenance.undrain(hostnames)
    elif action == 'down':
        mesos_maintenance.down(hostnames)
    elif action == 'up':
        mesos_maintenance.up(hostnames)
    elif action == 'status':
        ret = mesos_maintenance.friendly_status()
    elif action == 'cluster_status':
        ret = mesos_maintenance.status()
    elif action == 'schedule':
        ret = mesos_maintenance.schedule()
    elif action == 'is_safe_to_drain':
        ret = is_safe_to_drain(hostnames[0])
    elif action == 'is_safe_to_kill':
        ret = is_safe_to_kill(hostnames[0])
    elif action == 'is_host_drained':
        ret = mesos_maintenance.is_host_drained(hostnames[0])
    elif action == 'is_host_down':
        ret = mesos_maintenance.is_host_down(hostnames[0])
    elif action == 'is_host_draining':
        ret = mesos_maintenance.is_host_draining(hostnames[0])
    elif action == 'is_host_past_maintenance_start':
        ret = mesos_maintenance.is_host_past_maintenance_start(hostnames[0])
    elif action == 'is_host_past_maintenance_end':
        ret = mesos_maintenance.is_host_past_maintenance_end(hostnames[0])
    else:
        raise NotImplementedError("Action: '%s' is not implemented." % action)
    paasta_print(ret)
    return ret
Exemplo n.º 6
0
def paasta_maintenance():
    """Manipulate the maintenance state of a PaaSTA host.
    :returns: None
    """
    args = parse_args()

    if args.verbose >= 2:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbose == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.WARNING)

    action = args.action
    hostnames = args.hostname

    if action != 'status' and not hostnames:
        paasta_print("You must specify one or more hostnames")
        return

    start = args.start
    duration = args.duration

    ret = "Done"
    if action == 'drain':
        mesos_maintenance.drain(hostnames, start, duration)
    elif action == 'undrain':
        mesos_maintenance.undrain(hostnames)
    elif action == 'down':
        mesos_maintenance.down(hostnames)
    elif action == 'up':
        mesos_maintenance.up(hostnames)
    elif action == 'status':
        ret = "%s" % mesos_maintenance.status()
    elif action == 'schedule':
        ret = "%s" % mesos_maintenance.schedule()
    elif action == 'is_safe_to_drain':
        ret = is_safe_to_drain(hostnames[0])
    elif action == 'is_safe_to_kill':
        ret = is_safe_to_kill(hostnames[0])
    elif action == 'is_host_drained':
        ret = mesos_maintenance.is_host_drained(hostnames[0])
    elif action == 'is_host_down':
        ret = mesos_maintenance.is_host_down(hostnames[0])
    elif action == 'is_host_draining':
        ret = mesos_maintenance.is_host_draining(hostnames[0])
    elif action == 'is_host_past_maintenance_start':
        ret = mesos_maintenance.is_host_past_maintenance_start(hostnames[0])
    elif action == 'is_host_past_maintenance_end':
        ret = mesos_maintenance.is_host_past_maintenance_end(hostnames[0])
    else:
        raise NotImplementedError("Action: '%s' is not implemented." % action)
    paasta_print(ret)
    return ret
Exemplo n.º 7
0
 def gracefully_terminate_slave(self, slave_to_kill, current_capacity,
                                new_capacity):
     drain_timeout = self.pool_settings.get('drain_timeout',
                                            DEFAULT_DRAIN_TIMEOUT)
     # The start time of the maintenance window is the point at which
     # we giveup waiting for the instance to drain and mark it for termination anyway
     start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
     # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
     # do anything at the end of the maintenance window.
     duration = 600 * 1000000000  # nanoseconds
     self.log.info("Draining {}".format(slave_to_kill.pid))
     should_drain = self.should_drain(slave_to_kill)
     if should_drain:
         try:
             drain_host_string = "{}|{}".format(slave_to_kill.hostname,
                                                slave_to_kill.ip)
             drain([drain_host_string], start, duration)
         except HTTPError as e:
             self.log.error("Failed to start drain "
                            "on {}: {}\n Trying next host".format(
                                slave_to_kill.hostname, e))
             raise
     self.log.info("Decreasing resource from {} to: {}".format(
         current_capacity, new_capacity))
     # Instance weights can be floats but the target has to be an integer
     # because this is all AWS allows on the API call to set target capacity
     new_capacity = int(floor(new_capacity))
     try:
         self.set_capacity(new_capacity)
     except FailSetResourceCapacity:
         self.log.error(
             "Couldn't update resource capacity, stopping autoscaler")
         self.log.info("Undraining {}".format(slave_to_kill.pid))
         if should_drain:
             undrain([drain_host_string])
         raise
     self.log.info("Waiting for instance to drain before we terminate")
     try:
         self.wait_and_terminate(
             slave=slave_to_kill,
             drain_timeout=drain_timeout,
             dry_run=self.dry_run,
             region=self.resource['region'],
             should_drain=should_drain,
         )
     except ClientError as e:
         self.log.error("Failure when terminating: {}: {}".format(
             slave_to_kill.pid, e))
         self.log.error("Setting resource capacity back to {}".format(
             current_capacity))
         self.set_capacity(current_capacity)
         self.log.info("Undraining {}".format(slave_to_kill.pid))
         if should_drain:
             undrain([drain_host_string])
Exemplo n.º 8
0
def _clean_up_maintenance(context):
    """If a host is marked as draining/down for maintenance, bring it back up"""
    if hasattr(context, "at_risk_host"):
        with mock.patch("paasta_tools.mesos_maintenance.get_principal",
                        autospec=True) as mock_get_principal, mock.patch(
                            "paasta_tools.mesos_maintenance.get_secret",
                            autospec=True) as mock_get_secret:
            credentials = load_credentials(
                mesos_secrets="/etc/mesos-slave-secret")
            mock_get_principal.return_value = credentials.principal
            mock_get_secret.return_value = credentials.secret
            undrain([context.at_risk_host])
            del context.at_risk_host
Exemplo n.º 9
0
def _clean_up_maintenance(context):
    """If a host is marked as draining/down for maintenance, bring it back up"""
    if hasattr(context, 'at_risk_host'):
        with contextlib.nested(
            mock.patch('paasta_tools.mesos_maintenance.get_principal', autospec=True),
            mock.patch('paasta_tools.mesos_maintenance.get_secret', autospec=True),
        ) as (
            mock_get_principal,
            mock_get_secret,
        ):
            credentials = load_credentials(mesos_secrets='/etc/mesos-slave-secret')
            mock_get_principal.return_value = credentials.principal
            mock_get_secret.return_value = credentials.secret
            undrain([context.at_risk_host])
            del context.at_risk_host
Exemplo n.º 10
0
def gracefully_terminate_slave(resource, slave_to_kill, pool_settings, current_capacity, new_capacity, dry_run):
    sfr_id = resource['id']
    drain_timeout = pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT)
    # The start time of the maintenance window is the point at which
    # we giveup waiting for the instance to drain and mark it for termination anyway
    start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
    # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
    # do anything at the end of the maintenance window.
    duration = 600 * 1000000000  # nanoseconds
    log.info("Draining {0}".format(slave_to_kill['pid']))
    if not dry_run:
        try:
            drain_host_string = "{0}|{1}".format(slave_to_kill['hostname'], slave_to_kill['ip'])
            drain([drain_host_string], start, duration)
        except HTTPError as e:
            log.error("Failed to start drain "
                      "on {0}: {1}\n Trying next host".format(slave_to_kill['hostname'], e))
            raise
    log.info("Decreasing spot fleet capacity from {0} to: {1}".format(current_capacity, new_capacity))
    # Instance weights can be floats but the target has to be an integer
    # because this is all AWS allows on the API call to set target capacity
    new_capacity = int(floor(new_capacity))
    try:
        set_spot_fleet_request_capacity(sfr_id, new_capacity, dry_run, region=resource['region'])
    except FailSetSpotCapacity:
        log.error("Couldn't update spot fleet, stopping autoscaler")
        log.info("Undraining {0}".format(slave_to_kill['pid']))
        if not dry_run:
            undrain([drain_host_string])
        raise
    log.info("Waiting for instance to drain before we terminate")
    try:
        wait_and_terminate(slave_to_kill, drain_timeout, dry_run, region=resource['region'])
    except ClientError as e:
        log.error("Failure when terminating: {0}: {1}".format(slave_to_kill['pid'], e))
        log.error("Setting spot fleet capacity back to {0}".format(current_capacity))
        set_spot_fleet_request_capacity(sfr_id, current_capacity, dry_run, region=resource['region'])
    finally:
        log.info("Undraining {0}".format(slave_to_kill['pid']))
        if not dry_run:
            undrain([drain_host_string])
Exemplo n.º 11
0
def test_undrain(
    mock_unreserve_all_resources,
    mock_build_maintenance_schedule_payload,
    mock_get_schedule_client,
):
    fake_schedule = {'fake_schedule': 'fake_value'}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    undrain(hostnames=['some-host'])

    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(['some-host'], drain=False)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args

    assert mock_unreserve_all_resources.call_count == 1
    expected_args = mock.call(['some-host'])
    assert mock_unreserve_all_resources.call_args == expected_args

    assert mock_get_schedule_client.call_count == 1
    assert mock_get_schedule_client.return_value.call_count == 1
    expected_args = mock.call(method="POST", endpoint="", data=json.dumps(fake_schedule))
    assert mock_get_schedule_client.return_value.call_args == expected_args
Exemplo n.º 12
0
 async def gracefully_terminate_slave(self, slave_to_kill, capacity_diff,
                                      timer):
     """
     Since this is async, it can be suspended at an `await` call.  Because of this, we need to re-calculate
     the capacity each time we call `set_capacity` (as another coroutine could have set the capacity while
     this one was suspended).  `set_capacity` stores the currently set capacity in the object, and then
     this function re-calculates that from the capacity_diff each time we call `set_capacity`
     """
     drain_timeout = self.pool_settings.get('drain_timeout',
                                            DEFAULT_DRAIN_TIMEOUT)
     # The start time of the maintenance window is the point at which
     # we giveup waiting for the instance to drain and mark it for termination anyway
     start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
     # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
     # do anything at the end of the maintenance window.
     duration = 600 * 1000000000  # nanoseconds
     self.log.info("Draining {}".format(slave_to_kill.pid))
     should_drain = self.should_drain(slave_to_kill)
     if should_drain:
         try:
             drain_host_string = "{}|{}".format(slave_to_kill.hostname,
                                                slave_to_kill.ip)
             drain([drain_host_string], start, duration)
         except HTTPError as e:
             self.log.error("Failed to start drain "
                            "on {}: {}\n Trying next host".format(
                                slave_to_kill.hostname, e))
             raise
     self.log.info("Decreasing resource from {} to: {}".format(
         self.capacity, self.capacity + capacity_diff))
     # Instance weights can be floats but the target has to be an integer
     # because this is all AWS allows on the API call to set target capacity
     try:
         self.set_capacity(self.capacity + capacity_diff)
     except FailSetResourceCapacity:
         self.log.error(
             "Couldn't update resource capacity, stopping autoscaler")
         self.log.info("Undraining {}".format(slave_to_kill.pid))
         if should_drain:
             undrain([drain_host_string])
         raise
     self.log.info("Waiting for instance to drain before we terminate")
     try:
         await self.wait_and_terminate(
             slave=slave_to_kill,
             drain_timeout=drain_timeout,
             dry_run=self.dry_run,
             timer=timer,
             region=self.resource['region'],
             should_drain=should_drain,
         )
     except ClientError as e:
         self.log.error("Failure when terminating: {}: {}".format(
             slave_to_kill.pid, e))
         self.log.error(
             "Setting resource capacity back to {}".format(self.capacity -
                                                           capacity_diff))
         self.set_capacity(self.capacity - capacity_diff)
         self.log.info("Undraining {}".format(slave_to_kill.pid))
         if should_drain:
             undrain([drain_host_string])