Пример #1
0
 def _collect_ramdisk_logs_storage_fail(
         self, expected_exception, mock_collect, mock_store):
     mock_store.side_effect = expected_exception
     logs = 'Gary the Snail'
     mock_collect.return_value = {'command_result': {'system_logs': logs}}
     driver_utils.collect_ramdisk_logs(self.node)
     mock_store.assert_called_once_with(self.node, logs)
Пример #2
0
 def _collect_ramdisk_logs_storage_fail(
         self, expected_exception, mock_collect, mock_store):
     mock_store.side_effect = expected_exception
     logs = 'Gary the Snail'
     mock_collect.return_value = {'command_result': {'system_logs': logs}}
     driver_utils.collect_ramdisk_logs(self.node)
     mock_store.assert_called_once_with(self.node, logs)
Пример #3
0
 def test_collect_ramdisk_logs_IPA_command_fail(
         self, mock_collect, mock_store, mock_log):
     error_str = 'MR. KRABS! I WANNA GO TO BED!'
     mock_collect.return_value = {'faultstring': error_str}
     driver_utils.collect_ramdisk_logs(self.node)
     # assert store was never invoked
     self.assertFalse(mock_store.called)
     mock_log.assert_called_once_with(
         mock.ANY, {'node': self.node.uuid, 'error': error_str})
Пример #4
0
 def test_collect_ramdisk_logs_IPA_command_fail(
         self, mock_collect, mock_store, mock_log):
     error_str = 'MR. KRABS! I WANNA GO TO BED!'
     mock_collect.return_value = {'faultstring': error_str}
     driver_utils.collect_ramdisk_logs(self.node)
     # assert store was never invoked
     self.assertFalse(mock_store.called)
     mock_log.assert_called_once_with(
         mock.ANY, {'node': self.node.uuid, 'error': error_str})
Пример #5
0
def set_failed_state(task, msg, collect_logs=True):
    """Sets the deploy status as failed with relevant messages.

    This method sets the deployment as fail with the given message.
    It sets node's provision_state to DEPLOYFAIL and updates last_error
    with the given error message. It also powers off the baremetal node.

    :param task: a TaskManager instance containing the node to act on.
    :param msg: the message to set in last_error of the node.
    :param collect_logs: boolean indicating whether to attempt collect
                         logs from IPA-based ramdisk. Defaults to True.
                         Actual log collection is also affected by
                         CONF.agent.deploy_logs_collect config option.
    """
    node = task.node

    if (collect_logs
            and CONF.agent.deploy_logs_collect in ('on_failure', 'always')):
        driver_utils.collect_ramdisk_logs(node)

    try:
        task.process_event('fail')
    except exception.InvalidState:
        msg2 = (_LE('Internal error. Node %(node)s in provision state '
                    '"%(state)s" could not transition to a failed state.') % {
                        'node': node.uuid,
                        'state': node.provision_state
                    })
        LOG.exception(msg2)

    if CONF.deploy.power_off_after_deploy_failure:
        try:
            manager_utils.node_power_action(task, states.POWER_OFF)
        except Exception:
            msg2 = (_LE('Node %s failed to power off while handling deploy '
                        'failure. This may be a serious condition. Node '
                        'should be removed from Ironic or put in maintenance '
                        'mode until the problem is resolved.') % node.uuid)
            LOG.exception(msg2)
    # NOTE(deva): node_power_action() erases node.last_error
    #             so we need to set it here.
    node.last_error = msg
    node.save()
Пример #6
0
def set_failed_state(task, msg, collect_logs=True):
    """Sets the deploy status as failed with relevant messages.

    This method sets the deployment as fail with the given message.
    It sets node's provision_state to DEPLOYFAIL and updates last_error
    with the given error message. It also powers off the baremetal node.

    :param task: a TaskManager instance containing the node to act on.
    :param msg: the message to set in last_error of the node.
    :param collect_logs: boolean indicating whether to attempt collect
                         logs from IPA-based ramdisk. Defaults to True.
                         Actual log collection is also affected by
                         CONF.agent.deploy_logs_collect config option.
    """
    node = task.node

    if (collect_logs and
            CONF.agent.deploy_logs_collect in ('on_failure', 'always')):
        driver_utils.collect_ramdisk_logs(node)

    try:
        task.process_event('fail')
    except exception.InvalidState:
        msg2 = (_LE('Internal error. Node %(node)s in provision state '
                    '"%(state)s" could not transition to a failed state.')
                % {'node': node.uuid, 'state': node.provision_state})
        LOG.exception(msg2)

    if CONF.deploy.power_off_after_deploy_failure:
        try:
            manager_utils.node_power_action(task, states.POWER_OFF)
        except Exception:
            msg2 = (_LE('Node %s failed to power off while handling deploy '
                        'failure. This may be a serious condition. Node '
                        'should be removed from Ironic or put in maintenance '
                        'mode until the problem is resolved.') % node.uuid)
            LOG.exception(msg2)
    # NOTE(deva): node_power_action() erases node.last_error
    #             so we need to set it here.
    node.last_error = msg
    node.save()
Пример #7
0
 def test_collect_ramdisk_logs_storage_command_fail(
         self, mock_collect, mock_store):
     mock_collect.side_effect = exception.IronicException('boom')
     self.assertIsNone(driver_utils.collect_ramdisk_logs(self.node))
     self.assertFalse(mock_store.called)
Пример #8
0
 def test_collect_ramdisk_logs(self, mock_collect, mock_store):
     logs = 'Gary the Snail'
     mock_collect.return_value = {'command_result': {'system_logs': logs}}
     driver_utils.collect_ramdisk_logs(self.node)
     mock_store.assert_called_once_with(self.node, logs)
Пример #9
0
def do_next_clean_step(task, step_index):
    """Do cleaning, starting from the specified clean step.

    :param task: a TaskManager instance with an exclusive lock
    :param step_index: The first clean step in the list to execute. This
        is the index (from 0) into the list of clean steps in the node's
        driver_internal_info['clean_steps']. Is None if there are no steps
        to execute.
    """
    node = task.node
    # For manual cleaning, the target provision state is MANAGEABLE,
    # whereas for automated cleaning, it is AVAILABLE.
    manual_clean = node.target_provision_state == states.MANAGEABLE

    if step_index is None:
        steps = []
    else:
        steps = node.driver_internal_info['clean_steps'][step_index:]

    LOG.info('Executing %(state)s on node %(node)s, remaining steps: '
             '%(steps)s', {'node': node.uuid, 'steps': steps,
                           'state': node.provision_state})

    # Execute each step until we hit an async step or run out of steps
    for ind, step in enumerate(steps):
        # Save which step we're about to start so we can restart
        # if necessary
        node.clean_step = step
        driver_internal_info = node.driver_internal_info
        driver_internal_info['clean_step_index'] = step_index + ind
        node.driver_internal_info = driver_internal_info
        node.save()
        interface = getattr(task.driver, step.get('interface'))
        LOG.info('Executing %(step)s on node %(node)s',
                 {'step': step, 'node': node.uuid})
        try:
            result = interface.execute_clean_step(task, step)
        except Exception as e:
            if isinstance(e, exception.AgentConnectionFailed):
                if task.node.driver_internal_info.get('cleaning_reboot'):
                    LOG.info('Agent is not yet running on node %(node)s '
                             'after cleaning reboot, waiting for agent to '
                             'come up to run next clean step %(step)s.',
                             {'node': node.uuid, 'step': step})
                    driver_internal_info['skip_current_clean_step'] = False
                    node.driver_internal_info = driver_internal_info
                    target_state = (states.MANAGEABLE if manual_clean
                                    else None)
                    task.process_event('wait', target_state=target_state)
                    return

            msg = (_('Node %(node)s failed step %(step)s: '
                     '%(exc)s') %
                   {'node': node.uuid, 'exc': e,
                    'step': node.clean_step})
            LOG.exception(msg)
            driver_utils.collect_ramdisk_logs(task.node, label='cleaning')
            utils.cleaning_error_handler(task, msg)
            return

        # Check if the step is done or not. The step should return
        # states.CLEANWAIT if the step is still being executed, or
        # None if the step is done.
        if result == states.CLEANWAIT:
            # Kill this worker, the async step will make an RPC call to
            # continue_node_clean to continue cleaning
            LOG.info('Clean step %(step)s on node %(node)s being '
                     'executed asynchronously, waiting for driver.',
                     {'node': node.uuid, 'step': step})
            target_state = states.MANAGEABLE if manual_clean else None
            task.process_event('wait', target_state=target_state)
            return
        elif result is not None:
            msg = (_('While executing step %(step)s on node '
                     '%(node)s, step returned invalid value: %(val)s')
                   % {'step': step, 'node': node.uuid, 'val': result})
            LOG.error(msg)
            return utils.cleaning_error_handler(task, msg)
        LOG.info('Node %(node)s finished clean step %(step)s',
                 {'node': node.uuid, 'step': step})

    if CONF.agent.deploy_logs_collect == 'always':
        driver_utils.collect_ramdisk_logs(task.node, label='cleaning')

    # Clear clean_step
    node.clean_step = None
    driver_internal_info = node.driver_internal_info
    driver_internal_info['clean_steps'] = None
    driver_internal_info.pop('clean_step_index', None)
    driver_internal_info.pop('cleaning_reboot', None)
    driver_internal_info.pop('cleaning_polling', None)
    driver_internal_info.pop('agent_secret_token', None)
    driver_internal_info.pop('agent_secret_token_pregenerated', None)

    # Remove agent_url
    if not utils.fast_track_able(task):
        driver_internal_info.pop('agent_url', None)
    node.driver_internal_info = driver_internal_info
    node.save()
    try:
        task.driver.deploy.tear_down_cleaning(task)
    except Exception as e:
        msg = (_('Failed to tear down from cleaning for node %(node)s, '
                 'reason: %(err)s')
               % {'node': node.uuid, 'err': e})
        LOG.exception(msg)
        return utils.cleaning_error_handler(task, msg,
                                            tear_down_cleaning=False)

    LOG.info('Node %s cleaning complete', node.uuid)
    event = 'manage' if manual_clean or node.retired else 'done'
    # NOTE(rloo): No need to specify target prov. state; we're done
    task.process_event(event)
Пример #10
0
    def reboot_and_finish_deploy(self, task):
        """Helper method to trigger reboot on the node and finish deploy.

        This method initiates a reboot on the node. On success, it
        marks the deploy as complete. On failure, it logs the error
        and marks deploy as failure.

        :param task: a TaskManager object containing the node
        :raises: InstanceDeployFailure, if node reboot failed.
        """
        wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000
        attempts = CONF.agent.post_deploy_get_power_state_retries + 1

        @retrying.retry(
            stop_max_attempt_number=attempts,
            retry_on_result=lambda state: state != states.POWER_OFF,
            wait_fixed=wait
        )
        def _wait_until_powered_off(task):
            return task.driver.power.get_power_state(task)

        node = task.node

        if CONF.agent.deploy_logs_collect == 'always':
            driver_utils.collect_ramdisk_logs(node)

        # Whether ironic should power off the node via out-of-band or
        # in-band methods
        oob_power_off = strutils.bool_from_string(
            node.driver_info.get('deploy_forces_oob_reboot', False))

        try:
            if not oob_power_off:
                try:
                    self._client.power_off(node)
                    _wait_until_powered_off(task)
                except Exception as e:
                    LOG.warning('Failed to soft power off node %(node_uuid)s '
                                'in at least %(timeout)d seconds. '
                                '%(cls)s: %(error)s',
                                {'node_uuid': node.uuid,
                                 'timeout': (wait * (attempts - 1)) / 1000,
                                 'cls': e.__class__.__name__, 'error': e},
                                exc_info=not isinstance(
                                    e, exception.IronicException))
                    manager_utils.node_power_action(task, states.POWER_OFF)
            else:
                # Flush the file system prior to hard rebooting the node
                result = self._client.sync(node)
                error = result.get('faultstring')
                if error:
                    if 'Unknown command' in error:
                        error = _('The version of the IPA ramdisk used in '
                                  'the deployment do not support the '
                                  'command "sync"')
                    LOG.warning(
                        'Failed to flush the file system prior to hard '
                        'rebooting the node %(node)s. Error: %(error)s',
                        {'node': node.uuid, 'error': error})

                manager_utils.node_power_action(task, states.POWER_OFF)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     '%(cls)s: %(error)s') %
                   {'node': node.uuid, 'cls': e.__class__.__name__,
                    'error': e})
            log_and_raise_deployment_error(task, msg, exc=e)

        try:
            power_state_to_restore = (
                manager_utils.power_on_node_if_needed(task))
            task.driver.network.remove_provisioning_network(task)
            task.driver.network.configure_tenant_networks(task)
            manager_utils.restore_power_state_if_needed(
                task, power_state_to_restore)
            manager_utils.node_power_action(task, states.POWER_ON)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     '%(cls)s: %(error)s') %
                   {'node': node.uuid, 'cls': e.__class__.__name__,
                    'error': e})
            # NOTE(mgoddard): Don't collect logs since the node has been
            # powered off.
            log_and_raise_deployment_error(task, msg, collect_logs=False,
                                           exc=e)

        if not node.deploy_step:
            # TODO(rloo): delete this 'if' part after deprecation period, when
            # we expect all (out-of-tree) drivers to support deploy steps.
            # After which we will always notify_conductor_resume_deploy().
            task.process_event('done')
            LOG.info('Deployment to node %s done', task.node.uuid)
        else:
            manager_utils.notify_conductor_resume_deploy(task)
Пример #11
0
    def reboot_and_finish_deploy(self, task):
        """Helper method to trigger reboot on the node and finish deploy.

        This method initiates a reboot on the node. On success, it
        marks the deploy as complete. On failure, it logs the error
        and marks deploy as failure.

        :param task: a TaskManager object containing the node
        :raises: InstanceDeployFailure, if node reboot failed.
        """
        wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000
        attempts = CONF.agent.post_deploy_get_power_state_retries + 1

        @retrying.retry(
            stop_max_attempt_number=attempts,
            retry_on_result=lambda state: state != states.POWER_OFF,
            wait_fixed=wait)
        def _wait_until_powered_off(task):
            return task.driver.power.get_power_state(task)

        node = task.node

        if CONF.agent.deploy_logs_collect == 'always':
            driver_utils.collect_ramdisk_logs(node)

        # Whether ironic should power off the node via out-of-band or
        # in-band methods
        oob_power_off = strutils.bool_from_string(
            node.driver_info.get('deploy_forces_oob_reboot', False))

        try:
            if not oob_power_off:
                try:
                    self._client.power_off(node)
                    _wait_until_powered_off(task)
                except Exception as e:
                    LOG.warning(
                        _LW('Failed to soft power off node %(node_uuid)s '
                            'in at least %(timeout)d seconds. '
                            'Error: %(error)s'), {
                                'node_uuid': node.uuid,
                                'timeout': (wait * (attempts - 1)) / 1000,
                                'error': e
                            })
                    manager_utils.node_power_action(task, states.POWER_OFF)
            else:
                # Flush the file system prior to hard rebooting the node
                result = self._client.sync(node)
                error = result.get('faultstring')
                if error:
                    if 'Unknown command' in error:
                        error = _('The version of the IPA ramdisk used in '
                                  'the deployment do not support the '
                                  'command "sync"')
                    LOG.warning(
                        _LW('Failed to flush the file system prior to hard '
                            'rebooting the node %(node)s. Error: %(error)s'), {
                                'node': node.uuid,
                                'error': error
                            })

                manager_utils.node_power_action(task, states.POWER_OFF)

            task.driver.network.remove_provisioning_network(task)
            task.driver.network.configure_tenant_networks(task)

            manager_utils.node_power_action(task, states.POWER_ON)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     'Error: %(error)s') % {
                         'node': node.uuid,
                         'error': e
                     })
            log_and_raise_deployment_error(task, msg)

        task.process_event('done')
        LOG.info(_LI('Deployment to node %s done'), task.node.uuid)
Пример #12
0
 def test_collect_ramdisk_logs_storage_command_fail(
         self, mock_collect, mock_store):
     mock_collect.side_effect = exception.IronicException('boom')
     self.assertIsNone(driver_utils.collect_ramdisk_logs(self.node))
     self.assertFalse(mock_store.called)
Пример #13
0
 def test_collect_ramdisk_logs(self, mock_collect, mock_store):
     logs = 'Gary the Snail'
     mock_collect.return_value = {'command_result': {'system_logs': logs}}
     driver_utils.collect_ramdisk_logs(self.node)
     mock_store.assert_called_once_with(self.node, logs)
Пример #14
0
    def reboot_and_finish_deploy(self, task):
        """Helper method to trigger reboot on the node and finish deploy.

        This method initiates a reboot on the node. On success, it
        marks the deploy as complete. On failure, it logs the error
        and marks deploy as failure.

        :param task: a TaskManager object containing the node
        :raises: InstanceDeployFailure, if node reboot failed.
        """
        wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000
        attempts = CONF.agent.post_deploy_get_power_state_retries + 1

        @retrying.retry(
            stop_max_attempt_number=attempts,
            retry_on_result=lambda state: state != states.POWER_OFF,
            wait_fixed=wait
        )
        def _wait_until_powered_off(task):
            return task.driver.power.get_power_state(task)

        node = task.node

        if CONF.agent.deploy_logs_collect == 'always':
            driver_utils.collect_ramdisk_logs(node)

        # Whether ironic should power off the node via out-of-band or
        # in-band methods
        oob_power_off = strutils.bool_from_string(
            node.driver_info.get('deploy_forces_oob_reboot', False))

        try:
            if not oob_power_off:
                try:
                    self._client.power_off(node)
                    _wait_until_powered_off(task)
                except Exception as e:
                    LOG.warning(
                        _LW('Failed to soft power off node %(node_uuid)s '
                            'in at least %(timeout)d seconds. '
                            'Error: %(error)s'),
                        {'node_uuid': node.uuid,
                         'timeout': (wait * (attempts - 1)) / 1000,
                         'error': e})
                    manager_utils.node_power_action(task, states.POWER_OFF)
            else:
                # Flush the file system prior to hard rebooting the node
                result = self._client.sync(node)
                error = result.get('faultstring')
                if error:
                    if 'Unknown command' in error:
                        error = _('The version of the IPA ramdisk used in '
                                  'the deployment do not support the '
                                  'command "sync"')
                    LOG.warning(_LW(
                        'Failed to flush the file system prior to hard '
                        'rebooting the node %(node)s. Error: %(error)s'),
                        {'node': node.uuid, 'error': error})

                manager_utils.node_power_action(task, states.POWER_OFF)

            task.driver.network.remove_provisioning_network(task)
            task.driver.network.configure_tenant_networks(task)

            manager_utils.node_power_action(task, states.POWER_ON)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     'Error: %(error)s') %
                   {'node': node.uuid, 'error': e})
            log_and_raise_deployment_error(task, msg)

        task.process_event('done')
        LOG.info(_LI('Deployment to node %s done'), task.node.uuid)
Пример #15
0
    def reboot_and_finish_deploy(self, task):
        """Helper method to trigger reboot on the node and finish deploy.

        This method initiates a reboot on the node. On success, it
        marks the deploy as complete. On failure, it logs the error
        and marks deploy as failure.

        :param task: a TaskManager object containing the node
        :raises: InstanceDeployFailure, if node reboot failed.
        """
        wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000
        attempts = CONF.agent.post_deploy_get_power_state_retries + 1

        @retrying.retry(
            stop_max_attempt_number=attempts,
            retry_on_result=lambda state: state != states.POWER_OFF,
            wait_fixed=wait
        )
        def _wait_until_powered_off(task):
            return task.driver.power.get_power_state(task)

        node = task.node

        if CONF.agent.deploy_logs_collect == 'always':
            driver_utils.collect_ramdisk_logs(node)

        # Whether ironic should power off the node via out-of-band or
        # in-band methods
        oob_power_off = strutils.bool_from_string(
            node.driver_info.get('deploy_forces_oob_reboot', False))

        try:
            if not oob_power_off:
                try:
                    self._client.power_off(node)
                    _wait_until_powered_off(task)
                except Exception as e:
                    LOG.warning('Failed to soft power off node %(node_uuid)s '
                                'in at least %(timeout)d seconds. '
                                '%(cls)s: %(error)s',
                                {'node_uuid': node.uuid,
                                 'timeout': (wait * (attempts - 1)) / 1000,
                                 'cls': e.__class__.__name__, 'error': e},
                                exc_info=not isinstance(
                                    e, exception.IronicException))
                    manager_utils.node_power_action(task, states.POWER_OFF)
            else:
                # Flush the file system prior to hard rebooting the node
                result = self._client.sync(node)
                error = result.get('faultstring')
                if error:
                    if 'Unknown command' in error:
                        error = _('The version of the IPA ramdisk used in '
                                  'the deployment do not support the '
                                  'command "sync"')
                    LOG.warning(
                        'Failed to flush the file system prior to hard '
                        'rebooting the node %(node)s. Error: %(error)s',
                        {'node': node.uuid, 'error': error})

                manager_utils.node_power_action(task, states.POWER_OFF)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     '%(cls)s: %(error)s') %
                   {'node': node.uuid, 'cls': e.__class__.__name__,
                    'error': e})
            log_and_raise_deployment_error(task, msg, exc=e)

        try:
            power_state_to_restore = (
                manager_utils.power_on_node_if_needed(task))
            task.driver.network.remove_provisioning_network(task)
            task.driver.network.configure_tenant_networks(task)
            manager_utils.restore_power_state_if_needed(
                task, power_state_to_restore)
            manager_utils.node_power_action(task, states.POWER_ON)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     '%(cls)s: %(error)s') %
                   {'node': node.uuid, 'cls': e.__class__.__name__,
                    'error': e})
            # NOTE(mgoddard): Don't collect logs since the node has been
            # powered off.
            log_and_raise_deployment_error(task, msg, collect_logs=False,
                                           exc=e)

        if not node.deploy_step:
            # TODO(rloo): delete this 'if' part after deprecation period, when
            # we expect all (out-of-tree) drivers to support deploy steps.
            # After which we will always notify_conductor_resume_deploy().
            task.process_event('done')
            LOG.info('Deployment to node %s done', task.node.uuid)
        else:
            manager_utils.notify_conductor_resume_deploy(task)