示例#1
0
    def logger(self, request, **routeArguments):
        logger = _get_logger(self)

        # If this is ever more than ASCII we might have issues? or maybe
        # this is pre-url decoding?
        # https://clusterhq.atlassian.net/browse/FLOC-1602
        action = REQUEST(logger, request_path=request.path,
                         method=request.method)

        # Generate a serialized action context that uniquely identifies
        # position within the logs, though there won't actually be any log
        # message with that particular task level:
        incidentIdentifier = action.serialize_task_id()

        with action.context():
            d = DeferredContext(original(self, request, **routeArguments))

        def failure(reason):
            if reason.check(BadRequest):
                code = reason.value.code
                result = reason.value.result
            else:
                writeFailure(reason, logger, LOG_SYSTEM)
                code = INTERNAL_SERVER_ERROR
                result = incidentIdentifier
            request.setResponseCode(code)
            request.responseHeaders.setRawHeaders(
                b"content-type", [b"application/json"])
            return dumps(result)
        d.addErrback(failure)
        d.addActionFinish()
        return d.result
示例#2
0
    def output_CONVERGE(self, context):
        known_local_state = self.cluster_state.get_node(self.deployer.node_uuid, hostname=self.deployer.hostname)

        with LOG_CONVERGE(
            self.fsm.logger, cluster_state=self.cluster_state, desired_configuration=self.configuration
        ).context():
            d = DeferredContext(self.deployer.discover_state(known_local_state))

        def got_local_state(state_changes):
            # Current cluster state is likely out of date as regards the local
            # state, so update it accordingly.
            for state in state_changes:
                self.cluster_state = state.update_cluster_state(self.cluster_state)
            with LOG_SEND_TO_CONTROL_SERVICE(
                self.fsm.logger, connection=self.client, local_changes=list(state_changes)
            ) as context:
                self.client.callRemote(NodeStateCommand, state_changes=state_changes, eliot_context=context)
            action = self.deployer.calculate_changes(self.configuration, self.cluster_state)
            LOG_CALCULATED_ACTIONS(calculated_actions=action).write(self.fsm.logger)
            return run_state_change(action, self.deployer)

        d.addCallback(got_local_state)
        # If an error occurred we just want to log it and then try
        # converging again; hopefully next time we'll have more success.
        d.addErrback(writeFailure, self.fsm.logger, u"")

        # It would be better to have a "quiet time" state in the FSM and
        # transition to that next, then have a timeout input kick the machine
        # back around to the beginning of the loop in the FSM.  However, we're
        # not going to keep this sleep-for-a-bit solution in the long term.
        # Instead, we'll be more event driven.  So just going with the simple
        # solution and inserting a side-effect-y delay directly here.

        d.addCallback(lambda _: self.reactor.callLater(1.0, self.fsm.receive, ConvergenceLoopInputs.ITERATION_DONE))
        d.addActionFinish()
示例#3
0
文件: _loop.py 项目: uedzen/flocker
    def _send_state_to_control_service(self, state_changes):
        context = LOG_SEND_TO_CONTROL_SERVICE(
            self.fsm.logger, connection=self.client,
            local_changes=list(state_changes),
        )
        with context.context():
            d = DeferredContext(self.client.callRemote(
                NodeStateCommand,
                state_changes=state_changes,
                eliot_context=context)
            )

            def record_acknowledged_state(ignored):
                self._last_acknowledged_state = state_changes

            def clear_acknowledged_state(failure):
                # We don't know if the control service has processed the update
                # or not. So we clear the last acknowledged state so that we
                # always send the state on the next iteration.
                self._last_acknowledged_state = None
                return failure

            d.addCallbacks(record_acknowledged_state, clear_acknowledged_state)
            d.addErrback(
                writeFailure, self.fsm.logger,
                u"Failed to send local state to control node.")
            return d.addActionFinish()
示例#4
0
def sample(operation, metric, name):
    """
    Perform sampling of the operation.

    :param IOperation operation: An operation to perform.
    :param IMetric metric: A quantity to measure.
    :param int name: Identifier for individual sample.
    :return: Deferred firing with a sample. A sample is a dictionary
        containing a ``success`` boolean.  If ``success is True``, the
        dictionary also contains a ``value`` for the sample measurement.
        If ``success is False``, the dictionary also contains a
        ``reason`` for failure.
    """
    with start_action(action_type=u'flocker:benchmark:sample', sample=name):
        sampling = DeferredContext(maybeDeferred(operation.get_probe))

        def run_probe(probe):
            probing = metric.measure(probe.run)
            probing.addCallback(
                lambda measurement: dict(success=True, value=measurement)
            )
            probing.addCallback(bypass, probe.cleanup)

            return probing
        sampling.addCallback(run_probe)

        # Convert an error running the probe into a failed sample.
        def convert_to_result(failure):
            return dict(success=False, reason=failure.getTraceback())
        sampling.addErrback(convert_to_result)

        return sampling.addActionFinish()
示例#5
0
    def stop(self):
        """
        Stop the scenario from being maintained by stopping all the
        loops that may be executing.

        :return Deferred[Optional[Dict[unicode, Any]]]: Scenario metrics.
        """
        self.is_started = False
        if self.monitor_loop.running:
            self.monitor_loop.stop()

        if self.loop.running:
            self.loop.stop()

        outstanding_requests = self.rate_measurer.outstanding()

        if outstanding_requests > 0:
            msg = (
                "There are {num_requests} outstanding requests. " "Waiting {num_seconds} seconds for them to complete."
            ).format(num_requests=outstanding_requests, num_seconds=self.timeout)
            Message.log(key="outstanding_requests", value=msg)

        with start_action(action_type=u"flocker:benchmark:scenario:stop", scenario="request_load"):

            def no_outstanding_requests():
                return self.rate_measurer.outstanding() == 0

            scenario_stopped = loop_until(self.reactor, no_outstanding_requests, repeat(1))
            timeout(self.reactor, scenario_stopped, self.timeout)
            scenario = DeferredContext(scenario_stopped)

            def handle_timeout(failure):
                failure.trap(CancelledError)
                msg = ("Force stopping the scenario. " "There are {num_requests} outstanding requests").format(
                    num_requests=outstanding_requests
                )
                Message.log(key="force_stop_request", value=msg)

            scenario.addErrback(handle_timeout)

            def scenario_cleanup(ignored):
                """
                Calls the scenario cleanup, and wraps it inside an eliot
                start action, so we can see the logs if something goes
                wrong within the cleanup

                :return Deferred: that will fire once the cleanup has been
                    completed
                """
                with start_action(action_type=u"flocker:benchmark:scenario:cleanup", scenario="request_load"):
                    return self.request.run_cleanup()

            scenario.addBoth(scenario_cleanup)

            def return_metrics(_ignore):
                return self.rate_measurer.get_metrics()

            scenario.addCallback(return_metrics)

            return scenario.addActionFinish()
示例#6
0
文件: _api.py 项目: WUMUXIAN/flocker
    def volumedriver_mount(self, Name):
        """
        Move a volume with the given name to the current node and mount it.

        Since we need to return the filesystem path we wait until the
        dataset is mounted locally.

        :param unicode Name: The name of the volume.

        :return: Result that includes the mountpoint.
        """
        d = DeferredContext(self._dataset_id_for_name(Name))
        d.addCallback(lambda dataset_id: self._flocker_client.move_dataset(self._node_id, dataset_id))
        d.addCallback(lambda dataset: dataset.dataset_id)

        d.addCallback(
            lambda dataset_id: loop_until(
                self._reactor, lambda: self._get_path_from_dataset_id(dataset_id), repeat(self._POLL_INTERVAL)
            )
        )
        d.addCallback(lambda p: {u"Err": u"", u"Mountpoint": p.path})

        timeout(self._reactor, d.result, self._MOUNT_TIMEOUT)

        def handleCancel(failure):
            failure.trap(CancelledError)
            return {u"Err": u"Timed out waiting for dataset to mount.", u"Mountpoint": u""}

        d.addErrback(handleCancel)
        return d.result
示例#7
0
文件: _loop.py 项目: uedzen/flocker
    def output_CONVERGE(self, context):
        known_local_state = self.cluster_state.get_node(
            self.deployer.node_uuid, hostname=self.deployer.hostname)

        with LOG_CONVERGE(self.fsm.logger, cluster_state=self.cluster_state,
                          desired_configuration=self.configuration).context():
            d = DeferredContext(maybeDeferred(
                self.deployer.discover_state, known_local_state))

        def got_local_state(local_state):
            cluster_state_changes = local_state.shared_state_changes()
            # Current cluster state is likely out of date as regards the local
            # state, so update it accordingly.
            #
            # XXX This somewhat side-steps the whole explicit-state-machine
            # thing we're aiming for here.  It would be better for these state
            # changes to arrive as an input to the state machine.
            for state in cluster_state_changes:
                self.cluster_state = state.update_cluster_state(
                    self.cluster_state
                )

            # XXX And for this update to be the side-effect of an output
            # resulting.
            sent_state = self._maybe_send_state_to_control_service(
                cluster_state_changes)

            action = self.deployer.calculate_changes(
                self.configuration, self.cluster_state, local_state
            )
            LOG_CALCULATED_ACTIONS(calculated_actions=action).write(
                self.fsm.logger)
            ran_state_change = run_state_change(action, self.deployer)
            DeferredContext(ran_state_change).addErrback(
                writeFailure, self.fsm.logger)

            # Wait for the control node to acknowledge the new
            # state, and for the convergence actions to run.
            return gather_deferreds([sent_state, ran_state_change])
        d.addCallback(got_local_state)

        # If an error occurred we just want to log it and then try
        # converging again; hopefully next time we'll have more success.
        d.addErrback(writeFailure, self.fsm.logger)

        # It would be better to have a "quiet time" state in the FSM and
        # transition to that next, then have a timeout input kick the machine
        # back around to the beginning of the loop in the FSM.  However, we're
        # not going to keep this sleep-for-a-bit solution in the long term.
        # Instead, we'll be more event driven.  So just going with the simple
        # solution and inserting a side-effect-y delay directly here.

        d.addCallback(
            lambda _:
                self.reactor.callLater(
                    1.0, self.fsm.receive, ConvergenceLoopInputs.ITERATION_DONE
                )
        )
        d.addActionFinish()
 def startService(self):
     with start_action(action_type=u"asyncservice:start"):
         self.running = True
         self._d = self._factory()
         d = DeferredContext(self._d)
         d.addCallback(self._created)
         d.addErrback(self._failed)
         d.addActionFinish()
示例#9
0
    def stop(self):
        """
        Stop the scenario from being maintained by stopping all the
        loops that may be executing.

        :return Deferred[Optional[Dict[unicode, Any]]]: Scenario metrics.
        """
        self.is_started = False
        if self.monitor_loop.running:
            self.monitor_loop.stop()

        if self.loop.running:
            self.loop.stop()

        outstanding_requests = self.rate_measurer.outstanding()

        if outstanding_requests > 0:
            msg = (
                "There are {num_requests} outstanding requests. "
                "Waiting {num_seconds} seconds for them to complete."
            ).format(
                num_requests=outstanding_requests,
                num_seconds=self.timeout
            )
            Message.log(key='outstanding_requests', value=msg)

        with start_action(
            action_type=u'flocker:benchmark:scenario:stop',
            scenario='request_load'
        ):
            def no_outstanding_requests():
                return self.rate_measurer.outstanding() == 0

            scenario_stopped = loop_until(self.reactor,
                                          no_outstanding_requests,
                                          repeat(1))
            timeout(self.reactor, scenario_stopped, self.timeout)
            scenario = DeferredContext(scenario_stopped)

            def handle_timeout(failure):
                failure.trap(CancelledError)
                msg = (
                    "Force stopping the scenario. "
                    "There are {num_requests} outstanding requests"
                ).format(
                    num_requests=outstanding_requests
                )
                Message.log(key='force_stop_request', value=msg)
            scenario.addErrback(handle_timeout)

            def return_metrics(_ignore):
                return self.rate_measurer.get_metrics()
            scenario.addCallback(return_metrics)

            return scenario.addActionFinish()
示例#10
0
def retry_failure(reactor, function, expected=None, steps=None):
    """
    Retry ``function`` until it returns successfully.

    If it raises one of the expected exceptions, then retry.

    :param IReactorTime reactor: The reactor implementation to use to delay.
    :param callable function: A callable that returns a value.
    :param expected: Iterable of exceptions that trigger a retry. Passed
        through to ``Failure.check``.
    :param [float] steps: An iterable of delay intervals, measured in seconds.
        If not provided, will default to retrying every 0.1 seconds.

    :return: A ``Deferred`` that fires with the first successful return value
        of ``function``.
    """
    if steps is None:
        steps = repeat(0.1)
    steps = iter(steps)

    action = LOOP_UNTIL_ACTION(predicate=function)
    with action.context():
        d = DeferredContext(maybeDeferred(function))

    def loop(failure):
        if expected and not failure.check(*expected):
            return failure

        try:
            interval = steps.next()
        except StopIteration:
            return failure

        d = deferLater(reactor, interval, action.run, function)
        d.addErrback(loop)
        return d

    d.addErrback(loop)

    def got_result(result):
        action.add_success_fields(result=result)
        return result
    d.addCallback(got_result)
    d.addActionFinish()
    return d.result
def _execute_converge_output(jobs):
    if not jobs:
        return succeed(None)

    a = start_action(action_type=u"execute-converge-step")
    with a.context():
        job = jobs.pop(0)
        d = DeferredContext(job())
        d.addErrback(write_failure)
        d = d.addActionFinish()

    if jobs:
        # Capture whatever action context is active now and make sure it is
        # also active when we get back here to process the next job.
        DeferredContext(d).addCallback(
            lambda ignored: _execute_converge_output(jobs),
        )
    return d
示例#12
0
    def _send_state_to_control_service(self, state_changes):
        context = LOG_SEND_TO_CONTROL_SERVICE(
            self.fsm.logger, connection=self.client,
            local_changes=list(state_changes),
        )
        with context.context():
            d = DeferredContext(self.client.callRemote(
                NodeStateCommand,
                state_changes=state_changes,
                eliot_context=context)
            )

            def record_acknowledged_state(ignored):
                self._last_acknowledged_state = state_changes

            d.addCallback(record_acknowledged_state)
            d.addErrback(
                writeFailure, self.fsm.logger,
                u"Failed to send local state to control node.")
            d.addActionFinish()
示例#13
0
    def logger(self, request, **routeArguments):
        try:
            logger = self.logger
        except AttributeError:
            logger = _logger
        else:
            if logger is None:
                logger = _logger

        path = repr(request.path).decode("ascii")
        action = REQUEST(logger, request_path=path)

        # Can't construct a good identifier without using private things.
        # See https://github.com/ClusterHQ/eliot/issues/29
        uuid = action._identification[u"task_uuid"]
        level = action._identification[u"task_level"]
        incidentIdentifier = uuid + u"," + level

        with action.context():
            d = DeferredContext(original(self, request, **routeArguments))

        def failure(reason):
            if reason.check(BadRequest):
                code = reason.value.code
                result = reason.value.result
            else:
                writeFailure(reason, logger, LOG_SYSTEM)
                code = INTERNAL_SERVER_ERROR
                result = incidentIdentifier
            request.setResponseCode(code)
            request.responseHeaders.setRawHeaders(
                b"content-type", [b"application/json"])
            return dumps({u"error": True, u"result": result})
        d.addErrback(failure)
        d.addActionFinish()
        return d.result
示例#14
0
    def _async_get_node(self, reactor, instance, metadata):
        """
        Configure the given AWS instance, wait until it's running
        and create an ``AWSNode`` object for it.

        :param reactor: The reactor.
        :param boto.ec2.instance.Instance instance: The instance to set up.
        :param dict metadata: The metadata to set for the instance.
        :return: Deferred that fires when the instance is ready.
        """
        def instance_error(failure):
            Message.log(
                message_type="flocker:provision:aws:async_get_node:failed"
            )
            instance.terminate()
            write_failure(failure)
            return failure

        action = start_action(
            action_type=u"flocker:provision:aws:async_get_node",
            name=metadata['Name'],
            instance_id=instance.id,
        )
        with action.context():
            d = loop_until(
                reactor,
                lambda: maybeDeferred(self._set_metadata, instance, metadata),
                repeat(5, INSTANCE_TIMEOUT),
            )
            d = DeferredContext(d)
            d.addCallback(
                lambda _: _async_wait_until_running(reactor, instance)
            )
            d.addErrback(instance_error)
            d.addActionFinish()
            return d.result
 def g(*a, **kw):
     action = start_action(action_type=scope + u":" + f.__name__)
     with action.context():
         d = DeferredContext(maybeDeferred(f, *a, **kw))
         d.addErrback(write_failure)
         return d.addActionFinish()
示例#16
0
    def stop(self):
        """
        Stop the scenario from being maintained by stopping all the
        loops that may be executing.

        :return Deferred[Optional[Dict[unicode, Any]]]: Scenario metrics.
        """
        self.is_started = False
        if self.monitor_loop.running:
            self.monitor_loop.stop()

        if self.loop.running:
            self.loop.stop()

        outstanding_requests = self.rate_measurer.outstanding()

        if outstanding_requests > 0:
            msg = (
                "There are {num_requests} outstanding requests. "
                "Waiting {num_seconds} seconds for them to complete."
            ).format(
                num_requests=outstanding_requests,
                num_seconds=self.timeout
            )
            Message.log(key='outstanding_requests', value=msg)

        with start_action(
            action_type=u'flocker:benchmark:scenario:stop',
            scenario='request_load'
        ):
            def no_outstanding_requests():
                return self.rate_measurer.outstanding() == 0

            scenario_stopped = loop_until(self.reactor,
                                          no_outstanding_requests,
                                          repeat(1))
            timeout(self.reactor, scenario_stopped, self.timeout)
            scenario = DeferredContext(scenario_stopped)

            def handle_timeout(failure):
                failure.trap(CancelledError)
                msg = (
                    "Force stopping the scenario. "
                    "There are {num_requests} outstanding requests"
                ).format(
                    num_requests=outstanding_requests
                )
                Message.log(key='force_stop_request', value=msg)
            scenario.addErrback(handle_timeout)

            def scenario_cleanup(ignored):
                """
                Calls the scenario cleanup, and wraps it inside an eliot
                start action, so we can see the logs if something goes
                wrong within the cleanup

                :return Deferred: that will fire once the cleanup has been
                    completed
                """
                with start_action(
                    action_type=u'flocker:benchmark:scenario:cleanup',
                    scenario='request_load'
                ):
                    return self.request.run_cleanup()

            scenario.addBoth(scenario_cleanup)

            def return_metrics(_ignore):
                return self.rate_measurer.get_metrics()
            scenario.addCallback(return_metrics)

            return scenario.addActionFinish()
示例#17
0
    def output_CONVERGE(self, context):
        known_local_state = self.cluster_state.get_node(
            self.deployer.node_uuid, hostname=self.deployer.hostname)

        with LOG_CONVERGE(self.fsm.logger,
                          cluster_state=self.cluster_state,
                          desired_configuration=self.configuration).context():
            d = DeferredContext(
                maybeDeferred(self.deployer.discover_state, known_local_state))

        def got_local_state(local_state):
            self._last_discovered_local_state = local_state
            cluster_state_changes = local_state.shared_state_changes()
            # Current cluster state is likely out of date as regards the local
            # state, so update it accordingly.
            #
            # XXX This somewhat side-steps the whole explicit-state-machine
            # thing we're aiming for here.  It would be better for these state
            # changes to arrive as an input to the state machine.
            for state in cluster_state_changes:
                self.cluster_state = state.update_cluster_state(
                    self.cluster_state)

            # XXX And for this update to be the side-effect of an output
            # resulting.
            sent_state = self._maybe_send_state_to_control_service(
                cluster_state_changes)

            action = self.deployer.calculate_changes(self.configuration,
                                                     self.cluster_state,
                                                     local_state)
            if action == NoOp():
                # We've converged, we can sleep for deployer poll
                # interval. We add some jitter so not all agents wake up
                # at exactly the same time, to reduce load on system:
                sleep_duration = _Sleep.with_jitter(
                    self.deployer.poll_interval.total_seconds())
            else:
                # We're going to do some work, we should do another
                # iteration quickly in case there's followup work:
                sleep_duration = _UNCONVERGED_DELAY

            LOG_CALCULATED_ACTIONS(calculated_actions=action).write(
                self.fsm.logger)
            ran_state_change = run_state_change(action, self.deployer)
            DeferredContext(ran_state_change).addErrback(
                writeFailure, self.fsm.logger)

            # Wait for the control node to acknowledge the new
            # state, and for the convergence actions to run.
            result = gather_deferreds([sent_state, ran_state_change])
            result.addCallback(lambda _: sleep_duration)
            return result

        d.addCallback(got_local_state)

        # If an error occurred we just want to log it and then try
        # converging again; hopefully next time we'll have more success.
        def error(failure):
            writeFailure(failure, self.fsm.logger)
            # We should retry quickly to redo the failed work:
            return _UNCONVERGED_DELAY

        d.addErrback(error)

        # We're done with the iteration:
        d.addCallback(lambda delay: self.fsm.receive(delay))
        d.addActionFinish()
示例#18
0
def scp(reactor, username, host, remote_path, local_path,
        direction, port=22, identity_file=None):
    """
    :param reactor: A ``twisted.internet.reactor``.
    :param bytes username: The SSH username.
    :param bytes host: The SSH host.
    :param FilePath remote_path: The path to the remote file.
    :param FilePath local_path: The path to the local file.
    :param direction: One of ``DOWNLOAD`` or ``UPLOAD``.
    :param int port: The SSH TCP port.
    :param FilePath identity_file: The path to an SSH private key.
    :returns: A ``Deferred`` that fires when the process is ended.
    """
    if direction not in (DOWNLOAD, UPLOAD):
        raise ValueError(
            "Invalid direction argument {!r}. "
            "Must be one of ``runner.DOWNLOAD`` "
            "or ``runner.UPLOAD``.".format(direction)
        )

    remote_host_path = username + b'@' + host + b':' + remote_path.path
    scp_command = [
        b"scp",
        # XXX Seems safe to use -r for both files and directories.
        b"-r",
        b"-P", bytes(port),
    ] + SSH_OPTIONS

    if identity_file is not None:
        scp_command += [
            b"-i", identity_file.path
        ]
    if direction is DOWNLOAD:
        scp_command += [
            remote_host_path,
            local_path.path,
        ]
    else:
        scp_command += [
            local_path.path,
            remote_host_path,
        ]

    action = SCP_ACTION(
        username=username,
        host=host,
        remote_path=remote_path,
        local_path=local_path,
        port=port,
        identity_file=identity_file,
    )

    # A place to hold failure state between parsing stderr and needing to fire
    # a Deferred.
    failed_reason = []

    def handle_stdout(line):
        SCP_OUTPUT_MESSAGE(
            line=line,
        ).write(action=action)

    def handle_stderr(line):
        """
        Notice scp's particular way of describing the file-not-found condition
        and turn it into a more easily recognized form.
        """
        if b"No such file or directory" in line:
            failed_reason.append(RemoteFileNotFound(remote_path))
        if b"lost connection" in line:
            failed_reason.append(SCPConnectionError())
        SCP_ERROR_MESSAGE(
            line=line,
        ).write(action=action)

    def scp_failed(reason):
        """
        Check for a known error with the scp attempt and turn the normal
        failure into a more meaningful one.
        """
        reason.trap(ProcessTerminated)
        if failed_reason:
            return Failure(failed_reason[-1])
        return reason

    with action.context():
        context = DeferredContext(
            run(
                reactor,
                scp_command,
                handle_stdout=handle_stdout,
                handle_stderr=handle_stderr,
            )
        )

        context.addErrback(scp_failed)

        return context.addActionFinish()
示例#19
0
文件: _loop.py 项目: sysuwbs/flocker
    def output_CONVERGE(self, context):
        known_local_state = self.cluster_state.get_node(
            self.deployer.node_uuid, hostname=self.deployer.hostname)

        with LOG_CONVERGE(self.fsm.logger, cluster_state=self.cluster_state,
                          desired_configuration=self.configuration).context():
            with LOG_DISCOVERY(self.fsm.logger).context():
                discover = DeferredContext(maybeDeferred(
                    self.deployer.discover_state, known_local_state))
                discover.addActionFinish()
            d = DeferredContext(discover.result)

        def got_local_state(local_state):
            self._last_discovered_local_state = local_state
            cluster_state_changes = local_state.shared_state_changes()
            # Current cluster state is likely out of date as regards the local
            # state, so update it accordingly.
            #
            # XXX This somewhat side-steps the whole explicit-state-machine
            # thing we're aiming for here.  It would be better for these state
            # changes to arrive as an input to the state machine.
            for state in cluster_state_changes:
                self.cluster_state = state.update_cluster_state(
                    self.cluster_state
                )

            # XXX And for this update to be the side-effect of an output
            # resulting.
            sent_state = self._maybe_send_state_to_control_service(
                cluster_state_changes)

            action = self.deployer.calculate_changes(
                self.configuration, self.cluster_state, local_state
            )
            if action == NoOp():
                # We've converged, we can sleep for deployer poll
                # interval. We add some jitter so not all agents wake up
                # at exactly the same time, to reduce load on system:
                sleep_duration = _Sleep.with_jitter(
                    self.deployer.poll_interval.total_seconds())
            else:
                # We're going to do some work, we should do another
                # iteration quickly in case there's followup work:
                sleep_duration = _UNCONVERGED_DELAY

            LOG_CALCULATED_ACTIONS(calculated_actions=action).write(
                self.fsm.logger)
            ran_state_change = run_state_change(action, self.deployer)
            DeferredContext(ran_state_change).addErrback(
                writeFailure, self.fsm.logger)

            # Wait for the control node to acknowledge the new
            # state, and for the convergence actions to run.
            result = gather_deferreds([sent_state, ran_state_change])
            result.addCallback(lambda _: sleep_duration)
            return result
        d.addCallback(got_local_state)

        # If an error occurred we just want to log it and then try
        # converging again; hopefully next time we'll have more success.
        def error(failure):
            writeFailure(failure, self.fsm.logger)
            # We should retry quickly to redo the failed work:
            return _UNCONVERGED_DELAY
        d.addErrback(error)

        # We're done with the iteration:
        d.addCallback(
            lambda delay: self.fsm.receive(delay))
        d.addActionFinish()
示例#20
0
    def output_CONVERGE(self, context):
        with LOG_CONVERGE(self.fsm.logger, cluster_state=self.cluster_state,
                          desired_configuration=self.configuration).context():
            log_discovery = LOG_DISCOVERY(self.fsm.logger)
            with log_discovery.context():
                discover = DeferredContext(maybeDeferred(
                    self.deployer.discover_state, self.cluster_state,
                    persistent_state=self.configuration.persistent_state))

                def got_local_state(local_state):
                    log_discovery.addSuccessFields(state=local_state)
                    return local_state
                discover.addCallback(got_local_state)
                discover.addActionFinish()
            d = DeferredContext(discover.result)

        def got_local_state(local_state):
            self._last_discovered_local_state = local_state
            cluster_state_changes = local_state.shared_state_changes()
            # Current cluster state is likely out of date as regards the local
            # state, so update it accordingly.
            #
            # XXX This somewhat side-steps the whole explicit-state-machine
            # thing we're aiming for here.  It would be better for these state
            # changes to arrive as an input to the state machine.
            for state in cluster_state_changes:
                self.cluster_state = state.update_cluster_state(
                    self.cluster_state
                )

            # XXX And for this update to be the side-effect of an output
            # resulting.
            sent_state = self._maybe_send_state_to_control_service(
                cluster_state_changes)

            action = self.deployer.calculate_changes(
                self.configuration, self.cluster_state, local_state
            )
            if isinstance(action, NoOp):
                # If we have converged, we need to reset the sleep delay
                # in case there were any incremental back offs while
                # waiting to converge.
                self._unconverged_sleep.reset_delay()
                # We add some jitter so not all agents wake up at exactly
                # the same time, to reduce load on system:
                sleep_duration = _Sleep.with_jitter(
                    action.sleep.total_seconds())
            else:
                # We're going to do some work, we should do another
                # iteration, but chances are that if, for any reason,
                # the backend is saturated, by looping too fast, we
                # will only make things worse, so there is an incremental
                # back off in the sleep interval.
                sleep_duration = self._unconverged_sleep.sleep()

            LOG_CALCULATED_ACTIONS(calculated_actions=action).write(
                self.fsm.logger)
            ran_state_change = run_state_change(
                action,
                deployer=self.deployer,
                state_persister=RemoteStatePersister(client=self.client),
            )
            DeferredContext(ran_state_change).addErrback(
                writeFailure, self.fsm.logger)

            # Wait for the control node to acknowledge the new
            # state, and for the convergence actions to run.
            result = gather_deferreds([sent_state, ran_state_change])
            result.addCallback(lambda _: sleep_duration)
            return result
        d.addCallback(got_local_state)

        # If an error occurred we just want to log it and then try
        # converging again; hopefully next time we'll have more success.
        def error(failure):
            writeFailure(failure, self.fsm.logger)
            # We should retry to redo the failed work:
            return self._unconverged_sleep.sleep()
        d.addErrback(error)

        # We're done with the iteration:
        d.addCallback(
            lambda delay: self.fsm.receive(delay))
        d.addActionFinish()
示例#21
0
    def _wait_until_running(self, reactor, node):
        """
        Wait until the node is running and its network interface is configured.

        This method fails if the node does not reach the expected state until
        the predefined timeout expires or if the node goes into an error state
        or an unexpected state.

        :param node: A libcloud node object representing the compute instance
            of interest.
        :return: Deferred that fires with a libcloud node object representing
            the latest state of the instance in the case of success.
        """
        # Overall retry sleep time (not quite the same as timeout since
        # it doesn't count time spent in the predicate) is just copied
        # from the default libcloud timeout for wait_until_running.
        # Maybe some other value would be better.
        action = start_action(
            action_type=u"flocker:provision:libcloud:wait_until_running",
            name=node.name,
            id=node.id,
        )
        with action.context():
            steps = iter([15] * (600 / 15))
            d = loop_until(
                reactor,
                lambda: self._async_node_in_state(
                    reactor,
                    node,
                    self.TERMINAL_STATES
                ),
                steps=steps,
            )
            d = DeferredContext(d)

            def got_ip_addresses():
                d = self._async_refresh_node(reactor, node)
                d = DeferredContext(d)

                def is_running(updated_node):
                    if updated_node.state is not NodeState.RUNNING:
                        raise Exception("Node failed to run")
                    return updated_node

                def check_addresses(updated_node):
                    """
                    Check if the node has got at least one IPv4 public address
                    and, if requested, an IPv4 private address.  If yes, then
                    return the node object with the addresses, None otherwise.
                    """
                    public_ips = _filter_ipv4(updated_node.public_ips)
                    if len(public_ips) > 0:
                        if self._use_private_addresses:
                            private_ips = _filter_ipv4(
                                updated_node.private_ips
                            )
                            if len(private_ips) == 0:
                                return None
                        return updated_node
                    else:
                        return None

                d.addCallback(is_running)
                d.addCallback(check_addresses)
                return d.result

            # Once node is in a stable state ensure that it is running
            # and it has necessary IP addresses assigned.
            d.addCallback(
                lambda _: loop_until(reactor, got_ip_addresses, steps=steps)
            )

            def failed_to_run(failure):
                d = _retry_exception_async(reactor, node.destroy)
                d.addCallback(lambda _: failure)
                return d

            d.addErrback(failed_to_run)
            return d.addActionFinish()
示例#22
0
    def _wait_until_running(self, reactor, node):
        """
        Wait until the node is running and its network interface is configured.

        This method fails if the node does not reach the expected state until
        the predefined timeout expires or if the node goes into an error state
        or an unexpected state.

        :param node: A libcloud node object representing the compute instance
            of interest.
        :return: Deferred that fires with a libcloud node object representing
            the latest state of the instance in the case of success.
        """
        # Overall retry sleep time (not quite the same as timeout since
        # it doesn't count time spent in the predicate) is just copied
        # from the default libcloud timeout for wait_until_running.
        # Maybe some other value would be better.
        action = start_action(
            action_type=u"flocker:provision:libcloud:wait_until_running",
            name=node.name,
            id=node.id,
        )
        with action.context():
            steps = iter([15] * (600 / 15))
            d = loop_until(
                reactor,
                lambda: self._async_node_in_state(
                    reactor,
                    node,
                    self.TERMINAL_STATES
                ),
                steps=steps,
            )
            d = DeferredContext(d)

            def got_ip_addresses():
                d = self._async_refresh_node(reactor, node)
                d = DeferredContext(d)

                def is_running(updated_node):
                    if updated_node.state is not NodeState.RUNNING:
                        raise Exception("Node failed to run")
                    return updated_node

                def check_addresses(updated_node):
                    """
                    Check if the node has got at least one IPv4 public address
                    and, if requested, an IPv4 private address.  If yes, then
                    return the node object with the addresses, None otherwise.
                    """
                    public_ips = _filter_ipv4(updated_node.public_ips)
                    if len(public_ips) > 0:
                        if self._use_private_addresses:
                            private_ips = _filter_ipv4(
                                updated_node.private_ips
                            )
                            if len(private_ips) == 0:
                                return None
                        return updated_node
                    else:
                        return None

                d.addCallback(is_running)
                d.addCallback(check_addresses)
                return d.result

            # Once node is in a stable state ensure that it is running
            # and it has necessary IP addresses assigned.
            d.addCallback(
                lambda _: loop_until(reactor, got_ip_addresses, steps=steps)
            )

            def failed_to_run(failure):
                d = _retry_exception_async(reactor, node.destroy)
                d.addCallback(lambda _: failure)
                return d

            d.addErrback(failed_to_run)
            return d.addActionFinish()
示例#23
0
文件: _loop.py 项目: zendad/flocker
    def output_CONVERGE(self, context):
        with LOG_CONVERGE(self.fsm.logger).context():
            log_discovery = LOG_DISCOVERY(self.fsm.logger)
            with log_discovery.context():
                discover = DeferredContext(maybeDeferred(
                    self.deployer.discover_state, self.cluster_state,
                    persistent_state=self.configuration.persistent_state))

                def got_local_state(local_state):
                    log_discovery.addSuccessFields(state=local_state)
                    return local_state
                discover.addCallback(got_local_state)
                discover.addActionFinish()
            d = DeferredContext(discover.result)

        def got_local_state(local_state):
            self._last_discovered_local_state = local_state
            cluster_state_changes = local_state.shared_state_changes()
            # Current cluster state is likely out of date as regards the local
            # state, so update it accordingly.
            #
            # XXX This somewhat side-steps the whole explicit-state-machine
            # thing we're aiming for here.  It would be better for these state
            # changes to arrive as an input to the state machine.
            for state in cluster_state_changes:
                self.cluster_state = state.update_cluster_state(
                    self.cluster_state
                )

            # XXX And for this update to be the side-effect of an output
            # resulting.
            sent_state = self._maybe_send_state_to_control_service(
                cluster_state_changes)

            action = self.deployer.calculate_changes(
                self.configuration, self.cluster_state, local_state
            )
            if isinstance(action, NoOp):
                # If we have converged, we need to reset the sleep delay
                # in case there were any incremental back offs while
                # waiting to converge.
                self._unconverged_sleep.reset_delay()
                # We add some jitter so not all agents wake up at exactly
                # the same time, to reduce load on system:
                sleep_duration = _Sleep.with_jitter(
                    action.sleep.total_seconds())
            else:
                # Log the Node configuration that we are converging upon:
                log_info(desired_config=to_unserialized_json(
                    self.configuration.get_node(self.deployer.node_uuid)
                ))
                # We're going to do some work, we should do another
                # iteration, but chances are that if, for any reason,
                # the backend is saturated, by looping too fast, we
                # will only make things worse, so there is an incremental
                # back off in the sleep interval.
                sleep_duration = self._unconverged_sleep.sleep()

            LOG_CALCULATED_ACTIONS(calculated_actions=action).write(
                self.fsm.logger)
            ran_state_change = run_state_change(
                action,
                deployer=self.deployer,
                state_persister=RemoteStatePersister(client=self.client),
            )
            DeferredContext(ran_state_change).addErrback(
                writeFailure, self.fsm.logger)

            # Wait for the control node to acknowledge the new
            # state, and for the convergence actions to run.
            result = gather_deferreds([sent_state, ran_state_change])
            result.addCallback(lambda _: sleep_duration)
            return result
        d.addCallback(got_local_state)

        # If an error occurred we just want to log it and then try
        # converging again; hopefully next time we'll have more success.
        def error(failure):
            writeFailure(failure, self.fsm.logger)
            # We should retry to redo the failed work:
            return self._unconverged_sleep.sleep()
        d.addErrback(error)

        # We're done with the iteration:
        def send_delay_to_fsm(sleep):
            Message.log(
                message_type=u'flocker:node:_loop:CONVERGE:delay',
                log_level=u'INFO',
                message=u'Delaying until next convergence loop.',
                delay=sleep.delay_seconds
            )
            return self.fsm.receive(sleep)

        d.addCallback(send_delay_to_fsm)
        d.addActionFinish()