def get_background_pools(experiment: Experiment) -> ThreadPoolExecutor: """ Create a pool for background activities. The pool is as big as the number of declared background activities. If none are declared, returned `None`. """ method = experiment.get("method", []) rollbacks = experiment.get("rollbacks", []) activity_background_count = 0 for activity in method: if activity and activity.get("background"): activity_background_count = activity_background_count + 1 activity_pool = None if activity_background_count: logger.debug("{c} activities will be run in the background".format( c=activity_background_count)) activity_pool = ThreadPoolExecutor(activity_background_count) rollback_background_pool = 0 for activity in rollbacks: if activity and activity.get("background"): rollback_background_pool = rollback_background_pool + 1 rollback_pool = None if rollback_background_pool: logger.debug("{c} rollbacks will be run in the background".format( c=rollback_background_pool)) rollback_pool = ThreadPoolExecutor(rollback_background_pool) return activity_pool, rollback_pool
def get_all_activities(experiment: Experiment) -> List[Activity]: activities = [] activities.extend( experiment.get("steady-state-hypothesis", {}).get("probes", [])) activities.extend(experiment.get("method", [])) activities.extend(experiment.get("rollbacks", [])) return activities
def get_controls(experiment: Experiment) -> List[Control]: controls = [] controls.extend(experiment.get("controls", [])) controls.extend(experiment.get("steady-state-hypothesis", {}).get("controls", [])) for activity in get_all_activities(experiment): controls.extend(activity.get("controls", [])) return controls
def configure(self, experiment: Experiment, settings: Settings, experiment_vars: Dict[str, Any]) -> None: config_vars, secret_vars = experiment_vars or (None, None) self.settings = settings if settings is not None else \ get_loaded_settings() self.config = load_configuration(experiment.get("configuration", {}), config_vars) self.secrets = load_secrets(experiment.get("secrets", {}), self.config, secret_vars)
def cache_activities(experiment: Experiment) -> List[Activity]: """ Cache all activities into a map so we can quickly lookup ref. """ logger.debug("Building activity cache...") lot = experiment.get("method", []) + \ experiment.get("steady-state-hypothesis", {}).get("probes", []) for activity in lot: name = activity.get("name") if name: _cache[name] = activity logger.debug("Cached {d} activities".format(d=len(_cache)))
def run_rollbacks(experiment: Experiment, configuration: Configuration, secrets: Secrets, pool: ThreadPoolExecutor, dry: bool = False) -> Iterator[Run]: """ Run all rollbacks declared in the experiment in their order. Wait for each rollback activity to complete before to the next unless the activity is declared with the `background` flag. """ rollbacks = experiment.get("rollbacks", []) if not rollbacks: logger.info("No declared rollbacks, let's move on.") for activity in rollbacks: logger.info("Rollback: {t}".format(t=activity.get("name"))) if activity.get("background"): logger.debug("rollback activity will run in the background") yield pool.submit(execute_activity, experiment=experiment, activity=activity, configuration=configuration, secrets=secrets, dry=dry) else: yield execute_activity(experiment, activity, configuration=configuration, secrets=secrets, dry=dry)
def has_steady_state_hypothesis_with_probes(experiment: Experiment) -> bool: steady_state_hypothesis = experiment.get("steady-state-hypothesis") if steady_state_hypothesis: probes = steady_state_hypothesis.get("probes") if probes: return len(probes) > 0 return False
def run_activities(experiment: Experiment, configuration: Configuration, secrets: Secrets, pool: ThreadPoolExecutor, dry: bool = False) -> Iterator[Run]: """ Internal generator that iterates over all activities and execute them. Yields either the result of the run or a :class:`concurrent.futures.Future` if the activity was set to run in the `background`. """ method = experiment.get("method", []) if not method: logger.info("No declared activities, let's move on.") for activity in method: if activity.get("background"): logger.debug("activity will run in the background") yield pool.submit(execute_activity, experiment=experiment, activity=activity, configuration=configuration, secrets=secrets, dry=dry) else: yield execute_activity(experiment=experiment, activity=activity, configuration=configuration, secrets=secrets, dry=dry)
def ensure_hypothesis_is_valid(experiment: Experiment): """ Validates that the steady state hypothesis entry has the expected schema or raises :exc:`InvalidExperiment` or :exc:`InvalidProbe`. """ hypo = experiment.get("steady-state-hypothesis") if hypo is None: return if not hypo.get("title"): raise InvalidExperiment("hypothesis requires a title") probes = hypo.get("probes") if probes: for probe in probes: ensure_activity_is_valid(probe) if "tolerance" not in probe: raise InvalidActivity( "hypothesis probe must have a tolerance entry") if not isinstance(probe["tolerance"], ( bool, int, list, str, dict)): raise InvalidActivity( "hypothesis probe tolerance must either be an integer, " "a string, a boolean or a pair of values for boundaries. " "It can also be a dictionary which is a probe activity " "definition that takes an argument called `value` with " "the value of the probe itself to be validated") if isinstance(probe, dict): ensure_activity_is_valid(probe)
def get_all_activities_in_experiment(experiment: Experiment) -> List[Activity]: """ Handy function to return all activities from a given experiment. Useful when you need to iterate over all the activities. """ activities = [] hypo = experiment.get("steady-state-hypothesis") if hypo: activities.extend(hypo.get("probes", [])) method = experiment.get("method", []) activities.extend(method) rollbacks = experiment.get("rollbacks", []) activities.extend(rollbacks) return activities
def run_steady_state_hypothesis(experiment: Experiment, configuration: Configuration, secrets: Secrets, dry: bool = False): """ Run all probes in the hypothesis and fail the experiment as soon as any of the probe fails or is outside the tolerance zone. """ state = { "steady_state_met": None, "probes": [] } hypo = experiment.get("steady-state-hypothesis") if not hypo: logger.info( "No steady state hypothesis defined. That's ok, just exploring.") return logger.info("Steady state hypothesis: {h}".format(h=hypo.get("title"))) with controls(level="hypothesis", experiment=experiment, context=hypo, configuration=configuration, secrets=secrets) as control: probes = hypo.get("probes", []) control.with_state(state) for activity in probes: run = execute_activity( experiment=experiment, activity=activity, configuration=configuration, secrets=secrets, dry=dry) state["probes"].append(run) if run["status"] == "failed": run["tolerance_met"] = False state["steady_state_met"] = False logger.warn("Probe terminated unexpectedly, " "so its tolerance could not be validated") return state run["tolerance_met"] = True if dry: # do not check for tolerance when dry mode is on continue tolerance = activity.get("tolerance") logger.debug("allowed tolerance is {t}".format(t=str(tolerance))) checked = within_tolerance( tolerance, run["output"], configuration=configuration, secrets=secrets) if not checked: run["tolerance_met"] = False state["steady_state_met"] = False return state state["steady_state_met"] = True logger.info("Steady state hypothesis is met!") return state
def before_experiment_control(context: Experiment, **kwargs): """ Create a tracing span when the experiment's execution begins. """ tracer = local.tracer name = context.get("title") span = tracer.start_span(name) tracer.experiment_span = span span.set_tag('type', 'experiment') tags = context.get("tags") if tags: span.set_tag('target', ', '.join(tags)) contributions = context.get("contributions") if contributions: for contribution in contributions: span.set_tag(contribution, contributions[contribution]) if kwargs: span.log_kv(kwargs)
def add_contribution_model(experiment: Experiment): """ Expose the contribution of that experiment to the report. As this is part of an extension, we bubble it up to the experiment itself for rendering purpose. """ for extension in experiment.get("extensions", []): contributions = extension.get("contributions") if contributions: experiment["contributions"] = contributions break
def validate_extensions(experiment: Experiment): """ Validate that extensions respect the specification. """ extensions = experiment.get("extensions") if not extensions: return for ext in extensions: ext_name = ext.get("name") if not ext_name or not ext_name.strip(): raise InvalidExperiment("All extensions require a non-empty name")
def get_context_controls( level: str, experiment: Experiment = None, # noqa: C901 context: Union[Activity, Experiment] = None, ) -> List[Control]: """ Get the controls at the given level by merging those declared at the experiment level with the current's context. If a control is declared at the current level, do override it with an top-level ine. """ glbl_controls = get_global_controls() if not experiment: return glbl_controls top_level_controls = experiment.get("controls", []) controls = copy(context.get("controls", [])) controls.extend(glbl_controls) # do we even have something at the top level to be merged? if not top_level_controls: return controls if not controls: return [ deepcopy(c) for c in top_level_controls if c.get("automatic", True) ] if level in ["method", "rollback"]: return [ deepcopy(c) for c in top_level_controls if c.get("automatic", True) ] for c in controls: if "ref" in c: for top_level_control in top_level_controls: if c["ref"] == top_level_control["name"]: controls.append(deepcopy(top_level_control)) break else: for tc in top_level_controls: if c.get("name") == tc.get("name"): break else: if tc.get("automatic", True): controls.append(deepcopy(tc)) return controls
def get_extension(experiment: Experiment, name: str) -> Optional[Extension]: """ Get an extension by its name. If no extensions were defined, or the extension doesn't exist in this experiment, return `None`. """ extensions = experiment.get("extensions") if not extensions: return None for ext in extensions: ext_name = ext.get("name") if ext_name == name: return ext return None
def apply_python_control( level: str, control: Control, # noqa: C901 experiment: Experiment, context: Union[Activity, Experiment], state: Union[Journal, Run, List[Run]] = None, configuration: Configuration = None, secrets: Secrets = None, settings: Settings = None, ): """ Apply a control by calling a function matching the given level. """ provider = control["provider"] func_name = _level_mapping.get(level) func = load_func(control, func_name) if not func: return arguments = deepcopy(provider.get("arguments", {})) if configuration or secrets: arguments = substitute(arguments, configuration, secrets) sig = inspect.signature(func) if "secrets" in sig.parameters: arguments["secrets"] = secrets if "configuration" in sig.parameters: arguments["configuration"] = configuration if "state" in sig.parameters: arguments["state"] = state if "experiment" in sig.parameters: arguments["experiment"] = experiment if "extensions" in sig.parameters: arguments["extensions"] = experiment.get("extensions") if "settings" in sig.parameters: arguments["settings"] = settings func(context=context, **arguments)
def get_context_controls(level: str, experiment: Experiment, context: Union[Activity, Experiment]) \ -> List[Control]: """ Get the controls at the given level by merging those declared at the experiment level with the current's context. If a control is declared at the current level, do override it with an top-level ine. """ top_level_controls = experiment.get("controls", []) controls = context.get("controls", []) if not controls: if not top_level_controls: return [] else: return [ deepcopy(c) for c in top_level_controls if c.get("automatic", True) ] if level in ["method", "rollback"]: return [ deepcopy(c) for c in top_level_controls if c.get("automatic", True) ] for c in controls.copy(): if "ref" in c: for top_level_control in top_level_controls: if c["ref"] == top_level_control["name"]: controls.append(deepcopy(top_level_control)) break else: for tc in top_level_controls: if c.get("name") == tc.get("name"): break else: if tc.get("automatic", True): controls.append(deepcopy(tc)) return controls
def ensure_hypothesis_is_valid(experiment: Experiment): """ Validates that the steady state hypothesis entry has the expected schema or raises :exc:`InvalidExperiment` or :exc:`InvalidActivity`. """ hypo = experiment.get("steady-state-hypothesis") if hypo is None: return if not hypo.get("title"): raise InvalidExperiment("hypothesis requires a title") probes = hypo.get("probes") if probes: for probe in probes: ensure_activity_is_valid(probe) if "tolerance" not in probe: raise InvalidActivity("hypothesis probe must have a tolerance entry") ensure_hypothesis_tolerance_is_valid(probe["tolerance"])
def run_steady_state_hypothesis(experiment: Experiment, configuration: Configuration, secrets: Secrets, dry: bool = False): """ Run all probes in the hypothesis and fail the experiment as soon as any of the probe fails or is outside the tolerance zone. """ state = { "steady_state_met": None, "probes": [] } hypo = experiment.get("steady-state-hypothesis") if not hypo: logger.info( "No steady state hypothesis defined. That's ok, just exploring.") return logger.info("Steady state hypothesis: {h}".format(h=hypo.get("title"))) probes = hypo.get("probes", []) for activity in probes: run = execute_activity( activity, configuration=configuration, secrets=secrets, dry=dry) run["tolerance_met"] = True state["probes"].append(run) if dry: # do not check for tolerance when dry mode is on continue tolerance = activity.get("tolerance") logger.debug("allowed tolerance is {t}".format(t=str(tolerance))) if not within_tolerance(tolerance, run["output"]): run["tolerance_met"] = False state["steady_state_met"] = False return state state["steady_state_met"] = True logger.info("Steady state hypothesis is met!") return state
def warn_about_deprecated_features(experiment: Experiment): """ Warn about deprecated features. We do it globally so that we can warn only once about each feature and avoid repeating the same message over and over again. """ warned_deprecations = { DeprecatedDictArgsMessage: False, DeprecatedVaultMissingPathMessage: False } activities = get_all_activities_in_experiment(experiment) for activity in activities: provider = activity.get("provider") if not provider: continue provider_type = provider.get("type") if provider_type == "process": arguments = provider.get("arguments") if not warned_deprecations[DeprecatedDictArgsMessage] and \ isinstance(arguments, dict): warned_deprecations[DeprecatedDictArgsMessage] = True warnings.warn(DeprecatedDictArgsMessage, DeprecationWarning) logger.warning(DeprecatedDictArgsMessage) # vault now expects the path property # see https://github.com/chaostoolkit/chaostoolkit-lib/issues/77 for (target, keys) in experiment.get("secrets", {}).items(): for (key, value) in keys.items(): if isinstance(value, dict) and value.get("type") == "vault": if "key" in value and "path" not in value: warned_deprecations[ DeprecatedVaultMissingPathMessage] = True warnings.warn(DeprecatedVaultMissingPathMessage, DeprecationWarning) logger.warning(DeprecatedVaultMissingPathMessage)
def initialize_execution(session: Session, experiment: Experiment, journal: Journal) -> Optional[Response]: """ Initialize the execution payload and send it over. """ experiment_id = get_experiment_id(experiment.get('extensions')) if not experiment_id: logger.info("Missing experiment identifier") return journal["experiment"] = experiment journal["status"] = "running" execution_url = urls.execution( urls.experiment(session.base_url, experiment_id=experiment_id)) try: with remove_sensitive_extension_values(journal["experiment"], ["experiment_path"]): data = json.dumps({"journal": journal}, ensure_ascii=False, default=json_encoder) r = session.post(execution_url, data=data, headers={"content-type": "application/json"}) except Exception: logger.debug("Failed to create execution", exc_info=True) return if r.status_code not in [200, 201]: is_json = 'application/json' in r.headers.get("content-type", '') error = r.json() if is_json else r.text logger.warning("Execution failed to be published: {}".format(error)) else: logger.info("Execution available at {}".format( urls.clean(r.headers["Content-Location"]))) payload = r.json() set_execution_id(payload["id"], experiment) return r
def ensure_verification_is_valid(experiment: Experiment): ensure_experiment_is_valid(experiment) extensions = experiment.get("extensions") if extensions is None: raise InvalidVerification( "a verification must have an extensions block") chaosiq_blocks = list( filter(lambda extension: extension.get("name", "") == "chaosiq", extensions)) if not len(chaosiq_blocks) == 1: raise InvalidVerification( "a verification must have a single chaosiq extension block") verification = chaosiq_blocks[0].get("verification") if verification is None: raise InvalidVerification( "a verification must have a verification block") id = verification.get("id") if id is None: raise InvalidVerification("a verification must have an id") frequency_of_measurement = verification.get("frequency-of-measurement") if frequency_of_measurement is None: raise InvalidVerification( "a verification must have a frequency-of-measurement block") duration_of_conditions = verification.get("duration-of-conditions") if duration_of_conditions is None: raise InvalidVerification( "a verification must have a duration-of-conditions block") logger.info("Verification looks valid")
def ensure_experiment_is_valid(experiment: Experiment): """ A chaos experiment consists of a method made of activities to carry sequentially. There are two kinds of activities: * probe: detecting the state of a resource in your system or external to it There are two kinds of probes: `steady` and `close` * action: an operation to apply against your system Usually, an experiment is made of a set of `steady` probes that ensure the system is sound to carry further the experiment. Then, an action before another set of of ̀close` probes to sense the state of the system post-action. This function raises :exc:`InvalidExperiment`, :exc:`InvalidProbe` or :exc:`InvalidAction` depending on where it fails. """ logger.info("Validating the experiment's syntax") if not experiment: raise InvalidExperiment("an empty experiment is not an experiment") if not experiment.get("title"): raise InvalidExperiment("experiment requires a title") if not experiment.get("description"): raise InvalidExperiment("experiment requires a description") tags = experiment.get("tags") if tags: if list(filter(lambda t: t == '' or not isinstance(t, str), tags)): raise InvalidExperiment( "experiment tags must be a non-empty string") validate_extensions(experiment) config = load_configuration(experiment.get("configuration", {})) load_secrets(experiment.get("secrets", {}), config) ensure_hypothesis_is_valid(experiment) method = experiment.get("method") if not method: raise InvalidExperiment("an experiment requires a method with " "at least one activity") for activity in method: ensure_activity_is_valid(activity) # let's see if a ref is indeed found in the experiment ref = activity.get("ref") if ref and not lookup_activity(ref): raise InvalidActivity("referenced activity '{r}' could not be " "found in the experiment".format(r=ref)) rollbacks = experiment.get("rollbacks", []) for activity in rollbacks: ensure_activity_is_valid(activity) warn_about_deprecated_features(experiment) validate_controls(experiment) logger.info("Experiment looks valid")
def hypothesis(experiment: Experiment) -> Hypothesis: return experiment.get("steady-state-hypothesis")
def get_org_id(experiment: Experiment) -> str: extensions = experiment.get("extensions", []) for extension in extensions: if extension["name"] == "chaosiq": return extension.get("org_id")
def set_execution_id(execution_id: str, experiment: Experiment) -> NoReturn: extensions = experiment.get("extensions", []) for extension in extensions: if extension["name"] == "chaosiq": extension["execution_id"] = execution_id break
def apply_activities(experiment: Experiment, configuration: Configuration, secrets: Secrets, pool: ThreadPoolExecutor, journal: Journal, dry: bool = False) -> List[Run]: with controls(level="method", experiment=experiment, context=experiment, configuration=configuration, secrets=secrets) as control: result = [] runs = [] method = experiment.get("method", []) wait_for_background_activities = True try: for run in run_activities(experiment, configuration, secrets, pool, dry): runs.append(run) if journal["status"] in ["aborted", "failed", "interrupted"]: break except SystemExit as x: # when we got a signal for an ungraceful exit, we can decide # not to wait for background activities. Their statuses will # remain failed. wait_for_background_activities = x.code != 30 # see exit.py raise finally: background_activity_timeout = None if wait_for_background_activities and pool: logger.debug("Waiting for background activities to complete") pool.shutdown(wait=True) elif pool: harshly_terminate_pending_background_activities(pool) logger.debug( "Do not wait for the background activities to finish " "as per signal") background_activity_timeout = 0.2 pool.shutdown(wait=False) for index, run in enumerate(runs): if not run: continue if isinstance(run, dict): result.append(run) else: try: # background activities result.append( run.result(timeout=background_activity_timeout)) except TimeoutError: # we want an entry for the background activity in our # results anyway, we won't have anything meaningful # to say about it result.append({ "activity": method[index], "status": "failed", "output": None, "duration": None, "start": None, "end": None, "exception": None }) # now let's ensure the journal has all activities in their correct # order (background ones included) journal["run"] = result control.with_state(result) return result
def _run( self, strategy: Strategy, schedule: Schedule, # noqa: C901 experiment: Experiment, journal: Journal, configuration: Configuration, secrets: Secrets, settings: Settings, event_registry: EventHandlerRegistry) -> None: experiment["title"] = substitute(experiment["title"], configuration, secrets) logger.info("Running experiment: {t}".format(t=experiment["title"])) started_at = time.time() journal = journal or initialize_run_journal(experiment) event_registry.started(experiment, journal) control = Control() activity_pool, rollback_pool = get_background_pools(experiment) hypo_pool = get_hypothesis_pool() continous_hypo_event = threading.Event() dry = experiment.get("dry", False) if dry: logger.warning("Dry mode enabled") initialize_global_controls(experiment, configuration, secrets, settings) initialize_controls(experiment, configuration, secrets) logger.info("Steady-state strategy: {}".format(strategy.value)) rollback_strategy = settings.get("runtime", {}).get("rollbacks", {}).get( "strategy", "default") logger.info("Rollbacks strategy: {}".format(rollback_strategy)) exit_gracefully_with_rollbacks = True with_ssh = has_steady_state_hypothesis_with_probes(experiment) if not with_ssh: logger.info("No steady state hypothesis defined. That's ok, just " "exploring.") try: try: control.begin("experiment", experiment, experiment, configuration, secrets) state = object() if with_ssh and should_run_before_method(strategy): state = run_gate_hypothesis(experiment, journal, configuration, secrets, event_registry, dry) if state is not None: if with_ssh and should_run_during_method(strategy): run_hypothesis_during_method(hypo_pool, continous_hypo_event, strategy, schedule, experiment, journal, configuration, secrets, event_registry, dry) state = run_method(strategy, activity_pool, experiment, journal, configuration, secrets, event_registry, dry) continous_hypo_event.set() if journal["status"] not in ["interrupted", "aborted"]: if with_ssh and (state is not None) and \ should_run_after_method(strategy): run_deviation_validation_hypothesis( experiment, journal, configuration, secrets, event_registry, dry) except InterruptExecution as i: journal["status"] = "interrupted" logger.fatal(str(i)) event_registry.interrupted(experiment, journal) except KeyboardInterrupt: journal["status"] = "interrupted" logger.warning("Received a termination signal (Ctrl-C)...") event_registry.signal_exit() except SystemExit as x: journal["status"] = "interrupted" logger.warning("Received the exit signal: {}".format(x.code)) exit_gracefully_with_rollbacks = x.code != 30 if not exit_gracefully_with_rollbacks: logger.warning("Ignoring rollbacks as per signal") event_registry.signal_exit() finally: hypo_pool.shutdown(wait=True) # just in case a signal overrode everything else to tell us not to # play them anyway (see the exit.py module) if exit_gracefully_with_rollbacks: run_rollback(rollback_strategy, rollback_pool, experiment, journal, configuration, secrets, event_registry, dry) journal["end"] = datetime.utcnow().isoformat() journal["duration"] = time.time() - started_at # the spec only allows these statuses, so if it's anything else # we override to "completed" if journal["status"] not in ("completed", "failed", "aborted", "interrupted"): journal["status"] = "completed" has_deviated = journal["deviated"] status = "deviated" if has_deviated else journal["status"] logger.info("Experiment ended with status: {s}".format(s=status)) if has_deviated: logger.info( "The steady-state has deviated, a weakness may have been " "discovered") control.with_state(journal) try: control.end("experiment", experiment, experiment, configuration, secrets) except ChaosException: logger.debug("Failed to close controls", exc_info=True) finally: try: cleanup_controls(experiment) cleanup_global_controls() finally: event_registry.finish(journal) return journal
def run_experiment(experiment: Experiment, settings: Settings = None) -> Journal: """ Run the given `experiment` method step by step, in the following sequence: steady probe, action, close probe. Activities can be executed in background when they have the `"background"` property set to `true`. In that case, the activity is run in a thread. By the end of runs, those threads block until they are all complete. If the experiment has the `"dry"` property set to `False`, the experiment runs without actually executing the activities. NOTE: Tricky to make a decision whether we should rollback when exiting abnormally (Ctrl-C, SIGTERM...). Afterall, there is a chance we actually cannot afford to rollback properly. Better bailing to a conservative approach. This means we swallow :exc:`KeyboardInterrupt` and :exc:`SystemExit` and do not bubble it back up to the caller. We when were interrupted, we set the `interrupted` flag of the result accordingly to notify the caller this was indeed not terminated properly. """ logger.info("Running experiment: {t}".format(t=experiment["title"])) dry = experiment.get("dry", False) if dry: logger.warning("Dry mode enabled") started_at = time.time() settings = settings if settings is not None else get_loaded_settings() config = load_configuration(experiment.get("configuration", {})) secrets = load_secrets(experiment.get("secrets", {}), config) initialize_global_controls(experiment, config, secrets, settings) initialize_controls(experiment, config, secrets) activity_pool, rollback_pool = get_background_pools(experiment) control = Control() journal = initialize_run_journal(experiment) try: try: control.begin("experiment", experiment, experiment, config, secrets) # this may fail the entire experiment right there if any of the # probes fail or fall out of their tolerance zone try: state = run_steady_state_hypothesis(experiment, config, secrets, dry=dry) journal["steady_states"]["before"] = state if state is not None and not state["steady_state_met"]: p = state["probes"][-1] raise ActivityFailed( "Steady state probe '{p}' is not in the given " "tolerance so failing this experiment".format( p=p["activity"]["name"])) except ActivityFailed as a: journal["steady_states"]["before"] = state journal["status"] = "failed" logger.fatal(str(a)) else: try: journal["run"] = apply_activities(experiment, config, secrets, activity_pool, dry) except Exception: journal["status"] = "aborted" logger.fatal( "Experiment ran into an un expected fatal error, " "aborting now.", exc_info=True) else: try: state = run_steady_state_hypothesis(experiment, config, secrets, dry=dry) journal["steady_states"]["after"] = state if state is not None and not state["steady_state_met"]: journal["deviated"] = True p = state["probes"][-1] raise ActivityFailed( "Steady state probe '{p}' is not in the given " "tolerance so failing this experiment".format( p=p["activity"]["name"])) except ActivityFailed as a: journal["status"] = "failed" logger.fatal(str(a)) except InterruptExecution as i: journal["status"] = "interrupted" logger.fatal(str(i)) except (KeyboardInterrupt, SystemExit): journal["status"] = "interrupted" logger.warn("Received an exit signal, " "leaving without applying rollbacks.") else: journal["status"] = journal["status"] or "completed" journal["rollbacks"] = apply_rollbacks(experiment, config, secrets, rollback_pool, dry) journal["end"] = datetime.utcnow().isoformat() journal["duration"] = time.time() - started_at has_deviated = journal["deviated"] status = "deviated" if has_deviated else journal["status"] logger.info("Experiment ended with status: {s}".format(s=status)) if has_deviated: logger.info( "The steady-state has deviated, a weakness may have been " "discovered") control.with_state(journal) try: control.end("experiment", experiment, experiment, config, secrets) except ChaosException: logger.debug("Failed to close controls", exc_info=True) finally: cleanup_controls(experiment) cleanup_global_controls() return journal