def _run( self, strategy: Strategy, schedule: Schedule, # noqa: C901 experiment: Experiment, journal: Journal, configuration: Configuration, secrets: Secrets, settings: Settings, event_registry: EventHandlerRegistry) -> None: experiment["title"] = substitute(experiment["title"], configuration, secrets) logger.info("Running experiment: {t}".format(t=experiment["title"])) started_at = time.time() journal = journal or initialize_run_journal(experiment) event_registry.started(experiment, journal) control = Control() activity_pool, rollback_pool = get_background_pools(experiment) hypo_pool = get_hypothesis_pool() continous_hypo_event = threading.Event() dry = experiment.get("dry", False) if dry: logger.warning("Dry mode enabled") initialize_global_controls(experiment, configuration, secrets, settings) initialize_controls(experiment, configuration, secrets) logger.info("Steady-state strategy: {}".format(strategy.value)) rollback_strategy = settings.get("runtime", {}).get("rollbacks", {}).get( "strategy", "default") logger.info("Rollbacks strategy: {}".format(rollback_strategy)) exit_gracefully_with_rollbacks = True with_ssh = has_steady_state_hypothesis_with_probes(experiment) if not with_ssh: logger.info("No steady state hypothesis defined. That's ok, just " "exploring.") try: try: control.begin("experiment", experiment, experiment, configuration, secrets) state = object() if with_ssh and should_run_before_method(strategy): state = run_gate_hypothesis(experiment, journal, configuration, secrets, event_registry, dry) if state is not None: if with_ssh and should_run_during_method(strategy): run_hypothesis_during_method(hypo_pool, continous_hypo_event, strategy, schedule, experiment, journal, configuration, secrets, event_registry, dry) state = run_method(strategy, activity_pool, experiment, journal, configuration, secrets, event_registry, dry) continous_hypo_event.set() if journal["status"] not in ["interrupted", "aborted"]: if with_ssh and (state is not None) and \ should_run_after_method(strategy): run_deviation_validation_hypothesis( experiment, journal, configuration, secrets, event_registry, dry) except InterruptExecution as i: journal["status"] = "interrupted" logger.fatal(str(i)) event_registry.interrupted(experiment, journal) except KeyboardInterrupt: journal["status"] = "interrupted" logger.warning("Received a termination signal (Ctrl-C)...") event_registry.signal_exit() except SystemExit as x: journal["status"] = "interrupted" logger.warning("Received the exit signal: {}".format(x.code)) exit_gracefully_with_rollbacks = x.code != 30 if not exit_gracefully_with_rollbacks: logger.warning("Ignoring rollbacks as per signal") event_registry.signal_exit() finally: hypo_pool.shutdown(wait=True) # just in case a signal overrode everything else to tell us not to # play them anyway (see the exit.py module) if exit_gracefully_with_rollbacks: run_rollback(rollback_strategy, rollback_pool, experiment, journal, configuration, secrets, event_registry, dry) journal["end"] = datetime.utcnow().isoformat() journal["duration"] = time.time() - started_at # the spec only allows these statuses, so if it's anything else # we override to "completed" if journal["status"] not in ("completed", "failed", "aborted", "interrupted"): journal["status"] = "completed" has_deviated = journal["deviated"] status = "deviated" if has_deviated else journal["status"] logger.info("Experiment ended with status: {s}".format(s=status)) if has_deviated: logger.info( "The steady-state has deviated, a weakness may have been " "discovered") control.with_state(journal) try: control.end("experiment", experiment, experiment, configuration, secrets) except ChaosException: logger.debug("Failed to close controls", exc_info=True) finally: try: cleanup_controls(experiment) cleanup_global_controls() finally: event_registry.finish(journal) return journal
def run_experiment(experiment: Experiment, settings: Settings = None) -> Journal: """ Run the given `experiment` method step by step, in the following sequence: steady probe, action, close probe. Activities can be executed in background when they have the `"background"` property set to `true`. In that case, the activity is run in a thread. By the end of runs, those threads block until they are all complete. If the experiment has the `"dry"` property set to `False`, the experiment runs without actually executing the activities. NOTE: Tricky to make a decision whether we should rollback when exiting abnormally (Ctrl-C, SIGTERM...). Afterall, there is a chance we actually cannot afford to rollback properly. Better bailing to a conservative approach. This means we swallow :exc:`KeyboardInterrupt` and :exc:`SystemExit` and do not bubble it back up to the caller. We when were interrupted, we set the `interrupted` flag of the result accordingly to notify the caller this was indeed not terminated properly. """ logger.info("Running experiment: {t}".format(t=experiment["title"])) dry = experiment.get("dry", False) if dry: logger.warning("Dry mode enabled") started_at = time.time() settings = settings if settings is not None else get_loaded_settings() config = load_configuration(experiment.get("configuration", {})) secrets = load_secrets(experiment.get("secrets", {}), config) initialize_global_controls(experiment, config, secrets, settings) initialize_controls(experiment, config, secrets) activity_pool, rollback_pool = get_background_pools(experiment) control = Control() journal = initialize_run_journal(experiment) try: try: control.begin("experiment", experiment, experiment, config, secrets) # this may fail the entire experiment right there if any of the # probes fail or fall out of their tolerance zone try: state = run_steady_state_hypothesis(experiment, config, secrets, dry=dry) journal["steady_states"]["before"] = state if state is not None and not state["steady_state_met"]: p = state["probes"][-1] raise ActivityFailed( "Steady state probe '{p}' is not in the given " "tolerance so failing this experiment".format( p=p["activity"]["name"])) except ActivityFailed as a: journal["steady_states"]["before"] = state journal["status"] = "failed" logger.fatal(str(a)) else: try: journal["run"] = apply_activities(experiment, config, secrets, activity_pool, dry) except Exception: journal["status"] = "aborted" logger.fatal( "Experiment ran into an un expected fatal error, " "aborting now.", exc_info=True) else: try: state = run_steady_state_hypothesis(experiment, config, secrets, dry=dry) journal["steady_states"]["after"] = state if state is not None and not state["steady_state_met"]: journal["deviated"] = True p = state["probes"][-1] raise ActivityFailed( "Steady state probe '{p}' is not in the given " "tolerance so failing this experiment".format( p=p["activity"]["name"])) except ActivityFailed as a: journal["status"] = "failed" logger.fatal(str(a)) except InterruptExecution as i: journal["status"] = "interrupted" logger.fatal(str(i)) except (KeyboardInterrupt, SystemExit): journal["status"] = "interrupted" logger.warn("Received an exit signal, " "leaving without applying rollbacks.") else: journal["status"] = journal["status"] or "completed" journal["rollbacks"] = apply_rollbacks(experiment, config, secrets, rollback_pool, dry) journal["end"] = datetime.utcnow().isoformat() journal["duration"] = time.time() - started_at has_deviated = journal["deviated"] status = "deviated" if has_deviated else journal["status"] logger.info("Experiment ended with status: {s}".format(s=status)) if has_deviated: logger.info( "The steady-state has deviated, a weakness may have been " "discovered") control.with_state(journal) try: control.end("experiment", experiment, experiment, config, secrets) except ChaosException: logger.debug("Failed to close controls", exc_info=True) finally: cleanup_controls(experiment) cleanup_global_controls() return journal