def run_gate_hypothesis(experiment: Experiment, journal: Journal, configuration: Configuration, secrets: Secrets, event_registry: EventHandlerRegistry, dry: bool = False) -> Dict[str, Any]: """ Run the hypothesis before the method and bail the execution if it did not pass. """ logger.debug("Running steady-state hypothesis before the method") event_registry.start_hypothesis_before(experiment) state = run_steady_state_hypothesis(experiment, configuration, secrets, dry=dry) journal["steady_states"]["before"] = state event_registry.hypothesis_before_completed(experiment, state, journal) if state is not None and not state["steady_state_met"]: journal["steady_states"]["before"] = state journal["status"] = "failed" p = state["probes"][-1] logger.fatal("Steady state probe '{p}' is not in the given " "tolerance so failing this experiment".format( p=p["activity"]["name"])) return return state
def run_method(strategy: Strategy, activity_pool: ThreadPoolExecutor, experiment: Experiment, journal: Journal, configuration: Configuration, secrets: Secrets, event_registry: EventHandlerRegistry, dry: bool = False) -> Optional[List[Run]]: logger.info("Playing your experiment's method now...") event_registry.start_method(experiment) try: state = apply_activities(experiment, configuration, secrets, activity_pool, journal, dry) event_registry.method_completed(experiment, state) return state except InterruptExecution: event_registry.method_completed(experiment) raise except Exception: journal["status"] = "aborted" event_registry.method_completed(experiment) logger.fatal( "Experiment ran into an un expected fatal error, " "aborting now.", exc_info=True)
def discover( ctx: click.Context, package: str, discovery_path: str = "./discovery.json", no_system_info: bool = False, no_install: bool = False, ) -> Discovery: """Discover capabilities and experiments.""" settings = load_settings(ctx.obj["settings_path"]) try: notify(settings, DiscoverFlowEvent.DiscoverStarted, package) discovery = disco( package_name=package, discover_system=not no_system_info, download_and_install=not no_install, ) except DiscoveryFailed as err: notify(settings, DiscoverFlowEvent.DiscoverFailed, package, err) logger.debug(f"Failed to discover {package}", exc_info=err) logger.fatal(str(err)) return with open(discovery_path, "w") as d: d.write(json.dumps(discovery, indent=2, default=encoder)) logger.info(f"Discovery outcome saved in {discovery_path}") notify(settings, DiscoverFlowEvent.DiscoverCompleted, discovery) return discovery
def changePassword(self, newPassword): logger.debug("CurrentPassword: {}, NewPassword: {}".format( self.getCurrentPassword(), newPassword)) # check against cur pwd if self._verify_password(self.getCurrentPassword(), str(newPassword)): logger.fatal("Passwords match, heading back.") raise PasswordMatchException("Matches Current Go Home") # check against historical pwds historicalPasswords = self.getPasswordHistory() if newPassword in historicalPasswords: logger.fatal( "Password is in historical list, try again. {}".format( historicalPasswords)) raise HistoricalPasswordMatchException("Found in historical list") logger.debug("Current Historical List: {}".format(historicalPasswords)) # got here, so lets update the pwd and add it to the history #stop storing pwds in clear self.curPwd = self._hash_password(str(newPassword)) #historical pwds being stored in clear, need to check/prevent this historicalPasswords.append(newPassword) if len(historicalPasswords) >= MAX_HISTORICAL_PWDS: historicalPasswords.remove(historicalPasswords[0]) logger.info("History: {}".format(len(historicalPasswords))) logger.debug("Password changed to {} hashed as {}".format( newPassword, self.curPwd))
def run_deviation_validation_hypothesis(experiment: Experiment, journal: Journal, configuration: Configuration, secrets: Secrets, event_registry: EventHandlerRegistry, dry: bool = False) \ -> Dict[str, Any]: """ Run the hypothesis after the method and report to the journal if the experiment has deviated. """ logger.debug("Running steady-state hypothesis after the method") event_registry.start_hypothesis_after(experiment) state = run_steady_state_hypothesis(experiment, configuration, secrets, dry=dry) journal["steady_states"]["after"] = state event_registry.hypothesis_after_completed(experiment, state, journal) if state is not None and \ not state["steady_state_met"]: journal["deviated"] = True journal["status"] = "failed" p = state["probes"][-1] logger.fatal("Steady state probe '{p}' is not in the " "given tolerance so failing this " "experiment".format(p=p["activity"]["name"])) return state
def create_repository(): logger.info(f"creating repository {settings.REPOSITORY_NAME}") r = None try: url = f"{settings.ES_HOST}/_snapshot/{settings.REPOSITORY_NAME}" logger.debug( f"request: url={url} data={settings.REPOSITORY_SETTINGS} timeout={settings.REQUEST_TIMEOUT_SECONDS}" ) r = requests.put(url=url, data=settings.REPOSITORY_SETTINGS, timeout=settings.REQUEST_TIMEOUT_SECONDS) except Exception as e: logger.fatal( f"problem while creating repository on {settings.ES_HOST}: {str(e)}" ) raise logger.debug(f"response: {r} {r.text}") if r.status_code == 200: announce(f"repository {settings.REPOSITORY_NAME} created") else: announce( f"repository {settings.REPOSITORY_NAME} not created! reason: {r} {r.text}" ) return r.status_code == 200
def run_rollback( rollback_strategy: str, rollback_pool: ThreadPoolExecutor, experiment: Experiment, journal: Journal, configuration: Configuration, secrets: Secrets, event_registry: EventHandlerRegistry, dry: bool = False, ) -> None: has_deviated = journal["deviated"] journal_status = journal["status"] play_rollbacks = False if rollback_strategy == "always": logger.warning("Rollbacks were explicitly requested to be played") play_rollbacks = True elif rollback_strategy == "never": logger.warning("Rollbacks were explicitly requested to not be played") play_rollbacks = False elif rollback_strategy == "default" and journal_status not in [ "failed", "interrupted", ]: play_rollbacks = True elif rollback_strategy == "deviated": if has_deviated: logger.warning( "Rollbacks will be played only because the experiment " "deviated" ) play_rollbacks = True else: logger.warning( "Rollbacks were explicitely requested to be played " "only if the experiment deviated. Since this is not " "the case, we will not play them." ) if play_rollbacks: event_registry.start_rollbacks(experiment) try: journal["rollbacks"] = apply_rollbacks( experiment, configuration, secrets, rollback_pool, dry ) except InterruptExecution as i: journal["status"] = "interrupted" logger.fatal(str(i)) except (KeyboardInterrupt, SystemExit): journal["status"] = "interrupted" logger.warning( "Received an exit signal." "Terminating now without running the " "remaining rollbacks." ) finally: event_registry.rollbacks_completed(experiment, journal)
def completed(f: Future): exc = f.exception() event_registry.continous_hypothesis_completed(experiment, journal, exc) if exc is not None: if isinstance(exc, InterruptExecution): journal["status"] = "interrupted" logger.fatal(str(exc)) elif isinstance(exc, Exception): journal["status"] = "aborted" logger.fatal(str(exc)) logger.info("Continous steady state hypothesis terminated")
def create_snapshot(): timestamp = datetime.utcfromtimestamp(time.time()).strftime('%Y%m%d%H%M%S') snapshot_name = f"{settings.SNAPSHOT_PREFIX}{timestamp}" logger.info( f"creating snapshot {snapshot_name} in {settings.REPOSITORY_NAME}") wait_for_completion = (settings.SNAPSHOT_TIMEOUT_SECONDS != 0) timeout = settings.SNAPSHOT_TIMEOUT_SECONDS if wait_for_completion: logger.info( f"shutterbug will wait {settings.SNAPSHOT_TIMEOUT_SECONDS} second(s) maximum" ) else: logger.info(f"shutterbug will not wait for snapshot to be complete") timeout = settings.REQUEST_TIMEOUT_SECONDS json = { "ignore_unavailable": settings.IGNORE_UNAVAILABLE, "include_global_state": settings.INCLUDE_GLOBAL_STATE } if settings.INDEX_NAMES is None or len(settings.INDEX_NAMES) == 0: logger.info(f"all indices will be included") else: logger.info( f"only these indices will be included: {settings.INDEX_NAMES}") json["index_names"] = settings.INDEX_NAMES r = None try: url = f"{settings.ES_HOST}/_snapshot/{settings.REPOSITORY_NAME}/{snapshot_name}?wait_for_completion={str(wait_for_completion).lower()}" logger.debug(f"request: url={url} data={json} timeout={timeout}") r = requests.put(url=url, json=json, timeout=timeout) except Exception as e: logger.fatal( f"problem while creating snapshot {snapshot_name} on {settings.ES_HOST}: {str(e)}" ) raise logger.debug(f"response: {r} {r.text}") if r.status_code == 200: if wait_for_completion: announce(f"snapshot {snapshot_name} created") else: announce(f"snapshot {snapshot_name} accepted") else: announce(f"snapshot {snapshot_name} not created! reason: {r} {r.text}") return r.status_code == 200
def check_required(): """Function to check that all the required config variables are set.""" fail = False for key in REQUIRED_CONFIG: var = _key_to_var(key) if os.getenv(var) is None: fail = True logger.fatal(("Please make sure that the required environment " "variable '{}' is set!").format(var)) if fail: sys.exit(1)
def repository_exists(): logger.info(f"checking if repository {settings.REPOSITORY_NAME} exists") r = None try: r = requests.get( url=f"{settings.ES_HOST}/_snapshot/{settings.REPOSITORY_NAME}", timeout=settings.REQUEST_TIMEOUT_SECONDS) except Exception as e: logger.fatal(f"problem while contacting {settings.ES_HOST}: {str(e)}") raise logger.debug(f"response: {r} {r.text}") return r.status_code != 404
def main(): logger.info("starting...") setup_signal_handling() last_hello_emitted = time.time() semaphore_file = settings.SEMAPHORE_FILE if settings.USE_ECS_TASK_STRATEGY: logger.info("using ECS Task strategy for semaphore token") if len(settings.ECS_TASK_STRATEGY_ENDPOINT) is None: logger.fatal("ECS_TASK_STRATEGY_ENDPOINT was empty") return r = requests.get(settings.ECS_TASK_STRATEGY_ENDPOINT) metadata = r.json() logger.debug("metadata was: " + json.dumps(metadata)) task_id = metadata["Labels"]["com.amazonaws.ecs.task-arn"].split( "/")[-1] logger.debug("task_id: " + task_id) semaphore_file = settings.ECS_TASK_STRATEGY_SEMAPHORE_FILE_TEMPLATE.replace( "##task_id##", task_id) logger.info("semaphore file set to " + semaphore_file) if settings.USE_SEMAPHORE_FILE_STRATEGY: if settings.SEMAPHORE_FILE_ENSURE_REMOVED: logger.info("ensuring semaphore file at " + semaphore_file + " is removed") if os.path.isfile(semaphore_file): logger.info("semaphore file exists, removing") os.unlink(semaphore_file) while not requested_to_quit: age = int(time.time() - last_hello_emitted) if age > settings.SAY_HELLO_SECONDS: logger.info("running...") last_hello_emitted = time.time() time.sleep(settings.SLEEP_SECONDS) if requested_to_quit: if settings.USE_SEMAPHORE_FILE_STRATEGY: logger.info("touching semaphore file at " + semaphore_file) open(semaphore_file, 'a').close() logger.info("done")
def remove_snapshot(snapshot_name): logger.info(f"remove_snapshot {snapshot_name}") r = None try: url = f"{settings.ES_HOST}/_snapshot/{settings.REPOSITORY_NAME}/{snapshot_name}" r = requests.delete(url, timeout=settings.REQUEST_TIMEOUT_SECONDS) except Exception as e: logger.fatal( f"problem during removal of snapshot {snapshot_name}: {str(e)}") raise logger.debug(f"response: {r} {r.text}") return r.status_code == 200
def main(): announce("starting...") try: if not repository_exists(): if not create_repository(): raise Exception("couldn't create repository") if settings.REMOVE_OLDER_THAN_DAYS > 0: remove_old_snapshots() if not create_snapshot(): raise Exception("couldn't create snapshot") except Exception as e: logger.fatal(str(e)) if settings.ENABLE_SLACK: slack_announce(message=str(e))
def discover(package: str, discovery_path: str = "./discovery.json", no_system_info: bool = False, no_install: bool = False) -> Discovery: """Discover capabilities and experiments.""" try: discovery = disco(package_name=package, discover_system=not no_system_info, download_and_install=not no_install) except DiscoveryFailed as err: logger.debug("Failed to discover {}".format(package), exc_info=err) logger.fatal(str(err)) return with open(discovery_path, "w") as d: d.write(json.dumps(discovery, indent=2)) logger.info("Discovery outcome saved in {p}".format(p=discovery_path)) return discovery
def remove_old_snapshots(): logger.debug(f"remove_old_snapshots()") # get list of repository snapshots r = None try: url = f"{settings.ES_HOST}/_snapshot/{settings.REPOSITORY_NAME}/_all" logger.debug(f"request: {url}") r = requests.get(url=url, timeout=settings.REQUEST_TIMEOUT_SECONDS) except Exception as e: logger.fatal(f"problem while getting list of snapshots: {str(e)}") raise logger.debug(f"response: {r} {r.text}") snapshots = r.json() if "snapshots" not in snapshots: logger.fatal( f"couldn't find 'snapshots' property in response: {r} {r.text}") raise Exception( f"problem parsing reply from list of snapshots: {r} {r.text}") for snapshot in snapshots["snapshots"]: start_time = snapshot["start_time"] snapshot_name = snapshot["snapshot"] # start_time is in Zulu time - e.g. "start_time": "2019-02-23T19:39:28.043Z" # this is ISO-8601 compliant snapshot_date = dateutil.parser.parse(start_time) delta = datetime.now(timezone.utc) - snapshot_date if settings.REMOVE_OLDER_THAN_DAYS > 0 and delta.days > settings.REMOVE_OLDER_THAN_DAYS: logger.info(f"snapshot {snapshot_name} is a candidate for removal") if remove_snapshot(snapshot_name): announce(f"snapshot {snapshot_name} has been removed") else: raise Exception(f"snapshot removal failed")
def changePassword(self, newPassword): logger.debug("CurrentPassword: {}, NewPassword: {}".format( self.getCurrentPassword(), newPassword)) # check against cur pwd if newPassword == self.getCurrentPassword(): logger.fatal("Passwords match, heading back.") raise PasswordMatchException("Matches Current Go Home") # check against historical pwds historicalPasswords = self.getPasswordHistory() if newPassword in historicalPasswords: logger.fatal( "Password is in historical list, try again. {}".format( historicalPasswords)) raise HistoricalPasswordMatchException("Found in historical list") logger.debug("Current Historical List: {}".format(historicalPasswords)) # got here, so lets update the pwd and add it to the history self.curPwd = newPassword historicalPasswords.append(newPassword) if len(historicalPasswords) >= MAX_HISTORICAL_PWDS: historicalPasswords.remove(historicalPasswords[0]) logger.debug("Password changed to {} ".format(newPassword))
def run_experiment(experiment: Experiment, settings: Settings = None) -> Journal: """ Run the given `experiment` method step by step, in the following sequence: steady probe, action, close probe. Activities can be executed in background when they have the `"background"` property set to `true`. In that case, the activity is run in a thread. By the end of runs, those threads block until they are all complete. If the experiment has the `"dry"` property set to `False`, the experiment runs without actually executing the activities. NOTE: Tricky to make a decision whether we should rollback when exiting abnormally (Ctrl-C, SIGTERM...). Afterall, there is a chance we actually cannot afford to rollback properly. Better bailing to a conservative approach. This means we swallow :exc:`KeyboardInterrupt` and :exc:`SystemExit` and do not bubble it back up to the caller. We when were interrupted, we set the `interrupted` flag of the result accordingly to notify the caller this was indeed not terminated properly. """ logger.info("Running experiment: {t}".format(t=experiment["title"])) dry = experiment.get("dry", False) if dry: logger.warning("Dry mode enabled") started_at = time.time() settings = settings if settings is not None else get_loaded_settings() config = load_configuration(experiment.get("configuration", {})) secrets = load_secrets(experiment.get("secrets", {}), config) initialize_global_controls(experiment, config, secrets, settings) initialize_controls(experiment, config, secrets) activity_pool, rollback_pool = get_background_pools(experiment) control = Control() journal = initialize_run_journal(experiment) try: try: control.begin("experiment", experiment, experiment, config, secrets) # this may fail the entire experiment right there if any of the # probes fail or fall out of their tolerance zone try: state = run_steady_state_hypothesis(experiment, config, secrets, dry=dry) journal["steady_states"]["before"] = state if state is not None and not state["steady_state_met"]: p = state["probes"][-1] raise ActivityFailed( "Steady state probe '{p}' is not in the given " "tolerance so failing this experiment".format( p=p["activity"]["name"])) except ActivityFailed as a: journal["steady_states"]["before"] = state journal["status"] = "failed" logger.fatal(str(a)) else: try: journal["run"] = apply_activities(experiment, config, secrets, activity_pool, dry) except Exception: journal["status"] = "aborted" logger.fatal( "Experiment ran into an un expected fatal error, " "aborting now.", exc_info=True) else: try: state = run_steady_state_hypothesis(experiment, config, secrets, dry=dry) journal["steady_states"]["after"] = state if state is not None and not state["steady_state_met"]: journal["deviated"] = True p = state["probes"][-1] raise ActivityFailed( "Steady state probe '{p}' is not in the given " "tolerance so failing this experiment".format( p=p["activity"]["name"])) except ActivityFailed as a: journal["status"] = "failed" logger.fatal(str(a)) except InterruptExecution as i: journal["status"] = "interrupted" logger.fatal(str(i)) except (KeyboardInterrupt, SystemExit): journal["status"] = "interrupted" logger.warn("Received an exit signal, " "leaving without applying rollbacks.") else: journal["status"] = journal["status"] or "completed" journal["rollbacks"] = apply_rollbacks(experiment, config, secrets, rollback_pool, dry) journal["end"] = datetime.utcnow().isoformat() journal["duration"] = time.time() - started_at has_deviated = journal["deviated"] status = "deviated" if has_deviated else journal["status"] logger.info("Experiment ended with status: {s}".format(s=status)) if has_deviated: logger.info( "The steady-state has deviated, a weakness may have been " "discovered") control.with_state(journal) try: control.end("experiment", experiment, experiment, config, secrets) except ChaosException: logger.debug("Failed to close controls", exc_info=True) finally: cleanup_controls(experiment) cleanup_global_controls() return journal
def switch_team_during_verification_run( source: str, # noqa: C901 settings: Settings) -> bool: """ Verification may be run in a different team than the active team the user selected. Rather than preventing the verification from running, try to switch to the appropriate team's context for the duration of this run. It's all in memory and not changed on disk. """ if not has_chaosiq_extension_configured(settings): logger.fatal( "Please signin to ChaosIQ services first with `$ chaos signin`") return False base_url = get_endpoint_url(settings) verify_tls = get_verify_tls(settings) default_org = get_default_org(settings) team = get_default_team(default_org) if not team: logger.fatal("Please select a default team with `$chaos team`") return False team_id = team["id"] token = get_auth_token(settings, base_url) if not token: logger.fatal( "Please signin to ChaosIQ services first with `$ chaos signin`") p = urlparse(source) if p.scheme.lower() in ["http", "https"]: r = requests.get(source, headers={"Authorization": "Bearer {}".format(token)}, verify=verify_tls) if r.status_code != 200: logger.fatal("Failed to retrieve verification at '{}': {}".format( source, r.text)) return False experiment = r.json() experiment_team_id = get_team_id(experiment) if experiment_team_id: team_id = experiment_team_id else: if not os.path.exists(p.path): raise InvalidSource('Path "{}" does not exist.'.format(source)) experiment = parse_experiment_from_file(source) experiment_team_id = get_team_id(experiment) if experiment_team_id: team_id = experiment_team_id if not team_id: logger.fatal( "Failed to lookup the team identifier from the verification. " "Are you trying to run a verification using an experiment you " "created manually? This is not possible right now unfortunately.") return False if team["id"] != team_id: team_url = urls.team(urls.org(urls.base(base_url), organization_id=default_org["id"]), team_id=team_id) r = request_team(team_url, token, verify_tls) if r.status_code != 200: logger.fatal("You cannot access the team owning this verification." "Please request them to join the team.") return False team = r.json() if default_org["id"] != team["org_id"]: logger.fatal( "You must be signed in to the appropriate organization to run " "this verification. Please run `$ chaos signin`.") return False logger.debug("Running a verification in a team different from the " "active one. Activating '{}' for this run.".format( team["name"])) set_default_team(default_org, { "id": team_id, "default": True, "name": team["name"] }) return True
def _run( self, strategy: Strategy, schedule: Schedule, # noqa: C901 experiment: Experiment, journal: Journal, configuration: Configuration, secrets: Secrets, settings: Settings, event_registry: EventHandlerRegistry) -> None: experiment["title"] = substitute(experiment["title"], configuration, secrets) logger.info("Running experiment: {t}".format(t=experiment["title"])) started_at = time.time() journal = journal or initialize_run_journal(experiment) event_registry.started(experiment, journal) control = Control() activity_pool, rollback_pool = get_background_pools(experiment) hypo_pool = get_hypothesis_pool() continous_hypo_event = threading.Event() dry = experiment.get("dry", False) if dry: logger.warning("Dry mode enabled") initialize_global_controls(experiment, configuration, secrets, settings) initialize_controls(experiment, configuration, secrets) logger.info("Steady-state strategy: {}".format(strategy.value)) rollback_strategy = settings.get("runtime", {}).get("rollbacks", {}).get( "strategy", "default") logger.info("Rollbacks strategy: {}".format(rollback_strategy)) exit_gracefully_with_rollbacks = True with_ssh = has_steady_state_hypothesis_with_probes(experiment) if not with_ssh: logger.info("No steady state hypothesis defined. That's ok, just " "exploring.") try: try: control.begin("experiment", experiment, experiment, configuration, secrets) state = object() if with_ssh and should_run_before_method(strategy): state = run_gate_hypothesis(experiment, journal, configuration, secrets, event_registry, dry) if state is not None: if with_ssh and should_run_during_method(strategy): run_hypothesis_during_method(hypo_pool, continous_hypo_event, strategy, schedule, experiment, journal, configuration, secrets, event_registry, dry) state = run_method(strategy, activity_pool, experiment, journal, configuration, secrets, event_registry, dry) continous_hypo_event.set() if journal["status"] not in ["interrupted", "aborted"]: if with_ssh and (state is not None) and \ should_run_after_method(strategy): run_deviation_validation_hypothesis( experiment, journal, configuration, secrets, event_registry, dry) except InterruptExecution as i: journal["status"] = "interrupted" logger.fatal(str(i)) event_registry.interrupted(experiment, journal) except KeyboardInterrupt: journal["status"] = "interrupted" logger.warning("Received a termination signal (Ctrl-C)...") event_registry.signal_exit() except SystemExit as x: journal["status"] = "interrupted" logger.warning("Received the exit signal: {}".format(x.code)) exit_gracefully_with_rollbacks = x.code != 30 if not exit_gracefully_with_rollbacks: logger.warning("Ignoring rollbacks as per signal") event_registry.signal_exit() finally: hypo_pool.shutdown(wait=True) # just in case a signal overrode everything else to tell us not to # play them anyway (see the exit.py module) if exit_gracefully_with_rollbacks: run_rollback(rollback_strategy, rollback_pool, experiment, journal, configuration, secrets, event_registry, dry) journal["end"] = datetime.utcnow().isoformat() journal["duration"] = time.time() - started_at # the spec only allows these statuses, so if it's anything else # we override to "completed" if journal["status"] not in ("completed", "failed", "aborted", "interrupted"): journal["status"] = "completed" has_deviated = journal["deviated"] status = "deviated" if has_deviated else journal["status"] logger.info("Experiment ended with status: {s}".format(s=status)) if has_deviated: logger.info( "The steady-state has deviated, a weakness may have been " "discovered") control.with_state(journal) try: control.end("experiment", experiment, experiment, configuration, secrets) except ChaosException: logger.debug("Failed to close controls", exc_info=True) finally: try: cleanup_controls(experiment) cleanup_global_controls() finally: event_registry.finish(journal) return journal