示例#1
0
def test_delete_helm_release_failure(mocker):
    mocker.patch("util.helm.execute_system_command", return_value=("", 1, ""))
    fake_config_path = '/usr/ogorek/nctl_config'
    fake_config = mocker.patch('util.helm.Config')
    fake_config.return_value.config_path = fake_config_path
    with pytest.raises(RuntimeError):
        delete_helm_release(test_username)
示例#2
0
def test_delete_helm_release_success(mocker):
    esc_mock = mocker.patch("util.helm.execute_system_command")

    esc_mock.side_effect = [(f"release \"{test_username}\" deleted", 0,
                             f"release \"{test_username}\" deleted"),
                            (f"release: \"{test_username}\" not found", 0,
                             f"release: \"{test_username}\" not found")]

    fake_config_path = '/usr/ogorek/nctl_config'
    fake_config = mocker.patch('util.helm.Config')
    fake_config.return_value.config_path = fake_config_path

    delete_helm_release(test_username)

    assert esc_mock.call_count == 1
示例#3
0
def cancel_experiment_runs(runs_to_cancel: List[Run],
                           namespace: str) -> Tuple[List[Run], List[Run]]:
    """
    Cancel given list of Runs belonging to a single namespace.
    :param runs_to_cancel: Runs to be cancelled
    :param namespace: namespace where Run instances reside
    :return: tuple of list containing successfully Runs and list containing Runs that were not cancelled
    """
    deleted_runs = []
    not_deleted_runs = []
    try:
        for run in runs_to_cancel:
            logger.debug(f"Cancelling {run.name} run ...")
            click.echo(
                Texts.CANCELING_RUNS_START_MSG.format(
                    run_name=run.name, experiment_name=experiment_name))
            try:
                # if run status is cancelled - omit the following steps
                if run.state != RunStatus.CANCELLED:
                    with spinner(text=Texts.CANCEL_SETTING_STATUS_MSG.format(
                            run_name=run.name)):
                        delete_helm_release(release_name=run.name,
                                            namespace=namespace,
                                            purge=False)
                        # change a run state to CANCELLED
                        run.state = RunStatus.CANCELLED
                        run.end_timestamp = datetime.utcnow().strftime(
                            "%Y-%m-%dT%H:%M:%SZ")
                        run.update()
                deleted_runs.append(run)
            except Exception:
                logger.exception(
                    Texts.INCOMPLETE_CANCEL_ERROR_MSG.format(
                        run_name=run.name, experiment_name=experiment_name))
                click.echo(
                    Texts.INCOMPLETE_CANCEL_ERROR_MSG.format(
                        run_name=run.name, experiment_name=experiment_name))
                not_deleted_runs.append(run)

    except Exception:
        logger.exception("Error during cancelling experiments")
        return deleted_runs, not_deleted_runs

    return deleted_runs, not_deleted_runs
示例#4
0
def ctrl_c_handler_for_submit(sig, frame):
    log.debug("ctrl-c pressed while submitting")
    try:
        with spinner(text=Texts.CTRL_C_PURGING_PROGRESS_MSG):
            if submitted_runs:
                for run in submitted_runs:
                    try:
                        # delete run
                        delete_k8s_object("run", run.name)
                        # purge helm release
                        delete_helm_release(run.name, namespace=submitted_namespace, purge=True)
                    except Exception:
                        log.exception(Texts.ERROR_WHILE_REMOVING_RUNS)
            delete_k8s_object("experiment", submitted_experiment)
    except Exception:
        log.exception(Texts.ERROR_WHILE_REMOVING_EXPERIMENT)

    for proc in psutil.Process(os.getpid()).children(recursive=True):
        proc.send_signal(signal.SIGKILL)

    exit(1)
示例#5
0
def purge_experiment(exp_name: str, runs_to_purge: List[Run],
                     k8s_es_client: K8sElasticSearchClient,
                     namespace: str) -> Tuple[List[Run], List[Run]]:
    """
       Purge experiment with a given name by cancelling runs given as a parameter. If given experiment
       contains more runs than is in the list of runs - experiment's state remains intact.

       :param exp_name: name of an experiment to which belong runs passed in run_list parameter
       :param runs_to_purge: list of runs that should be purged, they have to belong to exp_name experiment
       :param k8s_es_client: Kubernetes ElasticSearch client
       :param namespace: namespace where experiment is located
       :return: two list - first contains runs that were cancelled successfully, second - those which weren't
       """
    logger.debug(f"Purging {exp_name} experiment ...")

    purged_runs: List[Run] = []
    not_purged_runs: List[Run] = []

    experiment = Experiment.get(name=exp_name, namespace=namespace)
    if not experiment:
        raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG)

    experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name])
    # check whether experiment has more runs that should be cancelled
    cancel_whole_experiment = (len(experiment_runs) == len(runs_to_purge))
    if cancel_whole_experiment:
        experiment.state = ExperimentStatus.CANCELLING
        experiment.update()

    try:
        cancelled_runs, not_cancelled_runs = cancel_experiment_runs(
            runs_to_cancel=runs_to_purge, namespace=namespace)
        not_purged_runs = not_cancelled_runs

        if cancel_whole_experiment:
            # Delete associated workflows
            experiment_associated_workflows = [
                wf for wf in ArgoWorkflow.list(namespace=namespace)
                if wf.labels.get('experimentName') == experiment.name
            ]
            for wf in experiment_associated_workflows:
                wf.delete()

            # Remove tags from git repo manager
            try:
                delete_exp_tag_from_git_repo_manager(
                    experiment_name=experiment.name,
                    username=namespace,
                    experiments_workdir=get_run_environment_path(''))
            except Exception:
                handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG,
                             Texts.GIT_REPO_MANAGER_ERROR_MSG)
                raise

        for run in cancelled_runs:
            logger.debug(f"Purging {run.name} run ...")
            click.echo(Texts.PURGING_START_MSG.format(run_name=run.name))
            try:
                with spinner(text=Texts.PURGING_PROGRESS_MSG.format(
                        run_name=run.name)):
                    # purge helm release
                    delete_helm_release(run.name,
                                        namespace=namespace,
                                        purge=True)
                    # delete run
                    kubectl.delete_k8s_object("run", run.name)
                    purged_runs.append(run)
            except Exception as exe:
                not_purged_runs.append(run)
                logger.exception("Error during purging runs.")
                # occurence of NotFound error may mean, that run has been removed earlier
                if "NotFound" not in str(exe):
                    click.echo(
                        Texts.INCOMPLETE_PURGE_ERROR_MSG.format(
                            experiment_name=experiment_name))
                    raise exe
            try:
                # clear run logs
                if is_current_user_administrator():
                    logger.debug(f"Clearing logs for {run.name} run.")
                    with spinner(text=Texts.PURGING_LOGS_PROGRESS_MSG.format(
                            run_name=run.name)):
                        k8s_es_client.delete_logs_for_run(run=run.name,
                                                          namespace=namespace)
            except Exception:
                logger.exception("Error during clearing run logs.")

            # CAN-1099 - docker garbage collector has errors that prevent from correct removal of images
            # try:
            # try to remove images from docker registry
            #    delete_images_for_experiment(exp_name=run.name)
            # except Exception:
            #    logger.exception("Error during removing images.")

        if cancel_whole_experiment and not not_purged_runs:
            try:
                kubectl.delete_k8s_object("experiment", exp_name)
            except Exception:
                # problems during deleting experiments are hidden as if runs were
                # cancelled user doesn't have a possibility to remove them
                logger.exception("Error during purging experiment.")

    except Exception:
        logger.exception("Error during purging experiment.")
        return purged_runs, not_purged_runs

    return purged_runs, not_purged_runs