def create_jobs(job_request):
    # NOTE: Similar but non-identical logic is implemented for running jobs
    # locally in `jobrunner.cli.local_run.create_job_request_and_jobs`. If you
    # make changes below then consider what the appropriate corresponding
    # changes are for locally run jobs.
    validate_job_request(job_request)
    project_file = get_project_file(job_request)
    pipeline_config = load_pipeline(project_file)
    latest_jobs = get_latest_jobs_for_actions_in_project(
        job_request.workspace, pipeline_config
    )
    new_jobs = get_new_jobs_to_run(job_request, pipeline_config, latest_jobs)
    assert_new_jobs_created(new_jobs, latest_jobs)
    resolve_reusable_action_references(new_jobs)
    # There is a delay between getting the current jobs (which we fetch from
    # the database and the disk) and inserting our new jobs below. This means
    # the state of the world may have changed in the meantime. Why is this OK?
    #
    # Because we're single threaded and because this function is the only place
    # jobs are created, we can guarantee that no *new* jobs were created. So
    # the only state change that's possible is that some active jobs might have
    # completed. That's unproblematic: any new jobs which are waiting on these
    # now-already-completed jobs will see they have completed the first time
    # they check and then proceed as normal.
    #
    # (It is also possible that someone could delete files off disk that are
    # needed by a particular job, but there's not much we can do about that
    # other than fail gracefully when trying to start the job.)
    insert_into_database(job_request, new_jobs)
    return len(new_jobs)
Exemplo n.º 2
0
    def dispatch(self, request, *args, **kwargs):
        try:
            self.workspace = Workspace.objects.get(
                project__org__slug=self.kwargs["org_slug"],
                project__slug=self.kwargs["project_slug"],
                name=self.kwargs["workspace_slug"],
            )
        except Workspace.DoesNotExist:
            return redirect("/")

        if not has_permission(
                request.user, "job_run", project=self.workspace.project):
            raise Http404

        if self.workspace.is_archived:
            msg = ("You cannot create Jobs for an archived Workspace."
                   "Please contact an admin if you need to have it unarchved.")
            messages.error(request, msg)
            return redirect(self.workspace)

        # some backends might need to be disabled.  This view only uses
        # backends the user can see so we look them up here, removing the
        # relevant ones from the QS before we check if there are any below.
        # The form, in get_form_kwargs, will also use the backends constructed
        # here to be consistent.
        self.backends = request.user.backends.all()
        if settings.DISABLE_CREATING_JOBS:
            self.backends = self.backends.exclude(
                Q(slug="emis") | Q(slug="tpp"))

        # jobs need to be run on a backend so the user needs to have access to
        # at least one
        if not self.backends.exists():
            raise Http404

        # build actions as list or render the exception to the page
        ref = self.kwargs.get("ref", self.workspace.branch)
        try:
            self.project = get_project(
                self.workspace.repo_owner,
                self.workspace.repo_name,
                ref,
            )
            data = load_pipeline(self.project)
        except Exception as e:
            self.actions = []
            # this is a bit nasty, need to mirror what get/post would set up for us
            self.object = None
            context = self.get_context_data(actions_error=str(e))
            return self.render_to_response(context=context)

        self.actions = list(get_actions(data))
        return super().dispatch(request, *args, **kwargs)
Exemplo n.º 3
0
def test_local_run_copes_with_detritus_of_earlier_interrupted_run(
    extraction_tool, tmp_path
):
    # This test simulates the case where an earlier run has been interrupted (for example by the user pressing ctrl-c).
    # In particular we put a couple of jobs in unfinished states, which they could never be left in under normal
    # operation. The correct behaviour of the local run, which this tests for, is for such unfinished jobs to be marked
    # as cancelled on the next run.
    project_dir = tmp_path / "project"
    shutil.copytree(str(FIXTURE_DIR / "full_project"), project_dir)
    config.DATABASE_FILE = project_dir / "metadata" / "db.sqlite"

    project = load_pipeline(project_dir / "project.yaml")
    database.insert(SavedJobRequest(id="previous-request", original={}))

    def job(job_id, action, state):
        spec = get_action_specification(
            project,
            action,
            using_dummy_data_backend=config.USING_DUMMY_DATA_BACKEND,
        )
        return Job(
            id=job_id,
            job_request_id="previous-request",
            state=state,
            status_message="",
            repo_url=str(project_dir),
            workspace=project_dir.name,
            database_name="a-database",
            action=action,
            wait_for_job_ids=[],
            requires_outputs_from=spec.needs,
            run_command=spec.run,
            output_spec=spec.outputs,
            created_at=int(time.time()),
            updated_at=int(time.time()),
            outputs={},
        )

    # FIXME: consolidate these when databuilder supports more columns in dummy data
    if extraction_tool == "cohortextractor":
        actions = ["generate_cohort", "prepare_data_m_cohortextractor"]
    else:
        actions = ["generate_dataset", "analyse_data_databuilder"]

    database.insert(job(job_id="123", action=actions[0], state=State.RUNNING))
    database.insert(job(job_id="456", action=actions[1], state=State.PENDING))
    assert local_run.main(project_dir=project_dir, actions=[actions[1]])

    assert database.find_one(Job, id="123").cancelled
    assert database.find_one(Job, id="123").state == State.FAILED
    assert database.find_one(Job, id="456").cancelled
    assert database.find_one(Job, id="456").state == State.FAILED
Exemplo n.º 4
0
def create_job_request_and_jobs(project_dir, actions, force_run_dependencies):
    job_request = JobRequest(
        id=random_id(),
        repo_url=str(project_dir),
        commit=None,
        requested_actions=actions,
        cancelled_actions=[],
        workspace=project_dir.name,
        database_name="dummy",
        force_run_dependencies=force_run_dependencies,
        # The default behaviour of refusing to run if a dependency has failed
        # makes for an awkward workflow when iterating in development
        force_run_failed=True,
        branch="",
        original={"created_by": getuser()},
    )

    project_file_path = project_dir / "project.yaml"
    if not project_file_path.exists():
        raise ProjectValidationError(
            f"No project.yaml file found in {project_dir}")
    # NOTE: Similar but non-identical logic is implemented for running jobs in
    # production in `jobrunner.create_or_update_jobs.create_jobs`. If you make
    # changes below then consider what, if any, the appropriate corresponding
    # changes might be for production jobs.
    pipeline_config = load_pipeline(project_file_path)
    latest_jobs = calculate_workspace_state(job_request.workspace)

    # On the server out-of-band deletion of an existing output is considered an error, so we ignore that case when
    # scheduling and allow jobs with missing dependencies to fail loudly when they are actually run. However for local
    # running we should allow researchers to delete outputs on disk and automatically rerun the actions that create
    # if they are needed. So here we check whether any files are missing for completed actions and, if so, treat them
    # as though they had not been run -- this will automatically trigger a rerun.
    latest_jobs_with_files_present = [
        job for job in latest_jobs
        if all_output_files_present(project_dir, job)
    ]

    try:
        if not actions:
            raise UnknownActionError("At least one action must be supplied")
        new_jobs = get_new_jobs_to_run(job_request, pipeline_config,
                                       latest_jobs_with_files_present)
    except UnknownActionError as e:
        # Annotate the exception with a list of valid action names so we can
        # show them to the user
        e.valid_actions = [RUN_ALL_COMMAND] + pipeline_config.all_actions
        raise e
    assert_new_jobs_created(new_jobs, latest_jobs_with_files_present)
    resolve_reusable_action_references(new_jobs)
    insert_into_database(job_request, new_jobs)
    return job_request, new_jobs
Exemplo n.º 5
0
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scikitplot as skplt
import itertools

from titanic_model.config import config
from titanic_model import __version__ as _version
from titanic_model import logger
from pipeline import load_pipeline
import typing as t

pipeline_file_name = f"{config.PIPELINE_SAVE_FILE}{_version}.pkl"
_titanic_pipe = load_pipeline(file_name=pipeline_file_name)


def make_prediction(*, input_data: t.Union[pd.DataFrame, dict]) -> dict:
    """
    Makes prediction
    """
    results = _titanic_pipe.predict(input_data)
    return results


if __name__ == '__main__':

    # test pipeline
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score