def test_pipeline_many_valid_inputs_clean(self): """A Pipeline with multiple, properly indexed inputs is clean.""" p = Pipeline(family=PipelineFamily()) self.add_inputs(p, TransformationInput(dataset_idx=2), TransformationInput(dataset_idx=1), TransformationInput(dataset_idx=3)) p.clean()
def test_pipeline_no_inputs_no_steps(self): """A Pipeline with no inputs and no steps is clean but not complete.""" p = Pipeline(family=PipelineFamily()) p.clean() self.assertRaisesRegexp( ValidationError, re.escape("Pipeline {} has no steps".format(p)), p.complete_clean)
def test_pipeline_one_valid_input_no_steps(self): """A Pipeline with one valid input, but no steps, is clean but not complete.""" p = Pipeline(family=PipelineFamily()) self.add_inputs(p, TransformationInput(dataset_idx=1)) p.clean() self.assertRaisesRegexp( ValidationError, re.escape("Pipeline {} has no steps".format(p)), p.complete_clean)
def test_pipeline_many_valid_steps_clean(self): """Test step index check, well-indexed multi-step case.""" p = Pipeline(family=PipelineFamily()) self.add_inputs(p, TransformationInput(dataset_idx=1)) m = Method() self.add_inputs(m, TransformationInput(dataset_idx=1)) p.steps.add(PipelineStep(pipeline=p, transformation=m, step_num=2)) p.steps.add(PipelineStep(pipeline=p, transformation=m, step_num=1)) p.steps.add(PipelineStep(pipeline=p, transformation=m, step_num=3)) p.clean()
def create_valid_pipeline(self): p = Pipeline(family=PipelineFamily()) self.add_inputs(p, self.create_input(datatypes.STR_PK, dataset_idx=1)) m = Method() m.method = m self.add_inputs(m, self.create_input(datatypes.STR_PK, dataset_idx=1)) self.add_outputs(m, self.create_output(datatypes.STR_PK, dataset_idx=1)) step1 = PipelineStep(pipeline=p, transformation=m, step_num=1) p.steps.add(step1) cable = PipelineStepInputCable(pipelinestep=step1, source_step=0, source=p.inputs.all()[0], dest=m.inputs.all()[0]) cable.pipelinestepinputcable = cable step1.cables_in.add(cable) outcable = PipelineOutputCable(pipeline=p, output_idx=1, source_step=1, source=m.outputs.all()[0], output_cdt=m.outputs.all()[0].get_cdt()) p.outcables.add(outcable) yield p
def test_get_action_specification_without_dummy_data_file_flag(tmp_path): dummy_data_file = tmp_path / "test.csv" with dummy_data_file.open("w") as f: f.write("test") config = Pipeline( **{ "version": 1, "actions": { "generate_cohort": { "run": "cohortextractor:latest generate_cohort", "outputs": { "moderately_sensitive": { "cohort": "output/input.csv" } }, "dummy_data_file": dummy_data_file, } }, }) action_spec = get_action_specification(config, "generate_cohort") expected = "cohortextractor:latest generate_cohort --output-dir=output" assert action_spec.run == expected
def test_get_action_specification_for_databuilder_action(): config = Pipeline( **{ "version": 3, "expectations": { "population_size": 1000 }, "actions": { "generate_dataset": { "run": "databuilder:latest generate_dataset " "--dataset_definition=dataset_definition.py " "--output=output/dataset.csv " "--dummy-data-file=dummy.csv", "outputs": { "highly_sensitive": { "dataset": "output/dataset.csv" } }, } }, }) action_spec = get_action_specification(config, "generate_dataset", using_dummy_data_backend=True) assert (action_spec.run == "databuilder:latest generate_dataset " "--dataset_definition=dataset_definition.py " "--output=output/dataset.csv " "--dummy-data-file=dummy.csv")
def test_get_action_specification_databuilder_has_output_flag(): config = Pipeline( **{ "version": 3, "expectations": { "population_size": 1000 }, "actions": { "generate_dataset": { "run": "databuilder:latest generate_dataset --output=output/dataset.csv", "outputs": { "highly_sensitive": { "cohort": "output/dataset.csv", "cohort2": "output/input2.csv", } }, }, }, }) action_spec = get_action_specification(config, "generate_dataset") assert (action_spec.run == "databuilder:latest generate_dataset --output=output/dataset.csv")
def test_get_actions_missing_needs(): dummy = Pipeline( **{ "version": 3, "expectations": {}, "actions": { "frobnicate": { "run": "test", "outputs": { "highly_sensitive": { "cohort": "/some/path" } }, }, }, }) output = list(get_actions(dummy)) expected = [ { "name": "frobnicate", "needs": [] }, { "name": "run_all", "needs": ["frobnicate"] }, ] assert output == expected
def test_get_action_specification_for_databuilder_errors(): config = Pipeline( **{ "version": 3, "expectations": { "population_size": 1_000 }, "actions": { "generate_dataset": { "run": "databuilder:latest generate_dataset " "--dataset_definition=dataset_definition.py " "--output=output/dataset.csv", "outputs": { "highly_sensitive": { "dataset": "output/dataset.csv" } }, } }, }) msg = "--dummy-data-file is required for a local run" with pytest.raises(ProjectValidationError, match=msg): get_action_specification( config, "generate_dataset", using_dummy_data_backend=True, )
def test_get_action_specification_for_cohortextractor_generate_cohort_action(): config = Pipeline( **{ "version": 3, "expectations": { "population_size": 1000 }, "actions": { "generate_cohort": { "run": "cohortextractor:latest generate_cohort", "outputs": { "highly_sensitive": { "cohort": "output/input.csv" } }, } }, }) action_spec = get_action_specification(config, "generate_cohort", using_dummy_data_backend=True) assert ( action_spec.run == """cohortextractor:latest generate_cohort --expectations-population=1000 --output-dir=output""" )
def test_pipeline_one_invalid_input_clean(self): """A Pipeline with one input not numbered "1" is not clean.""" p = Pipeline(family=PipelineFamily()) self.add_inputs(p, TransformationInput(dataset_idx=4)) error = "Inputs are not consecutively numbered starting from 1" self.assertRaisesRegexp(ValidationError, error, p.clean) self.assertRaisesRegexp(ValidationError, error, p.complete_clean)
def test_pipeline_many_invalid_inputs_clean(self): """A Pipeline with multiple, badly indexed inputs is not clean.""" p = Pipeline(family=PipelineFamily()) self.add_inputs(p, TransformationInput(dataset_idx=2), TransformationInput(dataset_idx=3), TransformationInput(dataset_idx=4)) self.assertRaisesRegexp( ValidationError, "Inputs are not consecutively numbered starting from 1", p.clean)
def __init__(self, *args, **kwargs): super(PipelineSerializer, self).__init__(*args, **kwargs) # Set the querysets of the related model fields. curr_user = self.context["request"].user # LOGGER.debug("PL SERIALIZER INIT %s" % self.context.get("only_is_published", False)) revision_parent_field = self.fields["revision_parent"] revision_parent_field.queryset = Pipeline.filter_by_user(curr_user) family_field = self.fields["family"] family_field.queryset = PipelineFamily.filter_by_user(curr_user)
def test_pipeline_many_invalid_steps_clean(self): """Test step index check, badly-indexed multi-step case.""" p = Pipeline(family=PipelineFamily()) self.add_inputs(p, TransformationInput(dataset_idx=1)) m = Method() self.add_inputs(m, TransformationInput(dataset_idx=1)) p.steps.add(PipelineStep(pipeline=p, transformation=m, step_num=1)) p.steps.add(PipelineStep(pipeline=p, transformation=m, step_num=4)) p.steps.add(PipelineStep(pipeline=p, transformation=m, step_num=5)) self.assertRaisesRegexp( ValidationError, "Steps are not consecutively numbered starting from 1", p.clean)
def test_build_removal_plan_for_used_image(self): image = DockerImage(id=99, name='doomed') method = image.methods.create(transformation_ptr_id=100) step = method.pipelinesteps.create(id=101) step.pipeline = Pipeline(transformation_ptr_id=102) step.pipeline.family = PipelineFamily() expected_plan = empty_removal_plan() expected_plan['DockerImages'].add(image) expected_plan['Methods'].add(method) expected_plan['Pipelines'].add(step.pipeline) plan = image.build_removal_plan() self.assertEqual(expected_plan, plan)
def test_get_action_specification_with_unknown_action(): config = Pipeline( **{ "version": 1, "actions": { "known_action": { "run": "python:latest python test.py", "outputs": { "moderately_sensitive": { "cohort": "output/input.csv" } }, } }, }) msg = "Action 'unknown_action' not found in project.yaml" with pytest.raises(UnknownActionError, match=msg): get_action_specification(config, "unknown_action")
def _choose_inputs_for_batch(request, pipeline_pk, start_form=None, input_error_message=''): """Load the input selection page.""" template = loader.get_template("sandbox/choose_inputs.html") pipeline_qs = Pipeline.filter_by_user(request.user).filter(pk=pipeline_pk) pipeline = pipeline_qs.first() if pipeline is None: raise Http404("ID {} is not accessible".format(pipeline_pk)) if start_form is None: start_form = StartRunBatchForm({"pipeline": pipeline}, pipeline_qs=pipeline_qs) context = {"inputs": pipeline.inputs.order_by("dataset_idx"), "start_form": start_form, "input_error_msg": input_error_message, "pipeline": pipeline, "priolist": [t[0] for t in settings.SLURM_QUEUES]} return HttpResponse(template.render(context, request))
def test_get_action_specification_with_config(): config = Pipeline( **{ "version": 3, "expectations": { "population_size": 1_000 }, "actions": { "my_action": { "run": "python:latest python action/__main__.py output/input.csv", "config": { "option": "value" }, "outputs": { "moderately_sensitive": { "my_figure": "output/my_figure.png" } }, } }, }) action_spec = get_action_specification(config, "my_action") assert ( action_spec.run == """python:latest python action/__main__.py output/input.csv --config '{"option": "value"}'""" ) # Does argparse accept options after arguments? parser = argparse.ArgumentParser() parser.add_argument("--config") # option parser.add_argument("input_files", nargs="*") # argument # If parser were in __main__.py, then parser.parse_args would receive sys.argv # by default. sys.argv[0] is the script name (either with or without a path, # depending on the OS) so we slice obs_run_command to mimic this. parser.parse_args(shlex.split(action_spec.run)[2:])
def test_pipeline_oneStep_invalid_cabling_incorrect_cdt_clean(self): """Bad cabling: input is of wrong CompoundDatatype.""" p = Pipeline(family=PipelineFamily()) self.add_inputs(p, self.create_input(datatypes.INT_PK, dataset_idx=1)) m = Method() self.add_inputs(m, self.create_input(datatypes.STR_PK, dataset_idx=1)) step1 = PipelineStep(pipeline=p, transformation=m, step_num=1) p.steps.add(step1) cable = PipelineStepInputCable(pipelinestep=step1, source_step=0, source=p.inputs.all()[0], dest=m.inputs.all()[0]) cable.pipelinestepinputcable = cable step1.cables_in.add(cable) cable.clean() self.assertRaisesRegexp( ValidationError, 'Custom wiring required for cable "{}"'.format(cable), cable.clean_and_completely_wired)
def form_valid(self, form): # This method is called when valid form data has been POSTed. # It should return an HttpResponse. if form.is_valid(): requests = form.cleaned_data['requests'] del form.cleaned_data['requests'] obj = Pipeline(**form.cleaned_data) obj.save() obj.owner = self.request.user for request in requests: obj.requests.add(request) obj.save() return HttpResponseRedirect( reverse('pipeline:pipeline', kwargs={'pk': obj.pk})) else: return True return super().form_valid(form)
def _choose_inputs_for_batch(request, pipeline_pk, start_form=None, input_error_message=''): """Load the input selection page.""" template = loader.get_template("sandbox/choose_inputs.html") pipeline_qs = Pipeline.filter_by_user(request.user).filter(pk=pipeline_pk) pipeline = pipeline_qs.first() if pipeline is None: raise Http404("ID {} is not accessible".format(pipeline_pk)) if start_form is None: start_form = StartRunBatchForm({"pipeline": pipeline}, pipeline_qs=pipeline_qs) context = { "inputs": pipeline.inputs.order_by("dataset_idx"), "start_form": start_form, "input_error_msg": input_error_message, "pipeline": pipeline, "priolist": [t[0] for t in settings.SLURM_QUEUES] } return HttpResponse(template.render(context, request))
def test_no_steps(self): pipeline = Pipeline() updates = pipeline.find_step_updates() self.assertEqual([], updates)