def fixture(): # We can't use pytest's tempdir because that is limited to # scope=function. tmpdir = tempfile.mkdtemp(prefix="reproman-tests-") repodir = os.path.realpath(os.path.join(tmpdir, "repo0")) os.mkdir(repodir) retval = repodir with chpwd(repodir): runner.run(["git", "init"]) setup_user() if kind != "empty": add_and_commit("foo") add_and_commit("bar") runner.run(["git", "tag", "tag0"]) add_and_commit("subdir/baz") if kind == "pair": localdir = os.path.realpath(os.path.join(tmpdir, "repo1")) runner.run(["git", "clone", repodir, localdir], expect_stderr=True) with chpwd(localdir): setup_user() retval = localdir, repodir yield retval shutil.rmtree(tmpdir)
def test_orc_datalad_abort_if_dirty(job_spec, dataset, ssh): subds = dataset.create(path="sub") subds.create(path="subsub") dataset.save() job_spec["inputs"] = [] job_spec["outputs"] = [] def get_orc(jspec=None): return orcs.DataladPairRunOrchestrator( ssh, submission_type="local", job_spec=jspec or job_spec) def run(**spec_kwds): jspec = dict(job_spec, **spec_kwds) with chpwd(dataset.path): orc = get_orc(jspec) # Run one job so that we create the remote repository. orc.prepare_remote() orc.submit() orc.follow() orc.fetch() return orc with chpwd(dataset.path): # We abort if the local dataset is dirty. create_tree(dataset.path, {"local-dirt": ""}) with pytest.raises(OrchestratorError) as exc: get_orc() assert "dirty" in str(exc.value) os.unlink("local-dirt") # Run one job so that we create the remote repository. run(_resolved_command_str="echo one >one") with chpwd(dataset.path): orc1 = get_orc() create_tree(orc1.working_directory, {"dirty": ""}) with pytest.raises(OrchestratorError) as exc: orc1.prepare_remote() assert "dirty" in str(exc.value) os.unlink(op.join(orc1.working_directory, "dirty")) # We can run if the submodule simply has a different commit checked out. run(_resolved_command_str="echo two >two") create_tree(op.join(dataset.path, "sub"), {"for-local-commit": ""}) dataset.add(".", recursive=True) run(_resolved_command_str="echo three >three") # But we abort if subdataset is actually dirty. with chpwd(dataset.path): orc2 = get_orc() create_tree(orc2.working_directory, {"sub": {"subsub": {"subdirt": ""}}}) with pytest.raises(OrchestratorError) as exc: orc2.prepare_remote() assert "dirty" in str(exc.value) os.unlink(op.join(orc2.working_directory, "sub", "subsub", "subdirt"))
def test_orc_datalad_abort_if_dirty(job_spec, dataset, shell): with chpwd(dataset.path): orc0 = orcs.DataladPairOrchestrator( shell, submission_type="local", job_spec=job_spec) # Run one job so that we create the remote repository. orc0.prepare_remote() orc0.submit() orc0.follow() with chpwd(dataset.path): orc1 = orcs.DataladPairOrchestrator( shell, submission_type="local", job_spec=job_spec) create_tree(orc1.working_directory, {"dirty": ""}) with pytest.raises(OrchestratorError) as exc: orc1.prepare_remote() assert "dirty" in str(exc)
def test_orc_datalad_pair_need_follow_parent(job_spec, dataset, shell): # An example of a scenario that fails without DataLad's --follow=parentds with chpwd(dataset.path): dataset.create("sub") dataset.save() job_spec["_resolved_command_str"] = "sh -c 'echo baz >baz'" job_spec["inputs"] = [] job_spec["outputs"] = [] orc0 = orcs.DataladPairOrchestrator(shell, submission_type="local", job_spec=job_spec) orc0.prepare_remote() orc0.submit() orc0.follow() job_spec["_resolved_command_str"] = "sh -c 'echo bar >sub/bar'" output = op.join("sub", "bar") job_spec["outputs"] = [output] orc1 = orcs.DataladPairOrchestrator(shell, submission_type="local", job_spec=job_spec) orc1.prepare_remote() orc1.submit() orc1.follow() orc1.fetch() assert op.exists(output)
def test_orc_datalad_pair_new_submodule(job_spec, dataset, shell): with chpwd(dataset.path): orc = orcs.DataladPairOrchestrator(shell, submission_type="local", job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() orc.fetch() # prepare_remote() doesn't fail when a new subdataset is added after # the first run. sub = dataset.create("sub") dataset.save() job_spec["_resolved_command_str"] = "sh -c 'echo a >sub/a'" job_spec["inputs"] = [] job_spec["outputs"] = [] orc = orcs.DataladPairOrchestrator(shell, submission_type="local", job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() orc.fetch() assert sub.repo.is_under_annex("a")
def test_orc_datalad_no_remote_get(tmpdir, shell, should_pass): import datalad.api as dl topdir = str(tmpdir) ds_a = dl.create(op.join(topdir, "a")) if should_pass: (ds_a.pathobj / "foo").write_text("data") ds_a.save() ds_b = dl.clone(ds_a.path, op.join(topdir, "b")) assert not ds_b.repo.file_has_content("foo") with chpwd(ds_b.path): orc = orcs.DataladNoRemoteOrchestrator(shell, submission_type="local", job_spec={ "root_directory": op.join(topdir, "run-root"), "inputs": ["foo"], "outputs": ["out"], "_resolved_command_str": 'sh -c "cat foo foo >out"' }) if should_pass: orc.prepare_remote() orc.submit() orc.follow() finish_fn = MagicMock() orc.fetch(on_remote_finish=finish_fn) finish_fn.assert_called_once_with(orc.resource, []) assert (ds_b.pathobj / "out").read_text() == "datadata" else: with pytest.raises(OrchestratorError): orc.prepare_remote()
def test_orc_datalad_concurrent(job_spec, dataset, ssh, orc_class, sub_type): names = ["paul", "rosa"] job_spec["inputs"] = ["{p[name]}.in"] job_spec["outputs"] = ["{p[name]}.out"] job_spec["_resolved_command_str"] = "sh -c 'cat {inputs} {inputs} >{outputs}'" job_spec["_resolved_batch_parameters"] = [{"name": n} for n in names] in_files = [n + ".in" for n in names] for fname in in_files: with open(op.join(dataset.path, fname), "w") as fh: fh.write(fname[0]) dataset.save(path=in_files) with chpwd(dataset.path): orc = orc_class(ssh, submission_type=sub_type, job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() # Just make sure each fetch() seems to have wired up on_remote_finish. # test_run.py tests the actual --follow actions. remote_fn = MagicMock() orc.fetch(on_remote_finish=remote_fn) remote_fn.assert_called_once_with(orc.resource, []) out_files = [n + ".out" for n in names] for ofile in out_files: assert dataset.repo.file_has_content(ofile) with open(ofile) as ofh: assert ofh.read() == ofile[0] * 2
def test_orc_datalad_no_remote_only_local(dataset, job_spec, ssh): with chpwd(dataset.path): orc = orcs.DataladNoRemoteOrchestrator(ssh, submission_type="local", job_spec=job_spec) with pytest.raises(OrchestratorError): orc.prepare_remote()
def test_orc_datalad_pair_run_ontop(job_spec, dataset, ssh): # Run one orchestrator and fetch, then run another and fetch: # # orc 1, master # | # o orc 0 # | # o ds = dataset create_tree(ds.path, {"in": "content\n"}) ds.add(".") js0 = job_spec js1 = dict(job_spec, _resolved_command_str='bash -c "echo other >other"') with chpwd(ds.path): def do(js): orc = orcs.DataladPairRunOrchestrator( ssh, submission_type="local", job_spec=js) orc.prepare_remote() orc.submit() orc.follow() orc.fetch() return orc orc0 = do(js0) orc1 = do(js1) ref0 = "refs/reproman/{}".format(orc0.jobid) ref1 = "refs/reproman/{}".format(orc1.jobid) assert ds.repo.is_ancestor(ref0, ref1) assert ds.repo.get_hexsha(ref0) != ds.repo.get_hexsha(ref1) assert ds.repo.get_hexsha(ref1) == ds.repo.get_hexsha("master") assert ds.repo.get_active_branch() == "master"
def _expand_globs(self): def normalize_hit(h): normalized = op.relpath(h) + ("" if op.basename(h) else op.sep) if h == op.curdir + op.sep + normalized: # Don't let relpath prune "./fname" (gh-3034). return h return normalized expanded = [] with chpwd(self.pwd): for pattern in self._paths["patterns"]: hits = glob.glob(pattern) if hits: expanded.extend(sorted(map(normalize_hit, hits))) else: lgr.debug("No matching files found for '%s'", pattern) # We didn't find a hit for the complete pattern. If we find # a sub-pattern hit, that may mean we have an uninstalled # subdataset. for sub_pattern in self._get_sub_patterns(pattern): sub_hits = glob.glob(sub_pattern) if sub_hits: expanded.extend( sorted(map(normalize_hit, sub_hits))) break # ... but we still want to retain the original pattern # because we don't know for sure at this point, and it # won't bother the "install, reglob" routine. expanded.extend([pattern]) return expanded
def fetch(self): """Fetch the results from the remote dataset sibling. """ lgr.info("Fetching results for %s", self.jobid) if self.resource.type == "ssh": ref = self.job_refname self.ds.repo.fetch(self.resource.name, "{0}:{0}".format(ref)) self.ds.update(sibling=self.resource.name, merge=True, recursive=True) with head_at(self.ds, ref): outputs = self.job_spec.get("outputs") if outputs: self.ds.get(path=outputs) if not self.ds.repo.is_ancestor(ref, "HEAD"): lgr.info( "Results stored on %s. " "Bring them into this branch with " "'git merge %s'", ref, ref) elif self.resource.type == "shell": # Below is just for local testing. It doesn't support actually # getting the content. with chpwd(self.ds.path): self.session.execute_command([ "git", "fetch", self.working_directory, "{0}:{0}".format(self.job_refname) ]) self.session.execute_command(["git", "merge", "FETCH_HEAD"]) def get_metadir(mdir): if self.resource.type == "ssh": self.ds.get(path=mdir) self.log_failed(get_metadir)
def _resurrect_orc(job): resource = get_manager().get_resource(job["resource_id"], "id") with chpwd(job["local_directory"]): orchestrator_class = ORCHESTRATORS[job["orchestrator"]] orc = orchestrator_class(resource, job["submitter"], job, resurrection=True) orc.submitter.submission_id = job.get("submission_id") return orc
def test_orc_datalad_abort_if_detached(job_spec, dataset, shell): dataset.repo.checkout("HEAD^{}") with chpwd(dataset.path): orc = orcs.DataladPairOrchestrator( shell, submission_type="local", job_spec=job_spec) with pytest.raises(OrchestratorError): orc.prepare_remote()
def test_dataset_as_dict(shell, dataset, job_spec): with chpwd(dataset.path): orc = orcs.DataladLocalRunOrchestrator(shell, submission_type="local", job_spec=job_spec) d = orc.as_dict() # Check for keys that DataladOrchestrator should extend # OrchestratorError.asdict() with. assert "head" in d assert "dataset_id" in d
def run(**spec_kwds): jspec = dict(job_spec, **spec_kwds) with chpwd(dataset.path): orc = get_orc(jspec) # Run one job so that we create the remote repository. orc.prepare_remote() orc.submit() orc.follow() orc.fetch() return orc
def test_orc_datalad_resurrect(job_spec, dataset, shell): for k in ["jobid", "working_directory", "root_directory", "local_directory"]: job_spec[k] = "doesn't matter" job_spec["head"] = "deadbee" with chpwd(dataset.path): orc = orcs.DataladPairOrchestrator( shell, submission_type="local", job_spec=job_spec, resurrection=True) assert orc.head == "deadbee"
def test_orc_datalad_nonrun(job_spec, dataset, shell, orc_class): with chpwd(dataset.path): orc = orc_class(shell, submission_type="local", job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() orc.fetch() assert dataset.repo.is_under_annex("out") assert (dataset.pathobj / "out").exists()
def test_orc_datalad_run(job_spec, dataset, shell, orc_class, sub_type): with chpwd(dataset.path): orc = orc_class(shell, submission_type=sub_type, job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() orc.fetch() assert dataset.repo.file_has_content("out") assert open("out").read() == "content\nmore\n"
def run_and_check(spec): with chpwd(dataset.path): orc = orc_class(shell, submission_type=sub_type, job_spec=spec) orc.prepare_remote() orc.submit() orc.follow() orc.fetch() assert dataset.repo.file_has_content("out") assert open("out").read() == "content\nmore\n" return orc
def test_orc_datalad_run_results_missing(job_spec, dataset, shell): with chpwd(dataset.path): orc = orcs.DataladLocalRunOrchestrator( shell, submission_type="local", job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() os.unlink(op.join(orc.root_directory, "outputs", "{}.tar.gz".format(orc.jobid))) with pytest.raises(OrchestratorError): orc.fetch()
def run_fn(*args, **kwargs): with contextlib.ExitStack() as stack: stack.enter_context(chpwd(path)) # Patch home to avoid populating testing machine with jobs when # using local shell. stack.enter_context(patch.dict(os.environ, {"HOME": home})) stack.enter_context(patch("reproman.interface.run.get_manager", return_value=resource_manager)) stack.enter_context(patch("reproman.interface.run.LocalRegistry", job_registry)) return run(*args, **kwargs)
def test_combine_batch_params_glob(tmpdir): tmpdir = str(tmpdir) create_tree(tmpdir, {"aaa": "a", "subdir": {"b": "b", "c": "c"}}) with chpwd(tmpdir): res = sorted(_combine_batch_params(["foo=a*,subdir/*,other"]), key=lambda d: d["foo"]) assert list(res) == [ {"foo": "aaa"}, {"foo": "other"}, {"foo": "subdir/b"}, {"foo": "subdir/c"}]
def test_orc_plain_failure(tmpdir, job_spec, shell): job_spec["_resolved_command_str"] = "iwillfail" job_spec["inputs"] = [] local_dir = str(tmpdir) with chpwd(local_dir): orc = orcs.PlainOrchestrator(shell, submission_type="local", job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() for fname in "status", "stderr", "stdout": assert op.exists(op.join(orc.meta_directory, fname + ".0"))
def test_orc_datalad_pair(job_spec, dataset, shell): with chpwd(dataset.path): orc = orcs.DataladPairOrchestrator( shell, submission_type="local", job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() orc.fetch() # The local fetch variant doesn't currently get the content, so just # check that the file is under annex. assert dataset.repo.is_under_annex("out")
def test_venv_identify_distributions(venv_test_dir): libpaths = {p[-1]: os.path.join("lib", PY_VERSION, *p) for p in [("abc.py",), ("importlib", "yaml", "machinery.py"), ("site-packages", "yaml", "parser.py"), ("site-packages", "attr", "filters.py")]} with chpwd(venv_test_dir): path_args = [ # Both full ... os.path.join(venv_test_dir, "venv0", libpaths["parser.py"]), # ... and relative paths work. os.path.join("venv1", libpaths["filters.py"]), # A virtualenv file that isn't part of any particular package. os.path.join("venv1", "bin", "python"), # A link to the outside world ... os.path.join("venv1", libpaths["abc.py"]), # or in a directory that is a link to the outside world. os.path.join("venv1", libpaths["machinery.py"]) ] path_args.append(COMMON_SYSTEM_PATH) tracer = VenvTracer() dists = list(tracer.identify_distributions(path_args)) assert len(dists) == 1 distributions, unknown_files = dists[0] # Unknown files do not include "venv0/bin/python", which is a link # another path within venv0, but they do include the link to the system # abc.py. assert unknown_files == { COMMON_SYSTEM_PATH, op.realpath(os.path.join("venv1", libpaths["abc.py"])), op.realpath(os.path.join("venv1", libpaths["machinery.py"])), # The editable package was added by VenvTracer as an unknown file. os.path.join(venv_test_dir, "minimal_pymodule")} assert len(distributions.environments) == 2 expect = {"environments": [{"packages": [{"files": [libpaths["parser.py"]], "name": "PyYAML", "editable": False}, {"files": [], "name": "nmtest", "editable": True}], "system_site_packages": False}, {"packages": [{"files": [libpaths["filters.py"]], "name": "attrs", "editable": False}], "system_site_packages": False}]} assert_is_subset_recur(expect, attr.asdict(distributions), [dict, list])
def test_venv_identify_distributions(venv_test_dir): paths = [ "lib/" + PY_VERSION + "/site-packages/yaml/parser.py", "lib/" + PY_VERSION + "/site-packages/attr/filters.py" ] with chpwd(venv_test_dir): path_args = [ # Both full ... os.path.join(venv_test_dir, "venv0", paths[0]), # ... and relative paths work. os.path.join("venv1", paths[1]), ] path_args.append("/sbin/iptables") tracer = VenvTracer() dists = list(tracer.identify_distributions(path_args)) assert len(dists) == 1 distributions, unknown_files = dists[0] assert unknown_files == { "/sbin/iptables", # The editable package was added by VenvTracer as an unknown file. os.path.join(venv_test_dir, "minimal_pymodule") } assert len(distributions.environments) == 2 expect = { "environments": [{ "packages": [{ "files": [paths[0]], "name": "PyYAML", "editable": False }, { "files": [], "name": "nmtest", "editable": True }] }, { "packages": [{ "files": [paths[1]], "name": "attrs", "editable": False }] }] } assert_is_subset_recur(expect, attr.asdict(distributions), [dict, list])
def fixture(tmpdir_factory): skipif.no_network() skipif.no_singularity() # Change to a temporary directory so that we don't pollute the current # directory with image files. with chpwd(str(tmpdir_factory.mktemp("singularity-resource"))): from reproman.resource.singularity import Singularity resource = Singularity(name=name or str(uuid.uuid4().hex)[:11], image=image) resource.connect() list(resource.create()) yield resource resource.delete()
def test_orc_datalad_run_failed(job_spec, dataset, shell): job_spec["command_str"] = "iwillfail" job_spec["inputs"] = [] with chpwd(dataset.path): orc = orcs.DataladLocalRunOrchestrator( shell, submission_type="local", job_spec=job_spec) orc.prepare_remote() orc.submit() orc.follow() with swallow_logs(new_level=logging.INFO) as log: orc.fetch() assert "Job status" in log.out assert "stderr:" in log.out
def test_orc_datalad_pair_existing_remote(job_spec, dataset, shell): root_directory = job_spec["root_directory"] dataset.repo.add_remote("localshell", "i-dont-match") with chpwd(dataset.path): orc = orcs.DataladPairOrchestrator(shell, submission_type="local", job_spec=job_spec) # If a remote with the resource name exists, we abort if the # URL doesn't match the expected target... with pytest.raises(OrchestratorError): orc.prepare_remote() # ... and continue if it does. dataset.repo.set_remote_url("localshell", orc.working_directory) orc.prepare_remote()
def fn(resource, jspec): create_tree(local_dir, {"d": {"in": "content\n"}}) with chpwd(local_dir): orc = orcs.PlainOrchestrator(resource, submission_type="local", job_spec=jspec) orc.prepare_remote() assert orc.session.exists( op.join(orc.working_directory, "d", "in")) orc.submit() orc.follow() assert orc.session.exists(op.join(orc.working_directory, "out")) orc.fetch() assert open("out").read() == "content\nmore\n"