def _datalad_format_command(ds, spec): """Adjust `spec` to use `datalad run`-style formatting. The "inputs", "outputs", and "command_str" keys in `spec` are replaced and the original are moved under the `*_unexpanded` key. """ from datalad.interface.run import format_command from datalad.interface.run import GlobbedPaths fmt_kwds = {} for key in ["inputs", "outputs"]: if key in spec: spec["{}_unexpanded".format(key)] = spec[key] gp = GlobbedPaths(spec[key]) spec[key] = gp.expand(dot=False) fmt_kwds[key] = gp cmd_expanded = format_command(ds, spec["command_str"], **fmt_kwds) spec["command_str_unexpanded"] = spec["command_str"] spec["command_str"] = cmd_expanded
def test_globbedpaths_get_sub_patterns(): gp = GlobbedPaths([], "doesn't matter") for pat, expected in [ # If there are no patterns in the directory component, we get no # sub-patterns. ("", []), ("nodir", []), (op.join("nomagic", "path"), []), (op.join("nomagic", "path*"), []), # Create sub-patterns from leading path, successively dropping the # right-most component. (op.join("s*", "path"), ["s*" + op.sep]), (op.join("s", "ss*", "path"), [op.join("s", "ss*") + op.sep]), (op.join("s", "ss*", "path*"), [op.join("s", "ss*") + op.sep]), (op.join("s", "ss*" + op.sep), []), (op.join("s*", "ss", "path*"), [op.join("s*", "ss") + op.sep, "s*" + op.sep]), (op.join("s?", "ss", "sss*", "path*"), [ op.join("s?", "ss", "sss*") + op.sep, op.join("s?", "ss") + op.sep, "s?" + op.sep ]) ]: eq_(gp._get_sub_patterns(pat), expected)
def test_globbedpaths(path): for patterns, expected in [(["1.txt", "2.dat"], {"1.txt", "2.dat"}), (["*.txt", "*.dat"], {"1.txt", "2.dat", "3.txt"}), (["*.txt"], {"1.txt", "3.txt"})]: gp = GlobbedPaths(patterns, pwd=path) eq_(set(gp.expand()), expected) eq_(set(gp.expand(full=True)), {opj(path, p) for p in expected}) # Full patterns still get returned as relative to pwd. gp = GlobbedPaths([opj(path, "*.dat")], pwd=path) eq_(gp.expand(), ["2.dat"]) # "." gets special treatment. gp = GlobbedPaths([".", "*.dat"], pwd=path) eq_(set(gp.expand()), {"2.dat", "."}) eq_(gp.expand(dot=False), ["2.dat"]) gp = GlobbedPaths(["."], pwd=path, expand=False) eq_(gp.expand(), ["."]) eq_(gp.paths, ["."]) # We can the glob outputs. glob_results = {"z": "z", "a": ["x", "d", "b"]} with patch('datalad.interface.run.glob', glob_results.get): gp = GlobbedPaths(["z", "a"]) eq_(gp.expand(), ["z", "b", "d", "x"]) # glob expansion for paths property is determined by expand argument. for expand, expected in [(True, ["2.dat"]), (False, ["*.dat"])]: gp = GlobbedPaths(["*.dat"], pwd=path, expand=expand) eq_(gp.paths, expected) with swallow_logs(new_level=logging.WARN) as cml: GlobbedPaths(["not here"], pwd=path).expand() assert_in("No matching files found for 'not here'", cml.out) GlobbedPaths(["also not"], pwd=path, warn=False).expand() assert_not_in("No matching files found for 'also not'", cml.out)
def __call__(cmd=None, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, jobcfg='default', submit=False): # TODO makes sure a different rel_pwd is handled properly on the remote end pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='preparing a remote command execution') try: cmd_expanded = format_command(ds, cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'htcprepare', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return transfer_files_list = ['pre.sh', 'post.sh'] # where all the submission packs live subroot_dir = get_submissions_dir(ds) subroot_dir.mkdir(parents=True, exist_ok=True) # location of to-be-created submission submission_dir = ut.Path( tempfile.mkdtemp(prefix='submit_', dir=text_type(subroot_dir))) submission = submission_dir.name[7:] split_cmd = shlex.split(cmd_expanded) # is this a singularity job? singularity_job = get_singularity_jobspec(split_cmd) if not singularity_job: with (submission_dir / 'runner.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/runner_direct.sh')) job_args = split_cmd else: # link the container into the submission dir (submission_dir / 'singularity.simg').symlink_to( ut.Path(singularity_job[0]).resolve()) transfer_files_list.append('singularity.simg') # arguments of the job job_args = singularity_job[1] job_args.insert(0, 'singularity.simg') # TODO conditional on run_as_user=false with (submission_dir / 'runner.sh').open('wb') as f: f.write( resource_string( 'datalad_htcondor', 'resources/scripts/runner_singularity_anon.sh')) make_executable(submission_dir / 'runner.sh') # htcondor wants the log dir to exist at submit time # TODO ATM we only support a single job per cluster submission (submission_dir / 'job_0' / 'logs').mkdir(parents=True) # TODO make job pre/post script selection configurable with (submission_dir / 'pre.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/pre_posix_chirp.sh')) make_executable(submission_dir / 'pre.sh') with (submission_dir / 'post.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/post_posix.sh')) make_executable(submission_dir / 'post.sh') # API support selection (bound dataset methods and such) # internal import to avoid circularities from datalad.api import ( rev_status as status, ) inputs = GlobbedPaths(inputs, pwd=pwd) prepare_inputs(ds, inputs) # it could be that an input expression does not expand, # because it doesn't match anything. In such a case # we need to filter out such globs to not confuse # the status() call below that only takes real paths inputs = [p for p in inputs.expand(full=True) if op.lexists(p)] # now figure out what matches the remaining paths in the # entire repo and dump a list of files to transfer if inputs: with (submission_dir / 'input_files').open('w') as f: # TODO disable output renderer for p in ds.rev_status( path=inputs, # TODO do we really want that True? I doubt it # this might pull in the world recursive=False, # we would have otherwise no idea untracked='no', result_renderer=None): f.write(text_type(p['path'])) f.write(u'\0') transfer_files_list.append('input_files') if outputs: # write the output globs to a file for eval on the execute # side # XXX we may not want to eval them on the remote side # at all, however. This would make things different # than with local execute, where we also just write to # a dataset and do not have an additional filter (submission_dir / 'output_globs').write_text( # we need a final trailing delimiter as a terminator u'\0'.join(outputs) + u'\0') transfer_files_list.append('output_globs') (submission_dir / 'source_dataset_location').write_text(text_type(ds.pathobj) + op.sep) transfer_files_list.append('source_dataset_location') with (submission_dir / 'cluster.submit').open('w') as f: f.write( submission_template.format( executable='runner.sh', # TODO if singularity_job else 'job.sh', transfer_files_list=','.join( op.join(op.pardir, f) for f in transfer_files_list), **submission_defaults)) f.write(u'\narguments = "{}"\nqueue\n'.format( # TODO deal with single quotes in the args ' '.join("'{}'".format(a) for a in job_args))) # dump the run command args into a file for re-use # when the result is merged # include even args that are already evaluated and # acted upon, to be able to convince `run` to create # a full run record that maybe could be re-run # locally json_py.dump( dict( cmd=cmd, inputs=inputs, outputs=outputs, expand=expand, explicit=explicit, message=message, sidecar=sidecar, # report the PWD to, to given `run` a chance # to be correct after the fact pwd=pwd, ), text_type(submission_dir / 'runargs.json')) # we use this file to inspect what state this submission is in (submission_dir / 'status').write_text(u'prepared') yield get_status_dict(action='htc_prepare', status='ok', refds=text_type(ds.pathobj), submission=submission, path=text_type(submission_dir), logger=lgr) if submit: try: Runner(cwd=text_type(submission_dir)).run( ['condor_submit', 'cluster.submit'], log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, ) (submission_dir / 'status').write_text(u'submitted') yield get_status_dict(action='htc_submit', status='ok', submission=submission, refds=text_type(ds.pathobj), path=text_type(submission_dir), logger=lgr) except CommandError as e: yield get_status_dict(action='htc_submit', status='error', submission=submission, message=('condor_submit failed: %s', exc_str(e)), refds=text_type(ds.pathobj), path=text_type(submission_dir), logger=lgr)
def _apply_output(ds, jdir, sdir): common = dict( action='htc_result_merge', refds=text_type(ds.pathobj), path=text_type(jdir), logger=lgr, ) args_path = sdir / 'runargs.json' try: # anything below PY3.6 needs stringification runargs = json_py.load(str(args_path)) except Exception as e: yield dict( common, status='error', message=("could not load submission arguments from '%s': %s", args_path, exc_str(e))) return # TODO check recursive status to have dataset clean # TODO have query limited to outputs if exlicit was given # prep outputs (unlock or remove) # COPY: this is a copy of the code from run_command outputs = GlobbedPaths(runargs['outputs'], pwd=runargs['pwd'], expand=runargs['expand'] in ["outputs", "both"]) if outputs: for res in _install_and_reglob(ds, outputs): yield res for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res # END COPY # TODO need to immitate PWD change, if needed # -> extract tarball try: stdout, stderr = Runner().run( ['tar', '-xf', '{}'.format(jdir / 'output')], cwd=ds.path) except CommandError as e: yield dict( common, status='error', message=("could not un-tar job results from '%s' at '%s': %s", str(jdir / 'output'), ds.path, exc_str(e))) return # fake a run record, as if we would have executed locally for res in run_command( runargs['cmd'], dataset=ds, inputs=runargs['inputs'], outputs=runargs['outputs'], expand=runargs['expand'], explicit=runargs['explicit'], message=runargs['message'], sidecar=runargs['sidecar'], # TODO pwd, exit code extra_info=None, inject=True): yield res res = list(_remove_dir(ds, jdir))[0] res['action'] = 'htc_results_merge' res['status'] = 'ok' res.pop('message', None) # not removing the submission files (for now), even if the last job output # might be removed now. Those submissions are tiny and could be resubmitted yield res