def test_runner_log_stdout(): # TODO: no idea of how to check correct logging via any kind of assertion yet. runner = Runner(dry=False) cmd_ = ['echo', 'stdout-Message should be logged'] for cmd in [cmd_, ' '.join(cmd_)]: # should be identical runs, either as a string or as a list kw = {} # on Windows it can't find echo if ran outside the shell if on_windows and isinstance(cmd, list): kw['shell'] = True with swallow_logs(logging.DEBUG) as cm: ret = runner.run(cmd, log_stdout=True, **kw) eq_(cm.lines[0], "Running: %s" % cmd) if not on_windows: # we can just count on sanity eq_(cm.lines[1], "stdout| stdout-Message should be logged") else: # echo outputs quoted lines for some reason, so relax check ok_("stdout-Message should be logged" in cm.lines[1]) assert_equal(runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__())) cmd = 'echo stdout-Message should not be logged' with swallow_outputs() as cmo: with swallow_logs(new_level=logging.INFO) as cml: ret = runner.run(cmd, log_stdout=False) eq_(cmo.out, "stdout-Message should not be logged\n") eq_(cml.out, "") assert_equal(runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__()))
def check_runner_heavy_output(log_online): # TODO: again, no automatic detection of this resulting in being stucked yet. runner = Runner() cmd = '%s -c "import datalad.tests.heavyoutput;"' % sys.executable with swallow_outputs() as cm: ret = runner.run(cmd, log_stderr=False, log_stdout=False, expect_stderr=True) eq_(cm.err, cm.out) # they are identical in that script eq_(cm.out[:10], "[0, 1, 2, ") eq_(cm.out[-15:], "997, 998, 999]\n") #do it again with capturing: ret = runner.run(cmd, log_stderr=True, log_stdout=True, expect_stderr=True) # and now original problematic command with a massive single line if not log_online: # We know it would get stuck in online mode cmd = '%s -c "import sys; x=str(list(range(1000))); [(sys.stdout.write(x), sys.stderr.write(x)) for i in xrange(100)];"' % sys.executable ret = runner.run(cmd, log_stderr=True, log_stdout=True, expect_stderr=True)
def compress_files(files, archive, path=None, overwrite=True): """Compress `files` into an `archive` file Parameters ---------- files : list of str archive : str path : str Alternative directory under which compressor will be invoked, to e.g. take into account relative paths of files and/or archive overwrite : bool Whether to allow overwriting the target archive file if one already exists """ runner = Runner(cwd=path) apath = Path(archive) if apath.exists(): if overwrite: apath.unlink() else: raise ValueError( 'Target archive {} already exists and overwrite is forbidden'. format(apath)) if len(apath.suffixes) > 1 and apath.suffixes[-2] == '.tar': cmd = '7z u .tar -so -- {} | 7z u -si -- {}'.format( ' '.join(quote_cmdlinearg(f) for f in files), quote_cmdlinearg(str(apath)), ) else: cmd = ['7z', 'u', str(apath), '--'] + files runner.run(cmd)
def test_runner_log_stdout(): # TODO: no idea of how to check correct logging via any kind of assertion yet. runner = Runner(dry=False) cmd_ = ['echo', 'stdout-Message should be logged'] for cmd in [cmd_, ' '.join(cmd_)]: # should be identical runs, either as a string or as a list kw = {} # on Windows it can't find echo if ran outside the shell if on_windows and isinstance(cmd, list): kw['shell'] = True with swallow_logs(logging.DEBUG) as cm: ret = runner.run(cmd, log_stdout=True, **kw) eq_(cm.lines[0], "Running: %s" % cmd) if not on_windows: # we can just count on sanity eq_(cm.lines[1], "stdout| stdout-Message should be logged") else: # echo outputs quoted lines for some reason, so relax check ok_("stdout-Message should be logged" in cm.lines[1]) assert_equal( runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__())) cmd = 'echo stdout-Message should not be logged' with swallow_outputs() as cmo: with swallow_logs(new_level=logging.INFO) as cml: ret = runner.run(cmd, log_stdout=False) eq_(cmo.out, "stdout-Message should not be logged\n") eq_(cml.out, "") assert_equal( runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__()))
def get_singularity_jobspec(cmd): """Extract the runscript of a singularity container used as an executable Parameters ---------- cmd : list A command as an argument list. Returns ------- None or str, None or list If no singularity is available, or the executable in the command is not a singularity image given by its path, None is return. Otherwise the runscript of the container is returned a string. The second value is None if the first is None, or a list of arguments to the runscript. """ # get the path to the command's executable exec_path = cmd[0] runner = Runner() if not op.exists(exec_path): # probably a command from PATH return # this is a real file, not just a command on the path try: stdout, stderr = runner.run( ['singularity', '--version'], log_stdout=True, log_stderr=True, expect_stderr=True, expect_fail=True, ) # TODO could be used to tailor handling to particular versions except CommandError as e: # pragma: no cover # we do not have a singularity installation that we can handle # log debug, because there is no guarantee that the executable # actually was a singularity container lgr.debug('No suitable singularity version installed: %s', exc_str(e)) return # we have singularity try: stdout, stderr = runner.run( # stringification only needed for pythons older than 3.6 ['singularity', 'exec', exec_path, 'cat', '/singularity'], log_stdout=True, log_stderr=True, expect_stderr=True, expect_fail=True, ) # TODO could be used to tailor handling to particular versions except CommandError as e: # we do not have a singularity installation that we can handle # log debug, because there is no guarantee that the executable # actually was a singularity container lgr.debug('%s is not a singularity image: %s', exec_path, exc_str(e)) return # all but the container itself are the arguments return exec_path, cmd[1:]
def create_info_file(self): runner = Runner() annex_version = runner.run("git annex version")[0].split()[2] git_version = runner.run("git --version")[0].split()[2] self.create_file('INFO.txt', "git: %s\n" "annex: %s\n" "datalad: %s\n" % (git_version, annex_version, __version__), annex=False)
def test_runner_failure(dir): runner = Runner() failing_cmd = ['git', 'annex', 'add', 'notexistent.dat'] assert_raises(CommandError, runner.run, failing_cmd, cwd=dir) try: runner.run(failing_cmd, cwd=dir) except CommandError, e: assert_equal(1, e.code) assert_in('notexistent.dat not found', e.stderr)
def test_runner_log_stderr(): # TODO: no idea of how to check correct logging via any kind of assertion yet. runner = Runner(dry=False) cmd = 'echo stderr-Message should be logged >&2' ret = runner.run(cmd, log_stderr=True, expect_stderr=True) assert_equal(runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__())) cmd = 'echo stderr-Message should not be logged >&2' with swallow_outputs() as cmo: with swallow_logs(new_level=logging.INFO) as cml: ret = runner.run(cmd, log_stderr=False) eq_(cmo.err.rstrip(), "stderr-Message should not be logged") eq_(cml.out, "") assert_equal(runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__()))
def check_run_and_get_output(cmd): runner = Runner() try: output = runner.run(["datalad", "--help"]) except CommandError, e: raise AssertionError("'datalad --help' failed to start normally. " "Exited with %d and output %s" % (e.code, (e.stdout, e.stderr)))
class runner(SuprocBenchmarks): """Some rudimentary tests to see if there is no major slowdowns from Runner """ def setup(self): from datalad.cmd import Runner self.runner = Runner() # older versions might not have it try: from datalad.cmd import GitRunner self.git_runner = GitRunner() except ImportError: pass def time_echo(self): self.runner.run("echo") def time_echo_gitrunner(self): self.git_runner.run("echo")
def test_cfg_passthrough(path): runner = Runner() _ = runner.run([ 'datalad', '-c', 'annex.tune.objecthash1=true', '-c', 'annex.tune.objecthashlower=true', 'create', path ]) ds = Dataset(path) eq_(ds.config.get('annex.tune.objecthash1', None), 'true') eq_(ds.config.get('annex.tune.objecthashlower', None), 'true')
def test_quoting(path): ds = Dataset(op.join(path, OBSCURE_FILENAME)).create(force=True) # Our custom procedure fails if it receives anything other than two # procedure arguments (so the script itself receives 3). Check a few cases # from the Python API and CLI. ds.config.add("datalad.locations.dataset-procedures", "code", where="dataset") with swallow_outputs(): ds.run_procedure(spec=["just2args", "with ' sing", 'with " doub']) with assert_raises(CommandError): ds.run_procedure(spec=["just2args", "still-one arg"]) runner = Runner(cwd=ds.path) runner.run( "datalad run-procedure just2args \"with ' sing\" 'with \" doub'") with assert_raises(CommandError): runner.run("datalad run-procedure just2args 'still-one arg'")
def test_runner_log_stderr(): # TODO: no idea of how to check correct logging via any kind of assertion yet. runner = Runner(dry=False) cmd = 'echo stderr-Message should be logged >&2' ret = runner.run(cmd, log_stderr=True, expect_stderr=True) assert_equal( runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__())) cmd = 'echo stderr-Message should not be logged >&2' with swallow_outputs() as cmo: with swallow_logs(new_level=logging.INFO) as cml: ret = runner.run(cmd, log_stderr=False) eq_(cmo.err.rstrip(), "stderr-Message should not be logged") eq_(cml.out, "") assert_equal( runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__()))
def check_run_and_get_output(cmd): runner = Runner() try: # suppress log output happen it was set to high values with patch.dict('os.environ', {'DATALAD_LOG_LEVEL': 'WARN'}): output = runner.run(["datalad", "--help"]) except CommandError as e: raise AssertionError("'datalad --help' failed to start normally. " "Exited with %d and output %s" % (e.code, (e.stdout, e.stderr))) return output
def test_cfg_passthrough(path): runner = Runner() _ = runner.run( ['datalad', '-c', 'annex.tune.objecthash1=true', '-c', 'annex.tune.objecthashlower=true', 'create', path]) ds = Dataset(path) eq_(ds.config.get('annex.tune.objecthash1', None), 'true') eq_(ds.config.get('annex.tune.objecthashlower', None), 'true')
class RunnerSuite(SuprocBenchmarks): """Some rudimentary tests to see if there is no major slowdowns from Runner """ def setup(self): from datalad.cmd import Runner self.runner = Runner() # older versions might not have it try: from datalad.cmd import GitRunner self.git_runner = GitRunner() except ImportError: pass def time_echo(self): self.runner.run("echo") def time_echo_gitrunner(self): self.git_runner.run("echo")
def check_run_and_get_output(cmd): runner = Runner() try: # suppress log output happen it was set to high values with patch.dict('os.environ', {'DATALAD_LOGLEVEL': 'WARN'}): output = runner.run(["datalad", "--help"]) except CommandError as e: raise AssertionError("'datalad --help' failed to start normally. " "Exited with %d and output %s" % (e.code, (e.stdout, e.stderr))) return output
def decompress_file(archive, dir_): """Decompress `archive` into a directory `dir_` This is an alternative implementation without patool, but directly calling 7z. Parameters ---------- archive: str dir_: str """ apath = Path(archive) runner = Runner(cwd=dir_) if len(apath.suffixes) > 1 and apath.suffixes[-2] == '.tar': # we have a compressed tar file that needs to be fed through the # decompressor first # hangs somehow, do via single string arg #cmd = ['7z', 'x', archive, '-so', '|', '7z', 'x', '-si', '-ttar'] cmd = '7z x {} -so | 7z x -si -ttar'.format(quote_cmdlinearg(archive)) else: # fire and forget cmd = ['7z', 'x', archive] runner.run(cmd)
def _execute_command(command, pwd, expected_exit=None): from datalad.cmd import Runner exc = None cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( command, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way exc = e cmd_exitcode = e.code if expected_exit is not None and expected_exit != cmd_exitcode: # we failed in a different way during a rerun. This can easily # happen if we try to alter a locked file # # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise exc lgr.info("== Command exit (modification check follows) =====") return cmd_exitcode or 0, exc
def populate(self): super(SubmoduleDataset, self).populate() # add submodules annex = BasicAnnexTestRepo() annex.create() from datalad.cmd import Runner runner = Runner() kw = dict(cwd=self.path, expect_stderr=True) runner.run(['git', 'submodule', 'add', annex.url, 'sub1'], **kw) runner.run(['git', 'submodule', 'add', annex.url, 'sub2'], **kw) runner.run(['git', 'commit', '-m', 'Added sub1 and sub2.'], **kw) runner.run(['git', 'submodule', 'update', '--init', '--recursive'], **kw) # init annex in subdatasets for s in ('sub1', 'sub2'): runner.run(['git', 'annex', 'init'], cwd=opj(self.path, s), expect_stderr=True)
def test_runner(tempfile): # test non-dry command call runner = Runner(dry=False) cmd = 'echo Testing real run > %s' % tempfile ret = runner.run(cmd) assert_equal(runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__())) assert_true(os.path.exists(tempfile), "Run of: %s resulted with non-existing file %s" % (cmd, tempfile)) # test non-dry python function call output = runner.call(os.path.join, 'foo', 'bar') assert_equal(os.path.join('foo', 'bar'), output, "Drycall of: os.path.join, 'foo', 'bar' returned %s" % output) assert_equal(runner.commands.__str__().find('os.path.join'), -1, "Drycall of: os.path.join, 'foo', 'bar' resulted in buffer: %s" % runner.commands.__str__())
def test_runner_dry(tempfile): runner = Runner(dry=True) # test dry command call cmd = 'echo Testing dry run > %s' % tempfile ret = runner.run(cmd) assert_equal(("DRY", "DRY"), ret, "Dry run of: %s resulted in output %s" % (cmd, ret)) assert_greater(runner.commands.__str__().find('echo Testing dry run'), -1, "Dry run of: %s resulted in buffer: %s" % (cmd, runner.commands.__str__())) assert_false(os.path.exists(tempfile)) # test dry python function call output = runner.call(os.path.join, 'foo', 'bar') assert_is(None, output, "Drycall of: os.path.join, 'foo', 'bar' returned %s" % output) assert_greater(runner.commands.__str__().find('join'), -1, "Drycall of: os.path.join, 'foo', 'bar' resulted in buffer: %s" % runner.commands.__str__())
def test_create(path): ds = Dataset(path) ds.create( description="funny", # custom git init option initopts=dict(shared='world')) ok_(ds.is_installed()) assert_repo_status(ds.path, annex=True) # check default backend eq_(ds.config.get("annex.backends"), 'MD5E') eq_(ds.config.get("core.sharedrepository"), '2') runner = Runner() # check description in `info` cmd = ['git', 'annex', 'info'] cmlout = runner.run(cmd, cwd=path) assert_in('funny [here]', cmlout[0]) # check datset ID eq_(ds.config.get_value('datalad.dataset', 'id'), ds.id)
def test_create(path): ds = Dataset(path) ds.create(description="funny", native_metadata_type=['bim', 'bam', 'bum'], shared_access='world') ok_(ds.is_installed()) ok_clean_git(ds.path, annex=True) # check default backend eq_(ds.config.get("annex.backends"), 'MD5E') eq_(ds.config.get("core.sharedrepository"), '2') runner = Runner() # check description in `info` cmd = ['git-annex', 'info'] cmlout = runner.run(cmd, cwd=path) assert_in('funny [here]', cmlout[0]) # check datset ID eq_(ds.config.get_value('datalad.dataset', 'id'), ds.id) assert_equal(ds.config.get_value('datalad.metadata', 'nativetype'), ('bim', 'bam', 'bum'))
def test_runner(tempfile): # test non-dry command call runner = Runner(dry=False) cmd = 'echo Testing real run > %s' % tempfile ret = runner.run(cmd) assert_equal( runner.commands, [], "Run of: %s resulted in non-empty buffer: %s" % (cmd, runner.commands.__str__())) assert_true( os.path.exists(tempfile), "Run of: %s resulted with non-existing file %s" % (cmd, tempfile)) # test non-dry python function call output = runner.call(os.path.join, 'foo', 'bar') assert_equal(os.path.join('foo', 'bar'), output, "Drycall of: os.path.join, 'foo', 'bar' returned %s" % output) assert_equal( runner.commands.__str__().find('os.path.join'), -1, "Drycall of: os.path.join, 'foo', 'bar' resulted in buffer: %s" % runner.commands.__str__())
def test_runner_dry(tempfile): runner = Runner(dry=True) # test dry command call cmd = 'echo Testing dry run > %s' % tempfile ret = runner.run(cmd) assert_equal(("DRY", "DRY"), ret, "Dry run of: %s resulted in output %s" % (cmd, ret)) assert_greater( runner.commands.__str__().find('echo Testing dry run'), -1, "Dry run of: %s resulted in buffer: %s" % (cmd, runner.commands.__str__())) assert_false(os.path.exists(tempfile)) # test dry python function call output = runner.call(os.path.join, 'foo', 'bar') assert_is(None, output, "Drycall of: os.path.join, 'foo', 'bar' returned %s" % output) assert_greater( runner.commands.__str__().find('join'), -1, "Drycall of: os.path.join, 'foo', 'bar' resulted in buffer: %s" % runner.commands.__str__())
def test_create(probe, path): # only as a probe whether this FS is a crippled one ar = AnnexRepo(probe, create=True) ds = Dataset(path) ds.create( description="funny", # custom git init option initopts=dict(shared='world') if not ar.is_managed_branch() else None) ok_(ds.is_installed()) assert_repo_status(ds.path, annex=True) # check default backend eq_(ds.config.get("annex.backends"), 'MD5E') if not ar.is_managed_branch(): eq_(ds.config.get("core.sharedrepository"), '2') runner = Runner() # check description in `info` cmd = ['git', 'annex', 'info'] cmlout = runner.run(cmd, cwd=path) assert_in('funny [here]', cmlout[0]) # check datset ID eq_(ds.config.get_value('datalad.dataset', 'id'), ds.id)
def __call__( # it is optional, because `rerun` can get a recorded one cmd=None, dataset=None, message=None, rerun=False): if rerun and cmd: lgr.warning('Ignoring provided command in --rerun mode') cmd = None if not dataset: # act on the whole dataset if nothing else was specified dataset = get_dataset_root(curdir) ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner from datalad.tests.utils import ok_clean_git lgr.debug('tracking command output underneath %s', ds) try: # base assumption is that the animal smells superb ok_clean_git(ds.path) except AssertionError: yield get_status_dict( 'run', ds=ds, status='impossible', message= 'unsaved modifications present, cannot detect changes by command' ) return if not cmd and not rerun: # TODO here we would need to recover a cmd when a rerun is attempted return if rerun: # pull run info out of the last commit message err_info = get_status_dict('run', ds=ds) if not ds.repo.get_hexsha(): yield dict(err_info, status='impossible', message='cannot re-run command, nothing recorded') return last_commit_msg = ds.repo.repo.head.commit.message cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ===\n(.*)\n\^\^\^ Do not change lines above \^\^\^' runinfo = re.match(cmdrun_regex, last_commit_msg, re.MULTILINE | re.DOTALL) if not runinfo: yield dict( err_info, status='impossible', message= 'cannot re-run command, last saved state does not look like a recorded command run' ) return rec_msg, runinfo = runinfo.groups() if message is None: # re-use commit message, if nothing new was given message = rec_msg try: runinfo = json.loads(runinfo) except Exception as e: yield dict( err_info, status='error', message= ('cannot re-run command, command specification is not valid JSON: %s', e.message)) return if 'cmd' not in runinfo: yield dict( err_info, status='error', message= 'cannot re-run command, command specification missing in recorded state' ) return cmd = runinfo['cmd'] rec_exitcode = runinfo.get('exit', 0) rel_pwd = runinfo.get('pwd', None) if rel_pwd: # recording is relative to the dataset pwd = normpath(opj(ds.path, rel_pwd)) else: rel_pwd = None # normalize, just in case pwd = None # now we have to find out what was modified during the last run, and enable re-modification # ideally, we would bring back the entire state of the tree with #1424, but we limit ourself # to file addition/not-in-place-modification for now to_unlock = [] for r in ds.diff(recursive=True, revision='HEAD~1...HEAD', return_type='generator', result_renderer=None): if r.get('type', None) == 'file' and \ r.get('state', None) in ('added', 'modified'): r.pop('status', None) to_unlock.append(r) if to_unlock: for r in ds.unlock(to_unlock, return_type='generator', result_xfm=None): yield r else: # not a rerun, figure out where we are running pwd = ds.path rel_pwd = curdir # anticipate quoted compound shell commands cmd = cmd[0] if isinstance(cmd, list) and len(cmd) == 1 else cmd # TODO do our best to guess which files to unlock based on the command string # in many cases this will be impossible (but see --rerun). however, # generating new data (common case) will be just fine already # we have a clean dataset, let's run things cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way cmd_exitcode = e.code if not rerun or rec_exitcode != cmd_exitcode: # we failed during a fresh run, or in a different way during a rerun # the latter can easily happen if we try to alter a locked file # # let's fail here, the command could have had a typo or some # other undesirable condition. If we would `add` nevertheless, # we would need to rerun and aggregate annex content that we # likely don't want # TODO add switch to ignore failure (some commands are stupid) # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise CommandError(code=cmd_exitcode) lgr.info("== Command exit (modification check follows) =====") # ammend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd # compose commit message cmd_shorty = (' '.join(cmd) if isinstance(cmd, list) else cmd) cmd_shorty = '{}{}'.format(cmd_shorty[:40], '...' if len(cmd_shorty) > 40 else '') msg = '[DATALAD RUNCMD] {}\n\n=== Do not change lines below ===\n{}\n^^^ Do not change lines above ^^^'.format( message if message is not None else cmd_shorty, json.dumps(run_info, indent=1), sort_keys=True, ensure_ascii=False, encoding='utf-8') for r in ds.add('.', recursive=True, message=msg): yield r
def test_clone(src, tempdir): # Verify that all our repos are clonable r = Runner() output = r.run(["git" , "clone", src, tempdir], log_online=True) #status, output = getstatusoutput("git clone %(src)s %(tempdir)s" % locals()) ok_(os.path.exists(os.path.join(tempdir, ".git")))
class runner(SuprocBenchmarks): """Some rudimentary tests to see if there is no major slowdowns from Runner """ def setup(self): self.runner = Runner() # older versions might not have it try: from datalad.cmd import GitRunner self.git_runner = GitRunner() except ImportError: pass def time_echo(self): self.runner.run("echo") def time_echo_gitrunner(self): self.git_runner.run("echo") # Following "track" measures computing overhead comparing to the simplest # os.system call on the same command without carrying for in/out unit = "% overhead" def _get_overhead(self, cmd, nrepeats=3, **run_kwargs): """Estimate overhead over running command via the simplest os.system and to not care about any output """ # asv does not repeat tracking ones I think, so nrepeats overheads = [] for _ in range(nrepeats): t0 = time() os.system(cmd + " >/dev/null 2>&1") t1 = time() self.runner.run(cmd, **run_kwargs) t2 = time() overhead = 100 * ((t2 - t1) / (t1 - t0) - 1.0) # print("O :", t1 - t0, t2 - t0, overhead) overheads.append(overhead) overhead = round(sum(overheads) / len(overheads), 2) #overhead = round(min(overheads), 2) return overhead def track_overhead_echo(self): return self._get_overhead("echo") # 100ms chosen below as providing some sensible stability for me. # at 10ms -- too much variability def track_overhead_100ms(self): return self._get_overhead("sleep 0.1") def track_overhead_heavyout(self): # run busyloop for 100ms outputing as much as it could return self._get_overhead(heavyout_cmd) def track_overhead_heavyout_online_through(self): return self._get_overhead( heavyout_cmd, log_stderr='offline', # needed to would get stuck log_online=True) def track_overhead_heavyout_online_process(self): return self._get_overhead( heavyout_cmd, log_stdout=lambda s: '', log_stderr='offline', # needed to would get stuck log_online=True)
def run_command(cmd, dataset=None, message=None, rerun_info=None): rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) elif dataset: pwd = dataset.path rel_pwd = curdir else: # act on the whole dataset if nothing else was specified dataset = get_dataset_root(curdir) # Follow our generic semantic that if dataset is specified, # paths are relative to it, if not -- relative to pwd pwd = getpwd() if dataset: rel_pwd = relpath(pwd, dataset) else: rel_pwd = pwd # and leave handling on deciding either we # deal with it or crash to checks below ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner lgr.debug('tracking command output underneath %s', ds) if not rerun_info and ds.repo.dirty: # Rerun already takes care of this. yield get_status_dict('run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return # anticipate quoted compound shell commands cmd = cmd[0] if isinstance(cmd, list) and len(cmd) == 1 else cmd # TODO do our best to guess which files to unlock based on the command string # in many cases this will be impossible (but see rerun). however, # generating new data (common case) will be just fine already # we have a clean dataset, let's run things exc = None cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way exc = e cmd_exitcode = e.code if rerun_info and rerun_info.get("exit", 0) != cmd_exitcode: # we failed in a different way during a rerun. This can easily # happen if we try to alter a locked file # # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise exc lgr.info("== Command exit (modification check follows) =====") # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, 'chain': rerun_info["chain"] if rerun_info else [], } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd # compose commit message msg = '[DATALAD RUNCMD] {}\n\n=== Do not change lines below ===\n{}\n^^^ Do not change lines above ^^^'.format( message if message is not None else _format_cmd_shorty(cmd), json.dumps(run_info, indent=1), sort_keys=True, ensure_ascii=False, encoding='utf-8') if not rerun_info and cmd_exitcode: msg_path = opj(relpath(ds.repo.repo.git_dir), "COMMIT_EDITMSG") with open(msg_path, "w") as ofh: ofh.write(msg) lgr.info( "The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -r -F%s .'", msg_path) raise exc else: for r in ds.add('.', recursive=True, message=msg): yield r
def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='raise', shared=False): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError("""No dataset found at or above {0}.""".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert(ds is not None and sshurl is not None) if not ds.is_installed(): raise ValueError("""Dataset {0} is not installed yet.""".format(ds)) assert(ds.repo is not None) # determine target parameters: parsed_target = urlparse(sshurl) host_name = parsed_target.netloc # TODO: Sufficient to fail on this condition? if not parsed_target.netloc: raise ValueError("Malformed URL: {0}".format(sshurl)) if target_dir is None: if parsed_target.path: target_dir = parsed_target.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # setup SSH Connection: # TODO: Make the entire setup a helper to use it when pushing via # publish? # - build control master: from datalad.utils import assure_dir not_supported_on_windows("TODO") from os import geteuid # Linux specific import var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid() assure_dir(var_run_user_datalad) control_path = "%s/%s" % (var_run_user_datalad, host_name) control_path += ":%s" % parsed_target.port if parsed_target.port else "" # - start control master: cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \ "-o ControlPersist=yes %s exit" % (control_path, host_name) lgr.debug("Try starting control master by calling:\n%s" % cmd) import subprocess proc = subprocess.Popen(cmd, shell=True) proc.communicate(input="\n") # why the f.. this is necessary? runner = Runner() ssh_cmd = ["ssh", "-S", control_path, host_name] lgr.info("Creating target datasets ...") for current_dataset in datasets: if not replicate_local_structure: path = target_dir.replace("%NAME", current_dataset.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath(opj(target_dir, relpath(datasets[current_dataset].path, start=ds.path))) if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True cmd = ssh_cmd + ["ls", path] try: out, err = runner.run(cmd, expect_fail=True, expect_stderr=True) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'raise': raise RuntimeError( "Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': pass else: raise ValueError("Do not know how to hand existing=%s" % repr(existing)) cmd = ssh_cmd + ["mkdir", "-p", path] try: runner.run(cmd) except CommandError as e: lgr.error("Remotely creating target directory failed at " "%s.\nError: %s" % (path, str(e))) continue # init git repo cmd = ssh_cmd + ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: runner.run(cmd) except CommandError as e: lgr.error("Remotely initializing git repository failed at %s." "\nError: %s\nSkipping ..." % (path, str(e))) continue # check git version on remote end: cmd = ssh_cmd + ["git", "version"] try: out, err = runner.run(cmd) git_version = out.lstrip("git version").strip() lgr.debug("Detected git version on server: %s" % git_version) if git_version < "2.4": lgr.error("Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping ..." % git_version) continue except CommandError as e: lgr.warning( "Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(e.message)) # allow for pushing to checked out branch cmd = ssh_cmd + ["git", "-C", path, "config", "receive.denyCurrentBranch", "updateInstead"] try: runner.run(cmd) except CommandError as e: lgr.warning("git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch." % path) # enable post-update hook: cmd = ssh_cmd + ["mv", opj(path, ".git/hooks/post-update.sample"), opj(path, ".git/hooks/post-update")] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to enable post update hook.\n" "Error: %s" % e.message) # initially update server info "manually": cmd = ssh_cmd + ["git", "-C", path, "update-server-info"] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to update server info.\n" "Error: %s" % e.message) # stop controlmaster (close ssh connection): cmd = ["ssh", "-O", "stop", "-S", control_path, host_name] out, err = runner.run(cmd, expect_stderr=True) if target: # add the sibling(s): if target_url is None: target_url = sshurl if target_pushurl is None: target_pushurl = sshurl result_adding = AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, force=existing in {'replace'})
class runner(SuprocBenchmarks): """Some rudimentary tests to see if there is no major slowdowns from Runner """ def setup(self): self.runner = Runner() # older versions might not have it try: from datalad.cmd import GitRunner self.git_runner = GitRunner() except ImportError: pass def time_echo(self): self.runner.run("echo") def time_echo_gitrunner(self): self.git_runner.run("echo") # Following "track" measures computing overhead comparing to the simplest # os.system call on the same command without carrying for in/out unit = "% overhead" def _get_overhead(self, cmd, nrepeats=3, **run_kwargs): """Estimate overhead over running command via the simplest os.system and to not care about any output """ # asv does not repeat tracking ones I think, so nrepeats overheads = [] for _ in range(nrepeats): t0 = time() os.system(cmd + " >/dev/null 2>&1") t1 = time() self.runner.run(cmd, **run_kwargs) t2 = time() overhead = 100 * ((t2 - t1) / (t1 - t0) - 1.0) # print("O :", t1 - t0, t2 - t0, overhead) overheads.append(overhead) overhead = round(sum(overheads) / len(overheads), 2) #overhead = round(min(overheads), 2) return overhead def track_overhead_echo(self): return self._get_overhead("echo") # 100ms chosen below as providing some sensible stability for me. # at 10ms -- too much variability def track_overhead_100ms(self): return self._get_overhead("sleep 0.1") def track_overhead_heavyout(self): # run busyloop for 100ms outputing as much as it could return self._get_overhead(heavyout_cmd) def track_overhead_heavyout_online_through(self): return self._get_overhead(heavyout_cmd, log_stderr='offline', # needed to would get stuck log_online=True) def track_overhead_heavyout_online_process(self): return self._get_overhead(heavyout_cmd, log_stdout=lambda s: '', log_stderr='offline', # needed to would get stuck log_online=True) # # Probably not really interesting, and good lord wobbles around 0 # def track_overhead_heavyout_offline(self): # return self._get_overhead(heavyout_cmd, # log_stdout='offline', # log_stderr='offline') # TODO: track the one with in/out, i.e. for those BatchedProcesses
def populate(self): super(NestedDataset, self).populate() ds = SubmoduleDataset() ds.create() from datalad.cmd import Runner runner = Runner() kw = dict(expect_stderr=True) runner.run(['git', 'submodule', 'add', ds.url, 'subdataset'], cwd=self.path, **kw) runner.run(['git', 'submodule', 'add', ds.url, 'subsubdataset'], cwd=opj(self.path, 'subdataset'), **kw) runner.run(['git', 'commit', '-m', 'Added subdataset.'], cwd=opj(self.path, 'subdataset'), **kw) runner.run(['git', 'commit', '-a', '-m', 'Added subdatasets.'], cwd=self.path, **kw) runner.run(['git', 'submodule', 'update', '--init', '--recursive'], cwd=self.path, **kw) # init all annexes for s in ('', 'subdataset', opj('subdataset', 'subsubdataset')): runner.run(['git', 'annex', 'init'], cwd=opj(self.path, s), expect_stderr=True)
def __call__(name, url=None, dataset=None, call_fmt=None, image=None, update=False): if not name: raise InsufficientArgumentsError("`name` argument is required") ds = require_dataset(dataset, check_installed=True, purpose='add container') runner = Runner() # prevent madness in the config file if not re.match(r'^[0-9a-zA-Z-]+$', name): raise ValueError( "Container names can only contain alphanumeric characters " "and '-', got: '{}'".format(name)) cfgbasevar = "datalad.containers.{}".format(name) if cfgbasevar + ".image" in ds.config: if not update: yield get_status_dict( action="containers_add", ds=ds, logger=lgr, status="impossible", message=("Container named %r already exists. " "Use --update to reconfigure.", name)) return if not (url or image or call_fmt): # No updated values were provided. See if an update url is # configured (currently relevant only for Singularity Hub). url = ds.config.get(cfgbasevar + ".updateurl") if not url: yield get_status_dict( action="containers_add", ds=ds, logger=lgr, status="impossible", message="No values to update specified") return call_fmt = call_fmt or ds.config.get(cfgbasevar + ".cmdexec") image = image or ds.config.get(cfgbasevar + ".image") if not image: loc_cfg_var = "datalad.containers.location" # TODO: We should provide an entry point (or sth similar) for extensions # to get config definitions into the ConfigManager. In other words an # easy way to extend definitions in datalad's common_cfgs.py. container_loc = \ ds.config.obtain( loc_cfg_var, where=definitions[loc_cfg_var]['destination'], # if not False it would actually modify the # dataset config file -- undesirable store=False, default=definitions[loc_cfg_var]['default'], dialog_type=definitions[loc_cfg_var]['ui'][0], valtype=definitions[loc_cfg_var]['type'], **definitions[loc_cfg_var]['ui'][1] ) image = op.join(ds.path, container_loc, name, 'image') else: image = op.join(ds.path, image) result = get_status_dict( action="containers_add", path=image, type="file", logger=lgr, ) if call_fmt is None: # maybe built in knowledge can help call_fmt = _guess_call_fmt(ds, name, url) # collect bits for a final and single save() call to_save = [] imgurl = url was_updated = False if url: if update and op.lexists(image): was_updated = True # XXX: check=False is used to avoid dropping the image. It # should use drop=False if remove() gets such an option (see # DataLad's gh-2673). for r in ds.remove(image, save=False, check=False, return_type="generator"): yield r imgurl = _resolve_img_url(url) lgr.debug('Attempt to obtain container image from: %s', imgurl) if url.startswith("dhub://"): from .adapters import docker docker_image = url[len("dhub://"):] lgr.debug("Running 'docker pull %s and saving image to %s", docker_image, image) runner.run(["docker", "pull", docker_image]) docker.save(docker_image, image) elif url.startswith("docker://"): image_dir, image_basename = op.split(image) if not image_basename: raise ValueError("No basename in path {}".format(image)) if image_dir and not op.exists(image_dir): os.makedirs(image_dir) lgr.info( "Building Singularity image for %s " "(this may take some time)", url) runner.run(["singularity", "build", image_basename, url], cwd=image_dir or None) elif op.exists(url): lgr.info("Copying local file %s to %s", url, image) image_dir = op.dirname(image) if image_dir and not op.exists(image_dir): os.makedirs(image_dir) copyfile(url, image) else: if _HAS_SHUB_DOWNLOADER and url.startswith('shub://'): _ensure_datalad_remote(ds.repo) try: ds.repo.add_url_to_file(image, imgurl) except Exception as e: result["status"] = "error" result["message"] = str(e) yield result # TODO do we have to take care of making the image executable # if --call_fmt is not provided? to_save.append(image) # continue despite a remote access failure, the following config # setting will enable running the command again with just the name # given to ease a re-run if not op.lexists(image): result["status"] = "error" result["message"] = ('no image at %s', image) yield result return # store configs if imgurl != url: # store originally given URL, as it resolves to something # different and maybe can be used to update the container # at a later point in time ds.config.set("{}.updateurl".format(cfgbasevar), url) # force store the image, and prevent multiple entries ds.config.set("{}.image".format(cfgbasevar), op.relpath(image, start=ds.path), force=True) if call_fmt: ds.config.set("{}.cmdexec".format(cfgbasevar), call_fmt, force=True) # store changes to_save.append(op.join(".datalad", "config")) for r in ds.save( path=to_save, message="[DATALAD] {do} containerized environment '{name}'". format(do="Update" if was_updated else "Configure", name=name)): yield r result["status"] = "ok" yield result
def test_clone(src, tempdir): # Verify that all our repos are clonable r = Runner() output = r.run(["git", "clone", src, tempdir], log_online=True) #status, output = getstatusoutput("git clone %(src)s %(tempdir)s" % locals()) ok_(os.path.exists(os.path.join(tempdir, ".git")))
def __call__(dataset=None, name=None, url=None, pushurl=None, recursive=False, force=False): # TODO: Detect malformed URL and fail? if name is None or (url is None and pushurl is None): raise ValueError("""insufficient information to add a sibling (needs at least a dataset, a name and an URL).""") if url is None: url = pushurl # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError("No dataset found at or above {0}.".format( getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert (ds is not None and name is not None and url is not None) if not ds.is_installed(): raise ValueError("Dataset {0} is not installed yet.".format(ds)) assert (ds.repo is not None) ds_basename = basename(ds.path) repos = {ds_basename: {'repo': ds.repo}} if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) repos[ds_basename + '/' + subds] = { # repos[subds] = { 'repo': GitRepo(sub_path, create=False) } # Note: This is copied from create_publication_target_sshwebserver # as it is the same logic as for its target_dir. # TODO: centralize and generalize template symbol handling # TODO: Check pushurl for template symbols too. Probably raise if only # one of them uses such symbols replicate_local_structure = False if "%NAME" not in url: replicate_local_structure = True for repo in repos: if not replicate_local_structure: repos[repo]['url'] = url.replace("%NAME", repo.replace("/", "-")) if pushurl: repos[repo]['pushurl'] = pushurl.replace( "%NAME", repo.replace("/", "-")) else: repos[repo]['url'] = url if pushurl: repos[repo]['pushurl'] = pushurl if repo != ds_basename: repos[repo]['url'] = _urljoin(repos[repo]['url'], repo[len(ds_basename) + 1:]) if pushurl: repos[repo]['pushurl'] = _urljoin( repos[repo]['pushurl'], repo[len(ds_basename) + 1:]) # collect existing remotes: already_existing = list() conflicting = list() for repo in repos: if name in repos[repo]['repo'].git_get_remotes(): already_existing.append(repo) lgr.debug("""Remote '{0}' already exists in '{1}'.""".format(name, repo)) existing_url = repos[repo]['repo'].git_get_remote_url(name) existing_pushurl = \ repos[repo]['repo'].git_get_remote_url(name, push=True) if repos[repo]['url'].rstrip('/') != existing_url.rstrip('/') \ or (pushurl and existing_pushurl and repos[repo]['pushurl'].rstrip('/') != existing_pushurl.rstrip('/')) \ or (pushurl and not existing_pushurl): conflicting.append(repo) if not force and conflicting: raise RuntimeError("Sibling '{0}' already exists with conflicting" " URL for {1} dataset(s). {2}".format( name, len(conflicting), conflicting)) runner = Runner() successfully_added = list() for repo in repos: if repo in already_existing: if repo not in conflicting: lgr.debug("Skipping {0}. Nothing to do.".format(repo)) continue # rewrite url cmd = ["git", "remote", "set-url", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) else: # add the remote cmd = ["git", "remote", "add", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) if pushurl: cmd = [ "git", "remote", "set-url", "--push", name, repos[repo]['pushurl'] ] runner.run(cmd, cwd=repos[repo]['repo'].path) successfully_added.append(repo) return successfully_added
def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='raise', shared=False): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError("""No dataset found at or above {0}.""".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert (ds is not None and sshurl is not None) if not ds.is_installed(): raise ValueError( """Dataset {0} is not installed yet.""".format(ds)) assert (ds.repo is not None) # determine target parameters: parsed_target = urlparse(sshurl) host_name = parsed_target.netloc # TODO: Sufficient to fail on this condition? if not parsed_target.netloc: raise ValueError("Malformed URL: {0}".format(sshurl)) if target_dir is None: if parsed_target.path: target_dir = parsed_target.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # setup SSH Connection: # TODO: Make the entire setup a helper to use it when pushing via # publish? # - build control master: from datalad.utils import assure_dir not_supported_on_windows("TODO") from os import geteuid # Linux specific import var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid() assure_dir(var_run_user_datalad) control_path = "%s/%s" % (var_run_user_datalad, host_name) control_path += ":%s" % parsed_target.port if parsed_target.port else "" # - start control master: cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \ "-o ControlPersist=yes %s exit" % (control_path, host_name) lgr.debug("Try starting control master by calling:\n%s" % cmd) import subprocess proc = subprocess.Popen(cmd, shell=True) proc.communicate(input="\n") # why the f.. this is necessary? runner = Runner() ssh_cmd = ["ssh", "-S", control_path, host_name] lgr.info("Creating target datasets ...") for current_dataset in datasets: if not replicate_local_structure: path = target_dir.replace("%NAME", current_dataset.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath( opj(target_dir, relpath(datasets[current_dataset].path, start=ds.path))) if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True cmd = ssh_cmd + ["ls", path] try: out, err = runner.run(cmd, expect_fail=True, expect_stderr=True) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'raise': raise RuntimeError( "Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': pass else: raise ValueError( "Do not know how to hand existing=%s" % repr(existing)) cmd = ssh_cmd + ["mkdir", "-p", path] try: runner.run(cmd) except CommandError as e: lgr.error("Remotely creating target directory failed at " "%s.\nError: %s" % (path, str(e))) continue # init git repo cmd = ssh_cmd + ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: runner.run(cmd) except CommandError as e: lgr.error("Remotely initializing git repository failed at %s." "\nError: %s\nSkipping ..." % (path, str(e))) continue # check git version on remote end: cmd = ssh_cmd + ["git", "version"] try: out, err = runner.run(cmd) git_version = out.lstrip("git version").strip() lgr.debug("Detected git version on server: %s" % git_version) if git_version < "2.4": lgr.error("Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping ..." % git_version) continue except CommandError as e: lgr.warning("Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(e.message)) # allow for pushing to checked out branch cmd = ssh_cmd + [ "git", "-C", path, "config", "receive.denyCurrentBranch", "updateInstead" ] try: runner.run(cmd) except CommandError as e: lgr.warning("git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch." % path) # enable post-update hook: cmd = ssh_cmd + [ "mv", opj(path, ".git/hooks/post-update.sample"), opj(path, ".git/hooks/post-update") ] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to enable post update hook.\n" "Error: %s" % e.message) # initially update server info "manually": cmd = ssh_cmd + ["git", "-C", path, "update-server-info"] try: runner.run(cmd) except CommandError as e: lgr.error("Failed to update server info.\n" "Error: %s" % e.message) # stop controlmaster (close ssh connection): cmd = ["ssh", "-O", "stop", "-S", control_path, host_name] out, err = runner.run(cmd, expect_stderr=True) if target: # add the sibling(s): if target_url is None: target_url = sshurl if target_pushurl is None: target_pushurl = sshurl result_adding = AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, force=existing in {'replace'})
class ConfigManager(object): """Thin wrapper around `git-config` with support for a dataset configuration. The general idea is to have an object that is primarily used to read/query configuration option. Upon creation, current configuration is read via one (or max two, in the case of the presence of dataset-specific configuration) calls to `git config`. If this class is initialized with a Dataset instance, it supports reading and writing configuration from ``.datalad/config`` inside a dataset too. This file is committed to Git and hence useful to ship certain configuration items with a dataset. The API aims to provide the most significant read-access API of a dictionary, the Python ConfigParser, and GitPython's config parser implementations. This class is presently not capable of efficiently writing multiple configurations items at once. Instead, each modification results in a dedicated call to `git config`. This author thinks this is OK, as he cannot think of a situation where a large number of items need to be written during normal operation. If such need arises, various solutions are possible (via GitPython, or an independent writer). Any DATALAD_* environment variable is also presented as a configuration item. Settings read from environment variables are not stored in any of the configuration file, but are read dynamically from the environment at each `reload()` call. Their values take precedence over any specification in configuration files. Parameters ---------- dataset : Dataset, optional If provided, all `git config` calls are executed in this dataset's directory. Moreover, any modifications are, by default, directed to this dataset's configuration file (which will be created on demand) dataset_only : bool If True, configuration items are only read from a datasets persistent configuration file, if any present (the one in ``.datalad/config``, not ``.git/config``). """ def __init__(self, dataset=None, dataset_only=False): # store in a simple dict # no subclassing, because we want to be largely read-only, and implement # config writing separately self._store = {} self._dataset = dataset self._dataset_only = dataset_only # Since configs could contain sensitive information, to prevent # any "facilitated" leakage -- just disable loging of outputs for # this runner run_kwargs = dict(log_outputs=False) if dataset is not None: # make sure we run the git config calls in the dataset # to pick up the right config files run_kwargs['cwd'] = dataset.path self._runner = Runner(**run_kwargs) self.reload() def reload(self): """Reload all configuration items from the configured sources""" self._store = {} # 2-step strategy: # - load datalad dataset config from dataset # - load git config from all supported by git sources # in doing so we always stay compatible with where Git gets its # config from, but also allow to override persistent information # from dataset locally or globally if self._dataset: # now any dataset config dscfg_fname = opj(self._dataset.path, '.datalad', 'config') if exists(dscfg_fname): stdout, stderr = self._run(['-z', '-l', '--file', dscfg_fname], log_stderr=True) # overwrite existing value, do not amend to get multi-line # values self._store = _parse_gitconfig_dump( stdout, self._store, replace=False) if not self._dataset_only: stdout, stderr = self._run(['-z', '-l'], log_stderr=True) self._store = _parse_gitconfig_dump( stdout, self._store, replace=True) # override with environment variables self._store = _parse_env(self._store) @_where_reload def obtain(self, var, default=None, dialog_type=None, valtype=None, store=False, where=None, reload=True, **kwargs): """ Convenience method to obtain settings interactively, if needed A UI will be used to ask for user input in interactive sessions. Questions to ask, and additional explanations can be passed directly as arguments, or retrieved from a list of pre-configured items. Additionally, this method allows for type conversion and storage of obtained settings. Both aspects can also be pre-configured. Parameters ---------- var : str Variable name including any section like `git config` expects them, e.g. 'core.editor' default : any type In interactive sessions and if `store` is True, this default value will be presented to the user for confirmation (or modification). In all other cases, this value will be silently assigned unless there is an existing configuration setting. dialog_type : {'question', 'yesno', None} Which dialog type to use in interactive sessions. If `None`, pre-configured UI options are used. store : bool Whether to store the obtained value (or default) %s `**kwargs` Additional arguments for the UI function call, such as a question `text`. """ # do local import, as this module is import prominently and the # could theroetically import all kind of weired things for type # conversion from datalad.interface.common_cfg import definitions as cfg_defs # fetch what we know about this variable cdef = cfg_defs.get(var, {}) # type conversion setup if valtype is None and 'type' in cdef: valtype = cdef['type'] if valtype is None: valtype = lambda x: x # any default? if default is None and 'default' in cdef: default = cdef['default'] _value = None if var in self: # nothing needs to be obtained, it is all here already _value = self[var] elif store is False and default is not None: # nothing will be stored, and we have a default -> no user confirmation # we cannot use logging, because we want to use the config to confiugre # the logging #lgr.debug('using default {} for config setting {}'.format(default, var)) _value = default if _value is not None: # we got everything we need and can exit early try: return valtype(_value) except Exception as e: raise ValueError( "value '{}' of existing configuration for '{}' cannot be " "converted to the desired type '{}' ({})".format( _value, var, valtype, exc_str(e))) # now we need to try to obtain something from the user from datalad.ui import ui # configure UI dialog_opts = kwargs if dialog_type is None: # no override # check for common knowledge on how to obtain a value if 'ui' in cdef: dialog_type = cdef['ui'][0] # pull standard dialog settings dialog_opts = cdef['ui'][1] # update with input dialog_opts.update(kwargs) if (not ui.is_interactive or dialog_type is None) and default is None: raise RuntimeError( "cannot obtain value for configuration item '{}', " "not preconfigured, no default, no UI available".format(var)) if not hasattr(ui, dialog_type): raise ValueError("UI '{}' does not support dialog type '{}'".format( ui, dialog_type)) # configure storage destination, if needed if store: if where is None and 'destination' in cdef: where = cdef['destination'] if where is None: raise ValueError( "request to store configuration item '{}', but no " "storage destination specified".format(var)) # obtain via UI dialog = getattr(ui, dialog_type) _value = dialog(default=default, **dialog_opts) if _value is None: # we got nothing if default is None: raise RuntimeError( "could not obtain value for configuration item '{}', " "not preconfigured, no default".format(var)) # XXX maybe we should return default here, even it was returned # from the UI -- if that is even possible # execute type conversion before storing to check that we got # something that looks like what we want try: value = valtype(_value) except Exception as e: raise ValueError( "cannot convert user input `{}` to desired type ({})".format( _value, exc_str(e))) # XXX we could consider "looping" until we have a value of proper # type in case of a user typo... if store: # store value as it was before any conversion, needs to be str # anyway # needs string conversion nevertheless, because default could come # in as something else self.add(var, '{}'.format(_value), where=where, reload=reload) return value # # Compatibility with dict API # def __len__(self): return len(self._store) def __getitem__(self, key): return self._store.__getitem__(key) def __contains__(self, key): return self._store.__contains__(key) def keys(self): """Returns list of configuration item names""" return self._store.keys() def get(self, key, default=None): """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" return self._store.get(key, default) # # Compatibility with ConfigParser API # def sections(self): """Returns a list of the sections available""" return list(set([cfg_section_regex.match(k).group(1) for k in self._store])) def options(self, section): """Returns a list of options available in the specified section.""" opts = [] for k in self._store: sec, opt = cfg_sectionoption_regex.match(k).groups() if sec == section: opts.append(opt) return opts def has_section(self, section): """Indicates whether a section is present in the configuration""" for k in self._store: if k.startswith(section): return True return False def has_option(self, section, option): """If the given section exists, and contains the given option""" for k in self._store: sec, opt = cfg_sectionoption_regex.match(k).groups() if sec == section and opt == option: return True return False def getint(self, section, option): """A convenience method which coerces the option value to an integer""" return int(self.get_value(section, option)) def getbool(self, section, option, default=None): """A convenience method which coerces the option value to a bool Values "on", "yes", "true" and any int!=0 are considered True Values which evaluate to bool False, "off", "no", "false" are considered False TypeError is raised for other values. """ val = self.get_value(section, option, default=default) return anything2bool(val) def getfloat(self, section, option): """A convenience method which coerces the option value to a float""" return float(self.get_value(section, option)) # this is a hybrid of ConfigParser and dict API def items(self, section=None): """Return a list of (name, value) pairs for each option Optionally limited to a given section. """ if section is None: return self._store.items() return [(k, v) for k, v in self._store.items() if cfg_section_regex.match(k).group(1) == section] # # Compatibility with GitPython's ConfigParser # def get_value(self, section, option, default=None): """Like `get()`, but with an optional default value If the default is not None, the given default value will be returned in case the option did not exist. This behavior immitates GitPython's config parser. """ try: return self['.'.join((section, option))] except KeyError as e: # this strange dance is needed because gitpython does it this way if default is not None: return default else: raise e # # Modify configuration (proxy respective git-config call) # @_where_reload def _run(self, args, where=None, reload=False, **kwargs): """Centralized helper to run "git config" calls Parameters ---------- args : list Arguments to pass for git config %s **kwargs Keywords arguments for Runner's call """ if where: args = self._get_location_args(where) + args out = self._runner.run(['git', 'config'] + args, **kwargs) if reload: self.reload() return out def _get_location_args(self, where, args=None): if args is None: args = [] cfg_labels = ('dataset', 'local', 'global') if where not in cfg_labels: raise ValueError( "unknown configuration label '{}' (not in {})".format( where, cfg_labels)) if where == 'dataset': if not self._dataset: raise ValueError( 'ConfigManager cannot store to configuration to dataset, none specified') # create an empty config file if none exists, `git config` will # fail otherwise dscfg_dirname = opj(self._dataset.path, '.datalad') dscfg_fname = opj(dscfg_dirname, 'config') if not exists(dscfg_dirname): os.makedirs(dscfg_dirname) if not exists(dscfg_fname): open(dscfg_fname, 'w').close() args.extend(['--file', opj(self._dataset.path, '.datalad', 'config')]) elif where == 'global': args.append('--global') elif where == 'local': args.append('--local') return args @_where_reload def add(self, var, value, where='dataset', reload=True): """Add a configuration variable and value Parameters ---------- var : str Variable name including any section like `git config` expects them, e.g. 'core.editor' value : str Variable value %s""" self._run(['--add', var, value], where=where, reload=reload, log_stderr=True) @_where_reload def rename_section(self, old, new, where='dataset', reload=True): """Rename a configuration section Parameters ---------- old : str Name of the section to rename. new : str Name of the section to rename to. %s""" self._run(['--rename-section', old, new], where=where, reload=reload) @_where_reload def remove_section(self, sec, where='dataset', reload=True): """Rename a configuration section Parameters ---------- sec : str Name of the section to remove. %s""" self._run(['--remove-section', sec], where=where, reload=reload) @_where_reload def unset(self, var, where='dataset', reload=True): """Remove all occurrences of a variable Parameters ---------- var : str Name of the variable to remove %s""" # use unset all as it is simpler for now self._run(['--unset-all', var], where=where, reload=reload)
class ConfigManager(object): """Thin wrapper around `git-config` with support for a dataset configuration. The general idea is to have an object that is primarily used to read/query configuration option. Upon creation, current configuration is read via one (or max two, in the case of the presence of dataset-specific configuration) calls to `git config`. If this class is initialized with a Dataset instance, it supports reading and writing configuration from ``.datalad/config`` inside a dataset too. This file is committed to Git and hence useful to ship certain configuration items with a dataset. The API aims to provide the most significant read-access API of a dictionary, the Python ConfigParser, and GitPython's config parser implementations. This class is presently not capable of efficiently writing multiple configurations items at once. Instead, each modification results in a dedicated call to `git config`. This author thinks this is OK, as he cannot think of a situation where a large number of items need to be written during normal operation. If such need arises, various solutions are possible (via GitPython, or an independent writer). Any DATALAD_* environment variable is also presented as a configuration item. Settings read from environment variables are not stored in any of the configuration file, but are read dynamically from the environment at each `reload()` call. Their values take precedence over any specification in configuration files. Parameters ---------- dataset : Dataset, optional If provided, all `git config` calls are executed in this dataset's directory. Moreover, any modifications are, by default, directed to this dataset's configuration file (which will be created on demand) dataset_only : bool If True, configuration items are only read from a datasets persistent configuration file, if any present (the one in ``.datalad/config``, not ``.git/config``). """ def __init__(self, dataset=None, dataset_only=False): # store in a simple dict # no subclassing, because we want to be largely read-only, and implement # config writing separately self._store = {} self._dataset_path = dataset.path if dataset else None self._dataset_only = dataset_only # Since configs could contain sensitive information, to prevent # any "facilitated" leakage -- just disable logging of outputs for # this runner run_kwargs = dict(log_outputs=False) if dataset is not None: # make sure we run the git config calls in the dataset # to pick up the right config files run_kwargs['cwd'] = dataset.path self._runner = Runner(**run_kwargs) self.reload() def reload(self): """Reload all configuration items from the configured sources""" self._store = {} # 2-step strategy: # - load datalad dataset config from dataset # - load git config from all supported by git sources # in doing so we always stay compatible with where Git gets its # config from, but also allow to override persistent information # from dataset locally or globally if self._dataset_path: # now any dataset config dscfg_fname = opj(self._dataset_path, '.datalad', 'config') if exists(dscfg_fname): stdout, stderr = self._run(['-z', '-l', '--file', dscfg_fname], log_stderr=True) # overwrite existing value, do not amend to get multi-line # values self._store = _parse_gitconfig_dump( stdout, self._store, replace=False) if not self._dataset_only: stdout, stderr = self._run(['-z', '-l'], log_stderr=True) self._store = _parse_gitconfig_dump( stdout, self._store, replace=True) # override with environment variables self._store = _parse_env(self._store) @_where_reload def obtain(self, var, default=None, dialog_type=None, valtype=None, store=False, where=None, reload=True, **kwargs): """ Convenience method to obtain settings interactively, if needed A UI will be used to ask for user input in interactive sessions. Questions to ask, and additional explanations can be passed directly as arguments, or retrieved from a list of pre-configured items. Additionally, this method allows for type conversion and storage of obtained settings. Both aspects can also be pre-configured. Parameters ---------- var : str Variable name including any section like `git config` expects them, e.g. 'core.editor' default : any type In interactive sessions and if `store` is True, this default value will be presented to the user for confirmation (or modification). In all other cases, this value will be silently assigned unless there is an existing configuration setting. dialog_type : {'question', 'yesno', None} Which dialog type to use in interactive sessions. If `None`, pre-configured UI options are used. store : bool Whether to store the obtained value (or default) %s `**kwargs` Additional arguments for the UI function call, such as a question `text`. """ # do local import, as this module is import prominently and the # could theroetically import all kind of weired things for type # conversion from datalad.interface.common_cfg import definitions as cfg_defs # fetch what we know about this variable cdef = cfg_defs.get(var, {}) # type conversion setup if valtype is None and 'type' in cdef: valtype = cdef['type'] if valtype is None: valtype = lambda x: x # any default? if default is None and 'default' in cdef: default = cdef['default'] _value = None if var in self: # nothing needs to be obtained, it is all here already _value = self[var] elif store is False and default is not None: # nothing will be stored, and we have a default -> no user confirmation # we cannot use logging, because we want to use the config to confiugre # the logging #lgr.debug('using default {} for config setting {}'.format(default, var)) _value = default if _value is not None: # we got everything we need and can exit early try: return valtype(_value) except Exception as e: raise ValueError( "value '{}' of existing configuration for '{}' cannot be " "converted to the desired type '{}' ({})".format( _value, var, valtype, exc_str(e))) # now we need to try to obtain something from the user from datalad.ui import ui # configure UI dialog_opts = kwargs if dialog_type is None: # no override # check for common knowledge on how to obtain a value if 'ui' in cdef: dialog_type = cdef['ui'][0] # pull standard dialog settings dialog_opts = cdef['ui'][1] # update with input dialog_opts.update(kwargs) if (not ui.is_interactive or dialog_type is None) and default is None: raise RuntimeError( "cannot obtain value for configuration item '{}', " "not preconfigured, no default, no UI available".format(var)) if not hasattr(ui, dialog_type): raise ValueError("UI '{}' does not support dialog type '{}'".format( ui, dialog_type)) # configure storage destination, if needed if store: if where is None and 'destination' in cdef: where = cdef['destination'] if where is None: raise ValueError( "request to store configuration item '{}', but no " "storage destination specified".format(var)) # obtain via UI dialog = getattr(ui, dialog_type) _value = dialog(default=default, **dialog_opts) if _value is None: # we got nothing if default is None: raise RuntimeError( "could not obtain value for configuration item '{}', " "not preconfigured, no default".format(var)) # XXX maybe we should return default here, even it was returned # from the UI -- if that is even possible # execute type conversion before storing to check that we got # something that looks like what we want try: value = valtype(_value) except Exception as e: raise ValueError( "cannot convert user input `{}` to desired type ({})".format( _value, exc_str(e))) # XXX we could consider "looping" until we have a value of proper # type in case of a user typo... if store: # store value as it was before any conversion, needs to be str # anyway # needs string conversion nevertheless, because default could come # in as something else self.add(var, '{}'.format(_value), where=where, reload=reload) return value # # Compatibility with dict API # def __len__(self): return len(self._store) def __getitem__(self, key): return self._store.__getitem__(key) def __contains__(self, key): return self._store.__contains__(key) def keys(self): """Returns list of configuration item names""" return self._store.keys() def get(self, key, default=None): """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" return self._store.get(key, default) # # Compatibility with ConfigParser API # def sections(self): """Returns a list of the sections available""" return list(set([cfg_section_regex.match(k).group(1) for k in self._store])) def options(self, section): """Returns a list of options available in the specified section.""" opts = [] for k in self._store: sec, opt = cfg_sectionoption_regex.match(k).groups() if sec == section: opts.append(opt) return opts def has_section(self, section): """Indicates whether a section is present in the configuration""" for k in self._store: if k.startswith(section): return True return False def has_option(self, section, option): """If the given section exists, and contains the given option""" for k in self._store: sec, opt = cfg_sectionoption_regex.match(k).groups() if sec == section and opt == option: return True return False def getint(self, section, option): """A convenience method which coerces the option value to an integer""" return int(self.get_value(section, option)) def getbool(self, section, option, default=None): """A convenience method which coerces the option value to a bool Values "on", "yes", "true" and any int!=0 are considered True Values which evaluate to bool False, "off", "no", "false" are considered False TypeError is raised for other values. """ val = self.get_value(section, option, default=default) return anything2bool(val) def getfloat(self, section, option): """A convenience method which coerces the option value to a float""" return float(self.get_value(section, option)) # this is a hybrid of ConfigParser and dict API def items(self, section=None): """Return a list of (name, value) pairs for each option Optionally limited to a given section. """ if section is None: return self._store.items() return [(k, v) for k, v in self._store.items() if cfg_section_regex.match(k).group(1) == section] # # Compatibility with GitPython's ConfigParser # def get_value(self, section, option, default=None): """Like `get()`, but with an optional default value If the default is not None, the given default value will be returned in case the option did not exist. This behavior immitates GitPython's config parser. """ try: return self['.'.join((section, option))] except KeyError as e: # this strange dance is needed because gitpython does it this way if default is not None: return default else: raise e # # Modify configuration (proxy respective git-config call) # @_where_reload def _run(self, args, where=None, reload=False, **kwargs): """Centralized helper to run "git config" calls Parameters ---------- args : list Arguments to pass for git config %s **kwargs Keywords arguments for Runner's call """ if where: args = self._get_location_args(where) + args out = self._runner.run(['git', 'config'] + args, **kwargs) if reload: self.reload() return out def _get_location_args(self, where, args=None): if args is None: args = [] cfg_labels = ('dataset', 'local', 'global') if where not in cfg_labels: raise ValueError( "unknown configuration label '{}' (not in {})".format( where, cfg_labels)) if where == 'dataset': if not self._dataset_path: raise ValueError( 'ConfigManager cannot store to configuration to dataset, none specified') # create an empty config file if none exists, `git config` will # fail otherwise dscfg_dirname = opj(self._dataset_path, '.datalad') dscfg_fname = opj(dscfg_dirname, 'config') if not exists(dscfg_dirname): os.makedirs(dscfg_dirname) if not exists(dscfg_fname): open(dscfg_fname, 'w').close() args.extend(['--file', opj(self._dataset_path, '.datalad', 'config')]) elif where == 'global': args.append('--global') elif where == 'local': args.append('--local') return args @_where_reload def add(self, var, value, where='dataset', reload=True): """Add a configuration variable and value Parameters ---------- var : str Variable name including any section like `git config` expects them, e.g. 'core.editor' value : str Variable value %s""" self._run(['--add', var, value], where=where, reload=reload, log_stderr=True) @_where_reload def set(self, var, value, where='dataset', reload=True, force=False): """Set a variable to a value. In opposition to `add`, this replaces the value of `var` if there is one already. Parameters ---------- var : str Variable name including any section like `git config` expects them, e.g. 'core.editor' value : str Variable value force: bool if set, replaces all occurrences of `var` by a single one with the given `value`. Otherwise raise if multiple entries for `var` exist already %s""" from datalad.support.gitrepo import to_options self._run(to_options(replace_all=force) + [var, value], where=where, reload=reload, log_stderr=True) @_where_reload def rename_section(self, old, new, where='dataset', reload=True): """Rename a configuration section Parameters ---------- old : str Name of the section to rename. new : str Name of the section to rename to. %s""" self._run(['--rename-section', old, new], where=where, reload=reload) @_where_reload def remove_section(self, sec, where='dataset', reload=True): """Rename a configuration section Parameters ---------- sec : str Name of the section to remove. %s""" self._run(['--remove-section', sec], where=where, reload=reload) @_where_reload def unset(self, var, where='dataset', reload=True): """Remove all occurrences of a variable Parameters ---------- var : str Name of the variable to remove %s""" # use unset all as it is simpler for now self._run(['--unset-all', var], where=where, reload=reload)
def __call__(dataset=None, name=None, url=None, pushurl=None, recursive=False, force=False): # TODO: Detect malformed URL and fail? if name is None or (url is None and pushurl is None): raise ValueError("""insufficient information to add a sibling (needs at least a dataset, a name and an URL).""") if url is None: url = pushurl # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if ds is None: # try to find a dataset at or above CWD dspath = GitRepo.get_toppath(abspath(getpwd())) if dspath is None: raise ValueError( "No dataset found at or above {0}.".format(getpwd())) ds = Dataset(dspath) lgr.debug("Resolved dataset for target creation: {0}".format(ds)) assert(ds is not None and name is not None and url is not None) if not ds.is_installed(): raise ValueError("Dataset {0} is not installed yet.".format(ds)) assert(ds.repo is not None) ds_basename = basename(ds.path) repos = { ds_basename: {'repo': ds.repo} } if recursive: for subds in ds.get_dataset_handles(recursive=True): sub_path = opj(ds.path, subds) repos[ds_basename + '/' + subds] = { # repos[subds] = { 'repo': GitRepo(sub_path, create=False) } # Note: This is copied from create_publication_target_sshwebserver # as it is the same logic as for its target_dir. # TODO: centralize and generalize template symbol handling # TODO: Check pushurl for template symbols too. Probably raise if only # one of them uses such symbols replicate_local_structure = False if "%NAME" not in url: replicate_local_structure = True for repo in repos: if not replicate_local_structure: repos[repo]['url'] = url.replace("%NAME", repo.replace("/", "-")) if pushurl: repos[repo]['pushurl'] = pushurl.replace("%NAME", repo.replace("/", "-")) else: repos[repo]['url'] = url if pushurl: repos[repo]['pushurl'] = pushurl if repo != ds_basename: repos[repo]['url'] = _urljoin(repos[repo]['url'], repo[len(ds_basename)+1:]) if pushurl: repos[repo]['pushurl'] = _urljoin(repos[repo]['pushurl'], repo[len(ds_basename)+1:]) # collect existing remotes: already_existing = list() conflicting = list() for repo in repos: if name in repos[repo]['repo'].git_get_remotes(): already_existing.append(repo) lgr.debug("""Remote '{0}' already exists in '{1}'.""".format(name, repo)) existing_url = repos[repo]['repo'].git_get_remote_url(name) existing_pushurl = \ repos[repo]['repo'].git_get_remote_url(name, push=True) if repos[repo]['url'].rstrip('/') != existing_url.rstrip('/') \ or (pushurl and existing_pushurl and repos[repo]['pushurl'].rstrip('/') != existing_pushurl.rstrip('/')) \ or (pushurl and not existing_pushurl): conflicting.append(repo) if not force and conflicting: raise RuntimeError("Sibling '{0}' already exists with conflicting" " URL for {1} dataset(s). {2}".format( name, len(conflicting), conflicting)) runner = Runner() successfully_added = list() for repo in repos: if repo in already_existing: if repo not in conflicting: lgr.debug("Skipping {0}. Nothing to do.".format(repo)) continue # rewrite url cmd = ["git", "remote", "set-url", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) else: # add the remote cmd = ["git", "remote", "add", name, repos[repo]['url']] runner.run(cmd, cwd=repos[repo]['repo'].path) if pushurl: cmd = ["git", "remote", "set-url", "--push", name, repos[repo]['pushurl']] runner.run(cmd, cwd=repos[repo]['repo'].path) successfully_added.append(repo) return successfully_added
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, message=None, rerun_info=None, rerun_outputs=None, sidecar=None): rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner lgr.debug('tracking command output underneath %s', ds) if not rerun_info and ds.repo.dirty: # Rerun already takes care of this. yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) if inputs: for res in ds.get(inputs.expand(full=True), on_failure="ignore"): yield res outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"], warn=not rerun_info) if outputs: for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res sfmt = SequenceFormatter() cmd_expanded = sfmt.format(cmd, pwd=pwd, dspath=ds.path, inputs=inputs.expand(dot=False), outputs=outputs.expand(dot=False)) # we have a clean dataset, let's run things exc = None cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd_expanded, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way exc = e cmd_exitcode = e.code if rerun_info and rerun_info.get("exit", 0) != cmd_exitcode: # we failed in a different way during a rerun. This can easily # happen if we try to alter a locked file # # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise exc lgr.info("== Command exit (modification check follows) =====") # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) use_sidecar = sidecar or ( sidecar is None and ds.config.get('datalad.run.record-sidecar', default=False)) if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd), '"{}"'.format(record_id) if use_sidecar else record) msg = assure_bytes(msg) if not rerun_info and cmd_exitcode: msg_path = opj(relpath(ds.repo.repo.git_dir), "COMMIT_EDITMSG") with open(msg_path, "wb") as ofh: ofh.write(msg) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -r -F%s .'", msg_path) raise exc else: for r in ds.add('.', recursive=True, message=msg): yield r
def run_command(cmd, dataset=None, message=None, rerun_info=None): rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) elif dataset: pwd = dataset.path rel_pwd = curdir else: # act on the whole dataset if nothing else was specified dataset = get_dataset_root(curdir) # Follow our generic semantic that if dataset is specified, # paths are relative to it, if not -- relative to pwd pwd = getpwd() if dataset: rel_pwd = relpath(pwd, dataset) else: rel_pwd = pwd # and leave handling on deciding either we # deal with it or crash to checks below ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner from datalad.tests.utils import ok_clean_git lgr.debug('tracking command output underneath %s', ds) if not rerun_info: # Rerun already takes care of this. try: # base assumption is that the animal smells superb ok_clean_git(ds.path) except AssertionError: yield get_status_dict('run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return # anticipate quoted compound shell commands cmd = cmd[0] if isinstance(cmd, list) and len(cmd) == 1 else cmd # TODO do our best to guess which files to unlock based on the command string # in many cases this will be impossible (but see rerun). however, # generating new data (common case) will be just fine already # we have a clean dataset, let's run things cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way cmd_exitcode = e.code if not rerun_info or rerun_info.get("exit", 0) != cmd_exitcode: # we failed during a fresh run, or in a different way during a rerun # the latter can easily happen if we try to alter a locked file # # let's fail here, the command could have had a typo or some # other undesirable condition. If we would `add` nevertheless, # we would need to rerun and aggregate annex content that we # likely don't want # TODO add switch to ignore failure (some commands are stupid) # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise CommandError(code=cmd_exitcode) lgr.info("== Command exit (modification check follows) =====") # ammend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, 'chain': rerun_info["chain"] if rerun_info else [], } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd # compose commit message cmd_shorty = (' '.join(cmd) if isinstance(cmd, list) else cmd) cmd_shorty = '{}{}'.format(cmd_shorty[:40], '...' if len(cmd_shorty) > 40 else '') msg = '[DATALAD RUNCMD] {}\n\n=== Do not change lines below ===\n{}\n^^^ Do not change lines above ^^^'.format( message if message is not None else cmd_shorty, json.dumps(run_info, indent=1), sort_keys=True, ensure_ascii=False, encoding='utf-8') for r in ds.add('.', recursive=True, message=msg): yield r