def __call__(types, files=None, dataset=None): dataset = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=not files) if not files: ds = require_dataset(dataset, check_installed=True) subds = ds.subdatasets(recursive=False, result_xfm='relpaths') files = list(_get_metadatarelevant_paths(ds, subds)) dsmeta, contentmeta, error = _get_metadata(dataset, types, global_meta=True, content_meta=bool(files), paths=files) if dataset is not None and dataset.is_installed(): res = get_status_dict(action='metadata', ds=dataset, refds=dataset.path, metadata=dsmeta, status='error' if error else 'ok') yield res for p in contentmeta: res = get_status_dict(action='metadata', path=opj(dataset.path, p) if dataset else p, refds=dataset.path, metadata=contentmeta[p], type='file', status='error' if error else 'ok') if dataset: res['parentds'] = dataset.path yield res
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): dataset = require_dataset(dataset, purpose='clean-up') for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", ("directory", "directories")), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file", "files")), ]: lgr.info("Considering to clean %s:%s", dataset, dirpath) if not ((what is None) or (flag in what)): continue topdir = opj(dataset.path, dirpath) paths = glob(opj(topdir, '*')) if paths: pl = len(paths) > 1 pwd = getpwd() # relative version if possible rtopdir = topdir[len(pwd) + 1:] \ if topdir.startswith(pwd) else topdir ui.message("Removing %d %s %s under %s: %s" % (len(paths), msg, sing_pl[int(pl)], rtopdir, ", ".join(sorted([x[len(topdir) + 1:] for x in paths])))) rmtree(topdir) if recursive: for sub in dataset.get_subdatasets( recursive=True, recursion_limit=recursion_limit, absolute=False): Dataset(sub).clean(what=what)
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): dataset = require_dataset(dataset, purpose='clean-up') for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", ("directory", "directories")), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file", "files")), ]: lgr.info("Considering to clean %s:%s", dataset, dirpath) if not ((what is None) or (flag in what)): continue topdir = opj(dataset.path, dirpath) paths = glob(opj(topdir, '*')) if paths: pl = len(paths) > 1 pwd = getpwd() # relative version if possible rtopdir = topdir[len(pwd) + 1:] \ if topdir.startswith(pwd) else topdir ui.message( "Removing %d %s %s under %s: %s" % (len(paths), msg, sing_pl[int(pl)], rtopdir, ", ".join( sorted([x[len(topdir) + 1:] for x in paths])))) rmtree(topdir) if recursive: for sub in dataset.get_subdatasets(recursive=True, recursion_limit=recursion_limit, absolute=False): Dataset(sub).clean(what=what)
def __call__(astype, dataset, getcmdhelp=False, output=None, **kwargs): # get a handle on the relevant plugin module import datalad.export as export_mod try: exmod = import_module('.%s' % (astype, ), package=export_mod.__package__) except ImportError as e: raise ValueError("cannot load exporter '{}': {}".format( astype, exc_str(e))) if getcmdhelp: # no result, but return the module to make the renderer do the rest return (exmod, None) ds = require_dataset(dataset, check_installed=True, purpose='exporting') # call the plugin, either with the argv array from the cmdline call # or directly with the kwargs if 'datalad_unparsed_args' in kwargs: result = exmod._datalad_export_plugin_call( ds, argv=kwargs['datalad_unparsed_args'], output=output) else: result = exmod._datalad_export_plugin_call(ds, output=output, **kwargs) return (exmod, result)
def __call__(spec, dataset=None): if not isinstance(spec, (tuple, list)): # maybe coming from config import shlex spec = shlex.split(spec) name = spec[0] args = spec[1:] procedure_file = _get_procedure_implementation(name, ds=dataset) if not procedure_file: # TODO error result raise ValueError("Cannot find procedure with name '%s'", name) ds = require_dataset(dataset, check_installed=False, purpose='run a procedure') if dataset else None cmd_tmpl = _guess_exec(procedure_file) cmd = cmd_tmpl.format(script=procedure_file, ds=ds.path if ds else '', args=u' '.join(u'"{}"'.format(a) for a in args) if args else '') lgr.debug('Attempt to run procedure {} as: {}'.format(name, cmd)) for r in Run.__call__( cmd=cmd, dataset=ds, # See gh-2593 for discussion on run feature extension #explicit=True, #inputs=None, #outputs=None, # pass through here on_failure='ignore', ): yield r
def no_annex(ds): ds = require_dataset( ds, check_installed=True, purpose='configuration') if isinstance(ds.repo, AnnexRepo): repo = ds.repo # TODO: if procedures can have options -- add --force handling/passing # # annex uninit unlocks files for which there is content (nice) but just proceeds # and leaves broken symlinks for files without content. For the current purpose # of this procedure we just prevent "uninit" of any annex with some files already # annexed. if any(repo.call_annex_items_(['whereis', '--all'])): raise RuntimeError("Annex has some annexed files, unsafe") # remove annex repo.call_annex(['uninit']) noannex_file = ds.pathobj / ".noannex" if not noannex_file.exists(): lgr.info("Creating and committing a .noannex file") noannex_file.touch() ds.save(noannex_file, message="Added .noannex to prevent accidental initialization of git-annex")
def __call__(title, name="osf", dataset=None, mode="annex"): ds = require_dataset(dataset, purpose="create OSF remote", check_installed=True) # we need an annex if not isinstance(ds.repo, AnnexRepo): yield get_status_dict(action="create-sibling-osf", type="dataset", status="impossible", message="dataset has no annex") return # NOTES: # - we prob. should check osf-special-remote availability upfront to # fail early # - publish-depends option? # - (try to) detect github/gitlab/bitbucket to suggest linking it on # OSF and configure publish dependency # -> prob. overkill; just make it clear in the doc # - add --recursive option # - recursive won't work easily. Need to think that through. # - would need a naming scheme for subdatasets # - flat on OSF or a tree? # - how do we detect something is there already, so we can skip # rather than duplicate (with a new name)? # osf-type-special-remote sufficient to decide it's not needed? # - adapt to conclusions in issue #30 # -> create those subcomponents # - results need to report URL for created projects suitable for datalad # output formatting! # -> result_renderer # -> needs to ne returned by create_project # - option: Make public! cred = get_credentials(allow_interactive=True) osf = OSF(**cred) proj_id, proj_url = create_project(osf_session=osf.session, title=title) yield get_status_dict(action="create-project-osf", type="dataset", url=proj_url, id=proj_id, status="ok") init_opts = [ "encryption=none", "type=external", "externaltype=osf", "autoenable=true", "project={}".format(proj_id) ] if mode == "export": init_opts += ["exporttree=yes"] ds.repo.init_remote(name, options=init_opts) # TODO: add special remote name to result? # need to check w/ datalad-siblings conventions yield get_status_dict(action="add-sibling-osf", type="dataset", status="ok")
def __call__(query=None, dataset=None, force_reindex=False, max_nresults=None, mode=None, full_record=False, show_keys=None, show_query=False): try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: for r in _search_from_virgin_install(dataset, query): yield r return if mode is None: # let's get inspired by what the dataset/user think is # default mode = ds.config.obtain('datalad.search.default-mode') if mode == 'egrep': searcher = _EGrepSearch elif mode == 'egrepcs': searcher = _EGrepCSSearch elif mode == 'textblob': searcher = _BlobSearch elif mode == 'autofield': searcher = _AutofieldSearch else: raise ValueError( 'unknown search mode "{}"'.format(mode)) searcher = searcher(ds, force_reindex=force_reindex) if show_keys: searcher.show_keys(show_keys) return if not query: return if show_query: print(repr(searcher.get_query(query))) return nhits = 0 for r in searcher( query, max_nresults=max_nresults, full_record=full_record): nhits += 1 yield r if not nhits: lgr.info(searcher.get_nohits_msg() or 'no hits')
def __call__(name, url=None, dataset=None, execute=None, image=None): ds = require_dataset(dataset, check_installed=True, purpose='add container') loc_cfg_var = "datalad.containers.location" # TODO: We should provide an entry point (or sth similar) for extensions # to get config definitions into the ConfigManager. In other words an # easy way to extend definitions in datalad's common_cfgs.py. container_loc = \ ds.config.obtain(loc_cfg_var, where=definitions[loc_cfg_var]['destination'], store=True, default=definitions[loc_cfg_var]['default'], dialog_type=definitions[loc_cfg_var]['ui'][0], valtype=definitions[loc_cfg_var]['type'], **definitions[loc_cfg_var]['ui'][1] ) result = { "action": "containers_add", "path": op.join(ds.path, container_loc, name), "type": "file" } if not url: url = ds.config.get("datalad.containers.{}.url".format(name)) if not url: raise InsufficientArgumentsError( "URL is required and can be provided either via parameter " "'url' or config key 'datalad.containers.{}.url'" "".format(name)) try: ds.repo.add_url_to_file(op.join(container_loc, name), url) ds.save( op.join(container_loc, name), message="[DATALAD] Added container {name}".format(name=name)) result["status"] = "ok" except Exception as e: result["status"] = "error" result["message"] = str(e) yield result # store configs ds.config.set("datalad.containers.{}.url".format(name), url) if execute: ds.config.add("datalad.containers.{}.exec".format(name), execute) if image: ds.config.add("datalad.containers.{}.image".format(name), image) # TODO: Just save won't work in direct mode, since save fails to detect changes ds.add(op.join(".datalad", "config"), message="[DATALAD] Store config for container '{name}'" "".format(name=name))
def __call__(dataset=None, sensitive=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetArgumentFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetArgumentFound: # failure is already logged pass if ds and not ds.is_installed(): # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = {} res = get_status_dict( action='wtf', path=ds.path if ds else op.abspath(op.curdir), type='dataset' if ds else 'directory', status='ok', logger=lgr, infos=infos, ) infos['datalad'] = _describe_datalad() infos['git-annex'] = _describe_annex() infos['system'] = _describe_system() infos['environment'] = _describe_environment() infos['configuration'] = _describe_configuration(cfg, sensitive) infos['extentions'] = _describe_extensions() infos['metadata_extractors'] = _describe_metadata_extractors() infos['dependencies'] = _describe_dependencies() if ds: try: infos['dataset'] = _describe_dataset(ds, sensitive) except InvalidGitRepositoryError as e: infos['dataset'] = {"invalid": exc_str(e)} if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(report) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for ap in AnnotatePaths.__call__(dataset=ds.path, recursive=recursive, recursion_limit=recursion_limit, action='clean', unavailable_path_status='impossible', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): yield ap continue if ap.get('type', None) != 'dataset': ap.update(status='impossible', message='only datasets can be cleaned') yield ap continue d = ap['path'] gitdir = get_git_dir(d) for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", ("directory", "directories")), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file", "files")), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", ("file", "files")), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict(path=topdir, status='ok', type='dir', message=message, **res_kwargs)
def __call__(dataset=None, recursive=False, contains=None): ds = require_dataset(dataset, check_installed=True, purpose='list containers') refds = ds.path if recursive: for sub in ds.subdatasets( contains=contains, on_failure='ignore', return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if subds.is_installed(): for c in subds.containers_list(recursive=recursive, return_type='generator', on_failure='ignore', result_filter=None, result_renderer=None, result_xfm=None): c['name'] = sub['gitmodule_name'] + '/' + c['name'] c['refds'] = refds yield c # all info is in the dataset config! var_prefix = 'datalad.containers.' containers = {} for var, value in ds.config.items(): if not var.startswith(var_prefix): # not an interesting variable continue var_comps = var[len(var_prefix):].split('.') cname = var_comps[0] ccfgname = '.'.join(var_comps[1:]) if not ccfgname: continue cinfo = containers.get(cname, {}) cinfo[ccfgname] = value containers[cname] = cinfo for k, v in containers.items(): if 'image' not in v: # there is no container location configured continue res = get_status_dict( status='ok', action='containers', name=k, type='file', path=op.join(ds.path, v.pop('image')), refds=refds, parentds=ds.path, # TODO #state='absent' if ... else 'present' **v) yield res
def __call__(path=None, *, dataset=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_subdataset_state='full', report_filetype=None): if report_filetype is not None: warnings.warn( "status(report_filetype=) no longer supported, and will be removed " "in a future release", DeprecationWarning) # To the next white knight that comes in to re-implement `status` as a # special case of `diff`. There is one fundamental difference between # the two commands: `status` can always use the worktree as evident on # disk as a constraint (e.g. to figure out which subdataset a path is # in) `diff` cannot do that (everything need to be handled based on a # "virtual" representation of a dataset hierarchy). # MIH concludes that while `status` can be implemented as a special case # of `diff` doing so would complicate and slow down both `diff` and # `status`. So while the apparent almost code-duplication between the # two commands feels wrong, the benefit is speed. Any future RF should # come with evidence that speed does not suffer, and complexity stays # on a manageable level ds = require_dataset(dataset, check_installed=True, purpose='report status') ds_path = ds.path queried = set() content_info_cache = {} for res in _yield_paths_by_ds(ds, dataset, ensure_list(path)): if 'status' in res: # this is an error yield res continue for r in yield_dataset_status( res['ds'], res['paths'], annex, untracked, recursion_limit if recursion_limit is not None else -1 if recursive else 0, queried, eval_subdataset_state, None, content_info_cache, reporting_order='depth-first'): if 'status' not in r: r['status'] = 'ok' yield dict( r, refds=ds_path, action='status', )
def __call__(extractor_name: str, path: Optional[str] = None, dataset: Optional[Union[Dataset, str]] = None, into: Optional[Union[Dataset, str]] = None): # Get basic arguments source_dataset = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=path is not None) source_primary_data_version = source_dataset.repo.get_hexsha() if into: into_ds = require_dataset(into, purpose="extract metadata", check_installed=True) realm = into_ds.repo root_primary_data_version = into_ds.repo.get_hexsha( ) # TODO: check for adjusted/managed branch, use get_corresponding_branch else: realm = source_dataset.repo root_primary_data_version = source_primary_data_version extractor_class = get_extractor_class(extractor_name) dataset_tree_path, file_tree_path = get_path_info( source_dataset, path, into) extraction_parameters = ExtractionParameter( realm, source_dataset, UUID(source_dataset.id), extractor_class, extractor_name, dataset_tree_path, file_tree_path, root_primary_data_version, source_primary_data_version, dataset.config.get("user.name"), dataset.config.get("user.email")) # If a path is given, we assume file-level metadata extraction is # requested, and the extractor class is a subclass of # FileMetadataExtractor. If oath is not given, we assume that # dataset-level extraction is requested and the extractor # class is a subclass of DatasetMetadataExtractor if path: yield from do_file_extraction(extraction_parameters) else: yield from do_dataset_extraction(extraction_parameters) return
def __call__(dataset=None, sensitive=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetArgumentFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetArgumentFound: # failure is already logged pass if ds and not ds.is_installed(): # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = {} res = get_status_dict( action='wtf', path=ds.path if ds else op.abspath(op.curdir), type='dataset' if ds else 'directory', status='ok', logger=lgr, infos=infos, ) infos['datalad'] = _describe_datalad() infos['git-annex'] = _describe_annex() infos['system'] = _describe_system() infos['environment'] = _describe_environment() infos['configuration'] = _describe_configuration(cfg, sensitive) infos['extentions'] = _describe_extensions() infos['metadata_extractors'] = _describe_metadata_extractors() infos['dependencies'] = _describe_dependencies() if ds: infos['dataset'] = _describe_dataset(ds, sensitive) if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(report) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for ap in AnnotatePaths.__call__( dataset=ds.path, recursive=recursive, recursion_limit=recursion_limit, action='clean', unavailable_path_status='impossible', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): yield ap continue if ap.get('type', None) != 'dataset': ap.update(status='impossible', message='only datasets can be cleaned') yield ap continue d = ap['path'] gitdir = GitRepo.get_git_dir(d) DIRS_PLURAL = ("directory", "directories") FILES_PLURAL = ("file", "files") for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", DIRS_PLURAL), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", FILES_PLURAL), (ANNEX_TRANSFER_DIR, "annex-transfer", "annex temporary transfer", DIRS_PLURAL), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", FILES_PLURAL), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict( path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict( path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join(sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict( path=topdir, status='ok', type='dir', message=message, **res_kwargs)
def __call__(path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): # no constraints given -> query subdatasets under curdir if not path and dataset is None: path = os.curdir paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \ if path else None ds = require_dataset(dataset, check_installed=False, purpose='subdataset reporting/modification') lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = [ rev_resolve_path(c, dataset) for c in assure_list(contains) ] for r in _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = text_type(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for wds in itertools.chain( [ds], ds.subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets') if recursive else []): d = wds.path gitdir = GitRepo.get_git_dir(d) DIRS_PLURAL = ("directory", "directories") FILES_PLURAL = ("file", "files") for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", DIRS_PLURAL), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", FILES_PLURAL), (ANNEX_TRANSFER_DIR, "annex-transfer", "annex temporary transfer", DIRS_PLURAL), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", FILES_PLURAL), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict(path=topdir, status='ok', type='dir', message=message, **res_kwargs)
def __call__( path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): # no constraints given -> query subdatasets under curdir if not path and dataset is None: path = os.curdir paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \ if path else None ds = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = [rev_resolve_path(c, dataset) for c in assure_list(contains)] for r in _get_submodules( ds, paths, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = text_type(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def __call__(message=None, files=None, dataset=None, all_changes=False, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): if dataset: dataset = require_dataset(dataset, check_installed=True, purpose='saving') content_by_ds, unavailable_paths = Interface._prep( path=files, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) if unavailable_paths: lgr.warning("ignoring non-existent path(s): %s", unavailable_paths) # here we know all datasets associated with any inputs # so we can expand "all_changes" right here to avoid confusion # wrt to "super" and "intermediate" datasets discovered later on if all_changes: # and we do this by replacing any given paths with the respective # datasets' base path for ds in content_by_ds: content_by_ds[ds] = [ds] if super_datasets: content_by_ds = amend_pathspec_with_superdatasets( content_by_ds, # save up to and including the base dataset (if one is given) # otherwise up to the very top topmost=dataset if dataset else True, limit_single=False) if dataset: # stuff all paths also into the base dataset slot to make sure # we get all links between relevant subdatasets bp = content_by_ds.get(dataset.path, []) for c in content_by_ds: bp.extend(content_by_ds[c]) content_by_ds[dataset.path] = list(set(bp)) saved_ds = save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message=message, version_tag=version_tag) return saved_ds
def __call__(extractorname: str, path: Optional[str] = None, dataset: Optional[Union[Dataset, str]] = None, extractorargs: Optional[List[str]] = None): # Get basic arguments extractor_name = extractorname extractor_args = extractorargs path = None if path == "++" else path source_dataset = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=path is not None) if not source_dataset.repo: raise ValueError(f"No dataset found in {dataset or curdir}.") source_dataset_version = source_dataset.repo.get_hexsha() extractor_class = get_extractor_class(extractor_name) dataset_tree_path, file_tree_path = get_path_info( source_dataset, Path(path) if path else None, None) extraction_parameters = ExtractionParameter( source_dataset=source_dataset, source_dataset_id=UUID(source_dataset.id), source_dataset_version=source_dataset_version, extractor_class=extractor_class, extractor_name=extractor_name, extractor_arguments=args_to_dict(extractor_args), file_tree_path=file_tree_path, agent_name=source_dataset.config.get("user.name"), agent_email=source_dataset.config.get("user.email")) # If a path is given, we assume file-level metadata extraction is # requested, and the extractor class should be a subclass of # FileMetadataExtractor (or a legacy extractor). # If path is not given, we assume that a dataset-level extraction is # requested and the extractor class is a subclass of # DatasetMetadataExtractor (or a legacy extractor class). path = None if path == "++" else path if path: # Check whether the path points to a sub_dataset. ensure_path_validity(source_dataset, file_tree_path) yield from do_file_extraction(extraction_parameters) else: yield from do_dataset_extraction(extraction_parameters) return
def __call__(types, files=None, dataset=None): dataset = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=not files) if not files: ds = require_dataset(dataset, check_installed=True) subds = ds.subdatasets(recursive=False, result_xfm='relpaths') files = list(_get_metadatarelevant_paths(ds, subds)) dsmeta, contentmeta, error = _get_metadata( dataset, types, global_meta=True, content_meta=bool(files), paths=files) if dataset is not None and dataset.is_installed(): res = get_status_dict( action='metadata', ds=dataset, refds=dataset.path, metadata=dsmeta, status='error' if error else 'ok') yield res for p in contentmeta: res = get_status_dict( action='metadata', path=opj(dataset.path, p) if dataset else p, refds=dataset.path, metadata=contentmeta[p], type='file', status='error' if error else 'ok') if dataset: res['parentds'] = dataset.path yield res
def __call__(message=None, files=None, dataset=None, all_changes=False, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): if dataset: dataset = require_dataset( dataset, check_installed=True, purpose='saving') content_by_ds, unavailable_paths = Interface._prep( path=files, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) if unavailable_paths: lgr.warning("ignoring non-existent path(s): %s", unavailable_paths) # here we know all datasets associated with any inputs # so we can expand "all_changes" right here to avoid confusion # wrt to "super" and "intermediate" datasets discovered later on if all_changes: # and we do this by replacing any given paths with the respective # datasets' base path for ds in content_by_ds: content_by_ds[ds] = [ds] if super_datasets: content_by_ds = amend_pathspec_with_superdatasets( content_by_ds, # save up to and including the base dataset (if one is given) # otherwise up to the very top topmost=dataset if dataset else True, limit_single=False) if dataset: # stuff all paths also into the base dataset slot to make sure # we get all links between relevant subdatasets bp = content_by_ds.get(dataset.path, []) for c in content_by_ds: bp.extend(content_by_ds[c]) content_by_ds[dataset.path] = list(set(bp)) saved_ds = save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message=message, version_tag=version_tag) return saved_ds
def __call__(name, dataset=None, remove_image=False): ds = require_dataset(dataset, check_installed=True, purpose='remove a container') res = get_status_dict( ds=ds, action='containers_remove', logger=lgr) section = 'datalad.containers.{}'.format(name) imagecfg = '{}.image'.format(section) to_save = [] if remove_image and imagecfg in ds.config: imagepath = ds.config.get(imagecfg) if op.lexists(op.join(ds.path, imagepath)): for r in ds.remove( path=imagepath, # XXX shortcomming: this is the only way to say: # don't drop check=False, # config setting might be outdated and image no longer # there -> no reason to fail, just report on_failure='ignore', save=False): yield r to_save.append(imagepath) if section in ds.config.sections(): ds.config.remove_section( section, where='dataset', reload=True) res['status'] = 'ok' to_save.append(op.join('.datalad', 'config')) else: res['status'] = 'notneeded' if to_save: for r in ds.save( path=to_save, message='[DATALAD] Remove container {}'.format(name)): yield r yield res
def __call__(dataset=None): ds = require_dataset(dataset, check_installed=True, purpose='list containers') loc_cfg_var = "datalad.containers.location" # TODO: We should provide an entry point (or sth similar) for extensions # to get config definitions into the ConfigManager. In other words an # easy way to extend definitions in datalad's common_cfgs.py. container_loc = \ ds.config.obtain(loc_cfg_var, where=definitions[loc_cfg_var]['destination'], store=True, default=definitions[loc_cfg_var]['default'], dialog_type=definitions[loc_cfg_var]['ui'][0], valtype=definitions[loc_cfg_var]['type'], **definitions[loc_cfg_var]['ui'][1] ) from six import PY3 try: location_content = listdir(op.join(ds.path, container_loc)) except FileNotFoundError if PY3 else (OSError, IOError) as e: # TODO: Right now, just retunr nothing, since there is nothing # But may also be an "impossible" result, since the configured # common mountpoint isn't existing (needs "e.errno == errno.ENOENT" # in addition in PY2) return for r in [n for n in location_content if not n.startswith(".")]: yield { 'status': 'ok', 'action': 'containers_list', 'path': op.join(ds.path, container_loc, r), # TODO: Might be an image file or a dataset. # Use AnnotatePath with container_loc? 'type': 'file', 'name': r, }
def __call__(astype, dataset, getcmdhelp=False, output=None, **kwargs): # get a handle on the relevant plugin module import datalad.export as export_mod try: exmod = import_module('.%s' % (astype,), package=export_mod.__package__) except ImportError as e: raise ValueError("cannot load exporter '{}': {}".format( astype, exc_str(e))) if getcmdhelp: # no result, but return the module to make the renderer do the rest return (exmod, None) ds = require_dataset(dataset, check_installed=True, purpose='exporting') # call the plugin, either with the argv array from the cmdline call # or directly with the kwargs if 'datalad_unparsed_args' in kwargs: result = exmod._datalad_export_plugin_call( ds, argv=kwargs['datalad_unparsed_args'], output=output) else: result = exmod._datalad_export_plugin_call( ds, output=output, **kwargs) return (exmod, result)
def __call__( dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): dataset = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') refds_path = dataset.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)", k) if contains: contains = resolve_path(contains, dataset) for r in _get_submodules( dataset.path, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def __call__( dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): dataset = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') refds_path = dataset.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)", k) if contains: contains = resolve_path(contains, dataset) for r in _get_submodules( dataset.path, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def __call__(url, dataset=None, recursive=False): # shortcut ds = require_dataset( dataset, check_installed=True, purpose='modifying subdataset URLs') assert(ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_subdatasets(recursive=True)] for dataset_repo in repos_to_update: parser = get_module_parser(dataset_repo) for submodule_section in parser.sections(): submodule_name = submodule_section[11:-1] parser.set_value(submodule_section, "url", url.replace("%NAME", submodule_name.replace("/", "-"))) return # TODO: return value?
def __call__(dataset=None): ds = require_dataset(dataset, check_installed=True, purpose='list containers') # all info is in the dataset config! var_prefix = 'datalad.containers.' containers = {} for var, value in ds.config.items(): if not var.startswith(var_prefix): # not an interesting variable continue var_comps = var[len(var_prefix):].split('.') cname = var_comps[0] ccfgname = '.'.join(var_comps[1:]) if not ccfgname: continue cinfo = containers.get(cname, {}) cinfo[ccfgname] = value containers[cname] = cinfo for k, v in containers.items(): if 'image' not in v: # there is no container location configured continue res = get_status_dict( status='ok', action='containers', name=k, type='file', path=op.join(ds.path, v.pop('image')), # TODO #state='absent' if ... else 'present' **v) yield res
def __call__(url, dataset=None, recursive=False): # shortcut ds = require_dataset(dataset, check_installed=True, purpose='modifying subdataset URLs') assert (ds.repo is not None) repos_to_update = [ds.repo] if recursive: repos_to_update += [ GitRepo(opj(ds.path, sub_path)) for sub_path in ds.get_subdatasets(recursive=True) ] for dataset_repo in repos_to_update: parser = get_module_parser(dataset_repo) for submodule_section in parser.sections(): submodule_name = submodule_section[11:-1] parser.set_value( submodule_section, "url", url.replace("%NAME", submodule_name.replace("/", "-"))) return # TODO: return value?
def __call__( action='query', dataset=None, name=None, url=None, pushurl=None, description=None, # TODO consider true, for now like add_sibling fetch=False, as_common_datasrc=None, publish_depends=None, publish_by_default=None, annex_wanted=None, annex_required=None, annex_group=None, annex_groupwanted=None, inherit=False, get_annex_info=True, recursive=False, recursion_limit=None): # TODO: Detect malformed URL and fail? # XXX possibly fail if fetch is False and as_common_datasrc if annex_groupwanted and not annex_group: raise InsufficientArgumentsError( "To set groupwanted, you need to provide annex_group option") # TODO catch invalid action specified action_worker_map = { 'query': _query_remotes, 'add': _add_remote, 'configure': _configure_remote, 'remove': _remove_remote, 'enable': _enable_remote, } # all worker strictly operate on a single dataset # anything that deals with hierarchies and/or dataset # relationships in general should be dealt with in here # at the top-level and vice versa worker = action_worker_map[action] dataset = require_dataset( dataset, check_installed=False, purpose='sibling configuration') refds_path = dataset.path res_kwargs = dict(refds=refds_path, logger=lgr) ds_name = basename(dataset.path) # do not form single list of datasets (with recursion results) to # give fastest possible response, for the precise of a long-all # function call ds = dataset for r in worker( # always copy signature to below to avoid bugs! ds, name, ds.repo.get_remotes(), # for top-level dataset there is no layout questions _mangle_urls(url, ds_name), _mangle_urls(pushurl, ds_name), fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): yield r if not recursive: return # do we have instructions to register siblings with some alternative # layout? replicate_local_structure = url and "%NAME" not in url for subds in dataset.subdatasets( fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, result_xfm='datasets'): subds_name = relpath(subds.path, start=dataset.path) if replicate_local_structure: subds_url = slash_join(url, subds_name) subds_pushurl = slash_join(pushurl, subds_name) else: subds_url = \ _mangle_urls(url, '/'.join([ds_name, subds_name])) subds_pushurl = \ _mangle_urls(pushurl, '/'.join([ds_name, subds_name])) for r in worker( # always copy signature from above to avoid bugs subds, name, subds.repo.get_remotes(), subds_url, subds_pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): yield r
def __call__( path=None, initopts=None, force=False, description=None, dataset=None, no_annex=False, fake_dates=False, cfg_proc=None ): refds_path = dataset.path if hasattr(dataset, 'path') else dataset # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if path: path = rev_resolve_path(path, dataset) path = path if path \ else getpwd() if dataset is None \ else refds_path # we know that we need to create a dataset at `path` assert(path is not None) # prep for yield res = dict(action='create', path=text_type(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != path: refds = require_dataset( refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", dataset, text_type(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = rev_get_dataset_root( op.normpath(op.join(text_type(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if any( check_path == p or check_path in p.parents for p in pstatus): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents] res.update({ 'status': 'error', 'message': ( 'collision with content in parent dataset at %s: %s', text_type(parentds_path), [text_type(c) for c in conflict])}) yield res return # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in iteritems(pstatus) if v.get('type', None) == 'dataset'} check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ( 'collision with %s (dataset) in dataset %s', text_type(conflict[0]), text_type(parentds_path))}) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and \ dataset.path == path else Dataset(text_type(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore'}) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo( tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = { 'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates ) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend( cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added'} # make sure that v6 annex repos never commit content under .datalad attrs_cfg = ( ('config', 'annex.largefiles', 'nothing'), ('metadata/aggregate*', 'annex.largefiles', 'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format(cfg.obtain( 'datalad.metadata.create-aggregate-annex-limit')))) attrs = tbds.repo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get( op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbds.repo.set_gitattributes( set_attrs, attrfile=op.join('.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbds.repo.get_gitattributes('.git') if not attrs.get('.git', {}).get( 'annex.largefiles', None) == 'nothing': tbds.repo.set_gitattributes([ ('**/.git*', {'annex.largefiles': 'nothing'})]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked'} # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds.config: # make sure we reset this variable completely, in case of a # re-create tbds.config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds.config.add( id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in iteritems(tbds.config.overrides): tbds.config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds.config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked'} # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.repo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(dataset, Dataset) and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in dataset.save( path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res for cfg_proc_ in cfg_proc or []: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r
def __call__( dataset, guess_native_type=False, recursive=False, recursion_limit=None, save=True, if_dirty='save-before'): ds = require_dataset( dataset, check_installed=True, purpose='meta data aggregation') modified_ds = [] if ds.id is None: lgr.warning('%s has not configured ID, skipping.', dataset) return modified_ds # make sure we get to an expected state handle_dirty_dataset(ds, if_dirty) # if you want to modify the behavior of get_subdataset() make sure # there is a way to return the subdatasets DEPTH FIRST! ds_meta = {} for subds_path in ds.get_subdatasets( fulfilled=True, absolute=False, recursive=recursive, recursion_limit=recursion_limit): subds = Dataset(opj(ds.path, subds_path)) if subds.id is None: # nothing to worry about, any meta data from below this will be # injected upstairs lgr.debug('skipping non-dataset at %s', subds.path) continue else: lgr.info('aggregating meta data for %s', subds) metapath = opj(subds.path, metadata_basepath) handle_dirty_dataset(subds, if_dirty) # # Phase 1: aggregate the within-dataset meta data, and store # within the dataset # # pull out meta data from subds only (no subdatasets) _within_metadata_store( subds, guess_native_type, metapath) # # Phase 2: store everything that is in the look up and belongs into # this dataset # _dump_submeta(subds, ds_meta, subds_path, save, modified_ds) # save state of modified dataset, all we modified has been staged # already # we need to save before extracting to full metadata for upstairs # consumption to get the versions right modified_ds = _save_helper(subds, save, modified_ds) # # Phase 3: obtain all aggregated meta data from this dataset, and # keep in lookup to escalate it upstairs # ds_meta[subds_path] = get_metadata( subds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) lgr.info('aggregating meta data for %s', ds) # pull out meta data from parent only (no subdatasets) _within_metadata_store( ds, guess_native_type, opj(ds.path, metadata_basepath)) # and lastly the subdatasets of the parent _dump_submeta(ds, ds_meta, '', save, modified_ds) # everything should be stored somewhere by now assert not len(ds_meta) # save the parent modified_ds = _save_helper(ds, save, modified_ds)
"""Procedure to apply YODA-compatible default setup to a dataset This procedure assumes a clean dataset that was just created by `datalad create`. """ import sys import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import create_tree ds = require_dataset( sys.argv[1], check_installed=True, purpose='YODA dataset setup') README_code = """\ All custom code goes into this directory. All scripts should be written such that they can be executed from the root of the dataset, and are only using relative paths for portability. """ README_top = """\ # Project <insert name> ## Dataset structure - All inputs (i.e. building blocks from other sources) are located in `inputs/`. - All custom code is located in `code/`.
"""Procedure to configure Git annex to add text files directly to Git""" import sys import os.path as op from datalad.distribution.dataset import require_dataset ds = require_dataset( sys.argv[1], check_installed=True, purpose='configuration') annex_largefiles = '(not(mimetype=text/*))' attrs = ds.repo.get_gitattributes('*') if not attrs.get('*', {}).get( 'annex.largefiles', None) == annex_largefiles: ds.repo.set_gitattributes([ ('*', {'annex.largefiles': annex_largefiles})]) git_attributes_file = op.join(ds.path, '.gitattributes') ds.save( git_attributes_file, message="Instruct annex to add text files to Git", )
def __call__(sshurl, target=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, existing='error', shared=False, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None): if sshurl is None: raise ValueError("""insufficient information for target creation (needs at least a dataset and a SSH URL).""") if target is None and (target_url is not None or target_pushurl is not None): raise ValueError("""insufficient information for adding the target as a sibling (needs at least a name)""") # shortcut ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') assert(ds is not None and sshurl is not None and ds.repo is not None) # determine target parameters: sshri = RI(sshurl) if not isinstance(sshri, SSHRI) \ and not (isinstance(sshri, URL) and sshri.scheme == 'ssh'): raise ValueError("Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax".format(sshurl)) if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = False if "%NAME" not in target_dir: replicate_local_structure = True # collect datasets to use: datasets = dict() datasets[basename(ds.path)] = ds if recursive: for subds in ds.get_subdatasets(recursive=True): sub_path = opj(ds.path, subds) # TODO: when enhancing Dataset/*Repo classes and therefore # adapt to moved code, make proper distinction between name and # path of a submodule, which are technically different. This # probably will become important on windows as well as whenever # we want to allow for moved worktrees. datasets[basename(ds.path) + '/' + subds] = \ Dataset(sub_path) # request ssh connection: not_supported_on_windows("TODO") lgr.info("Connecting ...") ssh = ssh_manager.get_connection(sshurl) ssh.open() # flag to check if at dataset_root at_root = True # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) remote_repos_to_run_hook_for = [] for current_dspath in \ sorted(datasets.keys(), key=lambda x: x.count('/')): current_ds = datasets[current_dspath] if not current_ds.is_installed(): lgr.info("Skipping %s since not installed locally", current_dspath) continue if not replicate_local_structure: path = target_dir.replace("%NAME", current_dspath.replace("/", "-")) else: # TODO: opj depends on local platform, not the remote one. # check how to deal with it. Does windows ssh server accept # posix paths? vice versa? Should planned SSH class provide # tools for this issue? path = normpath(opj(target_dir, relpath(datasets[current_dspath].path, start=ds.path))) lgr.info("Creating target dataset {0} at {1}".format(current_dspath, path)) # Must be set to True only if exists and existing='reconfigure' # otherwise we might skip actions if we say existing='reconfigure' # but it did not even exist before only_reconfigure = False if path != '.': # check if target exists # TODO: Is this condition valid for != '.' only? path_exists = True try: out, err = ssh(["ls", path]) except CommandError as e: if "No such file or directory" in e.stderr and \ path in e.stderr: path_exists = False else: raise # It's an unexpected failure here if path_exists: if existing == 'error': raise RuntimeError("Target directory %s already exists." % path) elif existing == 'skip': continue elif existing == 'replace': ssh(["chmod", "+r+w", "-R", path]) # enable write permissions to allow removing dir ssh(["rm", "-rf", path]) # remove target at path path_exists = False # if we succeeded in removing it elif existing == 'reconfigure': only_reconfigure = True else: raise ValueError("Do not know how to handle existing=%s" % repr(existing)) if not path_exists: try: ssh(["mkdir", "-p", path]) except CommandError as e: lgr.error("Remotely creating target directory failed at " "%s.\nError: %s" % (path, exc_str(e))) continue # don't (re-)initialize dataset if existing == reconfigure if not only_reconfigure: # init git and possibly annex repo if not CreateSibling.init_remote_repo( path, ssh, shared, datasets[current_dspath], description=target_url): continue # check git version on remote end lgr.info("Adjusting remote git configuration") remote_git_version = CreateSibling.get_remote_git_version(ssh) if remote_git_version and remote_git_version >= "2.4": # allow for pushing to checked out branch try: ssh(["git", "-C", path] + ["config", "receive.denyCurrentBranch", "updateInstead"]) except CommandError as e: lgr.error("git config failed at remote location %s.\n" "You will not be able to push to checked out " "branch. Error: %s", path, exc_str(e)) else: lgr.error("Git version >= 2.4 needed to configure remote." " Version detected on server: %s\nSkipping configuration" " of receive.denyCurrentBranch - you will not be able to" " publish updates to this repository. Upgrade your git" " and run with --existing=reconfigure" % remote_git_version) # enable metadata refresh on dataset updates to publication server lgr.info("Enabling git post-update hook ...") try: CreateSibling.create_postupdate_hook( path, ssh, datasets[current_dspath]) except CommandError as e: lgr.error("Failed to add json creation command to post update " "hook.\nError: %s" % exc_str(e)) # publish web-interface to root dataset on publication server if at_root and ui: lgr.info("Uploading web interface to %s" % path) at_root = False try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: lgr.error("Failed to push web interface to the remote " "datalad repository.\nError: %s" % exc_str(e)) remote_repos_to_run_hook_for.append(path) # in reverse order would be depth first lgr.debug("Running post-update hooks in all created siblings") for path in remote_repos_to_run_hook_for[::-1]: # Trigger the hook try: ssh( ["cd '" + _path_(path, ".git") + "' && hooks/post-update"], wrap_args=False # we wrapped here manually ) except CommandError as e: lgr.error("Failed to run post-update hook under path %s. " "Error: %s" % (path, exc_str(e))) if target: # add the sibling(s): lgr.debug("Adding the siblings") if target_url is None: target_url = sshurl if target_pushurl is None and sshurl != target_url: target_pushurl = sshurl AddSibling()(dataset=ds, name=target, url=target_url, pushurl=target_pushurl, recursive=recursive, fetch=True, force=existing in {'replace'}, as_common_datasrc=as_common_datasrc, publish_by_default=publish_by_default, publish_depends=publish_depends)
def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled'): # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath)) } start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds.repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, message=None, rerun_info=None, rerun_outputs=None, sidecar=None): rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner lgr.debug('tracking command output underneath %s', ds) if not rerun_info and ds.repo.dirty: # Rerun already takes care of this. yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) if inputs: for res in ds.get(inputs.expand(full=True), on_failure="ignore"): yield res outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"], warn=not rerun_info) if outputs: for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res sfmt = SequenceFormatter() cmd_expanded = sfmt.format(cmd, pwd=pwd, dspath=ds.path, inputs=inputs.expand(dot=False), outputs=outputs.expand(dot=False)) # we have a clean dataset, let's run things exc = None cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd_expanded, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way exc = e cmd_exitcode = e.code if rerun_info and rerun_info.get("exit", 0) != cmd_exitcode: # we failed in a different way during a rerun. This can easily # happen if we try to alter a locked file # # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise exc lgr.info("== Command exit (modification check follows) =====") # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) use_sidecar = sidecar or ( sidecar is None and ds.config.get('datalad.run.record-sidecar', default=False)) if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd), '"{}"'.format(record_id) if use_sidecar else record) msg = assure_bytes(msg) if not rerun_info and cmd_exitcode: msg_path = opj(relpath(ds.repo.repo.git_dir), "COMMIT_EDITMSG") with open(msg_path, "wb") as ofh: ofh.write(msg) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -r -F%s .'", msg_path) raise exc else: for r in ds.add('.', recursive=True, message=msg): yield r
def diff_dataset(dataset, fr, to, constant_refs, path=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_file_type=True, reporting_order='depth-first'): """Internal helper to diff a dataset Parameters ---------- dataset : Dataset Dataset to perform the diff on. `fr` and `to` parameters are interpreted in the context of this dataset. fr : str Commit-ish to compare from. to : str Commit-ish to compare to. constant_refs : bool If True, `fr` and `to` will be passed on unmodified to diff operations on subdatasets. This can be useful with symbolic references like tags to report subdataset changes independent of superdataset changes. If False, `fr` and `to` will be translated to the subdataset commit-ish that match the given commit-ish in the superdataset. path : Path-like, optional Paths to constrain the diff to (see main diff() command). annex : str, optional Reporting mode for annex properties (see main diff() command). untracked : str, optional Reporting mode for untracked content (see main diff() command). recursive : bool, optional Flag to enable recursive operation (see main diff() command). recursion_limit : int, optional Recursion limit (see main diff() command). eval_file_type : bool, optional Whether to perform file type discrimination between real symlinks and symlinks representing annex'ed files. This can be expensive in datasets with many files. reporting_order : {'depth-first', 'breadth-first'}, optional By default, subdataset content records are reported after the record on the subdataset's submodule in a superdataset (depth-first). Alternatively, report all superdataset records first, before reporting any subdataset content records (breadth-first). Yields ------ dict DataLad result records. """ if reporting_order not in ('depth-first', 'breadth-first'): raise ValueError('Unknown reporting order: {}'.format(reporting_order)) ds = require_dataset(dataset, check_installed=True, purpose='difference reporting') # we cannot really perform any sorting of paths into subdatasets # or rejecting paths based on the state of the filesystem, as # we need to be able to compare with states that are not represented # in the worktree (anymore) if path: ps = [] # sort any path argument into the respective subdatasets for p in sorted(assure_list(path)): # it is important to capture the exact form of the # given path argument, before any normalization happens # distinguish rsync-link syntax to identify # a dataset as whole (e.g. 'ds') vs its # content (e.g. 'ds/') # special case is the root dataset, always report its content # changes orig_path = str(p) resolved_path = resolve_path(p, dataset) p = \ resolved_path, \ orig_path.endswith(op.sep) or resolved_path == ds.pathobj str_path = str(p[0]) root = get_dataset_root(str_path) if root is None: # no root, not possibly underneath the refds yield dict(action='status', path=str_path, refds=ds.path, status='error', message='path not underneath this dataset', logger=lgr) continue if path_under_rev_dataset(ds, str_path) is None: # nothing we support handling any further # there is only a single refds yield dict( path=str_path, refds=ds.path, action='diff', status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str_path), logger=lgr, ) continue ps.append(p) path = ps # TODO we might want to move away from the single-pass+immediate-yield # paradigm for this command. If we gather all information first, we # could do post-processing and detect when a file (same gitsha, or same # key) was copied/moved from another dataset. Another command (e.g. # save) could act on this information and also move/copy # availability information or at least enhance the respective commit # message with cross-dataset provenance info # cache to help avoid duplicate status queries content_info_cache = {} for res in _diff_ds( ds, fr, to, constant_refs, recursion_limit if recursion_limit is not None and recursive else -1 if recursive else 0, # TODO recode paths to repo path reference origpaths=None if not path else OrderedDict(path), untracked=untracked, annexinfo=annex, eval_file_type=eval_file_type, cache=content_info_cache, order=reporting_order): res.update( refds=ds.path, logger=lgr, action='diff', ) yield res
def __call__(*, dataset=None, what=None, dry_run=False, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose="report on cleanable locations" if dry_run else "clean dataset") res_kwargs = dict(action='clean [dry-run]' if dry_run else 'clean', logger=lgr, refds=ds.path) for wds in itertools.chain( [ds], ds.subdatasets(state='present', recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets') if recursive else []): d = wds.pathobj gitdir = wds.repo.dot_git DIRS_PLURAL = ("directory", "directories") FILES_PLURAL = ("file", "files") discover_or_remove = "Discovered" if dry_run else "Removed" for dirpath, flag, msg, sing_pl in [ (Path(ARCHIVES_TEMP_DIR), "cached-archives", "temporary archive", DIRS_PLURAL), (Path(ANNEX_TEMP_DIR), "annex-tmp", "temporary annex", FILES_PLURAL), (Path(ANNEX_TRANSFER_DIR), "annex-transfer", "annex temporary transfer", DIRS_PLURAL), (gitdir / Path(SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", FILES_PLURAL), ]: topdir = wds.pathobj / dirpath lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=str(topdir), status='notneeded', type='directory', **res_kwargs) continue paths = [p for p in topdir.glob('*')] if not paths: if not topdir.exists(): yield get_status_dict(path=str(topdir), status='notneeded', type='directory', **res_kwargs) continue else: # we empty topdir only message = ("%s empty %s directory", discover_or_remove, msg) else: pl = len(paths) > 1 message = ("%s %d %s %s: %s", discover_or_remove, len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([ str(p.relative_to(topdir)) for p in paths if p != topdir ]))) if not dry_run: rmtree(str(topdir)) yield get_status_dict(path=str(topdir), status='ok', type='directory', message=message, **res_kwargs)
def __call__( reponame, dataset=None, recursive=False, recursion_limit=None, name='github', existing='error', github_login=None, github_passwd=None, github_organization=None, access_protocol='https', publish_depends=None, dryrun=False): try: # this is an absolute leaf package, import locally to avoid # unnecessary dependencies import github as gh except ImportError: raise MissingExternalDependency( 'PyGitHub', msg='GitHub-related functionality is unavailable without this package') # what to operate on ds = require_dataset( dataset, check_installed=True, purpose='create Github sibling') # gather datasets and essential info # dataset instance and mountpoint relative to the top toprocess = [(ds, '')] if recursive: for sub in ds.subdatasets( fulfilled=None, # we want to report on missing dataset in here recursive=recursive, recursion_limit=recursion_limit, result_xfm='datasets'): if not sub.is_installed(): lgr.info('Ignoring unavailable subdataset %s', sub) continue toprocess.append((sub, relpath(sub.path, start=ds.path))) # check for existing remote configuration filtered = [] for d, mp in toprocess: if name in d.repo.get_remotes(): if existing == 'error': msg = '{} already had a configured sibling "{}"'.format( d, name) if dryrun: lgr.error(msg) else: raise ValueError(msg) elif existing == 'skip': continue gh_reponame = '{}{}{}'.format( reponame, '-' if mp else '', template_fx(mp)) filtered.append((d, gh_reponame)) if not filtered: # all skipped return [] # actually make it happen on Github rinfo = _make_github_repos( gh, github_login, github_passwd, github_organization, filtered, existing, access_protocol, dryrun) # lastly configure the local datasets for d, url, existed in rinfo: if not dryrun: # first make sure that annex doesn't touch this one # but respect any existing config ignore_var = 'remote.{}.annex-ignore'.format(name) if not ignore_var in d.config: d.config.add(ignore_var, 'true', where='local') Siblings()( 'configure', dataset=d, name=name, url=url, recursive=False, # TODO fetch=True, maybe only if one existed already publish_depends=publish_depends) # TODO let submodule URLs point to Github (optional) return rinfo
def __call__( path=None, dataset=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_subdataset_state='full'): # To the next white knight that comes in to re-implement `status` as a # special case of `diff`. There is one fundamental difference between # the two commands: `status` can always use the worktree as evident on # disk as a contraint (e.g. to figure out which subdataset a path is in) # `diff` cannot do that (everything need to be handled based on a # "virtual" representation of a dataset hierarchy). # MIH concludes that while `status` can be implemented as a special case # of `diff` doing so would complicate and slow down both `diff` and # `status`. So while the apparent almost code-duplication between the # two commands feels wrong, the benefit is speed. Any future RF should # come with evidence that speed does not suffer, and complexity stays # on a manageable level ds = require_dataset( dataset, check_installed=True, purpose='status reporting') paths_by_ds = OrderedDict() if path: # sort any path argument into the respective subdatasets for p in sorted(assure_list(path)): # it is important to capture the exact form of the # given path argument, before any normalization happens # for further decision logic below orig_path = text_type(p) p = rev_resolve_path(p, dataset) root = rev_get_dataset_root(text_type(p)) if root is None: # no root, not possibly underneath the refds yield dict( action='status', path=p, refds=ds.path, status='error', message='path not underneath this dataset', logger=lgr) continue else: if dataset and root == text_type(p) and \ not (orig_path.endswith(op.sep) or orig_path == "."): # the given path is pointing to a dataset # distinguish rsync-link syntax to identify # the dataset as whole (e.g. 'ds') vs its # content (e.g. 'ds/') super_root = rev_get_dataset_root(op.dirname(root)) if super_root: # the dataset identified by the path argument # is contained in a superdataset, and no # trailing path separator was found in the # argument -> user wants to address the dataset # as a whole (in the superdataset) root = super_root root = ut.Path(root) ps = paths_by_ds.get(root, []) ps.append(p) paths_by_ds[root] = ps else: paths_by_ds[ds.pathobj] = None queried = set() content_info_cache = {} while paths_by_ds: qdspath, qpaths = paths_by_ds.popitem(last=False) if qpaths and qdspath in qpaths: # this is supposed to be a full query, save some # cycles sifting through the actual path arguments qpaths = [] # try to recode the dataset path wrt to the reference # dataset # the path that it might have been located by could # have been a resolved path or another funky thing qds_inrefds = path_under_rev_dataset(ds, qdspath) if qds_inrefds is None: # nothing we support handling any further # there is only a single refds yield dict( path=text_type(qdspath), refds=ds.path, action='status', status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, qpaths), logger=lgr, ) continue elif qds_inrefds != qdspath: # the path this dataset was located by is not how it would # be referenced underneath the refds (possibly resolved # realpath) -> recode all paths to be underneath the refds qpaths = [qds_inrefds / p.relative_to(qdspath) for p in qpaths] qdspath = qds_inrefds if qdspath in queried: # do not report on a single dataset twice continue qds = Dataset(text_type(qdspath)) for r in _yield_status( qds, qpaths, annex, untracked, recursion_limit if recursion_limit is not None else -1 if recursive else 0, queried, eval_subdataset_state, content_info_cache): yield dict( r, refds=ds.path, action='status', status='ok', )
def __call__(dataset=None, sensitive=None, sections=None, decor=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetArgumentFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetArgumentFound: # failure is already logged pass if ds and not ds.is_installed(): # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = OrderedDict() res = get_status_dict( action='wtf', path=ds.path if ds else op.abspath(op.curdir), type='dataset' if ds else 'directory', status='ok', logger=lgr, decor=decor, infos=infos, ) # Define section callables which require variables. # so there is no side-effect on module level original section_callables = SECTION_CALLABLES.copy() section_callables['location'] = partial(_describe_location, res) section_callables['configuration'] = \ partial(_describe_configuration, cfg, sensitive) if ds: section_callables['dataset'] = \ partial(_describe_dataset, ds, sensitive) else: section_callables.pop('dataset') assert all(section_callables.values()) # check if none was missed if sections is None: sections = sorted(list(section_callables)) for s in sections: infos[s] = section_callables[s]() if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(assure_bytes(report)) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return
def __call__(dataset, filename=None, missing_content='error', no_annex=False, # TODO: support working with projects and articles within them # project_id=None, article_id=None): import os import logging lgr = logging.getLogger('datalad.plugin.export_to_figshare') from datalad.ui import ui from datalad.api import add_archive_content from datalad.api import export_archive from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo dataset = require_dataset(dataset, check_installed=True, purpose='export to figshare') if not isinstance(dataset.repo, AnnexRepo): raise ValueError( "%s is not an annex repo, so annexification could be done" % dataset ) if dataset.repo.is_dirty(): raise RuntimeError( "Paranoid authors of DataLad refuse to proceed in a dirty repository" ) if filename is None: filename = dataset.path lgr.info( "Exporting current tree as an archive under %s since figshare " "does not support directories", filename ) archive_out = next( export_archive( dataset, filename=filename, archivetype='zip', missing_content=missing_content, return_type="generator" ) ) assert archive_out['status'] == 'ok' fname = archive_out['path'] lgr.info("Uploading %s to figshare", fname) figshare = FigshareRESTLaison() if not article_id: # TODO: ask if it should be an article within a project if ui.is_interactive: # or should we just upload to a new article? if ui.yesno( "Would you like to create a new article to upload to? " "If not - we will list existing articles", title="Article" ): article = figshare.create_article( title=os.path.basename(dataset.path) ) lgr.info( "Created a new (private) article %(id)s at %(url_private_html)s. " "Please visit it, enter additional meta-data and make public", article ) article_id = article['id'] else: article_id = int(ui.question( "Which of the articles should we upload to.", choices=list(map(str, figshare.get_article_ids())) )) if not article_id: raise ValueError("We need an article to upload to.") file_info = figshare.upload_file( fname, files_url='account/articles/%s/files' % article_id ) if no_annex: lgr.info("Removing generated tarball") unlink(fname) else: # I will leave all the complaining etc to the dataset add if path # is outside etc lgr.info("'Registering' %s within annex", fname) repo = dataset.repo repo.add(fname, git=False) key = repo.get_file_key(fname) lgr.info("Adding URL %(download_url)s for it", file_info) repo._annex_custom_command([], [ "git", "annex", "registerurl", '-c', 'annex.alwayscommit=false', key, file_info['download_url'] ] ) lgr.info("Registering links back for the content of the archive") add_archive_content( fname, annex=dataset.repo, delete_after=True, # just remove extracted into a temp dir allow_dirty=True, # since we have a tarball commit=False # we do not want to commit anything we have done here ) lgr.info("Removing generated and now registered in annex archive") repo.drop(key, key=True, options=['--force']) repo.remove(fname, force=True) # remove the tarball # if annex in {'delete'}: # dataset.repo.remove(fname) # else: # # kinda makes little sense I guess. # # Made more sense if export_archive could export an arbitrary treeish # # so we could create a branch where to dump and export to figshare # # (kinda closer to my idea) # dataset.save(fname, message="Added the entire dataset into a zip file") # TODO: add to downloader knowledge about figshare token so it could download-url # those zipballs before they go public yield dict( status='ok', # TODO: add article url (which needs to be queried if only ID is known message="Published archive {}".format( file_info['download_url']), file_info=file_info, path=dataset, action='export_to_figshare', logger=lgr )
def __call__( path=None, sibling=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=None, reobtain_data=False): """ """ if fetch_all is not None: lgr.warning('update(fetch_all=...) called. Option has no effect, and will be removed') if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset( None, check_installed=True, purpose='updating') refds_path = Interface.get_refds_path(dataset) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path save_paths = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='update', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message="can only update datasets") yield ap continue # this is definitely as dataset from here on ds = Dataset(ap['path']) if not ds.is_installed(): lgr.debug("Skipping update since not installed %s", ds) continue repo = ds.repo # prepare return value # TODO reuse AP for return props res = get_status_dict('update', ds=ds, logger=lgr, refds=refds_path) # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes( **({'exclude_special_remotes': True} if isinstance(repo, AnnexRepo) else {})) if not remotes and not sibling: res['message'] = ("No siblings known to dataset at %s\nSkipping", repo.path) res['status'] = 'notneeded' yield res continue if not sibling and len(remotes) == 1: # there is only one remote, must be this one sibling_ = remotes[0] elif not sibling: # nothing given, look for tracking branch sibling_ = repo.get_tracking_branch()[0] else: sibling_ = sibling if sibling_ and sibling_ not in remotes: res['message'] = ("'%s' not known to dataset %s\nSkipping", sibling_, repo.path) res['status'] = 'impossible' yield res continue if not sibling_ and len(remotes) > 1 and merge: lgr.debug("Found multiple siblings:\n%s" % remotes) res['status'] = 'impossible' res['message'] = "Multiple siblings, please specify from which to update." yield res continue lgr.info("Fetching updates for %s", ds) # fetch remote fetch_kwargs = dict( # test against user-provided value! remote=None if sibling is None else sibling_, all_=sibling is None, # required to not trip over submodules that # were removed in the origin clone recurse_submodules="no", prune=True) # prune to not accumulate a mess over time repo.fetch(**fetch_kwargs) # NOTE if any further acces to `repo` is needed, reevaluate # ds.repo again, as it might have be converted from an GitRepo # to an AnnexRepo if merge: for fr in _update_repo(ds, sibling_, reobtain_data): yield fr res['status'] = 'ok' yield res save_paths.append(ap['path']) if recursive: save_paths = [p for p in save_paths if p != refds_path] if not save_paths: return lgr.debug( 'Subdatasets where updated state may need to be ' 'saved in the parent dataset: %s', save_paths) for r in Dataset(refds_path).save( path=save_paths, recursive=False, message='[DATALAD] Save updated subdatasets'): yield r
def __call__( path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset( refds_path, check_installed=True, purpose='aggregate metadata query') agginfos = load_ds_aggregate_db( ds, version=str(aggregate_layout_version), abspath=True ) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message='metadata aggregation has never been performed in this dataset') return parentds = [] for dspath in sorted(agginfos): info = agginfos[dspath] if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if dspath == ds.path: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict( info, **res_kwargs ) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = op.curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def __call__( spec=None, dataset=None, discover=False, help_proc=False): if not spec and not discover: raise InsufficientArgumentsError('requires at least a procedure name') if help_proc and not spec: raise InsufficientArgumentsError('requires a procedure name') try: ds = require_dataset( dataset, check_installed=False, purpose='run a procedure') except NoDatasetArgumentFound: ds = None if discover: reported = set() for m, cmd_name, cmd_tmpl, cmd_help in \ _get_procedure_implementation('*', ds=ds): if m in reported: continue ex = _guess_exec(m) # configured template (call-format string) takes precedence: if cmd_tmpl: ex['template'] = cmd_tmpl if ex['type'] is None and ex['template'] is None: # doesn't seem like a match lgr.debug("Neither type nor execution template found for " "%s. Ignored.", m) continue message = ex['type'] if ex['type'] else 'unknown type' message += ' (missing)' if ex['state'] == 'absent' else '' res = get_status_dict( action='discover_procedure', path=m, type='file', logger=lgr, refds=ds.path if ds else None, status='ok', state=ex['state'], procedure_name=cmd_name, procedure_type=ex['type'], procedure_callfmt=ex['template'], procedure_help=cmd_help, message=message) reported.add(m) yield res return if not isinstance(spec, (tuple, list)): # maybe coming from config import shlex spec = shlex.split(spec) name = spec[0] args = spec[1:] try: # get the first match an run with it procedure_file, cmd_name, cmd_tmpl, cmd_help = \ next(_get_procedure_implementation(name, ds=ds)) except StopIteration: res = get_status_dict( action='run_procedure', # TODO: Default renderer requires a key "path" to exist. # Doesn't make a lot of sense in this case path=name, logger=lgr, refds=ds.path if ds else None, status='impossible', message="Cannot find procedure with name '%s'" % name) yield res return ex = _guess_exec(procedure_file) # configured template (call-format string) takes precedence: if cmd_tmpl: ex['template'] = cmd_tmpl if help_proc: if cmd_help: res = get_status_dict( action='procedure_help', path=procedure_file, type='file', logger=lgr, refds=ds.path if ds else None, status='ok', state=ex['state'], procedure_name=cmd_name, procedure_type=ex['type'], procedure_callfmt=ex['template'], message=cmd_help) else: res = get_status_dict( action='procedure_help', path=procedure_file, type='file', logger=lgr, refds=ds.path if ds else None, status='impossible', state=ex['state'], procedure_name=cmd_name, procedure_type=ex['type'], procedure_callfmt=ex['template'], message="No help available for '%s'" % name) yield res return if not ex['template']: raise ValueError("No idea how to execute procedure %s. " "Missing 'execute' permissions?" % procedure_file) cmd = ex['template'].format( script=procedure_file, ds=ds.path if ds else '', args=u' '.join(u'"{}"'.format(a) for a in args) if args else '') lgr.info("Running procedure %s", name) lgr.debug('Full procedure command: %r', cmd) for r in Run.__call__( cmd=cmd, dataset=ds, explicit=True, inputs=None, outputs=None, # pass through here on_failure='ignore', return_type='generator' ): yield r
def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option" ) if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified" ) # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings" ) # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds ) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(ds.path, super_ds.path)) # check the login URL sshri = RI(sshurl) if not is_ssh(sshri): raise ValueError( "Unsupported SSH URL: '{0}', " "use ssh://host/path or host:path syntax".format(sshurl)) if not name: # use the hostname as default remote name name = sshri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, # only a single path! path=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='create_sibling', # both next should not happen anyways unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent': # this can happen when there is `since`, but we have no # use for anything but datasets here continue checkds_remotes = Dataset(ap['path']).repo.get_remotes() \ if ap.get('state', None) != 'absent' \ else [] if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: ap['status'] = 'error' ap['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield ap continue if name in checkds_remotes and existing in ('error', 'skip'): ap['status'] = 'error' if existing == 'error' else 'notneeded' ap['message'] = ( "sibling '%s' already configured (specify alternative name, or force " "reconfiguration via --existing", name) yield ap continue to_process.append(ap) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir # request ssh connection: lgr.info("Connecting ...") assert(sshurl is not None) # delayed anal verification ssh = ssh_manager.get_connection(sshurl) if not ssh.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg='on the remote system') # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, ds.path, ssh, replicate_local_structure, sshri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit ) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == ds.path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: ssh("cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap
def __call__(path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled'): # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets( rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath))} start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = text_type( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to( pds.repo.pathobj) ) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update( status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update( status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
def __call__(match, dataset=None, search=None, report=None, report_matched=False, format='custom', regex=False): lgr.debug("Initiating search for match=%r and dataset %r", match, dataset) try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: exc_info = sys.exc_info() if dataset is None: if not ui.is_interactive: raise NoDatasetArgumentFound( "No DataLad dataset found. Specify a dataset to be " "searched, or run interactively to get assistance " "installing a queriable superdataset." ) # none was provided so we could ask user either he possibly wants # to install our beautiful mega-duper-super-dataset? # TODO: following logic could possibly benefit other actions. if os.path.exists(LOCAL_CENTRAL_PATH): central_ds = Dataset(LOCAL_CENTRAL_PATH) if central_ds.is_installed(): if ui.yesno( title="No DataLad dataset found at current location", text="Would you like to search the DataLad " "superdataset at %r?" % LOCAL_CENTRAL_PATH): pass else: reraise(*exc_info) else: raise NoDatasetArgumentFound( "No DataLad dataset found at current location. " "The DataLad superdataset location %r exists, " "but does not contain an dataset." % LOCAL_CENTRAL_PATH) elif ui.yesno( title="No DataLad dataset found at current location", text="Would you like to install the DataLad " "superdataset at %r?" % LOCAL_CENTRAL_PATH): from datalad.api import install central_ds = install(LOCAL_CENTRAL_PATH, source='///') ui.message( "From now on you can refer to this dataset using the " "label '///'" ) else: reraise(*exc_info) lgr.info( "Performing search using DataLad superdataset %r", central_ds.path ) for res in central_ds.search( match, search=search, report=report, report_matched=report_matched, format=format, regex=regex): yield res return else: raise cache_dir = opj(opj(ds.path, get_git_dir(ds.path)), 'datalad', 'cache') mcache_fname = opj(cache_dir, 'metadata.p%d' % pickle.HIGHEST_PROTOCOL) meta = None if os.path.exists(mcache_fname): lgr.debug("use cached metadata of '{}' from {}".format(ds, mcache_fname)) meta, checksum = pickle.load(open(mcache_fname, 'rb')) # TODO add more sophisticated tests to decide when the cache is no longer valid if checksum != ds.repo.get_hexsha(): # errrr, try again below meta = None # don't put in 'else', as yet to be written tests above might fail and require # regenerating meta data if meta is None: lgr.info("Loading and caching local meta-data... might take a few seconds") if not exists(cache_dir): os.makedirs(cache_dir) meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # merge all info on datasets into a single dict per dataset meta = flatten_metadata_graph(meta) # extract graph, if any meta = meta.get('@graph', meta) # build simple queriable representation if not isinstance(meta, list): meta = [meta] # sort entries by location (if present) sort_keys = ('location', 'description', 'id') meta = sorted(meta, key=lambda m: tuple(m.get(x, "") for x in sort_keys)) # use pickle to store the optimized graph in the cache pickle.dump( # graph plus checksum from what it was built (meta, ds.repo.get_hexsha()), open(mcache_fname, 'wb')) lgr.debug("cached meta data graph of '{}' in {}".format(ds, mcache_fname)) if report in ('', ['']): report = [] elif report and not isinstance(report, list): report = [report] match = assure_list(match) search = assure_list(search) # convert all to lower case for case insensitive matching search = {x.lower() for x in search} def get_in_matcher(m): """Function generator to provide closure for a specific value of m""" mlower = m.lower() def matcher(s): return mlower in s.lower() return matcher matchers = [ re.compile(match_).search if regex else get_in_matcher(match_) for match_ in match ] # location should be reported relative to current location # We will assume that noone chpwd while we are yielding ds_path_prefix = get_path_prefix(ds.path) # So we could provide a useful message whenever there were not a single # dataset with specified `--search` properties observed_properties = set() # for every meta data set for mds in meta: hit = False hits = [False] * len(matchers) matched_fields = set() if not mds.get('type', mds.get('schema:type', None)) == 'Dataset': # we are presently only dealing with datasets continue # TODO consider the possibility of nested and context/graph dicts # but so far we were trying to build simple lists of dicts, as much # as possible if not isinstance(mds, dict): raise NotImplementedError("nested meta data is not yet supported") # manual loop for now for k, v in iteritems(mds): if search: k_lower = k.lower() if k_lower not in search: if observed_properties is not None: # record for providing a hint later observed_properties.add(k_lower) continue # so we have a hit, no need to track observed_properties = None if isinstance(v, dict) or isinstance(v, list): v = text_type(v) for imatcher, matcher in enumerate(matchers): if matcher(v): hits[imatcher] = True matched_fields.add(k) if all(hits): hit = True # no need to do it longer than necessary if not report_matched: break if hit: location = mds.get('location', '.') report_ = matched_fields.union(report if report else {}) \ if report_matched else report if report_ == ['*']: report_dict = mds elif report_: report_dict = {k: mds[k] for k in report_ if k in mds} if report_ and not report_dict: lgr.debug( 'meta data match for %s, but no to-be-reported ' 'properties (%s) found. Present properties: %s', location, ", ".join(report_), ", ".join(sorted(mds)) ) else: report_dict = {} # it was empty but not None -- asked to # not report any specific field if isinstance(location, (list, tuple)): # could be that the same dataset installed into multiple # locations. For now report them separately for l in location: yield opj(ds_path_prefix, l), report_dict else: yield opj(ds_path_prefix, location), report_dict if search and observed_properties is not None: import difflib suggestions = { s: difflib.get_close_matches(s, observed_properties) for s in search } suggestions_str = "\n ".join( "%s for %s" % (", ".join(choices), s) for s, choices in iteritems(suggestions) if choices ) lgr.warning( "Found no properties which matched one of the one you " "specified (%s). May be you meant one among: %s.\n" "Suggestions:\n" " %s", ", ".join(search), ", ".join(observed_properties), suggestions_str if suggestions_str.strip() else "none" )
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): if dataset: dataset = require_dataset( dataset, check_installed=False, purpose='removal') if not dataset.is_installed() and not path: # all done already return [] if not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive) nonexistent_paths = [] for p in unavailable_paths: # we need to check whether any of these correspond # to a known subdataset, and add those to the list of # things to be removed toppath = get_dataset_root(p) if not toppath: nonexistent_paths.append(p) continue if p in Dataset(toppath).get_subdatasets( recursive=False, absolute=True): # this is a known subdataset that needs to be removed pl = content_by_ds.get(p, []) pl.append(p) content_by_ds[p] = pl if nonexistent_paths: lgr.warning("ignoring non-existent path(s): %s", nonexistent_paths) if path_is_under(content_by_ds): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") handle_dirty_datasets( content_by_ds, mode=if_dirty, base=dataset) ds2save = set() results = [] # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] if ds_path in paths: # entire dataset needs to go superds = ds.get_superdataset( datalad_only=False, topmost=False) res = _uninstall_dataset(ds, check=check, has_super=False) results.extend(res) if ds.path in ds2save: # we just uninstalled it, no need to save anything ds2save.discard(ds.path) if not superds: continue subds_relpath = relpath(ds_path, start=superds.path) # remove submodule reference submodule = [sm for sm in superds.repo.repo.submodules if sm.path == subds_relpath] # there can only be one! assert(len(submodule) == 1) submodule = submodule[0] submodule.remove() if exists(ds_path): # could be an empty dir in case an already uninstalled subdataset # got removed os.rmdir(ds_path) # need to save changes to .gitmodules later content_by_ds[superds.path] = \ content_by_ds.get(superds.path, []) \ + [opj(superds.path, '.gitmodules'), ds_path] ds2save.add(superds.path) else: if check and hasattr(ds.repo, 'drop'): _drop_files(ds, paths, check=True) results.extend(ds.repo.remove(paths, r=True)) ds2save.add(ds.path) if dataset and dataset.is_installed(): # forge chain from base dataset to any leaf dataset # in order to save state changes all the way up _discover_trace_to_known(dataset.path, [], content_by_ds) save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] removed content') return results
def __call__( action='query', dataset=None, name=None, url=None, pushurl=None, description=None, # TODO consider true, for now like add_sibling fetch=False, as_common_datasrc=None, publish_depends=None, publish_by_default=None, annex_wanted=None, annex_required=None, annex_group=None, annex_groupwanted=None, inherit=False, get_annex_info=True, recursive=False, recursion_limit=None): # TODO: Detect malformed URL and fail? # XXX possibly fail if fetch is False and as_common_datasrc if annex_groupwanted and not annex_group: raise InsufficientArgumentsError( "To set groupwanted, you need to provide annex_group option") # TODO catch invalid action specified action_worker_map = { 'query': _query_remotes, 'add': _add_remote, 'configure': _configure_remote, 'remove': _remove_remote, 'enable': _enable_remote, } # all worker strictly operate on a single dataset # anything that deals with hierarchies and/or dataset # relationships in general should be dealt with in here # at the top-level and vice versa worker = action_worker_map[action] dataset = require_dataset(dataset, check_installed=False, purpose='sibling configuration') refds_path = dataset.path res_kwargs = dict(refds=refds_path, logger=lgr) ds_name = basename(dataset.path) # do not form single list of datasets (with recursion results) to # give fastest possible response, for the precise of a long-all # function call ds = dataset for r in worker( # always copy signature to below to avoid bugs! ds, name, ds.repo.get_remotes(), # for top-level dataset there is no layout questions _mangle_urls(url, ds_name), _mangle_urls(pushurl, ds_name), fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): yield r if not recursive: return # do we have instructions to register siblings with some alternative # layout? replicate_local_structure = url and "%NAME" not in url for subds in dataset.subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, result_xfm='datasets'): subds_name = relpath(subds.path, start=dataset.path) if replicate_local_structure: subds_url = slash_join(url, subds_name) subds_pushurl = slash_join(pushurl, subds_name) else: subds_url = \ _mangle_urls(url, '/'.join([ds_name, subds_name])) subds_pushurl = \ _mangle_urls(pushurl, '/'.join([ds_name, subds_name])) for r in worker( # always copy signature from above to avoid bugs subds, name, subds.repo.get_remotes(), subds_url, subds_pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): yield r
def __call__( revision="HEAD", since=None, dataset=None, branch=None, message=None, onto=None, script=None, report=False): ds = require_dataset( dataset, check_installed=True, purpose='rerunning a command') lgr.debug('rerunning command output underneath %s', ds) if script is None and not report and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=( 'clean dataset required to detect changes from command; ' 'use `datalad status` to inspect unsaved changes')) return if not ds.repo.get_hexsha(): yield get_status_dict( 'run', ds=ds, status='impossible', message='cannot rerun command, nothing recorded') return if branch and branch in ds.repo.get_branches(): yield get_status_dict( "run", ds=ds, status="error", message="branch '{}' already exists".format(branch)) return if not ds.repo.commit_exists(revision + "^"): # Only a single commit is reachable from `revision`. In # this case, --since has no effect on the range construction. revrange = revision elif since is None: revrange = "{rev}^..{rev}".format(rev=revision) elif since.strip() == "": revrange = revision else: revrange = "{}..{}".format(since, revision) if ds.repo.repo.git.rev_list("--merges", revrange, "--"): yield get_status_dict( "run", ds=ds, status="error", message="cannot rerun history with merge commits") return results = _rerun_as_results(ds, revrange, since, branch, onto, message) if script: handler = _get_script_handler(script, since, revision) elif report: handler = _report else: handler = _rerun for res in handler(ds, results): yield res
def __call__(dataset, filename='README.md', existing='skip'): from os.path import lexists from os.path import join as opj from io import open import logging lgr = logging.getLogger('datalad.plugin.add_readme') from datalad.distribution.dataset import require_dataset from datalad.utils import assure_list dataset = require_dataset(dataset, check_installed=True, purpose='add README') filename = opj(dataset.path, filename) res_kwargs = dict(action='add_readme', path=filename) if lexists(filename) and existing == 'skip': yield dict( res_kwargs, status='notneeded', message='file already exists, and not appending content') return # unlock, file could be annexed if lexists(filename): dataset.unlock(filename) # get any metadata on the dataset itself dsinfo = dataset.metadata('.', reporton='datasets', return_type='item-or-list', on_failure='ignore') meta = {} if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok': lgr.warn("Could not obtain dataset metadata, proceeding without") dsinfo = {} else: # flatten possibly existing multiple metadata sources for src in dsinfo['metadata']: if src.startswith('@'): # not a source continue meta.update(dsinfo['metadata'][src]) metainfo = '' for label, content in ( ('', meta.get('description', meta.get('shortdescription', ''))), ('Author{}'.format( 's' if isinstance(meta.get('author', None), list) else ''), u'\n'.join([ u'- {}'.format(a) for a in assure_list(meta.get('author', [])) ])), ('Homepage', meta.get('homepage', '')), ('Reference', meta.get('citation', '')), ('License', meta.get('license', '')), ('Keywords', u', '.join([ u'`{}`'.format(k) for k in assure_list(meta.get('tag', [])) ])), ('Funding', meta.get('fundedby', '')), ): if label and content: metainfo += u'\n\n### {}\n\n{}'.format(label, content) elif content: metainfo += u'\n\n{}'.format(content) for key in 'title', 'name', 'shortdescription': if 'title' in meta: break if key in meta: meta['title'] = meta[key] default_content = u"""\ # {title}{metainfo} ## General information This is a DataLad dataset{id}. For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://docs.datalad.org """.format( title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset', metainfo=metainfo, id=u' (id: {})'.format(dataset.id) if dataset.id else '', ) with open(filename, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp: fp.write(default_content) yield dict(status='ok', path=filename, type='file', action='add_readme') for r in dataset.rev_save(filename, message='[DATALAD] added README', result_filter=None, result_xfm=None): yield r
def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.plugin.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format( '.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex( repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content( repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath(opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method( fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict( status='ok', path=filename, type='file', action='export_archive', logger=lgr)
def __call__( source, path=None, dataset=None, description=None, reckless=None): # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = ds.path if ds else None # legacy compatibility if reckless is True: # so that we can forget about how things used to be reckless = 'auto' if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `save`".format( path)) if path is not None: path = resolve_path(path, dataset) # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue # since this is a relative `path`, resolve it: # we are not going to reuse the decoded URL, as this is done for # all source candidates in clone_dataset(), we just use to determine # a destination path here in order to perform a bunch of additional # checks that shall not pollute the helper function source_ = decode_source_spec( source, cfg=None if ds is None else ds.config) path = resolve_path(source_['default_destpath'], dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) result_props = dict( action='install', logger=lgr, refds=refds_path, source_url=source) try: # this will implicitly cause pathlib to run a bunch of checks # whether the present path makes any sense on the platform # we are running on -- we don't care if the path actually # exists at this point, but we want to abort early if the path # spec is determined to be useless path.exists() except OSError as e: yield get_status_dict( status='error', path=path, message=('cannot handle target path: %s', exc_str(e)), **result_props) return destination_dataset = Dataset(path) result_props['ds'] = destination_dataset if ds is not None and ds.pathobj not in path.parents: yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, ds), **result_props) return # perform the actual cloning operation yield from clone_dataset( [source], destination_dataset, reckless, description, result_props, cfg=None if ds is None else ds.config, ) # TODO handle any 'version' property handling and verification using a dedicated # public helper if ds is not None: # we created a dataset in another dataset # -> make submodule for r in ds.save( path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False, saver=_save_outputs): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. saver : callable, optional Must take a dataset instance, a list of paths to save, and a message string as arguments and must record any changes done to any content matching an entry in the path list. Must yield result dictionaries as a generator. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) if not inject: for res in prepare_inputs(ds, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds, outputs): yield res for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command(ds, cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand(full=True) if explicit else '.' if not rerun_info and cmd_exitcode: if outputs_to_save: msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad add -d . -r -F %s .'", msg_path) raise exc elif outputs_to_save: for r in saver(ds, outputs_to_save, msg): yield r
def __call__(path=None, *, sibling=None, merge=False, how=None, how_subds=None, follow="sibling", dataset=None, recursive=False, recursion_limit=None, fetch_all=None, reobtain_data=False): if fetch_all is not None: lgr.warning( 'update(fetch_all=...) called. Option has no effect, and will be removed' ) if path and not recursive: lgr.warning('path constraints for subdataset updates ignored, ' 'because `recursive` option was not given') how, how_subds = _process_how_args(merge, how, how_subds) # `merge` should be considered through `how` and `how_subds` only. # Unbind `merge` to ensure that downstream code doesn't look at it. del merge refds = require_dataset(dataset, check_installed=True, purpose='update') save_paths = [] update_failures = set() saw_subds = False for ds, revision in itertools.chain( [(refds, None)], refds.subdatasets(path=path, state='present', recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm=YieldDatasetAndRevision()) if recursive else []): if ds != refds: saw_subds = True repo = ds.repo is_annex = isinstance(repo, AnnexRepo) # prepare return value res = get_status_dict('update', ds=ds, logger=lgr, refds=refds.path) follow_parent = revision and follow.startswith("parentds") follow_parent_lazy = revision and follow == "parentds-lazy" if follow_parent_lazy and \ repo.get_hexsha(repo.get_corresponding_branch()) == revision: res["message"] = ( "Dataset already at commit registered in parent: %s", repo.path) res["status"] = "notneeded" yield res continue how_curr = how_subds if revision else how # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(**({ 'exclude_special_remotes': True } if is_annex else {})) if not remotes and not sibling: res['message'] = ( "No siblings known to dataset at %s\nSkipping", repo.path) res['status'] = 'notneeded' yield res continue curr_branch = repo.get_active_branch() tracking_remote = None if not sibling and len(remotes) == 1: # there is only one remote, must be this one sibling_ = remotes[0] elif not sibling: # nothing given, look for tracking branch tracking_remote = repo.get_tracking_branch(branch=curr_branch, remote_only=True)[0] sibling_ = tracking_remote else: sibling_ = sibling if sibling_ and sibling_ not in remotes: res['message'] = ("'%s' not known to dataset %s\nSkipping", sibling_, repo.path) res['status'] = 'impossible' yield res continue if not sibling_ and len(remotes) > 1 and how_curr: lgr.debug("Found multiple siblings:\n%s", remotes) res['status'] = 'impossible' res['message'] = "Multiple siblings, please specify from which to update." yield res continue lgr.info("Fetching updates for %s", ds) # fetch remote fetch_kwargs = dict( # test against user-provided value! remote=None if sibling is None else sibling_, all_=sibling is None, git_options=[ # required to not trip over submodules that were removed in # the origin clone "--no-recurse-submodules", # prune to not accumulate a mess over time "--prune" ]) if not (follow_parent_lazy and repo.commit_exists(revision)): try: repo.fetch(**fetch_kwargs) except CommandError as exc: ce = CapturedException(exc) yield get_status_dict( status="error", message=("Fetch failed: %s", ce), exception=ce, **res, ) continue # NOTE reevaluate ds.repo again, as it might have be converted from # a GitRepo to an AnnexRepo repo = ds.repo if follow_parent and not repo.commit_exists(revision): if sibling_: try: lgr.debug("Fetching revision %s directly for %s", revision, repo) repo.fetch(remote=sibling_, refspec=revision, git_options=["--recurse-submodules=no"]) except CommandError as exc: ce = CapturedException(exc) yield dict( res, status="impossible", message=("Attempt to fetch %s from %s failed: %s", revision, sibling_, ce), exception=ce) continue else: yield dict(res, status="impossible", message=("Need to fetch %s directly " "but single sibling not resolved", revision)) continue saw_update_failure = False if how_curr: if follow_parent: target = revision else: target = _choose_update_target(repo, curr_branch, sibling_, tracking_remote) adjusted = is_annex and repo.is_managed_branch(curr_branch) if adjusted: if follow_parent: yield dict( res, status="impossible", message=("follow='parentds' is incompatible " "with adjusted branches")) continue if how_curr != "merge": yield dict( res, status="impossible", message=("Updating via '%s' is incompatible " "with adjusted branches", how_curr)) continue update_fn = _choose_update_fn(repo, how_curr, is_annex=is_annex, adjusted=adjusted) fn_opts = ["--ff-only"] if how_curr == "ff-only" else None if update_fn is not _annex_sync: if target is None: yield dict(res, status="impossible", message="Could not determine update target") continue if is_annex and reobtain_data: update_fn = _reobtain(ds, update_fn) for ures in update_fn(repo, sibling_, target, opts=fn_opts): # NOTE: Ideally the "merge" action would also be prefixed # with "update.", but a plain "merge" is used for backward # compatibility. if ures["status"] != "ok" and ( ures["action"] == "merge" or ures["action"].startswith("update.")): saw_update_failure = True yield dict(res, **ures) if saw_update_failure: update_failures.add(ds) res['status'] = 'error' res['message'] = ("Update of %s failed", target) else: res['status'] = 'ok' save_paths.append(ds.path) yield res # we need to save updated states only if merge was requested -- otherwise # it was a pure fetch if how_curr and recursive: yield from _save_after_update(refds, save_paths, update_failures, path, saw_subds)
def __call__(dataset, filename='README.md', existing='skip'): from os.path import lexists from os.path import join as opj from io import open import logging lgr = logging.getLogger('datalad.plugin.add_readme') from datalad.distribution.dataset import require_dataset from datalad.utils import assure_list dataset = require_dataset(dataset, check_installed=True, purpose='add README') filename = opj(dataset.path, filename) res_kwargs = dict(action='add_readme', path=filename) if lexists(filename) and existing == 'skip': yield dict( res_kwargs, status='notneeded', message='file already exists, and not appending content') return # unlock, file could be annexed if lexists(filename): dataset.unlock(filename) # get any metadata on the dataset itself dsinfo = dataset.metadata( '.', reporton='datasets', return_type='item-or-list', on_failure='ignore') meta = {} if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok': lgr.warn("Could not obtain dataset metadata, proceeding without") dsinfo = {} else: # flatten possibly existing multiple metadata sources for src in dsinfo['metadata']: if src.startswith('@'): # not a source continue meta.update(dsinfo['metadata'][src]) metainfo = '' for label, content in ( ('', meta.get('description', meta.get('shortdescription', ''))), ('Author{}'.format('s' if isinstance(meta.get('author', None), list) else ''), u'\n'.join([u'- {}'.format(a) for a in assure_list(meta.get('author', []))])), ('Homepage', meta.get('homepage', '')), ('Reference', meta.get('citation', '')), ('License', meta.get('license', '')), ('Keywords', u', '.join([u'`{}`'.format(k) for k in assure_list(meta.get('tag', []))])), ('Funding', meta.get('fundedby', '')), ): if label and content: metainfo += u'\n\n### {}\n\n{}'.format(label, content) elif content: metainfo += u'\n\n{}'.format(content) for key in 'title', 'name', 'shortdescription': if 'title' in meta: break if key in meta: meta['title'] = meta[key] default_content=u"""\ # {title}{metainfo} ## General information This is a DataLad dataset{id}. For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://docs.datalad.org """.format( title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset', metainfo=metainfo, id=u' (id: {})'.format(dataset.id) if dataset.id else '', ) with open(filename, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp: fp.write(default_content) yield dict( status='ok', path=filename, type='file', action='add_readme') for r in dataset.save( filename, message='[DATALAD] added README', result_filter=None, result_xfm=None): yield r