def __call__(archive, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): """ Returns ------- annex """ if exclude: exclude = assure_tuple_or_list(exclude) if rename: rename = assure_tuple_or_list(rename) # TODO: actually I see possibly us asking user either he wants to convert # his git repo into annex archive_path = archive pwd = getpwd() if annex is None: annex = get_repo_instance(pwd, class_=AnnexRepo) if not isabs(archive): # if not absolute -- relative to wd and thus archive_path = normpath(opj(realpath(pwd), archive)) # abspath(archive) is not "good" since dereferences links in the path # archive_path = abspath(archive) elif not isabs(archive): # if we are given an annex, then assume that given path is within annex, not # relative to PWD archive_path = opj(annex.path, archive) annex_path = annex.path # _rpath below should depict paths relative to the top of the annex archive_rpath = relpath( archive_path, # Use `get_dataset_root` to avoid resolving the leading path. If no # repo is found, downstream code will raise FileNotInRepositoryError. get_dataset_root(archive_path) or ".") if archive in annex.untracked_files: raise RuntimeError( "The archive is not under annex yet. You should run 'datalad " "add {}' first".format(archive)) if not allow_dirty and annex.dirty: # already saved me once ;) raise RuntimeError("You better commit all the changes and untracked files first") if not key: # we were given a file which must exist if not exists(archive_path): raise ValueError("Archive {} does not exist".format(archive)) # TODO: support adding archives content from outside the annex/repo origin = 'archive' key = annex.get_file_key(archive_rpath) archive_dir = dirname(archive_path) else: origin = 'key' key = archive archive_dir = None # We must not have anything to do with the location under .git/annex archive_basename = file_basename(archive) if not key: # TODO: allow for it to be under git??? how to reference then? raise NotImplementedError( "Provided file %s is not under annex. We don't support yet adding everything " "straight to git" % archive ) # are we in a subdirectory of the repository? pwd_under_annex = commonprefix([pwd, annex_path]) == annex_path # then we should add content under that # subdirectory, # get the path relative to the repo top if use_current_dir: # if outside -- extract to the top of repo extract_rpath = relpath(pwd, annex_path) \ if pwd_under_annex \ else None else: extract_rpath = relpath(archive_dir, annex_path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None # no special relpath from top of the repo # and operate from now on the key or whereever content available "canonically" try: key_rpath = annex.get_contentlocation(key) # , relative_to_top=True) except: raise RuntimeError("Content of %s seems to be N/A. Fetch it first" % key) # now we simply need to go through every file in that archive and lgr.info("Adding content of the archive %s into annex %s", archive, annex) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive annexarchive = ArchiveAnnexCustomRemote(path=annex_path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # TODO: check if may be it was already added if ARCHIVES_SPECIAL_REMOTE not in annex.get_remotes(): init_datalad_remote(annex, ARCHIVES_SPECIAL_REMOTE, autoenable=True) else: lgr.debug("Special remote {} already exists".format(ARCHIVES_SPECIAL_REMOTE)) precommitted = False delete_after_rpath = None try: old_always_commit = annex.always_commit # When faking dates, batch mode is disabled, so we want to always # commit. annex.always_commit = annex.fake_dates_enabled if annex_options: if isinstance(annex_options, str): annex_options = split_cmdline(annex_options) leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \ if strip_leading_dirs else None leading_dir_len = len(leading_dir) + len(opsep) if leading_dir else 0 # we need to create a temporary directory at the top level which would later be # removed prefix_dir = basename(tempfile.mktemp(prefix=".datalad", dir=annex_path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() for extracted_file in earchive.get_extracted_files(): stats.files += 1 extracted_path = opj(earchive.path, extracted_file) if islink(extracted_path): link_path = realpath(extracted_path) if not exists(link_path): # TODO: config addarchive.symlink-broken='skip' lgr.warning("Path %s points to non-existing file %s" % (extracted_path, link_path)) stats.skipped += 1 continue # TODO: check if points outside of the archive -- warning and skip # preliminary target name which might get modified by renames target_file_orig = target_file = extracted_file # strip leading dirs target_file = target_file[leading_dir_len:] if add_archive_leading_dir: target_file = opj(archive_basename, target_file) if rename: target_file = apply_replacement_rules(rename, target_file) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains {regexp} pattern".format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if prefix_dir: target_file = opj(prefix_dir, target_file) # but also allow for it in the orig target_file_orig = opj(prefix_dir, target_file_orig) target_file_path_orig = opj(annex.path, target_file_orig) url = annexarchive.get_file_url(archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals())) target_file_path = opj(extract_rpath, target_file) \ if extract_rpath else target_file target_file_path = opj(annex.path, target_file_path) if lexists(target_file_path): handle_existing = True if md5sum(target_file_path) == md5sum(extracted_path): if not annex.is_under_annex(extracted_path): # if under annex -- must be having the same content, # we should just add possibly a new extra URL # but if under git -- we cannot/should not do # anything about it ATM if existing != 'overwrite': continue else: handle_existing = False if not handle_existing: pass # nothing... just to avoid additional indentation elif existing == 'fail': raise RuntimeError( "File {} already exists, but new (?) file {} was instructed " "to be placed there while overwrite=False".format (target_file_path, extracted_file) ) elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a tree rmtree(target_file_path) else: target_file_path_orig_ = target_file_path # To keep extension intact -- operate on the base of the filename p, fn = os.path.split(target_file_path) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: raise ValueError(existing) # keep incrementing index in the suffix until file doesn't collide suf, i = '', 0 while True: target_file_path_new = opj(p, fn_base + suf + ('.' if (fn_ext or ends_with_dot) else '') + fn_ext) if not lexists(target_file_path_new): break lgr.debug("File %s already exists" % target_file_path_new) i += 1 suf = '.%d' % i target_file_path = target_file_path_new lgr.debug("Original file %s will be saved into %s" % (target_file_path_orig_, target_file_path)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file_path != target_file_path_orig: stats.renamed += 1 #target_path = opj(getpwd(), target_file) if copy: raise NotImplementedError("Not yet copying from 'persistent' cache") else: # os.renames(extracted_path, target_path) # addurl implementation relying on annex'es addurl below would actually copy pass lgr.debug("Adding %s to annex pointing to %s and with options %r", target_file_path, url, annex_options) out_json = annex.add_url_to_file( target_file_path, url, options=annex_options, batch=True) if 'key' in out_json and out_json['key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug("File {} was added to git, not adding url".format(target_file_path)) stats.add_git += 1 if delete_after: # delayed removal so it doesn't interfer with batched processes since any pure # git action invokes precommit which closes batched processes. But we like to count stats.removed += 1 # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc # annex.add(target_path, options=annex_options) # # above action might add to git or to annex # if annex.file_has_content(target_path): # # if not -- it was added to git, if in annex, it is present and output is True # annex.add_url_to_file(target_file, url, options=['--relaxed'], batch=True) # stats.add_annex += 1 # else: # lgr.debug("File {} was added to git, not adding url".format(target_file)) # stats.add_git += 1 # # TODO: actually check if it is anyhow different from a previous version. If not # # then it wasn't really added del target_file # Done with target_file -- just to have clear end of the loop if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(archive_rpath, force=True) lgr.info("Finished adding %s: %s" % (archive, stats.as_str(mode='line'))) if outside_stats: outside_stats += stats if delete_after: # force since not committed. r=True for -r (passed into git call # to recurse) delete_after_rpath = opj(extract_rpath, prefix_dir) if extract_rpath else prefix_dir lgr.debug( "Removing extracted and annexed files under %s", delete_after_rpath ) annex.remove(delete_after_rpath, r=True, force=True) if commit: commit_stats = outside_stats if outside_stats else stats annex.precommit() # so batched ones close and files become annex symlinks etc precommitted = True if any(r.get('state', None) != 'clean' for p, r in annex.status(untracked='no').items()): annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive_rpath, commit_stats.as_str(mode='full')), _datalad_msg=True ) commit_stats.reset() finally: # since we batched addurl, we should close those batched processes # if haven't done yet. explicitly checked to avoid any possible # "double-action" if not precommitted: annex.precommit() if delete_after_rpath: delete_after_path = opj(annex_path, delete_after_rpath) if exists(delete_after_path): # should not be there # but for paranoid yoh lgr.warning( "Removing temporary directory under which extracted " "files were annexed and should have been removed: %s", delete_after_path) rmtree(delete_after_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) return annex
def __call__( archive, *, dataset=None, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): if exclude: exclude = ensure_tuple_or_list(exclude) if rename: rename = ensure_tuple_or_list(rename) ds = require_dataset(dataset, check_installed=True, purpose='add-archive-content') # set up common params for result records res_kwargs = { 'action': 'add-archive-content', 'logger': lgr, } if not isinstance(ds.repo, AnnexRepo): yield get_status_dict( ds=ds, status='impossible', message="Can't operate in a pure Git repository", **res_kwargs ) return if annex: warnings.warn( "datalad add_archive_content's `annex` parameter is " "deprecated and will be removed in a future release. " "Use the 'dataset' parameter instead.", DeprecationWarning) annex = ds.repo # get the archive path relative from the ds root archive_path = resolve_path(archive, ds=dataset) # let Status decide whether we can act on the given file for s in ds.status( path=archive_path, on_failure='ignore', result_renderer='disabled'): if s['status'] == 'error': if 'path not underneath the reference dataset %s' in s['message']: yield get_status_dict( ds=ds, status='impossible', message='Can not add archive outside of the dataset', **res_kwargs) return # status errored & we haven't anticipated the cause. Bubble up yield s return elif s['state'] == 'untracked': # we can't act on an untracked file message = ( "Can not add an untracked archive. " "Run 'datalad save {}'".format(archive) ) yield get_status_dict( ds=ds, status='impossible', message=message, **res_kwargs) return if not allow_dirty and annex.dirty: # error out here if the dataset contains untracked changes yield get_status_dict( ds=ds, status='impossible', message=( 'clean dataset required. ' 'Use `datalad status` to inspect unsaved changes'), **res_kwargs ) return # ensure the archive exists, status doesn't error on a non-existing file if not key and not lexists(archive_path): yield get_status_dict( ds=ds, status='impossible', message=( 'No such file: {}'.format(archive_path), ), **res_kwargs ) return if not key: check_path = archive_path.relative_to(ds.pathobj) # TODO: support adding archives content from outside the annex/repo origin = 'archive' # can become get_file_annexinfo once #6104 is merged key = annex.get_file_annexinfo(check_path)['key'] if not key: raise RuntimeError( f"Archive must be an annexed file in {ds}") archive_dir = Path(archive_path).parent else: origin = 'key' key = archive # We must not have anything to do with the location under .git/annex archive_dir = None # instead, we will go from the current directory use_current_dir = True archive_basename = file_basename(archive) if not key: # if we didn't manage to get a key, the file must be in Git raise NotImplementedError( "Provided file %s does not seem to be under annex control. " "We don't support adding everything straight to Git" % archive ) # figure out our location pwd = getpwd() # are we in a subdirectory of the repository? pwd_in_root = annex.path == archive_dir # then we should add content under that subdirectory, # get the path relative to the repo top if use_current_dir: # extract the archive under the current directory, not the directory # where the archive is located extract_rpath = Path(pwd).relative_to(ds.path) \ if not pwd_in_root \ else None else: extract_rpath = archive_dir.relative_to(ds.path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None try: key_rpath = annex.get_contentlocation(key) except: # the only probable reason for this to fail is that there is no # content present raise RuntimeError( "Content of %s seems to be N/A. Fetch it first" % key ) # now we simply need to go through every file in that archive and lgr.info( "Adding content of the archive %s into annex %s", archive, annex ) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive # OK, let's ignore that the following class is actually a special # remote implementation, and use it only to work with its cache annexarchive = ArchiveAnnexCustomRemote(annex=None, path=annex.path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # make sure there is an enabled datalad-archives special remote ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE, autoenable=True) precommitted = False old_always_commit = annex.always_commit # batch mode is disabled when faking dates, we want to always commit annex.always_commit = annex.fake_dates_enabled if annex_options: if isinstance(annex_options, str): annex_options = split_cmdline(annex_options) delete_after_rpath = None prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex.path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() try: # keep track of extracted files for progress bar logging file_counter = 0 # iterative over all files in the archive extracted_files = list(earchive.get_extracted_files()) # start a progress bar for extraction pbar_id = f'add-archive-{archive_path}' log_progress( lgr.info, pbar_id, 'Extracting archive', label="Extracting archive", unit=' Files', total = len(extracted_files), noninteractive_level = logging.INFO) for extracted_file in extracted_files: file_counter += 1 files_left = len(extracted_files) - file_counter log_progress( lgr.info, pbar_id, "Files to extract %i ", files_left, update=1, increment=True, noninteractive_level=logging.DEBUG) stats.files += 1 extracted_path = Path(earchive.path) / Path(extracted_file) if extracted_path.is_symlink(): link_path = str(extracted_path.resolve()) if not exists(link_path): # TODO: config addarchive.symlink-broken='skip' lgr.warning( "Path %s points to non-existing file %s" % (extracted_path, link_path) ) stats.skipped += 1 continue # TODO: check if points outside of archive - warn & skip url = annexarchive.get_file_url( archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # preliminary target name which might get modified by renames target_file_orig = target_file = Path(extracted_file) # stream archives would not have had the original filename # information in them, so would be extracted under a name # derived from their annex key. # Provide ad-hoc handling for such cases if (len(extracted_files) == 1 and Path(archive).suffix in ('.xz', '.gz', '.lzma') and Path(key_rpath).name.startswith(Path( extracted_file).name)): # take archive's name without extension for filename & place # where it was originally extracted target_file = \ Path(extracted_file).parent / Path(archive).stem if strip_leading_dirs: leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) leading_dir_len = \ len(leading_dir) + len(opsep) if leading_dir else 0 target_file = str(target_file)[leading_dir_len:] if add_archive_leading_dir: # place extracted content under a directory corresponding to # the archive name with suffix stripped. target_file = Path(archive_basename) / target_file if rename: target_file = apply_replacement_rules(rename, str(target_file)) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains " "{regexp} pattern".format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if delete_after: # place target file in a temporary directory target_file = Path(prefix_dir) / Path(target_file) # but also allow for it in the orig target_file_orig = Path(prefix_dir) / Path(target_file_orig) target_file_path_orig = annex.pathobj / target_file_orig # If we were invoked in a subdirectory, patch together the # correct path target_file_path = extract_rpath / target_file \ if extract_rpath else target_file target_file_path = annex.pathobj / target_file_path # when the file already exists... if lexists(target_file_path): handle_existing = True if md5sum(str(target_file_path)) == \ md5sum(str(extracted_path)): if not annex.is_under_annex(str(extracted_path)): # if under annex -- must be having the same content, # we should just add possibly a new extra URL # but if under git -- we cannot/should not do # anything about it ATM if existing != 'overwrite': continue else: handle_existing = False if not handle_existing: pass # nothing... just to avoid additional indentation elif existing == 'fail': message = \ "{} exists, but would be overwritten by new file " \ "{}. Consider adjusting --existing".format\ (target_file_path, extracted_file) yield get_status_dict( ds=ds, status='error', message=message, **res_kwargs) return elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a # tree rmtree(target_file_path) else: # an elaborate dance to piece together new archive names target_file_path_orig_ = target_file_path # To keep extension intact -- operate on the base of the # filename p, fn = os.path.split(target_file_path) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: # we shouldn't get here, argparse should catch a # non-existing value for --existing right away raise ValueError(existing) # keep incrementing index in the suffix until file # doesn't collide suf, i = '', 0 while True: connector = \ ('.' if (fn_ext or ends_with_dot) else '') file = fn_base + suf + connector + fn_ext target_file_path_new = \ Path(p) / Path(file) if not lexists(target_file_path_new): # we found a file name that is not yet taken break lgr.debug("Iteration %i of file name finding. " "File %s already exists", i, target_file_path_new) i += 1 suf = '.%d' % i target_file_path = target_file_path_new lgr.debug("Original file %s will be saved into %s" % (target_file_path_orig_, target_file_path)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file_path != target_file_path_orig: stats.renamed += 1 if copy: raise NotImplementedError( "Not yet copying from 'persistent' cache" ) lgr.debug("Adding %s to annex pointing to %s and with options " "%r", target_file_path, url, annex_options) out_json = annex.add_url_to_file( target_file_path, url, options=annex_options, batch=True) if 'key' in out_json and out_json['key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: # drop extracted files after adding to annex annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug("File {} was added to git, not adding url".format( target_file_path)) stats.add_git += 1 if delete_after: # we count the removal here, but don't yet perform it # to not interfer with batched processes - any pure Git # action invokes precommit which closes batched processes. stats.removed += 1 # Done with target_file -- just to have clear end of the loop del target_file if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(str(archive_path), force=True) lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line')) if outside_stats: outside_stats += stats if delete_after: # force since not committed. r=True for -r (passed into git call # to recurse) delete_after_rpath = opj(extract_rpath, prefix_dir) \ if extract_rpath else prefix_dir delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) lgr.debug( "Removing extracted and annexed files under %s", delete_after_rpath ) annex.remove(str(delete_after_rpath), r=True, force=True) if commit: archive_rpath = archive_path.relative_to(ds.path) commit_stats = outside_stats if outside_stats else stats # so batched ones close and files become annex symlinks etc annex.precommit() precommitted = True if any(r.get('state', None) != 'clean' for p, r in annex.status(untracked='no').items()): annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive_rpath, commit_stats.as_str(mode='full')), _datalad_msg=True ) commit_stats.reset() else: # don't commit upon completion pass finally: # take down the progress bar log_progress( lgr.info, pbar_id, 'Finished extraction', noninteractive_level=logging.INFO) # since we batched addurl, we should close those batched processes # if haven't done yet. explicitly checked to avoid any possible # "double-action" if not precommitted: annex.precommit() if delete_after_rpath: delete_after_path = opj(annex.path, delete_after_rpath) delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) if exists(delete_after_path): # should not be there # but for paranoid yoh lgr.warning( "Removing temporary directory under which extracted " "files were annexed and should have been removed: %s", delete_after_path) rmtree(delete_after_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) # remove tempfile directories (not cleaned up automatically): if prefix_dir is not None and lexists(prefix_dir): os.rmdir(prefix_dir) yield get_status_dict( ds=ds, status='ok', **res_kwargs) return annex
def __call__(archive, annex=None, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): """ Returns ------- annex """ if exclude: exclude = assure_tuple_or_list(exclude) if rename: rename = assure_tuple_or_list(rename) # TODO: actually I see possibly us asking user either he wants to convert # his git repo into annex archive_path = archive if annex is None: annex = get_repo_instance(class_=AnnexRepo) if not isabs(archive): # if not absolute -- relative to curdir and thus archive_path = relpath(abspath(archive), annex.path) elif not isabs(archive): # if we are given an annex, then assume that given path is within annex, not # relative to PWD archive_path = opj(annex.path, archive) # TODO: somewhat too cruel -- may be an option or smth... if not allow_dirty and annex.dirty: # already saved me once ;) raise RuntimeError( "You better commit all the changes and untracked files first") # are we in a subdirectory of the repository? then we should add content under that # subdirectory, # get the path relative to the repo top extract_relpath = relpath(getpwd(), annex.path) \ if commonprefix([realpath(getpwd()), annex.path]) == annex.path \ else None if not key: # we were given a file which must exist if not exists(opj(annex.path, archive_path)): raise ValueError("Archive {} does not exist".format(archive)) # TODO: support adding archives content from outside the annex/repo origin = archive key = annex.get_file_key(archive_path) else: origin = key if not key: # TODO: allow for it to be under git??? how to reference then? raise NotImplementedError( "Provided file is not under annex. We don't support yet adding everything " "straight to git") # and operate from now on the key or whereever content available "canonically" try: key_path = annex.get_contentlocation( key) # , relative_to_top=True) except: raise RuntimeError( "Content of %s seems to be N/A. Fetch it first" % key) #key_path = opj(reltop, key_path) # now we simply need to go through every file in that archive and lgr.info("Adding content of the archive %s into annex %s", archive, annex) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive annexarchive = ArchiveAnnexCustomRemote(path=annex.path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_path] # TODO: check if may be it was already added if ARCHIVES_SPECIAL_REMOTE not in annex.git_get_remotes(): lgr.debug( "Adding new special remote {}".format(ARCHIVES_SPECIAL_REMOTE)) annex.annex_initremote(ARCHIVES_SPECIAL_REMOTE, [ 'encryption=none', 'type=external', 'externaltype=%s' % ARCHIVES_SPECIAL_REMOTE, 'autoenable=true' ]) else: lgr.debug("Special remote {} already exists".format( ARCHIVES_SPECIAL_REMOTE)) try: old_always_commit = annex.always_commit annex.always_commit = False keys_to_drop = [] if annex_options: if isinstance(annex_options, string_types): annex_options = shlex.split(annex_options) leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \ if strip_leading_dirs else None leading_dir_len = len(leading_dir) + len( opsep) if leading_dir else 0 # we need to create a temporary directory at the top level which would later be # removed prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex.path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() for extracted_file in earchive.get_extracted_files(): stats.files += 1 extracted_path = opj(earchive.path, extracted_file) if islink(extracted_path): link_path = realpath(extracted_path) if not exists( link_path ): # TODO: config addarchive.symlink-broken='skip' lgr.warning("Path %s points to non-existing file %s" % (extracted_path, link_path)) stats.skipped += 1 continue # TODO: check if points outside of the archive -- warning and skip # preliminary target name which might get modified by renames target_file_orig = target_file = extracted_file target_file = target_file[leading_dir_len:] if rename: target_file = apply_replacement_rules(rename, target_file) if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, target_file): lgr.debug( "Skipping {target_file} since contains {regexp} pattern" .format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if prefix_dir: target_file = opj(prefix_dir, target_file) url = annexarchive.get_file_url( archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals())) if lexists(target_file): if md5sum(target_file) == md5sum(extracted_path): # must be having the same content, we should just add possibly a new extra URL pass elif existing == 'fail': raise RuntimeError( "File {} already exists, but new (?) file {} was instructed " "to be placed there while overwrite=False".format( target_file, extracted_file)) elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a tree rmtree(target_file) else: target_file_orig_ = target_file # To keep extension intact -- operate on the base of the filename p, fn = os.path.split(target_file) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % file_basename(origin) elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: raise ValueError(existing) # keep incrementing index in the suffix until file doesn't collide suf, i = '', 0 while True: target_file_new = opj( p, fn_base + suf + ('.' if (fn_ext or ends_with_dot) else '') + fn_ext) if not lexists(target_file_new): break lgr.debug("File %s already exists" % target_file_new) i += 1 suf = '.%d' % i target_file = target_file_new lgr.debug("Original file %s will be saved into %s" % (target_file_orig_, target_file)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file != target_file_orig: stats.renamed += 1 #target_path = opj(getpwd(), target_file) if copy: raise NotImplementedError( "Not yet copying from 'persistent' cache") else: # os.renames(extracted_path, target_path) # addurl implementation relying on annex'es addurl below would actually copy pass lgr.debug( "Adding %s to annex pointing to %s and with options %r", target_file, url, annex_options) target_file_gitpath = opj( extract_relpath, target_file) if extract_relpath else target_file out_json = annex.annex_addurl_to_file(target_file_gitpath, url, options=annex_options, batch=True) if 'key' in out_json: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: keys_to_drop.append(out_json['key']) stats.add_annex += 1 else: lgr.debug( "File {} was added to git, not adding url".format( target_file)) stats.add_git += 1 if delete_after: # forcing since it is only staged, not yet committed annex.remove(target_file_gitpath, force=True) # TODO: batch! stats.removed += 1 # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc # annex.annex_add(target_path, options=annex_options) # # above action might add to git or to annex # if annex.file_has_content(target_path): # # if not -- it was added to git, if in annex, it is present and output is True # annex.annex_addurl_to_file(target_file, url, options=['--relaxed'], batch=True) # stats.add_annex += 1 # else: # lgr.debug("File {} was added to git, not adding url".format(target_file)) # stats.add_git += 1 # # TODO: actually check if it is anyhow different from a previous version. If not # # then it wasn't really added del target_file # Done with target_file -- just to have clear end of the loop if delete and archive: lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(archive_path, force=True) lgr.info("Finished adding %s: %s" % (archive, stats.as_str(mode='line'))) if outside_stats: outside_stats += stats if commit: commit_stats = outside_stats if outside_stats else stats annex.commit("Added content extracted from %s\n\n%s" % (origin, commit_stats.as_str(mode='full'))) commit_stats.reset() finally: # since we batched addurl, we should close those batched processes if delete_after: prefix_path = opj(annex.path, prefix_dir) if exists(prefix_path): # probably would always be there lgr.info( "Removing temporary directory under which extracted files were annexed: %s", prefix_path) rmtree(prefix_path) annex.precommit() if keys_to_drop: # since we know that keys should be retrievable, we --force since no batching # atm and it would be expensive annex.annex_drop(keys_to_drop, options=['--force'], key=True) stats.dropped += len(keys_to_drop) annex.precommit() # might need clean up etc again annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) return annex
def __call__(archive, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): """ Returns ------- annex """ if exclude: exclude = assure_tuple_or_list(exclude) if rename: rename = assure_tuple_or_list(rename) # TODO: actually I see possibly us asking user either he wants to convert # his git repo into annex archive_path = archive pwd = getpwd() if annex is None: annex = get_repo_instance(pwd, class_=AnnexRepo) if not isabs(archive): # if not absolute -- relative to wd and thus archive_path = normpath(opj(realpath(pwd), archive)) # abspath(archive) is not "good" since dereferences links in the path # archive_path = abspath(archive) elif not isabs(archive): # if we are given an annex, then assume that given path is within annex, not # relative to PWD archive_path = opj(annex.path, archive) annex_path = annex.path # _rpath below should depict paths relative to the top of the annex archive_rpath = relpath(archive_path, annex_path) # TODO: somewhat too cruel -- may be an option or smth... if not allow_dirty and annex.dirty: # already saved me once ;) raise RuntimeError("You better commit all the changes and untracked files first") if not key: # we were given a file which must exist if not exists(archive_path): raise ValueError("Archive {} does not exist".format(archive)) # TODO: support adding archives content from outside the annex/repo origin = 'archive' key = annex.get_file_key(archive_rpath) archive_dir = dirname(archive_path) else: origin = 'key' key = archive archive_dir = None # We must not have anything to do with the location under .git/annex archive_basename = file_basename(archive) if not key: # TODO: allow for it to be under git??? how to reference then? raise NotImplementedError( "Provided file %s is not under annex. We don't support yet adding everything " "straight to git" % archive ) # are we in a subdirectory of the repository? pwd_under_annex = commonprefix([pwd, annex_path]) == annex_path # then we should add content under that # subdirectory, # get the path relative to the repo top if use_current_dir: # if outside -- extract to the top of repo extract_rpath = relpath(pwd, annex_path) \ if pwd_under_annex \ else None else: extract_rpath = relpath(archive_dir, annex_path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None # no special relpath from top of the repo # and operate from now on the key or whereever content available "canonically" try: key_rpath = annex.get_contentlocation(key) # , relative_to_top=True) except: raise RuntimeError("Content of %s seems to be N/A. Fetch it first" % key) # now we simply need to go through every file in that archive and lgr.info("Adding content of the archive %s into annex %s", archive, annex) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive annexarchive = ArchiveAnnexCustomRemote(path=annex_path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # TODO: check if may be it was already added if ARCHIVES_SPECIAL_REMOTE not in annex.get_remotes(): lgr.debug("Adding new special remote {}".format(ARCHIVES_SPECIAL_REMOTE)) annex.init_remote( ARCHIVES_SPECIAL_REMOTE, ['encryption=none', 'type=external', 'externaltype=%s' % ARCHIVES_SPECIAL_REMOTE, 'autoenable=true']) else: lgr.debug("Special remote {} already exists".format(ARCHIVES_SPECIAL_REMOTE)) try: old_always_commit = annex.always_commit annex.always_commit = False if annex_options: if isinstance(annex_options, string_types): annex_options = shlex.split(annex_options) leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \ if strip_leading_dirs else None leading_dir_len = len(leading_dir) + len(opsep) if leading_dir else 0 # we need to create a temporary directory at the top level which would later be # removed prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex_path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() for extracted_file in earchive.get_extracted_files(): stats.files += 1 extracted_path = opj(earchive.path, extracted_file) if islink(extracted_path): link_path = realpath(extracted_path) if not exists(link_path): # TODO: config addarchive.symlink-broken='skip' lgr.warning("Path %s points to non-existing file %s" % (extracted_path, link_path)) stats.skipped += 1 continue # TODO: check if points outside of the archive -- warning and skip # preliminary target name which might get modified by renames target_file_orig = target_file = extracted_file # strip leading dirs target_file = target_file[leading_dir_len:] if add_archive_leading_dir: target_file = opj(archive_basename, target_file) if rename: target_file = apply_replacement_rules(rename, target_file) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains {regexp} pattern".format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if prefix_dir: target_file = opj(prefix_dir, target_file) url = annexarchive.get_file_url(archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals())) if lexists(target_file): if md5sum(target_file) == md5sum(extracted_path): # must be having the same content, we should just add possibly a new extra URL pass elif existing == 'fail': raise RuntimeError( "File {} already exists, but new (?) file {} was instructed " "to be placed there while overwrite=False".format(target_file, extracted_file)) elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a tree rmtree(target_file) else: target_file_orig_ = target_file # To keep extension intact -- operate on the base of the filename p, fn = os.path.split(target_file) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: raise ValueError(existing) # keep incrementing index in the suffix until file doesn't collide suf, i = '', 0 while True: target_file_new = opj(p, fn_base + suf + ('.' if (fn_ext or ends_with_dot) else '') + fn_ext) if not lexists(target_file_new): break lgr.debug("File %s already exists" % target_file_new) i += 1 suf = '.%d' % i target_file = target_file_new lgr.debug("Original file %s will be saved into %s" % (target_file_orig_, target_file)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file != target_file_orig: stats.renamed += 1 #target_path = opj(getpwd(), target_file) if copy: raise NotImplementedError("Not yet copying from 'persistent' cache") else: # os.renames(extracted_path, target_path) # addurl implementation relying on annex'es addurl below would actually copy pass lgr.debug("Adding %s to annex pointing to %s and with options %r", target_file, url, annex_options) target_file_rpath = opj(extract_rpath, target_file) if extract_rpath else target_file out_json = annex.add_url_to_file( target_file_rpath, url, options=annex_options, batch=True) if 'key' in out_json and out_json['key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug("File {} was added to git, not adding url".format(target_file)) stats.add_git += 1 if delete_after: # forcing since it is only staged, not yet committed annex.remove(target_file_rpath, force=True) # TODO: batch! stats.removed += 1 # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc # annex.add(target_path, options=annex_options) # # above action might add to git or to annex # if annex.file_has_content(target_path): # # if not -- it was added to git, if in annex, it is present and output is True # annex.add_url_to_file(target_file, url, options=['--relaxed'], batch=True) # stats.add_annex += 1 # else: # lgr.debug("File {} was added to git, not adding url".format(target_file)) # stats.add_git += 1 # # TODO: actually check if it is anyhow different from a previous version. If not # # then it wasn't really added del target_file # Done with target_file -- just to have clear end of the loop if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(archive_rpath, force=True) lgr.info("Finished adding %s: %s" % (archive, stats.as_str(mode='line'))) if outside_stats: outside_stats += stats if commit: commit_stats = outside_stats if outside_stats else stats annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive, commit_stats.as_str(mode='full')), _datalad_msg=True ) commit_stats.reset() finally: # since we batched addurl, we should close those batched processes annex.precommit() if delete_after: prefix_path = opj(annex_path, prefix_dir) if exists(prefix_path): # probably would always be there lgr.info("Removing temporary directory under which extracted files were annexed: %s", prefix_path) rmtree(prefix_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) return annex