def test_get_recurse_dirs(o_path, c_path): # prepare source: origin = Dataset(o_path).create(force=True) origin.add('.') ds = install( c_path, source=o_path, result_xfm='datasets', return_type='item-or-list') file_list = ['file1.txt', opj('subdir', 'file2.txt'), opj('subdir', 'subsubdir', 'file3.txt'), opj('subdir', 'subsubdir', 'file4.txt')] files_in_sub = [f for f in file_list if f.startswith(with_pathsep('subdir'))] # no content present: ok_(not any(ds.repo.file_has_content(file_list))) result = ds.get('subdir') # check result: assert_status('ok', result) eq_(set([item.get('path')[len(ds.path) + 1:] for item in result if item['type'] == 'file']), set(files_in_sub)) # we also get one report on the subdir eq_(len(result) - 1, len(files_in_sub)) # got all files beneath subdir: ok_(all(ds.repo.file_has_content(files_in_sub))) # additionally got file1.txt silently, since it has the same content as # subdir/subsubdir/file4.txt: ok_(ds.repo.file_has_content('file1.txt') is True)
def module_manifest(path): """Returns path to module manifest if one can be found under `path`, else `None`.""" if not path: return None for manifest_name in MANIFEST_NAMES: if os.path.isfile(opj(path, manifest_name)): return opj(path, manifest_name)
def _check_ri(ri, cls, exact_str=True, localpath=None, **fields): """just a helper to carry out few checks on urls""" with swallow_logs(new_level=logging.DEBUG) as cml: ri_ = cls(**fields) murl = RI(ri) eq_(murl.__class__, cls) # not just a subclass eq_(murl, ri_) eq_(str(RI(ri)), ri) eq_(eval(repr(ri_)), ri) # repr leads back to identical ri_ eq_(ri, ri_) # just in case ;) above should fail first if smth is wrong if not exact_str: assert_in('Parsed version of', cml.out) (eq_ if exact_str else neq_)(ri, str(ri_)) # that we can reconstruct it EXACTLY on our examples # and that we have access to all those fields nok_(set(fields).difference(set(cls._FIELDS))) for f, v in fields.items(): eq_(getattr(ri_, f), v) if localpath: eq_(ri_.localpath, localpath) old_localpath = ri_.localpath # for a test below else: # if not given -- must be a remote url, should raise exception with assert_raises(ValueError): ri_.localpath # do changes in the path persist? old_str = str(ri_) ri_.path = newpath = opj(ri_.path, 'sub') eq_(ri_.path, newpath) neq_(str(ri_), old_str) if localpath: eq_(ri_.localpath, opj(old_localpath, 'sub'))
def test_get_mixed_hierarchy(src, path): origin = Dataset(src).create(no_annex=True) origin_sub = origin.create('subds') with open(opj(origin.path, 'file_in_git.txt'), "w") as f: f.write('no idea') with open(opj(origin_sub.path, 'file_in_annex.txt'), "w") as f: f.write('content') origin.add('file_in_git.txt', to_git=True) origin_sub.add('file_in_annex.txt') origin.save() # now, install that thing: ds, subds = install( path, source=src, recursive=True, result_xfm='datasets', return_type='item-or-list', result_filter=None) ok_(subds.repo.file_has_content("file_in_annex.txt") is False) # and get: result = ds.get(curdir, recursive=True) # git repo and subds assert_status(['ok', 'notneeded'], result) assert_result_count( result, 1, path=opj(subds.path, "file_in_annex.txt"), status='ok') ok_(subds.repo.file_has_content("file_in_annex.txt") is True)
def fits_job1(model_names=None, model_cmps=None): from os.path import join as opj from itertools import product # obs_datfile = '/n/Users/fdu/now/tab_obs.dat' res_dir = '/n/Users/fdu/now/res/' out_dir = '/n/Users/fdu/' dir_lines = ['CO', '13CO', 'C18O', 'O', 'C', 'C+', 'oH2O_A', 'pH2O_A', 'HD'] # for model_name, model_cmp in product(model_names, model_cmps): model_dir = opj(res_dir, model_name) fname_html = opj(out_dir, 'tab_{0:s}.html'.format(model_name.replace('/', ''))) fname_overview = opj(out_dir, 'cmp_{0:s}.pdf'.format(model_name.replace('/', ''))) # obsd = parse_latex_txt_table(obs_datfile) # fnames = glob_fnames(model_dir, dir_lines) b = batch_proc(fnames, _obs_data = obsd) # cmp_dir = opj(res_dir, model_cmp) fnames_cmp = glob_fnames(cmp_dir, dir_lines) bcmp = batch_proc(fnames_cmp, _obs_data = obsd) # b.all_to_html_cmp(bcmp, fname_html) print 'Html file generated: ', fname_html # b.draw_overview(fname_overview, bcmp.specs) print 'Pdf file generated: ', fname_overview
def initiate(self): if self._initiated: return self._initiated = True d = opj(self.repopath, '.git', 'bin') if not exists(d): os.makedirs(d) suf = '-' + self.custom_remote_name.rstrip(':') if self.custom_remote_name else '' self._file = _file = opj(d, 'git-annex-remote-datalad' + suf) if exists(_file): lgr.debug("Commenting out previous entries") # comment out all the past entries with open(_file) as f: entries = f.readlines() for i in range(len(self.HEADER.split(os.linesep)), len(entries)): e = entries[i] if e.startswith('recv ') or e.startswith('send '): entries[i] = '#' + e with open(_file, 'w') as f: f.write(''.join(entries)) return # nothing else to be done lgr.debug("Initiating protocoling." "cd %s; vim %s" % (realpath(self.repopath), _file[len(self.repopath) + 1:])) with open(_file, 'a') as f: f.write(self.HEADER) os.chmod(_file, 0o755)
def import_zipfile(self, module_file, force=False): if not module_file: raise Exception(_("No file sent.")) if not zipfile.is_zipfile(module_file): raise UserError(_('File is not a zip file!')) success = [] errors = dict() module_names = [] with zipfile.ZipFile(module_file, "r") as z: for zf in z.filelist: if zf.file_size > MAX_FILE_SIZE: raise UserError(_("File '%s' exceed maximum allowed file size") % zf.filename) with tempdir() as module_dir: z.extractall(module_dir) dirs = [d for d in os.listdir(module_dir) if os.path.isdir(opj(module_dir, d))] for mod_name in dirs: module_names.append(mod_name) try: # assert mod_name.startswith('theme_') path = opj(module_dir, mod_name) self.import_module(mod_name, path, force=force) success.append(mod_name) except Exception, e: _logger.exception('Error while importing module') errors[mod_name] = exception_to_unicode(e)
def test_remove_file_handle_only(path): ds = Dataset(path).create(force=True) ds.save() ok_clean_git(ds.path) # make sure there is any key ok_(len(ds.repo.get_file_key('one'))) # both files link to the same key eq_(ds.repo.get_file_key('one'), ds.repo.get_file_key('two')) rpath_one = realpath(opj(ds.path, 'one')) eq_(rpath_one, realpath(opj(ds.path, 'two'))) path_two = opj(ds.path, 'two') ok_(exists(path_two)) # remove one handle, should not affect the other ds.remove('two', check=False, message="custom msg") eq_(ds.repo.repo.head.commit.message.rstrip(), "custom msg") eq_(rpath_one, realpath(opj(ds.path, 'one'))) ok_(exists(rpath_one)) ok_(not exists(path_two)) # remove file without specifying the dataset -- shouldn't fail with chpwd(path): remove('one', check=False) ok_(not exists("one")) # and we should be able to remove without saving ds.remove('three', check=False, save=False) ok_(ds.repo.dirty)
def test_ExtractedArchive(path): archive = opj(path, fn_archive_obscure_ext) earchive = ExtractedArchive(archive) assert_false(exists(earchive.path)) # no longer the case -- just using hash for now # assert_in(os.path.basename(archive), earchive.path) fpath = opj(fn_archive_obscure, # lead directory fn_in_archive_obscure) extracted = earchive.get_extracted_filename(fpath) eq_(extracted, opj(earchive.path, fpath)) assert_false(exists(extracted)) # not yet extracted_ = earchive.get_extracted_file(fpath) eq_(extracted, extracted_) assert_true(exists(extracted)) # now it should extracted_files = earchive.get_extracted_files() ok_generator(extracted_files) eq_(sorted(extracted_files), sorted([ # ['bbc/3.txt', 'bbc/abc'] opj(fn_archive_obscure, fn_in_archive_obscure), opj(fn_archive_obscure, '3.txt') ])) earchive.clean() if not os.environ.get('DATALAD_TESTS_TEMP_KEEP'): assert_false(exists(earchive.path))
def test_add_subdataset(path, other): subds = create(opj(path, 'dir'), force=True) ds = create(path, force=True) ok_(subds.repo.dirty) ok_(ds.repo.dirty) assert_not_in('dir', ds.subdatasets(result_xfm='relpaths')) # without a base dataset the next is interpreted as "add everything # in subds to subds" add(subds.path) ok_clean_git(subds.path) assert_not_in('dir', ds.subdatasets(result_xfm='relpaths')) # but with a base directory we add the dataset subds as a subdataset # to ds ds.add(subds.path) assert_in('dir', ds.subdatasets(result_xfm='relpaths')) # create another one other = create(other) # install into superdataset, but don't add other_clone = install(source=other.path, path=opj(ds.path, 'other')) ok_(other_clone.is_installed) assert_not_in('other', ds.subdatasets(result_xfm='relpaths')) # now add, it should pick up the source URL ds.add('other') # and that is why, we can reobtain it from origin ds.uninstall('other') ok_(other_clone.is_installed) ds.get('other') ok_(other_clone.is_installed)
def test_uninstall_git_file(path): ds = Dataset(path) ok_(ds.is_installed()) ok_(exists(opj(path, 'INFO.txt'))) ok_file_under_git(ds.repo.path, 'INFO.txt') # drop file in Git in an annex repo # regardless of the type of repo this is 'notneeded'... # it is less about education that about "can we # we get the content back?", and for a file in Git we can assert_result_count( ds.drop(path='INFO.txt'), 1, status='notneeded', message="no annex'ed content") res = ds.uninstall(path="INFO.txt", on_failure='ignore') assert_result_count( res, 1, status='impossible', message='can only uninstall datasets (consider the `drop` command)') # remove the file: res = ds.remove(path='INFO.txt', result_xfm='paths', result_filter=lambda x: x['action'] == 'remove') assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'INFO.txt') ok_(not exists(opj(path, 'INFO.txt'))) eq_(res, ['INFO.txt'])
def import_zipfile(self, cr, uid, module_file, force=False, context=None): if not module_file: raise Exception("No file sent.") if not zipfile.is_zipfile(module_file): raise UserError(_("File is not a zip file!")) success = [] errors = dict() module_names = [] with zipfile.ZipFile(module_file, "r") as z: for zf in z.filelist: if zf.file_size > MAX_FILE_SIZE: msg = _("File '%s' exceed maximum allowed file size") raise UserError(msg % zf.filename) with openerp.tools.osutil.tempdir() as module_dir: z.extractall(module_dir) dirs = [d for d in os.listdir(module_dir) if os.path.isdir(opj(module_dir, d))] for mod_name in dirs: module_names.append(mod_name) try: # assert mod_name.startswith('theme_') path = opj(module_dir, mod_name) self.import_module(cr, uid, mod_name, path, force=force, context=context) success.append(mod_name) except Exception, e: errors[mod_name] = str(e)
def test_add_recursive(path): # make simple hierarchy parent = Dataset(path).create() ok_clean_git(parent.path) sub1 = parent.create(opj('down', 'sub1')) ok_clean_git(parent.path) sub2 = parent.create('sub2') # next one make the parent dirty subsub = sub2.create('subsub') ok_clean_git(parent.path, index_modified=['sub2']) res = parent.save() ok_clean_git(parent.path) # now add content deep in the hierarchy create_tree(subsub.path, {'new': 'empty'}) ok_clean_git(parent.path, index_modified=['sub2']) # recursive add should not even touch sub1, because # it knows that it is clean res = parent.add('.', recursive=True) # the key action is done assert_result_count( res, 1, path=opj(subsub.path, 'new'), action='add', status='ok') # sub1 is untouched, and not reported assert_result_count(res, 0, path=sub1.path) # saved all the way up assert_result_count(res, 3, action='save', status='ok') ok_clean_git(parent.path)
def test_publish_gh1691(origin, src_path, dst_path): # prepare src; no subdatasets installed, but mount points present source = install(src_path, source=origin, recursive=False) ok_(exists(opj(src_path, "subm 1"))) assert_false(Dataset(opj(src_path, "subm 1")).is_installed()) # some content modification of the superdataset create_tree(src_path, {'probe1': 'probe1'}) source.add('probe1') ok_clean_git(src_path) # create the target(s): source.create_sibling( 'ssh://localhost:' + dst_path, name='target', recursive=True) # publish recursively, which silently ignores non-installed datasets results = source.publish(to='target', recursive=True) assert_result_count(results, 1) assert_result_count(results, 1, status='ok', type='dataset', path=source.path) # if however, a non-installed subdataset is requsted explicitly, it'll fail results = source.publish(path='subm 1', to='target', on_failure='ignore') assert_result_count(results, 1, status='impossible', type='dataset', action='publish')
def get_module_resource(module, *args): """Return the full path of a resource of the given module. :param module: module name :param list(str) args: resource path components within module :rtype: str :return: absolute path to the resource TODO name it get_resource_path TODO make it available inside on osv object (self.get_resource_path) """ mod_path = get_module_path(module) if not mod_path: return False resource_path = opj(mod_path, *args) if os.path.isdir(mod_path): # the module is a directory - ignore zip behavior if os.path.exists(resource_path): return resource_path elif zipfile.is_zipfile(mod_path + '.zip'): zip = zipfile.ZipFile( mod_path + ".zip") files = ['/'.join(f.split('/')[1:]) for f in zip.namelist()] resource_path = '/'.join(args) if resource_path in files: return opj(mod_path, resource_path) return False
def test_basic_aggregate(path): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.uninstall('subsub', check=False) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.uninstall('sub', recursive=True, check=False) assert(not sub.is_installed()) ok_clean_git(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def create_postupdate_hook(path, ssh, dataset): # location of post-update hook file, logs folder on remote target hooks_remote_dir = opj(path, '.git', 'hooks') hook_remote_target = opj(hooks_remote_dir, 'post-update') # post-update hook should create its log directory if doesn't exist logs_remote_dir = opj(path, WEB_META_LOG) make_log_dir = 'mkdir -p "{}"'.format(logs_remote_dir) # create json command for current dataset json_command = r''' mkdir -p {}; ( which datalad > /dev/null \ && ( cd ..; GIT_DIR=$PWD/.git datalad ls -a --json file '{}'; ) \ || echo "no datalad found - skipping generation of indexes for web frontend"; \ ) &> "{}/{}" '''.format(logs_remote_dir, str(path), logs_remote_dir, 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT) # collate content for post_update hook hook_content = '\n'.join(['#!/bin/bash', 'git update-server-info', make_log_dir, json_command]) with make_tempfile(content=hook_content) as tempf: # create post_update hook script ssh.copy(tempf, hook_remote_target) # upload hook to dataset ssh(['chmod', '+x', hook_remote_target]) # and make it executable
def test_publish_aggregated(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') base.create('sub', force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # create sibling and publish to it spath = opj(path, 'remote') base.create_sibling( name="local_target", sshurl="ssh://localhost", target_dir=spath) base.publish('.', to='local_target', transfer_data='all') remote = Dataset(spath) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # all object files a present in both datasets eq_(all(base.repo.file_has_content(objs)), True) eq_(all(remote.repo.file_has_content(objs)), True) # and we can squeeze the same metadata out eq_( [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in base.metadata('sub')], [{k: v for k, v in i.items() if k not in ('path', 'refds', 'parentds')} for i in remote.metadata('sub')], )
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_( objs, list(sorted(base.repo.find(objpath))) )
def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) ok_clean_git(super.path) clone = Dataset(opj(super.path, "base")) ok_clean_git(clone.path) objpath = opj('.datalad', 'metadata', 'objects') objs = [o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if o.startswith(objpath)] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.aggregate_metadata(recursive=True, update_mode='all', force_extraction=False) eq_(all(clone.repo.file_has_content(objs)), True)
def remove_handle(self, key): dir_ = self._key2filename(key) # remove handle from collection descriptor: uri = Graph().parse(opj(self.path, dir_, REPO_STD_META_FILE), format="turtle").value(predicate=RDF.type, object=DLNS.Handle) col_graph = Graph().parse(opj(self.path, REPO_STD_META_FILE), format="turtle") col_graph.remove((DLNS.this, DCTERMS.hasPart, uri)) col_graph.serialize(opj(self.path, REPO_STD_META_FILE), format="turtle") # remove handle's directory: # Note: Currently all files separatly due to issues with the # normalize_path decorator in gitrepo.py. It expects one output per # one input file. So, recursively removing the 'dir_' violates that # assertion. # Note2: Currently using "-f" option, since on ntfs/vfat, git somehow # reports the files (at least config.ttl) have staged changes. # TODO: Figure out, what the hell this is about. [self.git_remove(file_, f=True) for file_ in self.get_indexed_files() if file_.startswith(dir_)] self.git_add(REPO_STD_META_FILE) self.git_commit("Removed handle %s." % key)
def test_getpwd_symlink(tdir): sdir = opj(tdir, 's1') pwd_orig = getpwd() os.symlink('.', sdir) s1dir = opj(sdir, 's1') s2dir = opj(sdir, 's2') try: chpwd(sdir) pwd = getpwd() eq_(pwd, sdir) chpwd('s1') eq_(getpwd(), s1dir) chpwd('.') eq_(getpwd(), s1dir) chpwd('..') eq_(getpwd(), sdir) finally: chpwd(pwd_orig) # test context handler way of use with chpwd(s1dir): eq_(getpwd(), s1dir) eq_(getpwd(), pwd_orig) assert_false(exists(s2dir)) with assert_raises(OSError): with chpwd(s2dir): pass with chpwd(s2dir, mkdir=True): ok_(exists(s2dir)) eq_(getpwd(), s2dir)
def test_add_handle_by_names(hurl, hpath, cpath, lcpath): class mocked_dirs: user_data_dir = lcpath with patch('datalad.cmdline.helpers.dirs', mocked_dirs), \ swallow_logs() as cml: # get testrepos and make them known to datalad: handle = install_handle(hurl, hpath) collection = register_collection(cpath) assert_not_in(handle.name, collection) return_value = add_handle(handle.name, collection.name) # now handle is listed by collection: collection._reload() assert_in(handle.name, collection) # test collection repo: ok_clean_git(cpath, annex=False) ok_(isdir(opj(cpath, handle.name))) ok_(exists(opj(cpath, handle.name, REPO_CONFIG_FILE))) ok_(exists(opj(cpath, handle.name, REPO_STD_META_FILE))) # evaluate return value: assert_is_instance(return_value, Handle, "install_handle() returns object of " "incorrect class: %s" % type(return_value)) eq_(return_value.name, handle.name) eq_(urlparse(return_value.url).path, urlparse(handle.url).path)
def _get_file_matches(bids_directory, glob_pattern): files = glob( opj(bids_directory, "sub-*", "*", "sub-{}".format(glob_pattern))) files += glob( opj(bids_directory, "sub-*", "ses-*", "*", "sub-*_ses-{}".format( glob_pattern))) return files
def test_dont_trip_over_missing_subds(path): ds1 = Dataset(opj(path, 'ds1')).create() ds2 = Dataset(opj(path, 'ds2')).create() subds2 = ds1.install( source=ds2.path, path='subds2', result_xfm='datasets', return_type='item-or-list') assert_true(subds2.is_installed()) assert_in('subds2', ds1.subdatasets(result_xfm='relpaths')) subds2.uninstall() assert_in('subds2', ds1.subdatasets(result_xfm='relpaths')) assert_false(subds2.is_installed()) # see if it wants to talk to github (and fail), or if it trips over something # before assert_raises(gh.BadCredentialsException, ds1.create_sibling_github, 'bogus', recursive=True, github_login='******') # inject remote config prior run assert_not_in('github', ds1.repo.get_remotes()) # fail on existing ds1.repo.add_remote('github', 'http://nothere') assert_raises(ValueError, ds1.create_sibling_github, 'bogus', recursive=True, github_login='******') # talk to github when existing is OK assert_raises(gh.BadCredentialsException, ds1.create_sibling_github, 'bogus', recursive=True, github_login='******', existing='reconfigure') # return happy emptiness when all is skipped assert_equal( ds1.create_sibling_github( 'bogus', recursive=True, github_login='******', existing='skip'), [])
def test_install_subdataset(src, path): # get the superdataset: ds = install(path=path, source=src) # subdataset not installed: subds = Dataset(opj(path, 'sub1')) assert_false(subds.is_installed()) # install it: ds.install('sub1') ok_(subds.is_installed()) # Verify that it is the correct submodule installed and not # new repository initiated assert_equal(set(subds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) # Now the obnoxious install an annex file within not yet # initialized repository! with swallow_outputs(): # progress bar ds.install(opj('sub2', 'test-annex.dat')) subds2 = Dataset(opj(path, 'sub2')) assert(subds2.is_installed()) assert(subds2.repo.file_has_content('test-annex.dat')) # we shouldn't be able silently ignore attempt to provide source while # "installing" file under git assert_raises(FileInGitError, ds.install, opj('sub2', 'INFO.txt'), source="http://bogusbogus")
def call(self, runscript, workdir, out, err, runflags): """call(runscript, workdir, out, err, runflags) Execute the *runscript* in the folder *workdir*. Redirect output and error streams to *out* and *err*, respectively. Arguments *runscript*, *workdir*, *out* and *err* should be strings with paths to corresponding files or folders. *runflags* is a |Settings| instance containing the `run` branch of running job's settings. The basic job runner defined here ignores them, but they can be useful in |JobRunner| subclasses (see :meth:`GridRunner.call`). Returns an integer with the exit code returned by the *runscript*. This method can be safely overridden in |JobRunner| subclasses. For example, in |GridRunner| it submits the runscript to a queueing system instead of executing it locally. .. note:: This method is used automatically during |run| and should never be explicitly called in your script. """ log('Executing {}'.format(runscript), 5) command = ['./'+runscript] if os.name == 'posix' else ['sh', runscript] if out is not None: with open(opj(workdir, err), 'w') as e, open(opj(workdir, out), 'w') as o: process = saferun(command, cwd=workdir, stderr=e, stdout=o) else: with open(opj(workdir, err), 'w') as e: process = saferun(command, cwd=workdir, stderr=e) log('Execution of {} finished with returncode {}'.format(runscript, process.returncode), 5) return process.returncode
def test_ignore_nondatasets(path): # we want to ignore the version/commits for this test def _kill_time(meta): for m in meta: for k in ('version', 'shasum'): if k in m: del m[k] return meta ds = Dataset(path).create() meta = _kill_time(ds.metadata(reporton='datasets', on_failure='ignore')) n_subm = 0 # placing another repo in the dataset has no effect on metadata for cls, subpath in ((GitRepo, 'subm'), (AnnexRepo, 'annex_subm')): subm_path = opj(ds.path, subpath) r = cls(subm_path, create=True) with open(opj(subm_path, 'test'), 'w') as f: f.write('test') r.add('test') r.commit('some') assert_true(Dataset(subm_path).is_installed()) assert_equal(meta, _kill_time(ds.metadata(reporton='datasets', on_failure='ignore'))) # making it a submodule has no effect either ds.add(subpath) assert_equal(len(ds.subdatasets()), n_subm + 1) assert_equal(meta, _kill_time(ds.metadata(reporton='datasets', on_failure='ignore'))) n_subm += 1
def verify_files(self, data): files_path = opj(abspath(curdir), '_files') # list of files that exist from canonical tarball con_files = set(f_f('.*', topdir=curdir, exclude='./(_files|.datalad)')) assert con_files # list of files that are individually downloaded files = set(f_f('.*', topdir='_files')) assert files for item_from_con in con_files: item_compare = normpath(opj('_files', item_from_con)) if item_compare in files: key_item = self.repo.get_file_key(item_from_con) key_files_item = self.repo.get_file_key(item_compare) if key_item == key_files_item: pass else: lgr.warning("%s varies in content from the individually downloaded " "file with the same name, it is removed and file " "from canonical tarball is kept" % item_from_con) files.discard(item_compare) else: lgr.warning("%s does not exist in the individually listed files by name, " "but will be kept from canonical tarball" % item_compare) if files: lgr.warning("The following files do not exist in the canonical tarball, but are " "individually listed files and will not be kept: %s" % files) rmtree(files_path) yield data
def filter_unmodified(content_by_ds, refds, since): """Filter per-dataset path specifications based on modification history. This function takes a path specification dictionary, as produced by `Interface._prep()` and filters it such that only that subset of paths remains in the dictionary that corresponding to the set of changes in the given reference dataset since a given state. The change set is traced across all related subdatasets, i.e. if a submodule in the reference dataset is reported as modified then all paths for any given subdataset in the modified one are tested for changes too (based on the state at which the parent dataset reports a change in the subdataset), and so on. In doing so, not only unmodified given paths are removed, but also modified given paths are replaced by the set of actually modified paths within them. Only committed changes are considered! Parameters ---------- content_by_ds : dict Per-dataset path specifications, as produced ,for example, by `Interface._prep()` refds : Dataset or *Repo or path Reference dataset for which to determine the initial change set since : state Any commit-ish/tree-ish supported by Git (tag, commit, branch, ...). Changes between this given state and the most recent commit are evaluated. Returns ------- dict Filtered path spec dictionary. If `since` is not None, the output is guaranteed to only contain paths to modified, and presently existing components of subdatasets of the given reference dataset (and itself). """ if since is None: # we want all, subds not matching the ref are assumed to have been # sorted out before (e.g. one level up) return content_by_ds # turn refds argument into a usable repo instance if not hasattr(refds, 'path'): # not a Repo or Dataset refds_path = refds refds = GitRepo(refds, create=False) else: refds_path = refds.path repo = refds.repo if hasattr(repo, 'repo'): # TODO use GitRepo.diff() when available (gh-1217) repo = repo.repo dict_class = content_by_ds.__class__ # could be ordered dict # life is simple: we diff the base dataset, and kill anything that # does not start with something that is in the diff # we cannot really limit the diff paths easily because we might get # or miss content (e.g. subdatasets) if we don't figure out which ones # are known -- and we don't want that try: diff = repo.commit().diff(since) except GitCommandError as exc: # could fail because `since` points to non existing location. # Unfortunately there might be no meaningful message # e.g. "fatal: ambiguous argument 'HEAD^': unknown revision or path not in the working tree" # logged within this GitCommandError for some reason! So let's check # that value of since post-error for being correct: try: refds.repo._git_custom_command( [], ['git', 'show', '--stat', since, '--'], expect_stderr=True, expect_fail=True) raise # re-raise since our idea was incorrect except CommandError as ce_exc: if ce_exc.stderr.startswith('fatal: bad revision'): raise ValueError( "Value since=%r is not valid. Git reports: %s" % (since, exc_str(ce_exc))) else: raise # re-raise # get all modified paths (with original? commit) that are still # present modified = dict( (opj(refds_path, d.b_path), d.b_blob.hexsha if d.b_blob else None) for d in diff) if not modified: # nothing modified nothing to report return dict_class() # determine the subset that is a directory and hence is relevant for possible # subdatasets modified_dirs = {_with_sep(d) for d in modified if isdir(d)} # find the subdatasets matching modified paths, this will also kick out # any paths that are not in the dataset sub-hierarchy mod_subs = dict_class( (candds, paths) for candds, paths in content_by_ds.items() if candds != refds_path and any( _with_sep(candds).startswith(md) for md in modified_dirs)) # now query the next level down keep_subs = \ [filter_unmodified(mod_subs, subds_path, modified[subds_path]) for subds_path in mod_subs if subds_path in modified] # merge result list into a single dict keep = dict_class((k, v) for d in keep_subs for k, v in d.items()) paths_refds = content_by_ds[refds_path] keep[refds_path] = [ m for m in modified if lexists(m) # still around and (m in paths_refds # listed file, or subds # or a modified path under a given directory or any(m.startswith(_with_sep(p)) for p in paths_refds)) ] return keep
import sys from os.path import lexists, dirname, join as opj, curdir # Hard coded version, to be done by release process, # it is also "parsed" (not imported) by setup.py, that is why assigned as # __hardcoded_version__ later and not vise versa __version__ = '0.13.4' __hardcoded_version__ = __version__ __full_version__ = __version__ # NOTE: might cause problems with "python setup.py develop" deployments # so I have even changed buildbot to use pip install -e . moddir = dirname(__file__) projdir = curdir if moddir == 'datalad' else dirname(moddir) if lexists(opj(projdir, '.git')): # If under git -- attempt to deduce a better "dynamic" version following git try: from subprocess import Popen, PIPE # Note: Popen does not support `with` way correctly in 2.7 # git = Popen([ 'git', 'describe', '--abbrev=4', '--dirty', '--match', r'[0-9]*\.*' ], stdout=PIPE, stderr=PIPE, cwd=projdir) if git.wait() != 0: raise OSError("Could not run git describe") line = git.stdout.readlines()[0] _ = git.stderr.readlines()
def test_fs_traverse(topdir): # setup temp directory tree for testing annex = AnnexRepo(topdir) AnnexRepo(opj(topdir, 'annexdir'), create=True) GitRepo(opj(topdir, 'gitdir'), create=True) GitRepo(opj(topdir, 'dir', 'subgit'), create=True) annex.add(opj(topdir, 'dir')) annex.commit() annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # traverse file system in recursive and non-recursive modes for recursive in [True, False]: # test fs_traverse in display mode with swallow_logs(new_level=logging.INFO) as log, swallow_outputs() as cmo: repo = AnnexRepo(topdir) fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='display') if recursive: # fs_traverse logs should contain all not ignored subdirectories for subdir in [opj(topdir, 'dir'), opj(topdir, 'dir', 'subdir')]: assert_in('Directory: ' + subdir, log.out) # fs_traverse stdout contains subdirectory assert_in(('file2.txt' and 'dir'), cmo.out) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # size of dir type child in non-recursive modes should be 0 Bytes(default) as # dir type child's size currently has no metadata file for traverser to pick its size from # and would require a recursive traversal w/ write to child metadata file mode assert_equal(child['size']['total'], {True: '6 Bytes', False: '0 Bytes'}[recursive]) repo.precommit() # to possibly stop batch process occupying the stdout for recursive in [True, False]: # run fs_traverse in write to json 'file' mode repo = AnnexRepo(topdir) fs = fs_traverse(topdir, repo, recurse_directories=recursive, json='file') # fs_traverse should return a dictionary assert_equal(isinstance(fs, dict), True) # not including git and annex folders assert_equal([item for item in fs['nodes'] if ('gitdir' or 'annexdir') == item['name']], []) # extract info of the top-level child directory child = [item for item in fs['nodes'] if item['name'] == 'dir'][0] # verify node type assert_equal(child['type'], 'dir') # same node size on running fs_traversal in recursive followed by non-recursive mode # verifies child's metadata file being used to find its size # running in reverse order (non-recursive followed by recursive mode) will give (0, actual size) assert_equal(child['size']['total'], '6 Bytes') # verify subdirectory traversal if run in recursive mode # In current RF 'nodes' are stripped away during recursive traversal # for now... later we might reincarnate them "differently" # TODO! if False: # recursive: # sub-dictionary should not include git and hidden directory info assert_equal([item for item in child['nodes'] if ('subgit' or '.fgit') == item['name']], []) # extract subdirectory dictionary, else fail subchild = [subitem for subitem in child["nodes"] if subitem['name'] == 'subdir'][0] # extract info of file1.txts, else fail link = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file1.txt'][0] # verify node's sizes and type assert_equal(link['size']['total'], '3 Bytes') assert_equal(link['size']['ondisk'], link['size']['total']) assert_equal(link['type'], 'link') # extract info of file2.txt, else fail brokenlink = [subnode for subnode in subchild["nodes"] if subnode['name'] == 'file2.txt'][0] # verify node's sizes and type assert_equal(brokenlink['type'], 'link-broken') assert_equal(brokenlink['size']['ondisk'], '0 Bytes') assert_equal(brokenlink['size']['total'], '3 Bytes')
def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()
def test_ls_json(topdir, topurl): annex = AnnexRepo(topdir, create=True) ds = Dataset(topdir) # create some file and commit it with open(opj(ds.path, 'subdsfile.txt'), 'w') as f: f.write('123') ds.save(path='subdsfile.txt', message="Hello!", version_tag=1) # add a subdataset ds.install('subds', source=topdir) subdirds = ds.create(_path_('dir/subds2'), force=True) subdirds.save('file') git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True) # create git repo git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt')) # commit to git to init git repo git.commit() annex.add(opj(topdir, 'dir', 'subgit')) # add the non-dataset git repo to annex annex.add(opj(topdir, 'dir')) # add to annex (links) annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'), options=['--force']) # broken-link annex.commit() git.add('fgit.txt') # commit to git to init git repo git.commit() # annex.add doesn't add submodule, so using ds.add ds.save(opj('dir', 'subgit')) # add the non-dataset git repo to annex ds.save('dir') # add to annex (links) ds.drop(opj('dir', 'subdir', 'file2.txt'), check=False) # broken-link # register "external" submodule by installing and uninstalling it ext_url = topurl + '/dir/subgit/.git' # need to make it installable via http Runner()('git update-server-info', cwd=opj(topdir, 'dir', 'subgit')) ds.install(opj('dir', 'subgit_ext'), source=ext_url) ds.uninstall(opj('dir', 'subgit_ext')) meta_dir = opj('.git', 'datalad', 'metadata') def get_metahash(*path): if not path: path = ['/'] return hashlib.md5(opj(*path).encode('utf-8')).hexdigest() def get_metapath(dspath, *path): return _path_(dspath, meta_dir, get_metahash(*path)) def get_meta(dspath, *path): with open(get_metapath(dspath, *path)) as f: return js.load(f) # Let's see that there is no crash if one of the files is available only # in relaxed URL mode, so no size could be picked up ds.repo.add_url_to_file( 'fromweb', topurl + '/noteventhere', options=['--relaxed']) for all_ in [True, False]: # recurse directories for recursive in [True, False]: for state in ['file', 'delete']: # subdataset should have its json created and deleted when # all=True else not subds_metapath = get_metapath(opj(topdir, 'subds')) exists_prior = exists(subds_metapath) #with swallow_logs(), swallow_outputs(): dsj = _ls_json( topdir, json=state, all_=all_, recursive=recursive ) ok_startswith(dsj['tags'], '1-') exists_post = exists(subds_metapath) # print("%s %s -> %s" % (state, exists_prior, exists_post)) assert_equal(exists_post, (state == 'file' and recursive)) # root should have its json file created and deleted in all cases ds_metapath = get_metapath(topdir) assert_equal(exists(ds_metapath), state == 'file') # children should have their metadata json's created and deleted only when recursive=True child_metapath = get_metapath(topdir, 'dir', 'subdir') assert_equal(exists(child_metapath), (state == 'file' and all_)) # ignored directories should not have json files created in any case for subdir in [('.hidden',), ('dir', 'subgit')]: assert_false(exists(get_metapath(topdir, *subdir))) # check if its updated in its nodes sublist too. used by web-ui json. regression test assert_equal(dsj['nodes'][0]['size']['total'], dsj['size']['total']) # check size of subdataset subds = [item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds')][0] assert_equal(subds['size']['total'], '3 Bytes') # dir/subds2 must not be listed among nodes of the top dataset: topds_nodes = {x['name']: x for x in dsj['nodes']} assert_in('subds', topds_nodes) # XXX # # condition here is a bit a guesswork by yoh later on # # TODO: here and below clear destiny/interaction of all_ and recursive # assert_equal(dsj['size']['total'], # '15 Bytes' if (recursive and all_) else # ('9 Bytes' if (recursive or all_) else '3 Bytes') # ) # https://github.com/datalad/datalad/issues/1674 if state == 'file' and all_: dirj = get_meta(topdir, 'dir') dir_nodes = {x['name']: x for x in dirj['nodes']} # it should be present in the subdir meta assert_in('subds2', dir_nodes) assert_not_in('url_external', dir_nodes['subds2']) assert_in('subgit_ext', dir_nodes) assert_equal(dir_nodes['subgit_ext']['url'], ext_url) # and not in topds assert_not_in('subds2', topds_nodes) # run non-recursive dataset traversal after subdataset metadata already created # to verify sub-dataset metadata being picked up from its metadata file in such cases if state == 'file' and recursive and not all_: dsj = _ls_json(topdir, json='file', all_=False) subds = [ item for item in dsj['nodes'] if item['name'] == ('subdsfile.txt' or 'subds') ][0] assert_equal(subds['size']['total'], '3 Bytes') assert_equal( topds_nodes['fromweb']['size']['total'], UNKNOWN_SIZE )
def make_studyforrest_mockup(path): """Generate a dataset structure mimicking aspects of studyforrest.org Under the given path there are two directories: public - to be published datasets private - never to be published datasets The 'public' directory itself is a superdataset, the 'private' directory is just a directory that contains standalone datasets in subdirectories. """ public = create(opj(path, 'public'), description="umbrella dataset") # the following tries to capture the evolution of the project phase1 = public.create('phase1', description='old-style, no connection to RAW') structural = public.create('structural', description='anatomy') tnt = public.create('tnt', description='image templates') tnt.clone(source=phase1.path, path=opj('src', 'phase1'), reckless='auto') tnt.clone(source=structural.path, path=opj('src', 'structural'), reckless='auto') aligned = public.create('aligned', description='aligned image data') aligned.clone(source=phase1.path, path=opj('src', 'phase1'), reckless='auto') aligned.clone(source=tnt.path, path=opj('src', 'tnt'), reckless='auto') # new acquisition labet = create(opj(path, 'private', 'labet'), description="raw data ET") phase2_dicoms = create(opj(path, 'private', 'p2dicoms'), description="raw data P2MRI") phase2 = public.create('phase2', description='new-style, RAW connection') phase2.clone(source=labet.path, path=opj('src', 'labet'), reckless='auto') phase2.clone(source=phase2_dicoms.path, path=opj('src', 'dicoms'), reckless='auto') # add to derivatives tnt.clone(source=phase2.path, path=opj('src', 'phase2'), reckless='auto') aligned.clone(source=phase2.path, path=opj('src', 'phase2'), reckless='auto') # never to be published media files media = create(opj(path, 'private', 'media'), description="raw data ET") # assuming all annotations are in one dataset (in reality this is also # a superdatasets with about 10 subdatasets annot = public.create('annotations', description='stimulus annotation') annot.clone(source=media.path, path=opj('src', 'media'), reckless='auto') # a few typical analysis datasets # (just doing 3, actual status quo is just shy of 10) # and also the real goal -> meta analysis metaanalysis = public.create('metaanalysis', description="analysis of analyses") for i in range(1, 3): ana = public.create('analysis{}'.format(i), description='analysis{}'.format(i)) ana.clone(source=annot.path, path=opj('src', 'annot'), reckless='auto') ana.clone(source=aligned.path, path=opj('src', 'aligned'), reckless='auto') ana.clone(source=tnt.path, path=opj('src', 'tnt'), reckless='auto') # link to metaanalysis metaanalysis.clone(source=ana.path, path=opj('src', 'ana{}'.format(i)), reckless='auto') # simulate change in an input (but not raw) dataset create_tree(aligned.path, {'modification{}.txt'.format(i): 'unique{}'.format(i)}) aligned.save() # finally aggregate data aggregate = public.create('aggregate', description='aggregate data') aggregate.clone(source=aligned.path, path=opj('src', 'aligned'), reckless='auto')
def _ls_dataset(loc, fast=False, recursive=False, all_=False, long_=False): isabs_loc = isabs(loc) topdir = '' if isabs_loc else abspath(curdir) topds = Dataset(loc) dss = [topds] + ( [Dataset(opj(loc, sm)) for sm in topds.subdatasets(recursive=recursive, result_xfm='relpaths')] if recursive else []) dsms = [] for ds in dss: if not ds.is_installed(): dsm = AbsentRepoModel(ds.path) elif isinstance(ds.repo, AnnexRepo): dsm = AnnexModel(ds.repo) elif isinstance(ds.repo, GitRepo): dsm = GitModel(ds.repo) else: raise RuntimeError("Got some dataset which don't know how to handle %s" % ds) dsms.append(dsm) # adjust path strings for ds_model in dsms: #path = ds_model.path[len(topdir) + 1 if topdir else 0:] path = relpath(ds_model.path, topdir) if topdir else ds_model.path if not path: path = '.' ds_model.path = path dsms = sorted(dsms, key=lambda m: m.path) maxpath = max(len(ds_model.path) for ds_model in dsms) path_fmt = u"{ds.path!U:<%d}" % (maxpath + (11 if is_interactive() else 0)) # + to accommodate ansi codes pathtype_fmt = path_fmt + u" [{ds.type}]" full_fmt = pathtype_fmt + u" {ds.branch!N} {ds.describe!N} {ds.date!D}" if (not fast) or long_: full_fmt += u" {ds.clean!X}" fmts = { AbsentRepoModel: pathtype_fmt, GitModel: full_fmt, AnnexModel: full_fmt } if long_: fmts[AnnexModel] += u" {ds.annex_local_size!S}/{ds.annex_worktree_size!S}" formatter = LsFormatter() # weird problems happen in the parallel run -- TODO - figure it out # for out in Parallel(n_jobs=1)( # delayed(format_ds_model)(formatter, dsm, full_fmt, format_exc=path_fmt + " {msg!R}") # for dsm in dss): # print(out) for dsm in dsms: fmt = fmts[dsm.__class__] ds_str = format_ds_model(formatter, dsm, fmt, format_exc=path_fmt + u" {msg!R}") safe_print(ds_str) # workaround for explosion of git cat-file --batch processes # https://github.com/datalad/datalad/issues/1888 if dsm.repo is not None: dsm.repo.repo.close() del dsm.repo dsm.repo = None
def test_save(path): ds = Dataset(path) with open(opj(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save("add a new file") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) with open(opj(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save("modified new_file.tst") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # save works without ds and files given in the PWD with open(opj(path, "new_file.tst"), "w") as f: f.write("rapunzel") with chpwd(path): save("love rapunzel") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # and also without `-a` when things are staged with open(opj(path, "new_file.tst"), "w") as f: f.write("exotic") ds.repo.add("new_file.tst", git=True) with chpwd(path): save("love marsians") ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(opj(path, fn), "w") as f: f.write(fn) ds.add([opj(path, f) for f in files]) # superfluous call to save (add saved it already), should not fail # but report that nothing was saved assert_status('notneeded', ds.save("set of new files")) ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.rev_create('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(opj(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.add('.') ok_clean_git(subds.path, annex=isinstance(subds.repo, AnnexRepo)) # Note/TODO: ok_clean_git is failing in direct mode, due to staged but # uncommited .datalad (probably caused within create) ok_(ds.repo.dirty) # ensure modified subds is committed ds.save() ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo)) # now introduce a change downstairs subds.rev_create('someotherds') ok_clean_git(subds.path, annex=isinstance(subds.repo, AnnexRepo)) ok_(ds.repo.dirty) # and save via subdataset path ds.save('subds') ok_clean_git(path, annex=isinstance(ds.repo, AnnexRepo))
def test_recursive_save(path): ds = Dataset(path).rev_create() # nothing to save assert_status('notneeded', ds.save()) subds = ds.rev_create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.rev_create('subsub') assert_equal( ds.subdatasets(recursive=True, fulfilled=True, result_xfm='paths'), [subds.path, subsubds.path]) newfile_name = opj(subsubds.path, 'test') with open(newfile_name, 'w') as f: f.write('some') # saves the status change of the subdataset due to the subsubdataset addition assert_result_values_equal(ds.save(result_filter=is_ok_dataset), 'path', [ds.path]) # make the new file known to its dataset ds.add(newfile_name, save=False) # but remains dirty because of the uncommited file down below assert ds.repo.dirty # auto-add will save nothing deep down without recursive assert_status('notneeded', ds.save()) assert ds.repo.dirty # with recursive pick up the change in subsubds assert_result_values_equal( ds.save(recursive=True, result_filter=is_ok_dataset), 'path', [subsubds.path, subds.path, ds.path]) # at this point the entire tree is clean ok_clean_git(ds.path) states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] # now we save recursively, nothing should happen res = ds.save(recursive=True) # we do not get any report from a subdataset, because we detect at the # very top that the entire tree is clean assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save', path=ds.path) # now we introduce new files all the way down create_tree(subsubds.path, {"mike1": 'mike1'}) # because we cannot say from the top if there is anything to do down below, # we have to traverse and we will get reports for all dataset, but there is # nothing actually saved res = ds.save(recursive=True) assert_result_count(res, 3) assert_status('notneeded', res) subsubds_indexed = subsubds.repo.get_indexed_files() assert_not_in('mike1', subsubds_indexed) assert_equal(states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]) unlink(opj(subsubds.path, 'mike1')) ok_clean_git(ds.path) # modify content in subsub and try saving testfname = newfile_name subsubds.unlock(testfname) with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_status('notneeded', ds.save()) # no recursive assert_status('notneeded', ds.save()) # an explicit target saves only the corresponding dataset assert_result_values_equal(save(path=[testfname]), 'path', [subsubds.path]) # plain recursive without any files given will save the beast assert_result_values_equal( ds.save(recursive=True, result_filter=is_ok_dataset), 'path', [subds.path, ds.path]) # there is nothing else to save assert_status('notneeded', ds.save(recursive=True)) ok_clean_git(ds.path) # one more time and check that all datasets in the hierarchy are not # contaminated with untracked files states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_status('notneeded', ds.save(recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_equal(old, new) assert ds.repo.dirty unlink(opj(ds.path, testfname)) ok_clean_git(ds.path) # now let's check saving "upwards" create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) old_states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] subsubds.save(message="savingtestmessage", super_datasets=True) # this save actually didn't save anything in subsub (or anywhere), # because there were only untracked bits pending for old, new in zip(old_states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]): assert_equal(old, new) # but now we are saving this untracked bit specifically subsubds.save(message="savingtestmessage", path=['testnew2'], super_datasets=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages # there are no more dedicated superdataset-save commits anymore, because # superdatasets get saved as part of the processed hierarchy and can contain # other parts in the commit (if so instructed) assert_equal( next(subsubds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal( next(subds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal( next(ds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') # and if we try to save while being within that subsubds path subsubds.unlock('testnew2') create_tree(subsubds.path, {"testnew2": 'smth2'}) # trying to replicate https://github.com/datalad/datalad/issues/1540 subsubds.save(message="saving new changes", all_updated=True) # no super with chpwd(subds.path): # no explicit dataset is provided by path is provided save(path=['subsub'], message='saving sub', super_datasets=True) # super should get it saved too assert_equal( next(ds.repo.get_branch_commits('master')).message.rstrip(), 'saving sub')
def test_add_source(path, url, ds_dir): raise SkipTest('functionality is not supported ATM') from os import listdir from datalad.support.network import RI urls = [RI(url + f) for f in listdir(path)] ds = Dataset(ds_dir).create() eq_(len(ds.repo.get_annexed_files()), 0) # add a remote source to git => fail: assert_raises(NotImplementedError, ds.add, source=urls[0], to_git=True) # annex add a remote source: ds.add(source=urls[0]) eq_(len(ds.repo.get_annexed_files()), 1) # add two remote source an give local names: ds.add(path=['local1.dat', 'local2.dat'], source=urls[1:3]) annexed = ds.repo.get_annexed_files() eq_(len(annexed), 3) assert_in('local1.dat', annexed) assert_in('local2.dat', annexed) # add a second source for one of them ds.add(path='local1.dat', source=urls[3]) eq_(len(annexed), 3) whereis_dict = ds.repo.whereis('local1.dat', output='full') reg_urls = [ whereis_dict[uuid]['urls'] for uuid in whereis_dict if not whereis_dict[uuid]['here'] ] eq_(len(reg_urls), 1) # one remote for 'local1.dat', that is not "here" eq_({str(urls[1]), str(urls[3])}, set(reg_urls[0])) # just to be sure compare to 'local2.dat': whereis_dict = ds.repo.whereis('local2.dat', output='full') reg_urls = [ whereis_dict[uuid]['urls'] for uuid in whereis_dict if not whereis_dict[uuid]['here'] ] eq_(len(reg_urls), 1) # one remote for 'local2.dat', that is not "here" eq_([urls[2]], reg_urls[0]) # provide more paths than sources: # report failure on non-existing 'local4.dat': result = ds.add(path=['local3.dat', 'local4.dat'], source=urls[4]) ok_( all([ r['success'] is False and r['note'] == 'not found' for r in result if r['file'] == 'local4.dat' ])) with open(opj(ds.path, 'local4.dat'), 'w') as f: f.write('local4 content') ds.add(path=['local3.dat', 'local4.dat'], source=urls[4]) annexed = ds.repo.get_annexed_files() eq_(len(annexed), 5) assert_in('local3.dat', annexed) assert_in('local4.dat', annexed) # 'local3.dat' has a remote source whereis_dict = ds.repo.whereis('local3.dat', output='full') reg_urls = [ whereis_dict[uuid]['urls'] for uuid in whereis_dict if not whereis_dict[uuid]['here'] ] eq_(len(reg_urls), 1) # one remote for 'local3.dat', that is not "here" eq_([urls[4]], reg_urls[0]) # 'local4.dat' has no remote source whereis_dict = ds.repo.whereis('local4.dat', output='full') reg_urls = [ whereis_dict[uuid]['urls'] for uuid in whereis_dict if not whereis_dict[uuid]['here'] ] eq_(len(reg_urls), 0) # provide more sources than paths: ds.add('local5.dat', source=urls[5:]) annexed = ds.repo.get_annexed_files() assert_in('local5.dat', annexed) eq_(len(annexed), 5 + len(urls[5:])) # Note: local4.dat didn't come from an url, # but 'local1.dat' consumes two urls eq_(len(annexed), len(urls)) # all files annexed (-2 for '.git' and '.datalad'): eq_(len(annexed), len(listdir(ds.path)) - 2)
'062', '063', '066', '126', '127', '146' ] session_list = ['run001', 'run002', 'run003'] # subject_list = ['003'] # session_list = ['run001', 'run002'] frequency_list = ['10Hz', '20Hz', '40Hz'] output_dir = 'Stimulation_Preproc_OutputDir' working_dir = 'Stimulation_Preproc_WorkingDir' stimulation_preproc = Workflow(name='stimulation_preproc') stimulation_preproc.base_dir = opj(experiment_dir, working_dir) # ===================================================================================================== # In[3]: # to prevent nipype from iterating over the anat image with each func run, you need seperate # nodes to select the files # and this will solve the problem I have for almost 6 months # but notice that in the sessions, you have to iterate also over subject_id to get the {subject_id} var # Infosource - a function free node to iterate over the list of subject names infosource_anat = Node(IdentityInterface(fields=['subject_id']), name="infosource_anat") infosource_anat.iterables = [('subject_id', subject_list)] infosource_func = Node( IdentityInterface(fields=['subject_id', 'session_id', 'frequency_id']),
def test_bf1886(path): parent = Dataset(path).rev_create() sub = parent.rev_create('sub') ok_clean_git(parent.path) # create a symlink pointing down to the subdataset, and add it os.symlink('sub', opj(parent.path, 'down')) parent.add('down') ok_clean_git(parent.path) # now symlink pointing up os.makedirs(opj(parent.path, 'subdir', 'subsubdir')) os.symlink(opj(pardir, 'sub'), opj(parent.path, 'subdir', 'up')) parent.add(opj('subdir', 'up')) ok_clean_git(parent.path) # now symlink pointing 2xup, as in #1886 os.symlink(opj(pardir, pardir, 'sub'), opj(parent.path, 'subdir', 'subsubdir', 'upup')) parent.add(opj('subdir', 'subsubdir', 'upup')) ok_clean_git(parent.path) # simulatenously add a subds and a symlink pointing to it # create subds, but don't register it sub2 = rev_create(opj(parent.path, 'sub2')) os.symlink(opj(pardir, pardir, 'sub2'), opj(parent.path, 'subdir', 'subsubdir', 'upup2')) parent.add(['sub2', opj('subdir', 'subsubdir', 'upup2')]) ok_clean_git(parent.path) # full replication of #1886: the above but be in subdir of symlink # with no reference dataset sub3 = rev_create(opj(parent.path, 'sub3')) os.symlink(opj(pardir, pardir, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3')) # need to use absolute paths with chpwd(opj(parent.path, 'subdir', 'subsubdir')): rev_save([ opj(parent.path, 'sub3'), opj(parent.path, 'subdir', 'subsubdir', 'upup3') ]) # in contrast to `add` only operates on a single top-level dataset # although it is not specified, it get's discovered based on the PWD # the logic behind that feels a bit shaky # consult discussion in https://github.com/datalad/datalad/issues/3230 # if this comes up as an issue at some point ok_clean_git(parent.path)
def initialize_options(self): self.manpath = opj('build', 'man') self.rstpath = opj('docs', 'source', 'generated', 'man') self.parser = 'datalad.cmdline.main:setup_parser'
for f in findall(opj('datalad_neuroimaging', subdir)) if splitext(f)[-1].lstrip('.') in extensions ] # extension version version = get_version() cmdclass = { 'build_manpage': BuildManPage, 'build_examples': BuildRSTExamplesFromScripts, } # PyPI doesn't render markdown yet. Workaround for a sane appearance # https://github.com/pypa/pypi-legacy/issues/148#issuecomment-227757822 README = opj(dirname(__file__), 'README.md') try: import pypandoc long_description = pypandoc.convert(README, 'rst') except (ImportError, OSError) as exc: # attempting to install pandoc via brew on OSX currently hangs and # pypandoc imports but throws OSError demanding pandoc print( "WARNING: pypandoc failed to import or thrown an error while converting" " README.md to RST: %r .md version will be used as is" % exc) long_description = open(README).read() requires = { 'devel-docs': [ # used for converting README.md -> .rst for long_description 'pypandoc',
def initialize_options(self): self.expath = opj('docs', 'examples') self.rstpath = opj('docs', 'source', 'generated', 'examples')
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of paths that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be in item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = assure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set(t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection(downward_targets)) undiscovered_ds = [ t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds ] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(undiscovered_ds)
def test_within_ds_file_search(path): try: import nibabel except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('nifti1.nii.gz', opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')), ('nifti1.nii.gz', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))): copy(opj(dirname(dirname(__file__)), 'tests', 'data', 'files', src), opj(path, dst)) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('bids', 'nifti1'): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.key bids.BIDSVersion bids.author bids.citation bids.conformsto bids.datatype bids.description """ if external_versions['bids'] >= '0.9': target_out += "bids.extension\n" target_out += """\ bids.fundedby bids.license bids.name bids.subject.age(years) bids.subject.gender bids.subject.handedness bids.subject.hearing_problems_current bids.subject.id bids.subject.language bids.suffix bids.task datalad_core.id datalad_core.refcommit id nifti1.cal_max nifti1.cal_min nifti1.datatype nifti1.description nifti1.dim nifti1.freq_axis nifti1.intent nifti1.magic nifti1.phase_axis nifti1.pixdim nifti1.qform_code nifti1.sform_code nifti1.sizeof_hdr nifti1.slice_axis nifti1.slice_duration nifti1.slice_end nifti1.slice_order nifti1.slice_start nifti1.spatial_resolution(mm) nifti1.t_unit nifti1.temporal_spacing(s) nifti1.toffset nifti1.vox_offset nifti1.xyz_unit parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump # so we will use diff diff = list(unified_diff(target_out.splitlines(), cmo.out.splitlines())) assert_in(target_out, cmo.out, msg="Diff: %s" % os.linesep.join(diff)) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query # multi word query implies AND ('textblob', ['bold', 'female'], opj('sub-03', 'func', 'sub-03_task-some_bold.nii.gz'), 'meta', 'female'), # report which field matched with auto-field ('autofield', 'female', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'), 'bids.subject.gender', 'female'), # autofield multi-word query is also AND ('autofield', ['bids.suffix:bold', 'bids.subject.id:01'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'bids.suffix', 'bold'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def _path_rel2file(p): return opj(dirname(__file__), p)
def pipeline(bucket, prefix=None, no_annex=False, tag=True, skip_problematic=False, to_http=False, rename=None, directory=None, archives=False, allow_dirty=False, backend='MD5E', drop=False, drop_force=False, strategy='commit-versions', exclude=None, **kwargs): """Pipeline to crawl/annex an arbitrary bucket Parameters ---------- bucket : str prefix : str, optional prefix within the bucket tag : bool, optional tag "versions" skip_problematic : bool, optional pass to Annexificator to_http : bool, optional Convert s3:// urls to corresponding generic http:// . So to be used for resources which are publicly available via http directory : {subdataset}, optional What to do when encountering a directory. 'subdataset' would initiate a new sub-dataset at that directory strategy : {'commit-versions', 'naive'}, optional What strategy to use whenever processing "delete" event, See `crawl_s3` node for more information. drop : bool, optional Drop all the files whenever done crawling exclude : str, optional Regular expression to be passed to s3_crawl to exclude some files **kwargs: passed into simple_with_archives.pipeline """ lgr.info("Creating a pipeline for the %s bucket", bucket) # TODO: see if we could make it more generic!!! # drop = assure_bool(drop) # drop_force = assure_bool(drop_force) annex_kw = {} to_http = assure_bool(to_http) tag = assure_bool(tag) archives = assure_bool(archives) no_annex = assure_bool(no_annex) allow_dirty = assure_bool(allow_dirty) drop = assure_bool(drop) drop_force = assure_bool(drop_force) if not to_http: annex_kw['special_remotes'] = [DATALAD_SPECIAL_REMOTE] annex = Annexificator( create=False, # must be already initialized etc backend=backend, no_annex=no_annex, skip_problematic=skip_problematic, allow_dirty=allow_dirty, # Primary purpose of this one is registration of all URLs with our # upcoming "ultimate DB" so we don't get to git anything # options=["-c", "annex.largefiles=exclude=CHANGES* and exclude=changelog.txt and exclude=dataset_description.json and exclude=README* and exclude=*.[mc]"] **annex_kw) s3_actions = {'commit': annex.finalize(tag=tag), 'annex': annex} s3_switch_kw = {} recursive = True if directory: if directory == 'subdataset': new_prefix = '%(filename)s/' if prefix: new_prefix = opj(prefix, new_prefix) s3_actions['directory'] = [ # for initiate_dataset we should replicate filename as handle_name, prefix assign({ 'prefix': new_prefix, 'dataset_name': '%(filename)s' }, interpolate=True), annex.initiate_dataset(template='simple_s3', data_fields=['prefix'], add_fields={ 'bucket': bucket, 'to_http': to_http, 'skip_problematic': skip_problematic, }) ] s3_switch_kw['missing'] = 'skip' # ok to not remove recursive = False else: raise ValueError("Do not know how to treat %s" % directory) else: s3_actions['remove'] = annex.remove incoming_pipeline = [ crawl_s3(bucket, prefix=prefix, strategy=strategy, repo=annex.repo, recursive=recursive, exclude=exclude), ] from ..nodes.misc import debug if to_http: incoming_pipeline.append(sub_s3_to_http) if rename: incoming_pipeline += [ sub({'filename': get_replacement_dict(rename)}, ok_missing=True) ] incoming_pipeline.append( switch('datalad_action', s3_actions, **s3_switch_kw)) if archives: pipeline = swa_pipeline(incoming_pipeline=incoming_pipeline, annex=annex, **kwargs) else: pipeline = incoming_pipeline if drop: pipeline.append(annex.drop(all=True, force=drop_force)) return pipeline
def _dataset_auto_get(self, filepath): """Verify that filepath is under annex, and if so and not present - get it""" if not self._autoget: return # if filepath is not there at all (program just "checked" if it could access it if not lexists(filepath): lgr.log(2, " skipping %s since it is not there", filepath) return # deduce directory for filepath filedir = dirname(filepath) annex = None if self._repos_cache is not None: filedir_parts = filedir.split(pathsep) # ATM we do not expect subdatasets under .datalad, so we could take the top # level dataset for that try: filedir = pathsep.join( filedir_parts[:filedir_parts.index(DATALAD_DOTDIR)]) except ValueError: # would happen if no .datalad pass try: annex = self._repos_cache[filedir] except KeyError: pass if annex is None: try: # TODO: verify logic for create -- we shouldn't 'annexify' non-annexified # see https://github.com/datalad/datalad/issues/204 annex = get_repo_instance(filedir) lgr.log(2, "Got the repository %s id:%s containing %s", annex, id(annex), filedir) except (RuntimeError, InvalidGitRepositoryError) as e: # must be not under annex etc return if self._repos_cache is not None: self._repos_cache[filedir] = annex if not isinstance(annex, AnnexRepo): # not an annex -- can do nothing lgr.log(2, " skipping %s since the repo is not annex", filepath) return # since Git/AnnexRepo functionality treats relative paths relative to the # top of the repository and might be outside, get a full path if not isabs(filepath): filepath = opj(getpwd(), filepath) # "quick" check first if under annex at all try: # might fail. TODO: troubleshoot when it does e.g. # datalad/tests/test_auto.py:test_proxying_open_testrepobased under_annex = annex.is_under_annex(filepath, batch=True) except Exception as exc: # MIH: really? what if MemoryError lgr.log(5, " cannot determine if %s under annex: %s", filepath, exc_str(exc)) under_annex = None # either it has content if (under_annex or under_annex is None) and not annex.file_has_content(filepath): lgr.info("AutomagicIO: retrieving file content of %s", filepath) out = annex.get(filepath) if out and not out.get('success', False): # to assure that it is present and without trailing/leading new lines out['note'] = out.get('note', '').strip() lgr.error("Failed to retrieve %(file)s: %(note)s", out)
def sorted_files(dout): """Return a (sorted) list of files under dout """ return sorted( sum([[opj(r, f)[len(dout) + 1:] for f in files] for r, d, files in os.walk(dout) if not '.git' in r], []))
def call(self, runscript, workdir, out, err, runflags): """call(runscript, workdir, out, err, runflags) Submit *runscript* to the queueing system with *workdir* as the working directory. Redirect output and error streams to *out* and *err*, respectively. *runflags* stores varoius submit command options. The submit command has the following structure:: <commands.submit>_<workdir>_{workdir}_<error>_{err}[_<output>_{out}][FLAGS]_{runscript} Underscores denote spaces, parts in pointy brackets correspond to ``settings`` entries, parts in curly brackets to :meth:`call` arguments, square brackets contain optional parts. Output part is added if *out* is not ``None``. This is handled automatically based on ``runscript.stdout_redirect`` value in job's ``settings``. ``FLAGS`` part is built based on *runflags* argument, which is a |Settings| instance storing |run| keyword arguments. For every *(key,value)* pair in *runflags* the string ``_-key_value`` is appended to ``FLAGS`` **unless** the *key* is a special key occurring in ``commands.special``. In that case ``_<commands.special.key>value`` is used (mind the lack of space in between). For example, a |Settings| instance defining interaction with SLURM has the following entries:: workdir = '-D' output = '-o' error = '-e' special.nodes = '-N ' special.walltime = '-t ' special.memory = '--mem=' special.queue = '-p ' commands.submit = 'sbatch' commands.check = 'squeue' The submit command produced by:: gr = GridRunner(parallel=True, maxjobs=4, grid='slurm') j.run(jobrunner=gr, queue='short', nodes=2, J='something', O='') will be: .. code-block:: none sbatch -D {workdir} -e {err} -o {out} -p short -N 2 -J something -O {runscript} In certain queueing systems some flags don't have a short form with semantics ``-key value``. For example, in SLURM the flag ``--nodefile=value`` has a short form ``-F value``, but the flag ``--export=value`` does not. One can still use such a flag using the special keys logic:: gr = GridRunner(parallel=True, maxjobs=4, grid='slurm') gr.settings.special.export = '--export=' j.run(jobrunner=gr, queue='short', export='value') That results in the command:: sbatch -D {workdir} -e {err} -o {out} -p short --export=value {runscript} The submit command is then executed and the output returned by it is used to determine the submitted job's ID. The value stored in ``commands.getid`` is used for that purpose. It should be a function taking a single string (the whole output of the submit command) and returning a string with job's ID. The submitted job's ID is then added to ``_active_jobs`` dictionary, with the key being job's ID and the value being an instance of :class:`threading.Lock`. This lock is used to singal the fact that the job is finished and the thread handling it can continue. Then the :meth:`_check_queue` method starts the thread querying the queue and unlocking finished jobs. Since it is difficult to automatically obtain job's exit code, the returned value is 0 (or 1, if the submit command failed). From |run| perspective it means that a job executed with |GridRunner| is *crashed* only if it never entered the queue (usually due to improper submit command). .. note:: This method is used automatically during |run| and should never be explicitly called in your script. """ s = self.settings cmd = ' '.join([s.commands.submit, s.workdir, workdir, s.error, err]) if out is not None: cmd += ' '+s.output+' '+out for k,v in runflags.items(): if k in s.special: cmd += ' '+s.special[k]+str(v) else: cmd += ' -'+k+' '+str(v) cmd += ' ' + opj(workdir,runscript) log('Submitting {} with command {}'.format(runscript, cmd), 5) process = saferun(cmd.split(' '), stdout=PIPE, stderr=PIPE) subout = process.stdout.decode() log('Output of {} submit command: {}'.format(runscript, subout), 5) jobid = s.commands.getid(subout) if jobid is None: log('Submitting of {} failed. Stderr of submit command:\n{}'.format(runscript, process.stderr.decode()), 1) return 1 log('{} submitted successfully as job {}'.format(runscript, jobid), 3) event = threading.Event() with self._active_lock: self._active_jobs[jobid] = event self._check_queue() event.wait() log('Execution of {} finished'.format(runscript), 5) return 0
def __call__(dataset, pattern, ref_dir='.', makedirs=False): # could be extended to accept actual largefile expressions from os.path import join as opj from os.path import isabs from os.path import exists from os import makedirs as makedirsfx from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo from datalad.utils import assure_list pattern = assure_list(pattern) ds = require_dataset(dataset, check_installed=True, purpose='no_annex configuration') res_kwargs = dict( path=ds.path, type='dataset', action='no_annex', ) # all the ways we refused to cooperate if not isinstance(ds.repo, AnnexRepo): yield dict(res_kwargs, status='notneeded', message='dataset has no annex') return if any(isabs(p) for p in pattern): yield dict( res_kwargs, status='error', message= ('path pattern for `no_annex` configuration must be relative paths: %s', pattern)) return if isabs(ref_dir): yield dict( res_kwargs, status='error', message= ('`ref_dir` for `no_annex` configuration must be a relative path: %s', ref_dir)) return gitattr_dir = opj(ds.path, ref_dir) if not exists(gitattr_dir): if makedirs: makedirsfx(gitattr_dir) else: yield dict( res_kwargs, status='error', message= 'target directory for `no_annex` does not exist (consider makedirs=True)' ) return gitattr_file = opj(gitattr_dir, '.gitattributes') with open(gitattr_file, 'a') as fp: for p in pattern: fp.write('{} annex.largefiles=nothing\n'.format(p)) yield dict(res_kwargs, status='ok') for r in dataset.add(gitattr_file, to_git=True, message="[DATALAD] exclude paths from annex'ing", result_filter=None, result_xfm=None): yield r
def job_runner(email, dir_cache, seqs=None, run_id='', parallel_run_count=1, max_title_a_len=0, max_run_id_len=0): """Run InterProScan 5.""" max_jobs = int(30 / parallel_run_count) delay = 0.333 cfp = opj(dir_cache, 'ips5_cache_running_' + run_id) def load(): with open(cfp, 'rb') as f: c = pickle.load(f) return c def dump(c): with open(cfp, 'wb') as f: pickle.dump(c, f, protocol=PICKLE_PROTOCOL) seqs_orig = deepcopy(seqs) seqs = deepcopy(seqs) jobs = None if ope(cfp): jobs = load() else: if seqs is not None: jobs = {'queue': seqs, 'running': OrderedDict(), 'finished': OrderedDict(), 'error': OrderedDict(), 'failure': OrderedDict(), 'not_found': OrderedDict(), 'other': OrderedDict()} dump(jobs) else: print('No sequences provided.') sys.exit(0) ########################################################################## queue = jobs['queue'] running = jobs['running'] finished = jobs['finished'] # error = jobs['error'] failure = jobs['failure'] # not_found = jobs['not_found'] other = jobs['other'] retry_list = list() queue_size = len(seqs_orig) busy = True submit_message = True while busy: dump(jobs) sleep(delay) if len(queue) > 0: if len(running) < max_jobs: if submit_message is True: pass # print() # print('Submitting jobs: ' + run_id) # print() job_status = 'SUBMITTED ' title = list(queue.keys()).pop() sequence = queue.pop(title) job_id = submit_job(email, title, sequence) if job_id is None: job_status = 'WILL_RETRY' queue[title] = sequence job_id = '' else: running[title] = job_id titles_ab = split_seq_defn(title) title_a = titles_ab[0] msg = (job_status + ' ' + title_a.ljust(max_title_a_len) + run_id.ljust(max_run_id_len) + ' ' * 5 + job_id) Log.msg(msg) if len(running) < max_jobs and len(queue) > 0: submit_message = False continue if len(running) > 0: submit_message = True # print() # print('Looking for finished jobs: ' + run_id) # print() finished_jobs = False sleep(delay + 7) job_statuses = {} for title in running: sleep(delay) job_id = running[title] job_status = status(job_id) job_statuses[title] = {'job_id': job_id, 'job_status': job_status} titles_ab = split_seq_defn(title) title_a = titles_ab[0] # ToDo: Refactor if job_status == 'RUNNING': Log.msg(' ' * 10 + '- ' + title_a.ljust(max_title_a_len) + run_id.ljust(max_run_id_len) + ' ' * 5 + job_id) else: Log.msg(' ' * 10 + '+ ' + title_a.ljust(max_title_a_len) + run_id.ljust(max_run_id_len) + ' ' * 5 + job_id) finished_jobs = True if finished_jobs is True: # print() pass else: continue for title in job_statuses: job_id = job_statuses[title]['job_id'] job_status = job_statuses[title]['job_status'] if job_status == 'RUNNING': pass elif job_status == 'FINISHED': job_id = running.pop(title) if 'error' in result_types(job_id): job_status = 'FAILURE' failure[title] = job_id if retry_list.count(title) < 3: job_status = 'WILL_RETRY' queue[title] = seqs_orig[title] retry_list.append(title) else: finished[title] = job_id elif job_status == 'FAILURE': job_id = running.pop(title) failure[title] = job_id elif job_status == 'ERROR': continue # job_id = running.pop(title) # error[title] = job_id elif job_status == 'NOT_FOUND': job_status = 'WILL_RETRY' job_id = running.pop(title) queue[title] = seqs_orig[title] # not_found[title] = job_id else: job_id = running.pop(title) other[title] = job_id if job_status != 'RUNNING': progress = round((len(finished) / queue_size) * 100) progress_str = '{:3d}'.format(progress) + '%' titles_ab = split_seq_defn(title) title_a = titles_ab[0] job_status_msg = job_status.ljust(10) msg = (job_status_msg + ' ' + title_a.ljust(max_title_a_len) + run_id.ljust(max_run_id_len) + progress_str.rjust(4) + ' ' + job_id) Log.msg(msg) if len(running) == 0 and len(queue) == 0: busy = False if ope(cfp): remove(cfp) return jobs
def load_information_from_description_file(module, mod_path=None): """ :param module: The name of the module (sale, purchase, ...) :param mod_path: Physical path of module, if not providedThe name of the module (sale, purchase, ...) """ if not mod_path: mod_path = get_module_path(module, downloaded=True) manifest_file = module_manifest(mod_path) if manifest_file: # default values for descriptor info = { 'application': False, 'author': 'Odoo S.A.', 'auto_install': False, 'category': 'Uncategorized', 'depends': [], 'description': '', 'icon': get_module_icon(module), 'installable': True, 'license': 'LGPL-3', 'post_load': None, 'version': '1.0', 'web': False, 'sequence': 100, 'summary': '', 'website': '', } info.update( zip('depends data demo test init_xml update_xml demo_xml'.split(), iter(list, None))) f = tools.file_open(manifest_file, mode='rb') try: info.update(ast.literal_eval(pycompat.to_text(f.read()))) finally: f.close() if not info.get('description'): readme_path = [ opj(mod_path, x) for x in README if os.path.isfile(opj(mod_path, x)) ] if readme_path: with tools.file_open(readme_path[0]) as fd: info['description'] = fd.read() # auto_install is set to `False` if disabled, and a set of # auto_install dependencies otherwise. That way, we can set # auto_install: [] to always auto_install a module regardless of its # dependencies auto_install = info.get('auto_install', info.get('active', False)) if isinstance(auto_install, collections.abc.Iterable): info['auto_install'] = set(auto_install) non_dependencies = info['auto_install'].difference(info['depends']) assert not non_dependencies,\ "auto_install triggers must be dependencies, found " \ "non-dependencies [%s] for module %s" % ( ', '.join(non_dependencies), module ) elif auto_install: info['auto_install'] = set(info['depends']) else: info['auto_install'] = False info['version'] = adapt_version(info['version']) return info _logger.debug('module %s: no manifest file found %s', module, MANIFEST_NAMES) return {}
def is_really_module(name): for mname in MANIFEST_NAMES: if os.path.isfile(opj(dir, name, mname)): return True
def _import_module(self, module, path, force=False): known_mods = self.search([]) known_mods_names = {m.name: m for m in known_mods} installed_mods = [m.name for m in known_mods if m.state == 'installed'] terp = load_information_from_description_file(module, mod_path=path) if not terp: return False values = self.get_values_from_terp(terp) if 'version' in terp: values['latest_version'] = terp['version'] unmet_dependencies = set(terp['depends']).difference(installed_mods) if unmet_dependencies: if (unmet_dependencies == set(['web_studio']) and _is_studio_custom(path)): err = _("Studio customizations require Studio") else: err = _("Unmet module dependencies: \n\n - %s") % '\n - '.join( known_mods.filtered(lambda mod: mod.name in unmet_dependencies).mapped('shortdesc') ) raise UserError(err) elif 'web_studio' not in installed_mods and _is_studio_custom(path): raise UserError(_("Studio customizations require the Eagle Studio app.")) mod = known_mods_names.get(module) if mod: mod.write(dict(state='installed', **values)) mode = 'update' if not force else 'init' else: assert terp.get('installable', True), "Module not installable" self.create(dict(name=module, state='installed', imported=True, **values)) mode = 'init' for kind in ['data', 'init_xml', 'update_xml']: for filename in terp[kind]: ext = os.path.splitext(filename)[1].lower() if ext not in ('.xml', '.csv', '.sql'): _logger.info("module %s: skip unsupported file %s", module, filename) continue _logger.info("module %s: loading %s", module, filename) noupdate = False if ext == '.csv' and kind in ('init', 'init_xml'): noupdate = True pathname = opj(path, filename) idref = {} convert_file(self.env.cr, module, filename, idref, mode=mode, noupdate=noupdate, kind=kind, pathname=pathname) path_static = opj(path, 'static') IrAttachment = self.env['ir.attachment'] if os.path.isdir(path_static): for root, dirs, files in os.walk(path_static): for static_file in files: full_path = opj(root, static_file) with open(full_path, 'rb') as fp: data = base64.b64encode(fp.read()) url_path = '/{}{}'.format(module, full_path.split(path)[1].replace(os.path.sep, '/')) if not isinstance(url_path, str): url_path = url_path.decode(sys.getfilesystemencoding()) filename = os.path.split(url_path)[1] values = dict( name=filename, url=url_path, res_model='ir.ui.view', type='binary', datas=data, ) attachment = IrAttachment.search([('url', '=', url_path), ('type', '=', 'binary'), ('res_model', '=', 'ir.ui.view')]) if attachment: attachment.write(values) else: IrAttachment.create(values) return True
else: import pickle from six import string_types from os.path import join as opj, exists from os.path import dirname from importlib import import_module from datalad.utils import swallow_logs from datalad.utils import assure_dir from datalad.support.json_py import load as jsonload from datalad.dochelpers import exc_str from datalad.log import lgr # common format metadata_filename = 'meta.json' metadata_basepath = opj('.datalad', 'meta') # XXX Could become dataset method def get_metadata_type(ds, guess=False): """Return the metadata type(s)/scheme(s) of a dataset Parameters ---------- ds : Dataset Dataset instance to be inspected guess : bool Whether to try to auto-detect the type if no metadata type setting is found. All supported metadata schemes are tested in alphanumeric order. Returns
def check_resource_path(mod_path, *args): resource_path = opj(mod_path, *args) if os.path.exists(resource_path): return resource_path return False
"""HTCrystalBall - A crystal ball that lets you peek into the future.""" import logging from os.path import expanduser, join as opj from htcrystalball._version import __version__ SLOTS_CONFIGURATION = opj(expanduser('~'), '.htcrystalball') # External (root level) logging level logging.basicConfig(level=logging.ERROR, format='WARNING: %(message)s') # Internal logging level LOGGER = logging.getLogger('crystal_balls') LOGGER.setLevel(level=logging.DEBUG) __all__ = [ '__version__', 'SLOTS_CONFIGURATION', 'LOGGER' ]