def __call__(self, fn): "" # klass = rx.split(inspect.stack()[1][3])[1].lower() k=inspect.stack()[1][3] # print ">>>>>>>>>>>>>>>>>",k,fn.func_name klass = rx.split(k)[1] #we assume consistent capitalization of class / object names # print ">>>>klass>>>>>>>>",klass # print ">>>>self>>>>>>>>",self fname = fn.func_name css = [] csspath = '../htdocs/site/%s_%s.css' % (klass, fname) if lexists(csspath): css.append('/site/%s_%s.css' % (klass, fname)) csspath = '../htdocs/site/%s.css' % klass if lexists(csspath): css.append('/site/%s.css' % klass) name = '%s_%s.xml' % (klass, fname) template = Render(name, *self.a, **self.k) def function(inner_self, req): "a typical template" req['css'] = css req['meta'] = '%s_%d_%s' % (klass, getattr(inner_self, 'uid', 0), fname) fn(inner_self, req) return template(inner_self, req) return function
def test_soft_link(self): path1_real_file = join(self.test_dir, 'path1_real_file') path2_symlink = join(self.test_dir, 'path2_symlink') touch(path1_real_file) assert isfile(path1_real_file) assert not islink(path1_real_file) symlink(path1_real_file, path2_symlink) assert exists(path2_symlink) assert lexists(path2_symlink) assert islink(path2_symlink) assert readlink(path2_symlink).endswith(path1_real_file) # for win py27, readlink actually gives something that starts with \??\ # \??\c:\users\appveyor\appdata\local\temp\1\c571cb0c\path1_real_file assert stat_nlink(path1_real_file) == stat_nlink(path2_symlink) == 1 os.unlink(path1_real_file) assert not isfile(path1_real_file) assert not lexists(path1_real_file) assert not exists(path1_real_file) assert lexists(path2_symlink) if not (on_win and PY2): # I guess I'm not surprised this exist vs lexist is different for win py2 # consider adding a fix in the future assert not exists(path2_symlink) os.unlink(path2_symlink) assert not lexists(path2_symlink) assert not exists(path2_symlink)
def create_payloads(self): ''' Create all missing data payloads in current directory Doesn't compute md5 during creation because tarball can be created manually Also create symlink to versionned payload ''' arrow("Creating payloads") for payload_name in self.select_payloads(): paydesc = self.describe_payload(payload_name) if exists(paydesc["link_path"]): continue arrow(payload_name, 1) try: # create non versionned payload file if not exists(paydesc["dest_path"]): if paydesc["isdir"]: self.create_payload_tarball(paydesc["dest_path"], paydesc["source_path"], paydesc["compressor"]) else: self.create_payload_file(paydesc["dest_path"], paydesc["source_path"], paydesc["compressor"]) # create versionned payload file if lexists(paydesc["link_path"]): unlink(paydesc["link_path"]) symlink(paydesc["dest_path"], paydesc["link_path"]) except Exception as e: # cleaning file in case of error if exists(paydesc["dest_path"]): unlink(paydesc["dest_path"]) if lexists(paydesc["link_path"]): unlink(paydesc["link_path"]) raise ISError(u"Unable to create payload %s" % payload_name, e)
def create_link(src, dst, link_type=LinkType.hardlink, force=False): if link_type == LinkType.directory: # A directory is technically not a link. So link_type is a misnomer. # Naming is hard. mkdir_p(dst) return if not lexists(src): raise CondaError("Cannot link a source that does not exist. %s" % src) if lexists(dst): if not force: maybe_raise(BasicClobberError(src, dst, context), context) log.info("file exists, but clobbering: %r" % dst) rm_rf(dst) if link_type == LinkType.hardlink: if isdir(src): raise CondaError("Cannot hard link a directory. %s" % src) link(src, dst) elif link_type == LinkType.softlink: _do_softlink(src, dst) elif link_type == LinkType.copy: _do_copy(src, dst) else: raise CondaError("Did not expect linktype=%r" % link_type)
def main(argv): EXCLUDES.append(path.join('.', path.basename(argv[0]))) target = os.environ['HOME'] target_prefix = '.' opts = argv[1:] extras = '--extras' in opts dolink('.', target, target_prefix, excludes=[ path.join('.', 'bin') ]) mkdir(path.join(target, 'bin')) dolink('bin', path.join(target, 'bin')) # pull in hgexts hgexts = path.join(target, '.hgexts') mkdir(hgexts) if not path.lexists(path.join(hgexts, 'hg-git')): system('hg clone ssh://[email protected]/durin42/hg-git', hgexts) if not path.lexists(path.join(hgexts, 'hg-remotebranches')): system('hg clone ssh://[email protected]/durin42/hg-remotebranches', hgexts) # pull in sandboxes sandbox = path.join(target, 'sandbox') mkdir(sandbox) if not path.lexists(path.join(sandbox, 'mercurial-cli-templates')): system('hg clone ssh://[email protected]/sjl/mercurial-cli-templates/', sandbox) return 0
def dolink(dirpath, target, target_prefix='', excludes=None): for fn in sorted(os.listdir(dirpath)): localfn = path.join(dirpath, fn) if localfn in EXCLUDES: continue if excludes and localfn in excludes: continue targetfn = path.join(target, target_prefix + fn) localfnabs = path.abspath(localfn) if path.isdir(localfn): if localfn in MKDIR_INSTEADOF_LINK: mkdir(targetfn) dolink(localfn, targetfn) else: if path.lexists(targetfn): if not (path.islink(targetfn) \ and os.readlink(targetfn) == localfnabs): warn('exists: diff -u %s %s' % (targetfn, localfn)) else: os.symlink(localfnabs, targetfn) else: if path.lexists(targetfn): if not (path.islink(targetfn) \ and os.readlink(targetfn) == localfnabs): warn('exists: diff -u %s %s' % (targetfn, localfn)) else: os.symlink(localfnabs, targetfn)
def trash(self): if islink(self.path): self.remove() return True elif exists(self.path): base = basename(self.path) target = base ftarget = join(expanduser("~/.Trash"), target) index = 1 while lexists(ftarget): target = "%s-%d" % (base, index) index += 1 ftarget = join(expanduser("~/.Trash"), target) try: l.debug("Calling: os.rename('%s', '%s')" % (self.path, ftarget)) if not self.dryrun: os.rename(self.path, ftarget) except: if self.sudo: try: run('sudo /bin/mv %%s "%s"' % ftarget, self.path, self.dryrun) except: l.error("Error moving file with sudo: %s" % self) if self.dryrun or not lexists(self.path): return True else: l.error("Could not trash file: %s\n" % self) return False
def create_link(src, dst, link_type=LinkType.hardlink, force=False): if link_type == LinkType.directory: # A directory is technically not a link. So link_type is a misnomer. # Naming is hard. mkdir_p(dst) return if not lexists(src): raise CondaError("Cannot link a source that does not exist. %s" % src) if lexists(dst): if not force: maybe_raise(BasicClobberError(src, dst, context), context) log.info("file exists, but clobbering: %r" % dst) rm_rf(dst) if link_type == LinkType.hardlink: if isdir(src): raise CondaError("Cannot hard link a directory. %s" % src) link(src, dst) elif link_type == LinkType.softlink: symlink(src, dst) elif link_type == LinkType.copy: # on unix, make sure relative symlinks stay symlinks if not on_win and islink(src): src_points_to = readlink(src) if not src_points_to.startswith('/'): # copy relative symlinks as symlinks symlink(src_points_to, dst) return shutil.copy2(src, dst) else: raise CondaError("Did not expect linktype=%r" % link_type)
def _truncate_spaces( nshell, n1, n2, dirpaths, scalefactor, remove_protons, force=False): """For multiple directories, perform the operation of truncate_space :param nshell: major oscillator shell (0=s, 1=p, ...) :param n1: max allowed one-particle state :param n2: max allowed two-particle state :param dirpaths: Paths to the destination directories """ dirpaths = list(dirpaths) d0 = dirpaths.pop() # truncate interaction once fpath0, lpath0 = _truncate_space( nshell=nshell, n1=n1, n2=n2, dpath_elt=d0, scalefactor=scalefactor, remove_protons=remove_protons, force=force, ) fname_tbme = path.split(fpath0)[1] lname_tbme = path.split(lpath0)[1] # link truncated interaction file to the rest of the directories for d in dirpaths: dst_path = path.join(d, fname_tbme) sl_path = path.join(d, lname_tbme) try: if path.exists(sl_path) or path.lexists(sl_path): # symlink exists remove(sl_path) if not (path.exists(dst_path) or path.lexists(dst_path)): link(fpath0, dst_path) elif force: remove(dst_path) link(fpath0, dst_path) except OSError: print 'Could not link %s to %s.' % (fpath0, dst_path) raise symlink(dst_path, sl_path) return fname_tbme, lname_tbme
def reload(self,force=False): "reload the template, if it is not in the cache, or the timestamp doesn't match" cob=self.cache.get(self.key,CacheObject()) if (cob.path and not force):#is it in the cache? if (cob.timestamp == os.stat(cob.path).st_mtime):# does the timestamp match? # print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MATCHED",self.key self.path=cob.path self.wrapperpath=cob.wrapperpath if hasattr(cob,'template'): self.template=cob.template self.saxevents=cob.saxevents return False # the correct template objects are now copied from cache else: #ie do this first time only.. get the paths (we could not do this before now, because we didn't have the base and app filepaths) cob.path=self.app_filepath+self.filename#is there local template? if not lexists(cob.path): cob.path=self.base_filepath+self.filename# use the base template cob.wrapperpath=self.app_filepath+self.wrapper# is there a local wrapper? if not lexists(cob.wrapperpath): cob.wrapperpath=self.base_filepath+self.wrapper# use the base wrapper # print "WRAPPERPATH=",self.wrapperpath,self.app_filepath,self.wrapper # print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>RELOAD>>",self.key self.path=cob.path self.wrapperpath=cob.wrapperpath self.template=cob.template= self.wrapTemplate(self.path, self.wrapperpath) self.include(self.template.childNodes[0]) self.timestamp=cob.timestamp=os.stat(cob.path).st_mtime self.cache[self.key]=cob #cache the template self.cob=cob #we need this in render.py for sax data # return true to indicate reload has happened return True
def _installFile(self,source,target,removeSource=True,symlink=False): """Copy, move or create symlink source file to target. Save old.""" def getLinkForTarget(target): """Get symlinks from target dirname which point to target""" baseDir = path.dirname(path.normpath(target)) linkFiles = filter(path.islink,map(lambda x:path.join(baseDir,x), os.listdir(baseDir))) return filter(lambda x:path.join(baseDir, os.readlink(x))==target, linkFiles) # raise IOError if source is not exists open(source,'r').close() targetLinkFiles = getLinkForTarget(target) oldtarget = "" if path.lexists(target): oldtarget = "%s.old" % target if path.lexists(oldtarget): oldTargetLinkFiles = getLinkForTarget(oldtarget) map(os.unlink,oldTargetLinkFiles) os.unlink(oldtarget) os.rename(target,oldtarget) if symlink: if path.dirname(source)==path.dirname(target): os.symlink(path.basename(source),target) else: os.symlink(source,target) elif removeSource: os.rename(source,target) else: copy_with_perm(source,target) if oldtarget: map(os.unlink,targetLinkFiles) map(lambda x:os.symlink(path.basename(oldtarget),x),targetLinkFiles)
def copyfilesfromsvn(filelist,svnbasepath,localpath,revision,samefold=False): shutil.rmtree(localpath,True) if not path.lexists(localpath): os.makedirs(localpath) os.chdir(localpath) prefix = r'/IMClient-RV/' prefix2 = r'/revolution_min/' prefix3 = r'/Private/' haserr = False for onefile in filelist: pos = onefile.find(prefix) if pos==-1: pos = onefile.find(prefix2) if pos == -1: pos = onefile.find(prefix3) if pos == -1: continue if samefold: localfilepath = localpath + path.basename(onefile) if path.lexists(localfilepath): localfilepath = localfilepath + '-dup' cmdstr = 'svn export --force -r '+ str(revision) + ' ' + svnbasepath + onefile + ' ' + localfilepath else: localfilepath = localpath + onefile.replace(onefile[:pos], '', 1) if not path.lexists(path.dirname(localfilepath)): os.makedirs(path.dirname(localfilepath)) cmdstr = 'svn export --force -r '+ str(revision) + ' ' + svnbasepath + onefile + ' ' + localfilepath try: retcode = subprocess.call(cmdstr,shell=True) if retcode != 0: haserr = True except OSError,e: haserr = True
def prep_symlink(outdir, workdir, filename=None): """ Creates a symlink between outdir and workdir. If outdir and workdir are the same directory, then bails out. Both directories should exist prior to call. If filename is None, then creates a symlink to workdir in outdir called ``workdir``. Otherwise, creates a symlink in workdir called filename. If a link ``filename`` already exists, deletes it first. """ from os import remove, symlink from os.path import samefile, lexists, abspath, join from ..misc import Changedir if samefile(outdir, workdir): return if filename is None: with Changedir(workdir) as cwd: if lexists('workdir'): try: remove('workdir') except OSError: pass try: symlink(abspath(workdir), abspath(join(outdir, 'workdir'))) except OSError: pass return with Changedir(workdir) as cwd: if lexists(filename): try: remove(filename) except OSError: pass try: symlink( abspath(join(outdir, filename)), abspath(join(workdir, filename)) ) except OSError: pass
def test_sh_rm_broken_symlink(): with sh.tmpdir(): os.symlink('afile-notexist', 'alink') assert not path.exists('alink') assert path.lexists('alink') sh.rm('alink') assert not path.lexists('alink')
def copyfilesfromlocal(filelist,localsrcpath,localdestpath,samefold=False): shutil.rmtree(localdestpath,True) if not path.lexists(localdestpath): os.makedirs(localdestpath) os.chdir(localdestpath) prefix = r'/IMClient-RV/' prefix2 = r'/revolution_min/' prefix3 = r'/Private/' #prefix4 = '/modules/' #prefix = '/IMClient/Branches_tb/20110325_Base6.6002C_security' #prefix2 = '/IMClient/Branches_tb/20110420_Base6.6003C_security2' for onefile in filelist: pos = onefile.find(prefix) if pos==-1: pos = onefile.find(prefix2) if pos == -1: pos = onefile.find(prefix3) if pos == -1: continue localfilepath = onefile.replace(onefile[:pos],'',1) try: if samefold: copydest = localdestpath + path.basename(localfilepath) if path.lexists(copydest): copydest = copydest + '-dup' shutil.copy2(localsrcpath+localfilepath,copydest) else: copydest = localdestpath+localfilepath if not path.lexists(path.dirname(copydest)): os.makedirs(path.dirname(copydest)) shutil.copy2(localsrcpath+localfilepath,copydest) except IOError,e: pass
def test_simple_LinkPathAction_softlink(self): source_full_path = make_test_file(self.pkgs_dir) target_short_path = source_short_path = basename(source_full_path) correct_sha256 = compute_sha256sum(source_full_path) correct_size_in_bytes = getsize(source_full_path) path_type = PathType.hardlink source_path_data = PathDataV1( _path = source_short_path, path_type=path_type, sha256=correct_sha256, size_in_bytes=correct_size_in_bytes, ) axn = LinkPathAction({}, None, self.pkgs_dir, source_short_path, self.prefix, target_short_path, LinkType.softlink, source_path_data) assert axn.target_full_path == join(self.prefix, target_short_path) axn.verify() axn.execute() assert isfile(axn.target_full_path) assert islink(axn.target_full_path) assert stat_nlink(axn.target_full_path) == 1 axn.reverse() assert not lexists(axn.target_full_path) assert lexists(source_full_path)
def mount(self, mkfs): self.state = S_STARTING devnull_fd = open(devnull,'w') # waiting for our block device to be available dev_found = False dev_prefix = self.dev_name.split('/')[2][:-1] for attempt in range(1, 11): sql_logger.info("Galera node waiting for block device %s" % self.dev_name) if lexists(self.dev_name): dev_found = True break else: # On EC2 the device name gets changed # from /dev/sd[a-z] to /dev/xvd[a-z] if lexists(self.dev_name.replace(dev_prefix, 'xvd')): dev_found = True self.dev_name = self.dev_name.replace(dev_prefix, 'xvd') break time.sleep(10) # create mount point run_cmd(self.mkdir_cmd) if dev_found: sql_logger.info("Galera node has now access to %s" % self.dev_name) # prepare block device if mkfs: sql_logger.info("Creating new file system on %s" % self.dev_name) self.prepare_args = ['mkfs.ext4', '-q', '-m0', self.dev_name] proc = Popen(self.prepare_args, stdin=PIPE, stdout=devnull_fd, stderr=devnull_fd, close_fds=True) proc.communicate(input="y") # answer interactive question with y if proc.wait() != 0: sql_logger.critical('Failed to prepare storage device:(code=%d)' % proc.returncode) else: sql_logger.info('File system created successfully') else: sql_logger.info( "Not creating a new file system on %s" % self.dev_name) time.sleep(10) # mount self.mount_args = ['mount', self.dev_name, self.mount_point] mount_cmd = ' '.join(self.mount_args) sql_logger.debug("Running command '%s'" % mount_cmd) _, err = run_cmd(mount_cmd) if err: sql_logger.critical('Failed to mount storage device: %s' % err) else: sql_logger.info("OSD node has prepared and mounted %s" % self.dev_name) else: sql_logger.critical("Block device %s unavailable, falling back to image space" % self.dev_name)
def test_sh_rm_symlink(): with sh.tmpdir(): with open('afile', 'w') as f: f.close() assert path.exists('afile') os.symlink('afile', 'alink') assert path.lexists('alink') sh.rm('alink') assert not path.lexists('alink')
def is_saned(self, environment): """ Return true if the addon is saned. """ path = self.environment_path(environment) return (lexists(path) and os.path.exists(realpath(path)) or not lexists(path))
def rename(source_path, destination_path, force=False): if lexists(destination_path) and force: rm_rf(destination_path) if lexists(source_path): log.trace("renaming %s => %s", source_path, destination_path) os_rename(source_path, destination_path) else: log.trace("cannot rename; source path does not exist '%s'", source_path)
def batch(args): """ %prog batch splits output The arguments are two folders. Input FASTA sequences are in splits/. Output csv files are in output/. Must have folders swissprot/, tair/, trembl/ that contains the respective BLAST output. Once finished, you can run, for example: $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml """ p = OptionParser(batch.__doc__) ahrd_weights = { "blastp": [0.5, 0.3, 0.2], "blastx": [0.6, 0.4, 0.0] } blast_progs = tuple(ahrd_weights.keys()) p.add_option("--path", default="~/code/AHRD/", help="Path where AHRD is installed [default: %default]") p.add_option("--blastprog", default="blastp", choices=blast_progs, help="Specify the blast program being run. Based on this option," \ + " the AHRD parameters (score_weights) will be modified." \ + " [default: %default]") p.add_option("--iprscan", default=None, help="Specify path to InterProScan results file if available." \ + " If specified, the yml conf file will be modified" \ + " appropriately. [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) splits, output = args mkdir(output) bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog] for f in glob("{0}/*.fasta".format(splits)): fb = op.basename(f).rsplit(".", 1)[0] fw = open(op.join(output, fb + ".yml"), "w") path = op.expanduser(opts.path) dir = op.join(path, "test/resources") outfile = op.join(output, fb + ".csv") interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else "" print >> fw, Template.format(dir, fb, f, outfile, bit_score, db_score, ovl_score, interpro) if opts.iprscan: if not op.lexists("interpro.xml"): symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml") if not op.lexists("interpro.dtd"): symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
def run_command(self, args): # rm venv 'project' ONLY if args.brew: vname = args.project print "Deleting given pythonbrew virtualenv '{}' ONLY...".format(vname) # check if target venv is linked in pybrat project proj_d = get_project_list({'pybrat': True,}) for proj in proj_d.keys(): if vname in proj_d[proj]['venv'].keys(): pv_proj_dir = proj_d[proj]['srcpath'] print "Venv '{0}' is being used by project '{1}'.".format(vname, proj) print "Deleting it will break a link unless you delete the link too." if get_input_bool("Cleanup the pybrat project as well? [y/N] ", default_answer=False): # rm venv and cleanup project if not pb_rmvenv(vname, pv_proj_dir): print "Remove Error: {} was not deleted".format(args.project) return False return True # venv not in any pybrat project so no worries... if not pb_rmvenv(vname): print "Remove Error: {} was not deleted".format(args.project) return False return True # still here? then delete the rest of the pybrat project... pv_projd = join(PYBRAT_PROJD, args.project) if not lexists(pv_projd): print "Warning: {} does not exist".format(pv_projd) return False print "==> Removing project '{}'...".format(args.project) # if 'pythonbrew venv delete project'...? if args.venv: print "Deleting linked virtualenv(s)..." if not pv_rmvenv(pv_projd): print "Remove Error: {} was not deleted".format(pv_projd) return False # remove .pybrat dir in user's project dir pv_subd = join(readlink(pv_projd), ".{}".format(PYBRAT_PROG)) if not lexists(pv_subd): print "Warning: {} does not exist.".format(pv_subd) else: rmtree(pv_subd, ignore_errors=True) print "Removed .pybrat subdirectory {}".format(pv_subd) # remove pybrat project link in .pybrat_projects/ remove(pv_projd) print "Removed pybrat project {}".format(pv_projd) # all done deleting shit? ok... return True
def checkRequiredFiles(self): '''makes sure textgrid files exist for each wav file, and that a config file exists''' #check for config file, and load it. Make it part of options. #load config file in the 2 required formats self.options.config = {} self.loadConfig() self.options.praatConfig = [] self.loadPraatConfig() #for truth and each other directory: #make sure the directories exist #check for txtgrid files #TRUTH if not path.lexists(self.options.directory + self.options.truth + "/"): print "error, the",self.options.truth,"directory doesn't exist (it's where your 'truth' files should be)! Exiting." exit() toExtract = self.options.source if toExtract: for folder in toExtract: if not path.isdir(self.options.directory + folder + "/"): print "error, the",self.options.directory + folder + "/","directory doesn't exist! You listed it as a folder to process. Exiting." exit() for fname in self.fileList: if not path.isfile(self.options.directory + folder + "/" + fname + ".txtgrid"): print "error, the required",fname,".txtgrid file doesn't exist in ./",folder,"/" print "exiting." exit() if not self.options.extractPraat and not path.isdir(self.options.directory + 'formatted/' + fname + '_formatted/'): print "You first need to extract prosodic information with Praat. Re-run with the -x flag." exit() #check to make sure directories to be cleaned exist #if self.options.cleanOldResults: # for folder in self.options.cleanOldResults: # if not path.isdir(self.options.directory + "extracted/" + folder + "/"): # print "error, the",folder,"directory doesn't exist! Exiting." # exit() praatscriptpath = os.path.abspath(__file__) s = praatscriptpath.split('/') praatscriptpath = '/' + '/'.join(s[:-1]) + "/extractInfoSingle.praat" self.options.praatscriptpath = praatscriptpath #make sure praat script is there if not path.isfile(praatscriptpath): print "error, the praat script isn't in the same directory as the rest of the package files! Printing file list and exiting." print os.system("ls") print path.isfile(praatscriptpath) print self.options.praatscriptpath exit() #folder for db files if not path.lexists(self.options.directory + "db/"): mkdir(self.options.directory + "db/") if not path.lexists(self.options.directory + "arff/"): mkdir(self.options.directory + "arff/")
def __call__(self, ob, req, wrapper='', gettext=lambda x: x): """ generate HTML - allow for multiple apps having different template versions within each Evo instance - we ony get the app data at call time - wrapper can be passed as req.wrapper or as wrapper: wrapper=None means no wrapper """ self.gettext = gettext #get the template path for this app self.key = ob.Config.app + '.' + self.filename self.path = self.pathcache.get(self.key, "") if not self.path: # first time only.. get the paths # note: we couldn't do this earlier, as we lacked the evoke and app filepaths # firstly: use the local class template, if there is one klass = self.filename.split("_", 1)[0] self.path = '%s%s/evo/%s' % (ob.Config.app_filepath, klass, self.filename) # otherwise, is there a local template? if not lexists(self.path): self.path = ob.Config.app_filepath + 'evo/' + self.filename # otherwise, is there a class template in evoke (base)? if not lexists(self.path): self.path = '%s%s/evo/%s' % (ob.Config.evoke_filepath, klass, self.filename) # otherwise, use the evoke template if not lexists(self.path): self.path = ob.Config.evoke_filepath + 'evo/' + self.filename # if that doesn't exist, raise an error if not lexists(self.path): raise EvoTemplateNotFound(self.filename) self.pathcache[self.key] = self.path #get the CacheObject for this path, containing the python code and timestamp cob = self.pycache.get(self.path, EvoCacheObject()) #and parse the template to a python expression # ... unless we already have python code and the timestamp matches if (not cob.pyc) or (cob.timestamp != stat(self.path).st_mtime): # set the timestamp cob.timestamp = stat(self.path).st_mtime # parse the template into python code cob.pyc = self.parse(self.path, ob, req) # compile the python code to the cache # print("compiling %s : " % self.filename,cob.pyc) if not debug: try: cob.pyc = compile(cob.pyc, '<string>', 'eval') except SyntaxError as inst: p = inst.offset t = inst.text raise EvoSyntaxError( "char %s" % p, "evo pycode for %s" % self.filename, t[max(0, p - 40):p], t[p:min(p + 40, len(t) - 1)]) # sort out the wrapper if wrapper is not None: wrapper = wrapper or req.get('wrapper', 'wrapper.evo') # and run the python code from the cache res = self.wrap(cob.pyc, ob, req, wrapper) if wrapper else self.evaluate( cob.pyc, ob, req) return res
def tearDown(self): rm_rf(self.prefix) if not (on_win and PY2): # this assertion fails for the Softlink action windows tests # line 141 in backoff_rmdir # exp_backoff_fn(rmtree, path, onerror=retry, max_tries=max_tries) # leaves a directory self.prefix\\Scripts that cannot be accessed or removed assert not lexists(self.prefix) rm_rf(self.pkgs_dir) assert not lexists(self.pkgs_dir)
def make_allegro_build_directory(): os.chdir(root_path) os.chdir('deps') os.chdir('builds') if path.lexists('allegro_build'): i = 1 while path.lexists('allegro_build ({0})'.format(i)): i += 1 shutil.move('allegro_build', 'allegro_build ({0})'.format(i)) os.mkdir('allegro_build')
def _perform(src, dst, action, actionname): if not op.lexists(src): print("Copying %s failed: it doesn't exist." % src) return if op.lexists(dst): if op.isdir(dst): shutil.rmtree(dst) else: os.remove(dst) print('%s %s --> %s' % (actionname, src, dst)) action(src, dst)
def undoRename(self): if not self.renamed or not path.lexists(self.new_path): return False if path.lexists(self.path): if self.path not in _renamed or not _renamed[self.path].undoRename(): return False try: os.rename(self.new_path, self.path) PrintError(self.arg + " not renamed", shortPath(self.new_path) + " exists") self.renamed = False return True except OSError as e: return False
def test_simple_LinkPathAction_directory(self): target_short_path = join('a', 'nested', 'directory') axn = LinkPathAction({}, None, None, None, self.prefix, target_short_path, LinkType.directory, None) axn.verify() axn.execute() assert isdir(join(self.prefix, target_short_path)) axn.reverse() assert not lexists(axn.target_full_path) assert not lexists(dirname(axn.target_full_path)) assert not lexists(dirname(dirname(axn.target_full_path)))
def bringup(self, outdir, workdir): """ Sets up call to program. """ from os.path import join, abspath, samefile, lexists from os import symlink, remove from ...misc import copyfile, Changedir from ... import CRYSTAL_propnames as filenames with Changedir(workdir) as cwd: # first copies file from current working directory for key, value in filenames.iteritems(): copyfile( join(workdir, value.format('prop')), key, nocopyempty=True, symlink=False, nothrow="never" ) for key, value in filenames.iteritems(): copyfile( join(self.input.directory, value.format('prop')), key, nocopyempty=True, symlink=False, nothrow="never" ) # then creates input file. string = self.print_input(workdir=workdir, outdir=outdir, filework=True) string = string.rstrip() + '\n' with open('prop.d12', 'w') as file: file.write(string) header = ''.join(['#']*20) with open('prop.out', 'w') as file: file.write('{0} {1} {0}\n'.format(header, 'INPUT FILE')) file.write(string) file.write('{0} END {1} {0}\n'.format(header, 'INPUT FILE')) file.write('\n{0} {1} {0}\n'.format(header, 'FUNCTIONAL')) file.write(self.__repr__(defaults=False)) file.write('\n{0} END {1} {0}\n'.format(header, 'FUNCTIONAL')) with Changedir(outdir) as cwd: pass if not samefile(outdir, workdir): # Creates symlink to make sure we keep working directory. with Changedir(outdir) as cwd: with open('prop.d12', 'w') as file: file.write(string) with open('prop.out', 'w') as file: pass with open('prop.err', 'w') as file: pass # creates symlink files. for filename in ['prop.err', 'prop.out']: if lexists(join(workdir, filename)): try: remove( join(workdir, filename) ) except: pass symlink(abspath(filename), abspath(join(workdir, filename))) if lexists('workdir'): try: remove('workdir') except: pass try: symlink(workdir, 'workdir') except: pass # creates a file in the directory, to say we are going to work here with open(join(outdir, '.pylada_is_running'), 'w') as file: pass
def test_add_delete(self): # To test that .tar gets removed add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete=True) assert_false(lexists(opj(self.annex.path, '1.tar')))
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') ds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. # MIH: is_dirty() is gone, but status() can do all of the above! if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=( 'clean dataset required to detect changes from command; ' 'use `datalad status` to inspect unsaved changes')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) # ATTN: For correct path handling, all dataset commands call should be # unbound. They should (1) receive a string dataset argument, (2) receive # relative paths, and (3) happen within a chpwd(pwd) context. if not inject: with chpwd(pwd): for res in prepare_inputs(ds_path, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds_path, outputs): yield res for res in _unlock_or_remove(ds_path, outputs.expand()): yield res if rerun_outputs is not None: for res in _unlock_or_remove(ds_path, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command( ds, cmd, pwd=pwd, dspath=ds_path, # Check if the command contains "{tmpdir}" to avoid creating an # unnecessary temporary directory in most but not all cases. tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "", inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds_path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand() if explicit else None if outputs_to_save is not None and use_sidecar: outputs_to_save.append(record_path) do_save = outputs_to_save is None or outputs_to_save if not rerun_info and cmd_exitcode: if do_save: repo = ds.repo msg_path = relpath(opj(str(repo.dot_git), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(ensure_bytes(msg)) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -d . -r -F %s'", msg_path) raise exc elif do_save: with chpwd(pwd): for r in Save.__call__( dataset=ds_path, path=outputs_to_save, recursive=True, message=msg, return_type='generator'): yield r
def find_first_existing(*globs): for g in globs: for path in glob(g): if lexists(path): return path return None
def pid(self): pid_filename = self.pid_filename() if lexists(pid_filename): return int(''.join(open(pid_filename).readlines())) else: return None
def restore(self, view, where='unknow'): if view is None or not view.file_name() or view.settings().get( 'is_widget'): return if view.is_loading(): sublime.set_timeout(lambda: self.restore(view, where), 100) else: id, index = self.view_id(view) if debug: print '-----------------------------------' print 'restoring from ' + where print view.file_name() print 'id ' + id print 'position in tabbar ' + index if id in db: # if the view changed outside of the application, don't restore folds etc if db[id]['id'] == long(view.size()): # fold rs = [] for r in db[id]['f']: rs.append(sublime.Region(int(r[0]), int(r[1]))) if len(rs): view.fold(rs) # selection if len(db[id]['s']) > 0: view.sel().clear() for r in db[id]['s']: view.sel().add(sublime.Region( int(r[0]), int(r[1]))) # marks rs = [] for r in db[id]['m']: rs.append(sublime.Region(int(r[0]), int(r[1]))) if len(rs): view.add_regions("mark", rs, "mark", "dot", sublime.HIDDEN | sublime.PERSISTENT) # bookmarks rs = [] for r in db[id]['b']: rs.append(sublime.Region(int(r[0]), int(r[1]))) if len(rs): view.add_regions("bookmarks", rs, "bookmarks", "bookmark", sublime.HIDDEN | sublime.PERSISTENT) # color scheme if Pref.remember_color_scheme and 'c' in db[ id] and view.settings().get( 'color_scheme') != db[id]['c']: view.settings().set('color_scheme', db[id]['c']) # syntax if view.settings().get('syntax') != db[id]['x'] and lexists( sublime.packages_path() + '/../' + db[id]['x']): view.settings().set('syntax', db[id]['x']) # scroll if int(sublime.version()) >= 2151: if index in db[id]['l']: view.set_viewport_position(tuple(db[id]['l'][index]), False) else: view.set_viewport_position(tuple(db[id]['l']['0']), False)
def add_to_datalad(topdir, studydir, msg, bids): """Do all necessary preparations (if were not done before) and save """ import datalad.api as dl from datalad.api import Dataset from datalad.support.annexrepo import AnnexRepo from datalad.support.external_versions import external_versions assert external_versions['datalad'] >= MIN_VERSION, ( "Need datalad >= {}".format(MIN_VERSION)) # add to reqs studyrelpath = op.relpath(studydir, topdir) assert not studyrelpath.startswith(op.pardir) # so we are under # now we need to test and initiate a DataLad dataset all along the path curdir_ = topdir superds = None subdirs = [''] + [d for d in studyrelpath.split(op.sep) if d != os.curdir] for isubdir, subdir in enumerate(subdirs): curdir_ = op.join(curdir_, subdir) ds = Dataset(curdir_) if not ds.is_installed(): lgr.info("Initiating %s", ds) # would require annex > 20161018 for correct operation on annex v6 # need to add .gitattributes first anyways ds_ = dl.create( curdir_, dataset=superds, force=True, # initiate annex only at the bottom repository no_annex=isubdir < (len(subdirs) - 1), fake_dates=True, # shared_access='all', ) assert ds == ds_ assert ds.is_installed() superds = ds # TODO: we need a helper (in DataLad ideally) to ease adding such # specifications gitattributes_path = op.join(studydir, '.gitattributes') # We will just make sure that all our desired rules are present in it desired_attrs = """\ * annex.largefiles=(largerthan=100kb) *.json annex.largefiles=nothing *.txt annex.largefiles=nothing *.tsv annex.largefiles=nothing *.nii.gz annex.largefiles=anything *.tgz annex.largefiles=anything *_scans.tsv annex.largefiles=anything """ if op.exists(gitattributes_path): with open(gitattributes_path, 'rb') as f: known_attrs = [ line.decode('utf-8').rstrip() for line in f.readlines() ] else: known_attrs = [] for attr in desired_attrs.split('\n'): if attr not in known_attrs: known_attrs.append(attr) with open(gitattributes_path, 'wb') as f: f.write('\n'.join(known_attrs).encode('utf-8')) # ds might have memories of having ds.repo GitRepo superds = Dataset(topdir) assert op.realpath(ds.path) == op.realpath(studydir) assert isinstance(ds.repo, AnnexRepo) # Add doesn't have all the options of save such as msg and supers ds.save(path=['.gitattributes'], message="Custom .gitattributes", to_git=True) dsh = dsh_path = None if op.lexists(op.join(ds.path, '.heudiconv')): dsh_path = op.join(ds.path, '.heudiconv') dsh = Dataset(dsh_path) if not dsh.is_installed(): # Previously we did not have it as a submodule, and since no # automagic migration is implemented, we just need to check first # if any path under .heudiconv is already under git control if any(x.startswith('.heudiconv/') for x in ds.repo.get_files()): lgr.warning( "%s has .heudiconv not as a submodule from previous" " versions of heudiconv. No automagic migration is " "yet provided", ds) else: dsh = ds.create( path='.heudiconv', force=True, # shared_access='all' ) # Since .heudiconv could contain sensitive information # we place all files under annex and then add if create_file_if_missing(op.join(dsh_path, '.gitattributes'), """* annex.largefiles=anything"""): ds.save( '.heudiconv/.gitattributes', to_git=True, message="Added gitattributes to place all .heudiconv content" " under annex") ds.save('.', recursive=True # not in effect! ? #annex_add_opts=['--include-dotfiles'] ) # TODO: filter for only changed files? # Provide metadata for sensitive information mark_sensitive(ds, 'sourcedata') mark_sensitive(ds, '*_scans.tsv') # top level mark_sensitive(ds, '*/*_scans.tsv') # within subj mark_sensitive(ds, '*/*/*_scans.tsv') # within sess/subj mark_sensitive(ds, '*/anat') # within subj mark_sensitive(ds, '*/*/anat') # within ses/subj if dsh_path: mark_sensitive(ds, '.heudiconv') # entire .heudiconv! superds.save(path=ds.path, message=msg, recursive=True) assert not ds.repo.dirty # TODO: they are still appearing as native annex symlinked beasts """
# copyright and license terms. # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """Defines version to be imported in the module and obtained from setup.py """ from os.path import lexists, dirname, join as opj, curdir # Hard coded version, to be done by release process __version__ = '0.2.1' # NOTE: might cause problems with "python setup.py develop" deployments # so I have even changed buildbot to use pip install -e . moddir = dirname(__file__) projdir = curdir if moddir == 'reproman' else dirname(moddir) if lexists(opj(projdir, '.git')): # If under git -- attempt to deduce a better "dynamic" version following git try: import sys from subprocess import Popen, PIPE from os.path import dirname git = Popen([ 'git', 'describe', '--abbrev=4', '--dirty', '--match', '[0-9]*\.*' ], stdout=PIPE, stderr=PIPE, cwd=projdir) if git.wait() != 0: raise OSError("Could not run git describe") line = git.stdout.readlines()[0] _ = git.stderr.readlines()
import os from os.path import join, expanduser, lexists CLIENT_DIR = join(expanduser("~"), ".gcal-quickeradd") if not lexists(CLIENT_DIR): os.makedirs(CLIENT_DIR) CREDENTIALS_FILE = join(CLIENT_DIR, "gcal-credentials") CLIENT_SECRET_FILE = join(CLIENT_DIR, 'client_secret.json')
def save_dataset(ds, paths, message=None): """Save changes in a single dataset. Parameters ---------- ds : Dataset The dataset to be saved. paths : list Annotated paths to dataset components to be saved. message: str, optional (Commit) message to be attached to the saved state. Returns ------- bool Whether a new state was saved. If all to be saved content was unmodified no new state will be saved. """ # XXX paths must be in the given ds, no further sanity checks! # make sure that all pending changes (batched annex operations, etc.) # are actually reflected in Git ds.repo.precommit() # track what is to be committed, so it becomes # possible to decide when/what to save further down # and one level up orig_hexsha = ds.repo.get_hexsha() # check whether we want to commit anything staged, or individual pieces # this is independent of actually staging individual bits save_entire_ds = False for ap in paths: if ap['path'] == ds.path: save_entire_ds = True break # asking yourself why we need to `add` at all? For example, freshly # unlocked files in a v5 repo are listed as "typechange" and commit # refuses to touch them without an explicit `add` to_gitadd = [ ap['path'] for ap in paths # if not flagged as staged if not ap.get('staged', False) and # must exist, anything else needs no staging, can be committed directly lexists(ap['path']) and # not an annex repo, hence no choice other than git ( not isinstance(ds.repo, AnnexRepo) or # even in an annex repo we want to use `git add` for submodules (ap.get('type', None) == 'dataset' and not ap['path'] == ds.path)) ] to_annexadd = [ ap['path'] for ap in paths # not passed to git add if ap['path'] not in to_gitadd and # if not flagged as staged not ap.get('staged', False) and # prevent `git annex add .` in a subdataset, if not desired not ap.get('process_updated_only', False) and # must exist, anything else needs no staging, can be committed directly lexists(ap['path']) ] if to_gitadd or save_entire_ds: ds.repo.add( to_gitadd, git=True, commit=False, # this makes sure that pending submodule updates are added too update=save_entire_ds) if to_annexadd: ds.repo.add(to_annexadd, commit=False) _datalad_msg = False if not message: message = 'Recorded existing changes' _datalad_msg = True # we will blindly call commit not knowing if there is anything to # commit -- this is cheaper than to anticipate all possible ways # a repo in whatever mode is dirty paths_to_commit = None if not save_entire_ds: paths_to_commit = [] for ap in paths: paths_to_commit.append(ap['path']) # was file renamed? path_src = ap.get('path_src') if path_src and path_src != ap['path']: paths_to_commit.append(path_src) ds.repo.commit(message, files=paths_to_commit, _datalad_msg=_datalad_msg, careless=True) current_hexsha = ds.repo.get_hexsha() _was_modified = current_hexsha != orig_hexsha return current_hexsha if _was_modified else None
def populate_aggregated_jsons(path): """Aggregate across the entire BIDS dataset .json's into top level .json's Top level .json files would contain only the fields which are common to all subject[/session]/type/*_modality.json's. ATM aggregating only for *_task*_bold.json files. Only the task- and OPTIONAL _acq- field is retained within the aggregated filename. The other BIDS _key-value pairs are "aggregated over". Parameters ---------- path: str Path to the top of the BIDS dataset """ # TODO: collect all task- .json files for func files to tasks = {} # way too many -- let's just collect all which are the same! # FIELDS_TO_TRACK = {'RepetitionTime', 'FlipAngle', 'EchoTime', # 'Manufacturer', 'SliceTiming', ''} for fpath in find_files('.*_task-.*\_bold\.json', topdir=path, exclude_vcs=True, exclude="/\.(datalad|heudiconv)/"): # # According to BIDS spec I think both _task AND _acq (may be more? # _rec, _dir, ...?) should be retained? # TODO: if we are to fix it, then old ones (without _acq) should be # removed first task = re.sub('.*_(task-[^_\.]*(_acq-[^_\.]*)?)_.*', r'\1', fpath) json_ = load_json(fpath) if task not in tasks: tasks[task] = json_ else: rec = tasks[task] # let's retain only those fields which have the same value for field in sorted(rec): if field not in json_ or json_[field] != rec[field]: del rec[field] # create a stub onsets file for each one of those suf = '_bold.json' assert fpath.endswith(suf) # specify the name of the '_events.tsv' file: if '_echo-' in fpath: # multi-echo sequence: bids (1.1.0) specifies just one '_events.tsv' # file, common for all echoes. The name will not include _echo-. # TODO: RF to use re.match for better readability/robustness # So, find out the echo number: fpath_split = fpath.split('_echo-', 1) # split fpath using '_echo-' fpath_split_2 = fpath_split[1].split( '_', 1) # split the second part of fpath_split using '_' echoNo = fpath_split_2[0] # get echo number if echoNo == '1': if len(fpath_split_2) != 2: raise ValueError("Found no trailer after _echo-") # we modify fpath to exclude '_echo-' + echoNo: fpath = fpath_split[0] + '_' + fpath_split_2[1] else: # for echoNo greater than 1, don't create the events file, so go to # the next for loop iteration: continue events_file = fpath[:-len(suf)] + '_events.tsv' # do not touch any existing thing, it may be precious if not op.lexists(events_file): lgr.debug("Generating %s", events_file) with open(events_file, 'w') as f: f.write("onset\tduration\ttrial_type\tresponse_time\tstim_file" "\tTODO -- fill in rows and add more tab-separated " "columns if desired") # extract tasks files stubs for task_acq, fields in tasks.items(): task_file = op.join(path, task_acq + '_bold.json') # Since we are pulling all unique fields we have to possibly # rewrite this file to guarantee consistency. # See https://github.com/nipy/heudiconv/issues/277 for a usecase/bug # when we didn't touch existing one. # But the fields we enter (TaskName and CogAtlasID) might need need # to be populated from the file if it already exists placeholders = { "TaskName": ("TODO: full task name for %s" % task_acq.split('_')[0].split('-')[1]), "CogAtlasID": "TODO", } if op.lexists(task_file): j = load_json(task_file) # Retain possibly modified placeholder fields for f in placeholders: if f in j: placeholders[f] = j[f] act = "Regenerating" else: act = "Generating" lgr.debug("%s %s", act, task_file) fields.update(placeholders) save_json(task_file, fields, sort_keys=True, pretty=True)
def _paths_from_path_patterns(path_patterns, files=True, dirs="never", recursive=True, includes=None, excludes=None, skip_dupe_dirs=False, follow_symlinks=False, on_error=_NOT_SPECIFIED): """_paths_from_path_patterns([<path-patterns>, ...]) -> file paths Generate a list of paths (files and/or dirs) represented by the given path patterns. "path_patterns" is a list of paths optionally using the '*', '?' and '[seq]' glob patterns. "files" is boolean (default True) indicating if file paths should be yielded "dirs" is string indicating under what conditions dirs are yielded. It must be one of: never (default) never yield dirs always yield all dirs matching given patterns if-not-recursive only yield dirs for invocations when recursive=False See use cases below for more details. "recursive" is boolean (default True) indicating if paths should be recursively yielded under given dirs. "includes" is a list of file patterns to include in recursive searches. "excludes" is a list of file and dir patterns to exclude. (Note: This is slightly different than GNU grep's --exclude option which only excludes *files*. I.e. you cannot exclude a ".svn" dir.) "skip_dupe_dirs" can be set True to watch for and skip descending into a dir that has already been yielded. Note that this currently does not dereference symlinks. "follow_symlinks" is a boolean indicating whether to follow symlinks (default False). To guard against infinite loops with circular dir symlinks, only dir symlinks to *deeper* dirs are followed. "on_error" is an error callback called when a given path pattern matches nothing: on_error(PATH_PATTERN) If not specified, the default is look for a "log" global and call: log.error("`%s': No such file or directory") Specify None to do nothing. Typically this is useful for a command-line tool that takes a list of paths as arguments. (For Unix-heads: the shell on Windows does NOT expand glob chars, that is left to the app.) Use case #1: like `grep -r` {files=True, dirs='never', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield nothing script PATH* # yield all files matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files (not dirs) recursively under DIR script -r PATH* # yield files matching PATH* and files recursively # under dirs matching PATH*; if none, call # on_error(PATH*) callback Use case #2: like `file -r` (if it had a recursive option) {files=True, dirs='if-not-recursive', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield DIR, else call on_error(DIR) script PATH* # yield all files and dirs matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files (not dirs) recursively under DIR script -r PATH* # yield files matching PATH* and files recursively # under dirs matching PATH*; if none, call # on_error(PATH*) callback Use case #3: kind of like `find .` {files=True, dirs='always', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield DIR, else call on_error(DIR) script PATH* # yield all files and dirs matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files and dirs recursively under DIR # (including DIR) script -r PATH* # yield files and dirs matching PATH* and recursively # under dirs; if none, call on_error(PATH*) # callback """ from os.path import exists, isdir, join, normpath, abspath, lexists, islink, realpath from glob import glob assert not isinstance(path_patterns, _BASESTRING), \ "'path_patterns' must be a sequence, not a string: %r" % path_patterns if includes is None: includes = [] if excludes is None: excludes = [] GLOB_CHARS = '*?[' if skip_dupe_dirs: searched_dirs = set() for path_pattern in path_patterns: # Determine the set of paths matching this path_pattern. for glob_char in GLOB_CHARS: if glob_char in path_pattern: paths = glob(path_pattern) break else: if follow_symlinks: paths = exists(path_pattern) and [path_pattern] or [] else: paths = lexists(path_pattern) and [path_pattern] or [] if not paths: if on_error is None: pass elif on_error is _NOT_SPECIFIED: try: log.error("`%s': No such file or directory", path_pattern) except (NameError, AttributeError): pass else: on_error(path_pattern) for path in paths: if (follow_symlinks or not islink(path)) and isdir(path): if skip_dupe_dirs: canon_path = normpath(abspath(path)) if follow_symlinks: canon_path = realpath(canon_path) if canon_path in searched_dirs: continue else: searched_dirs.add(canon_path) # 'includes' SHOULD affect whether a dir is yielded. if (dirs == "always" or (dirs == "if-not-recursive" and not recursive)) and _should_include_path( path, includes, excludes): yield path # However, if recursive, 'includes' should NOT affect # whether a dir is recursed into. Otherwise you could # not: # script -r --include="*.py" DIR if recursive and _should_include_path(path, [], excludes): for dirpath, dirnames, filenames in _walk( path, follow_symlinks=follow_symlinks): dir_indeces_to_remove = [] for i, dirname in enumerate(dirnames): d = join(dirpath, dirname) if skip_dupe_dirs: canon_d = normpath(abspath(d)) if follow_symlinks: canon_d = realpath(canon_d) if canon_d in searched_dirs: dir_indeces_to_remove.append(i) continue else: searched_dirs.add(canon_d) if dirs == "always" \ and _should_include_path(d, includes, excludes): yield d if not _should_include_path(d, [], excludes): dir_indeces_to_remove.append(i) for i in reversed(dir_indeces_to_remove): del dirnames[i] if files: for filename in sorted(filenames): f = join(dirpath, filename) if _should_include_path(f, includes, excludes): yield f elif files and _should_include_path(path, includes, excludes): yield path
def compress_dicoms(dicom_list, out_prefix, tempdirs, overwrite): """Archives DICOMs into a tarball Also tries to do it reproducibly, so takes the date for files and target tarball based on the series time (within the first file) Parameters ---------- dicom_list : list of str list of dicom files out_prefix : str output path prefix, including the portion of the output file name before .dicom.tgz suffix tempdirs : object TempDirs object to handle multiple tmpdirs overwrite : bool Overwrite existing tarfiles Returns ------- filename : str Result tarball """ tmpdir = tempdirs(prefix='dicomtar') outtar = out_prefix + '.dicom.tgz' if op.exists(outtar) and not overwrite: lgr.info("File {} already exists, will not overwrite".format(outtar)) return # tarfile encodes current time.time inside making those non-reproducible # so we should choose which date to use. # Solution from DataLad although ugly enough: dicom_list = sorted(dicom_list) dcm_time = get_dicom_series_time(dicom_list) def _assign_dicom_time(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all ti.mtime = dcm_time return ti # poor man mocking since can't rely on having mock try: import time _old_time = time.time time.time = lambda: dcm_time if op.lexists(outtar): os.unlink(outtar) with tarfile.open(outtar, 'w:gz', dereference=True) as tar: for filename in dicom_list: outfile = op.join(tmpdir, op.basename(filename)) if not op.islink(outfile): os.symlink(op.realpath(filename), outfile) # place into archive stripping any lead directories and # adding the one corresponding to prefix tar.add(outfile, arcname=op.join(op.basename(out_prefix), op.basename(outfile)), recursive=False, filter=_assign_dicom_time) finally: time.time = _old_time tempdirs.rmtree(tmpdir) return outtar
def populate_bids_templates(path, defaults={}): """Premake BIDS text files with templates""" lgr.info("Populating template files under %s", path) descriptor = op.join(path, 'dataset_description.json') if not op.lexists(descriptor): save_json( descriptor, OrderedDict([ ('Name', "TODO: name of the dataset"), ('BIDSVersion', "1.0.1"), ('License', defaults.get( 'License', "TODO: choose a license, e.g. PDDL " "(http://opendatacommons.org/licenses/pddl/)")), ('Authors', defaults.get( 'Authors', ["TODO:", "First1 Last1", "First2 Last2", "..."])), ('Acknowledgements', defaults.get('Acknowledgements', 'TODO: whom you want to acknowledge')), ('HowToAcknowledge', "TODO: describe how to acknowledge -- either cite a " "corresponding paper, or just in acknowledgement " "section"), ('Funding', ["TODO", "GRANT #1", "GRANT #2"]), ('ReferencesAndLinks', ["TODO", "List of papers or websites"]), ('DatasetDOI', 'TODO: eventually a DOI for the dataset') ])) sourcedata_README = op.join(path, 'sourcedata', 'README') if op.exists(op.dirname(sourcedata_README)): create_file_if_missing(sourcedata_README, ( "TODO: Provide description about source data, e.g. \n" "Directory below contains DICOMS compressed into tarballs per " "each sequence, replicating directory hierarchy of the BIDS dataset" " itself.")) create_file_if_missing( op.join(path, 'CHANGES'), "0.0.1 Initial data acquired\n" "TODOs:\n\t- verify and possibly extend information in participants.tsv" " (see for example http://datasets.datalad.org/?dir=/openfmri/ds000208)" "\n\t- fill out dataset_description.json, README, sourcedata/README" " (if present)\n\t- provide _events.tsv file for each _bold.nii.gz with" " onsets of events (see '8.5 Task events' of BIDS specification)") create_file_if_missing( op.join(path, 'README'), "TODO: Provide description for the dataset -- basic details about the " "study, possibly pointing to pre-registration (if public or embargoed)" ) # TODO: collect all task- .json files for func files to tasks = {} # way too many -- let's just collect all which are the same! # FIELDS_TO_TRACK = {'RepetitionTime', 'FlipAngle', 'EchoTime', # 'Manufacturer', 'SliceTiming', ''} for fpath in find_files('.*_task-.*\_bold\.json', topdir=path, exclude_vcs=True, exclude="/\.(datalad|heudiconv)/"): task = re.sub('.*_(task-[^_\.]*(_acq-[^_\.]*)?)_.*', r'\1', fpath) json_ = load_json(fpath) if task not in tasks: tasks[task] = json_ else: rec = tasks[task] # let's retain only those fields which have the same value for field in sorted(rec): if field not in json_ or json_[field] != rec[field]: del rec[field] # create a stub onsets file for each one of those suf = '_bold.json' assert fpath.endswith(suf) events_file = fpath[:-len(suf)] + '_events.tsv' # do not touch any existing thing, it may be precious if not op.lexists(events_file): lgr.debug("Generating %s", events_file) with open(events_file, 'w') as f: f.write( "onset\tduration\ttrial_type\tresponse_time\tstim_file\tTODO -- fill in rows and add more tab-separated columns if desired" ) # extract tasks files stubs for task_acq, fields in tasks.items(): task_file = op.join(path, task_acq + '_bold.json') # do not touch any existing thing, it may be precious if not op.lexists(task_file): lgr.debug("Generating %s", task_file) fields["TaskName"] = ("TODO: full task name for %s" % task_acq.split('_')[0].split('-')[1]) fields["CogAtlasID"] = "TODO" with open(task_file, 'w') as f: f.write(json_dumps_pretty(fields, indent=2, sort_keys=True))
def copyfile(originalfile, newfile, copy=False, create_new=False, hashmethod=None, use_hardlink=False, copy_related_files=True): """Copy or link ``originalfile`` to ``newfile``. If ``use_hardlink`` is True, and the file can be hard-linked, then a link is created, instead of copying the file. If a hard link is not created and ``copy`` is False, then a symbolic link is created. Parameters ---------- originalfile : str full path to original file newfile : str full path to new file copy : Bool specifies whether to copy or symlink files (default=False) but only for POSIX systems use_hardlink : Bool specifies whether to hard-link files, when able (Default=False), taking precedence over copy copy_related_files : Bool specifies whether to also operate on related files, as defined in ``related_filetype_sets`` Returns ------- None """ newhash = None orighash = None fmlogger.debug(newfile) if create_new: while op.exists(newfile): base, fname, ext = split_filename(newfile) s = re.search('_c[0-9]{4,4}$', fname) i = 0 if s: i = int(s.group()[2:]) + 1 fname = fname[:-6] + "_c%04d" % i else: fname += "_c%04d" % i newfile = base + os.sep + fname + ext if hashmethod is None: hashmethod = config.get('execution', 'hash_method').lower() # Don't try creating symlinks on CIFS if copy is False and on_cifs(newfile): copy = True # Existing file # ------------- # Options: # symlink # to regular file originalfile (keep if symlinking) # to same dest as symlink originalfile (keep if symlinking) # to other file (unlink) # regular file # hard link to originalfile (keep) # copy of file (same hash) (keep) # different file (diff hash) (unlink) keep = False if op.lexists(newfile): if op.islink(newfile): if all((os.readlink(newfile) == op.realpath(originalfile), not use_hardlink, not copy)): keep = True elif posixpath.samefile(newfile, originalfile): keep = True else: if hashmethod == 'timestamp': hashfn = hash_timestamp elif hashmethod == 'content': hashfn = hash_infile else: raise AttributeError("Unknown hash method found:", hashmethod) newhash = hashfn(newfile) fmlogger.debug('File: %s already exists,%s, copy:%d', newfile, newhash, copy) orighash = hashfn(originalfile) keep = newhash == orighash if keep: fmlogger.debug('File: %s already exists, not overwriting, copy:%d', newfile, copy) else: os.unlink(newfile) # New file # -------- # use_hardlink & can_hardlink => hardlink # ~hardlink & ~copy & can_symlink => symlink # ~hardlink & ~symlink => copy if not keep and use_hardlink: try: fmlogger.debug('Linking File: %s->%s', newfile, originalfile) # Use realpath to avoid hardlinking symlinks os.link(op.realpath(originalfile), newfile) except OSError: use_hardlink = False # Disable hardlink for associated files else: keep = True if not keep and not copy and os.name == 'posix': try: fmlogger.debug('Symlinking File: %s->%s', newfile, originalfile) os.symlink(originalfile, newfile) except OSError: copy = True # Disable symlink for associated files else: keep = True if not keep: try: fmlogger.debug('Copying File: %s->%s', newfile, originalfile) shutil.copyfile(originalfile, newfile) except shutil.Error as e: fmlogger.warn(e.message) # Associated files if copy_related_files: related_file_pairs = (get_related_files(f, include_this_file=False) for f in (originalfile, newfile)) for alt_ofile, alt_nfile in zip(*related_file_pairs): if op.exists(alt_ofile): copyfile(alt_ofile, alt_nfile, copy, hashmethod=hashmethod, use_hardlink=use_hardlink, copy_related_files=False) return newfile
import sublime, sublime_plugin from os.path import lexists, normpath from hashlib import sha1 from gzip import GzipFile import thread from cPickle import load, dump import time debug = False # open db = {} database = sublime.packages_path() + '/User/BufferScroll.bin.gz' if lexists(database): try: gz = GzipFile(database, 'rb') db = load(gz) gz.close() except: db = {} else: # upgrade from os import remove, rename # from version 6 to 7 if lexists(sublime.packages_path() + '/User/BufferScroll.bin'): try: db = load(
def _proxy_exists(self, path): # TODO: decide either it should may be retrieved right away. # For now, as long as it is a symlink pointing to under .git/annex if exists(path): return True return lexists(path) and 'annex/objects' in str(realpath(path))
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)" ) # prep common result props res_kwargs = dict(action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive(refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = ensure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = _resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [ preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r) ] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # Possibly to be used "cache" of known subdatasets per each parent # to avoid re-querying subdatasets per each path. The assumption here # is that the list of sub-datasets for a given parent should not change # through the execution of this loop, which (hypothetically) could be # incorrect while annotating paths for some commands. # TODO: verify this assumption and possibly add an argument to turn # caching off if/when needed, or provide some other way to invalidate # it subdss_cache = {} # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath( opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or (refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root( normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) # Possibly "cache" the list of known subdss for parents we # have encountered so far if parent in subdss_cache: subdss = subdss_cache[parent] else: subdss = containing_ds.subdatasets(fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') subdss_cache[parent] = subdss if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get('status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset( parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change= force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def _dataset_auto_get(self, filepath): """Verify that filepath is under annex, and if so and not present - get it""" if not self._autoget: return # if filepath is not there at all (program just "checked" if it could access it if not lexists(filepath): lgr.log(2, " skipping %s since it is not there", filepath) return # deduce directory for filepath filedir = dirname(filepath) annex = None if self._repos_cache is not None: filedir_parts = filedir.split(pathsep) # ATM we do not expect subdatasets under .datalad, so we could take the top # level dataset for that try: filedir = pathsep.join( filedir_parts[:filedir_parts.index(HANDLE_META_DIR)]) except ValueError: # would happen if no .datalad pass try: annex = self._repos_cache[filedir] except KeyError: pass if annex is None: try: # TODO: verify logic for create -- we shouldn't 'annexify' non-annexified # see https://github.com/datalad/datalad/issues/204 annex = get_repo_instance(filedir) lgr.log(2, "Got the repository %s id:%s containing %s", annex, id(annex), filedir) except (RuntimeError, InvalidGitRepositoryError) as e: # must be not under annex etc return if self._repos_cache is not None: self._repos_cache[filedir] = annex if not isinstance(annex, AnnexRepo): # not an annex -- can do nothing lgr.log(2, " skipping %s since the repo is not annex", filepath) return # since Git/AnnexRepo functionality treats relative paths relative to the # top of the repository and might be outside, get a full path if not isabs(filepath): filepath = opj(getpwd(), filepath) # "quick" check first if under annex at all try: # might fail. TODO: troubleshoot when it does e.g. # datalad/tests/test_auto.py:test_proxying_open_testrepobased under_annex = annex.is_under_annex(filepath, batch=True) except: # MIH: really? what if MemoryError under_annex = None # either it has content if (under_annex or under_annex is None) and not annex.file_has_content(filepath): lgr.info("AutomagicIO: retrieving file content of %s", filepath) out = annex.get(filepath) if not out.get('success', False): # to assure that it is present and without trailing/leading new lines out['note'] = out.get('note', '').strip() lgr.error("Failed to retrieve %(file)s: %(note)s", out)
def copyfile( originalfile, newfile, copy=False, create_new=False, use_hardlink=True, copy_related_files=True, ): """ Copy or link files. If ``use_hardlink`` is True, and the file can be hard-linked, then a link is created, instead of copying the file. If a hard link is not created and ``copy`` is False, then a symbolic link is created. .. admonition:: Copy options for existing files * symlink * to regular file originalfile (keep if symlinking) * to same dest as symlink originalfile (keep if symlinking) * to other file (unlink) * regular file * hard link to originalfile (keep) * copy of file (same hash) (keep) * different file (diff hash) (unlink) .. admonition:: Copy options for new files * ``use_hardlink`` & ``can_hardlink`` => hardlink * ``~hardlink`` & ``~copy`` & ``can_symlink`` => symlink * ``~hardlink`` & ``~symlink`` => copy Parameters ---------- originalfile : :obj:`str` full path to original file newfile : :obj:`str` full path to new file copy : Bool specifies whether to copy or symlink files (default=False) but only for POSIX systems use_hardlink : Bool specifies whether to hard-link files, when able (Default=False), taking precedence over copy copy_related_files : Bool specifies whether to also operate on related files, as defined in ``related_filetype_sets`` Returns ------- None """ newhash = None orighash = None logger.debug(newfile) if create_new: while op.exists(newfile): base, fname, ext = split_filename(newfile) s = re.search("_c[0-9]{4,4}$", fname) i = 0 if s: i = int(s.group()[2:]) + 1 fname = fname[:-6] + "_c%04d" % i else: fname += "_c%04d" % i newfile = base + os.sep + fname + ext # Don't try creating symlinks on CIFS if copy is False and on_cifs(newfile): copy = True keep = False if op.lexists(newfile): if op.islink(newfile): if all( ( os.readlink(newfile) == op.realpath(originalfile), not use_hardlink, not copy, ) ): keep = True elif posixpath.samefile(newfile, originalfile): keep = True else: newhash = hash_file(newfile) logger.debug("File: %s already exists,%s, copy:%d", newfile, newhash, copy) orighash = hash_file(originalfile) keep = newhash == orighash if keep: logger.debug( "File: %s already exists, not overwriting, copy:%d", newfile, copy ) else: os.unlink(newfile) if not keep and use_hardlink: try: logger.debug("Linking File: %s->%s", newfile, originalfile) # Use realpath to avoid hardlinking symlinks os.link(op.realpath(originalfile), newfile) except OSError: use_hardlink = False # Disable hardlink for associated files else: keep = True if not keep and not copy and os.name == "posix": try: logger.debug("Symlinking File: %s->%s", newfile, originalfile) os.symlink(originalfile, newfile) except OSError: copy = True # Disable symlink for associated files else: keep = True if not keep: try: logger.debug("Copying File: %s->%s", newfile, originalfile) shutil.copyfile(originalfile, newfile) except shutil.Error as e: logger.warning(e.message) # Associated files if copy_related_files: related_file_pairs = ( get_related_files(f, include_this_file=False) for f in (originalfile, newfile) ) for alt_ofile, alt_nfile in zip(*related_file_pairs): if op.exists(alt_ofile): copyfile( alt_ofile, alt_nfile, copy, use_hardlink=use_hardlink, copy_related_files=False, ) return newfile
def __call__(loc, recursive=False, fast=False, all_=False, long_=False, config_file=None, list_content=False, json=None): if json: from datalad.interface.ls_webui import _ls_json if isinstance(loc, list) and not len(loc): # nothing given, CWD assumed -- just like regular ls loc = '.' kw = dict(fast=fast, recursive=recursive, all_=all_, long_=long_) if isinstance(loc, list): return [ Ls.__call__(loc_, config_file=config_file, list_content=list_content, json=json, **kw) for loc_ in loc ] # TODO: do some clever handling of kwargs as to remember what were defaults # and what any particular implementation actually needs, and then issuing # warning if some custom value/option was specified which doesn't apply to the # given url # rename to not angry Python gods who took all_ good words kw['long_'] = kw.pop('long_') loc_type = "unknown" if loc.startswith('s3://'): return _ls_s3(loc, config_file=config_file, list_content=list_content, **kw) elif lexists(loc): if isdir(loc): ds = Dataset(loc) if ds.is_installed(): return _ls_json(loc, json=json, ** kw) if json else _ls_dataset(loc, **kw) loc_type = False else: loc_type = "dir" # we know that so far for sure # it might have been an uninstalled dataset within super-dataset superds = ds.get_superdataset() if superds: try: subdatasets = Ls._cached_subdatasets[superds.path] except KeyError: subdatasets = Ls._cached_subdatasets[superds.path] \ = superds.subdatasets(result_xfm='relpaths') if relpath(ds.path, superds.path) in subdatasets: loc_type = "not installed" else: loc_type = "file" # could list properties -- under annex or git, either clean/dirty # etc # repo = get_repo_instance(dirname(loc)) if loc_type: #raise ValueError("ATM supporting only s3:// URLs and paths to local datasets") # TODO: unify all_ the output here -- _ls functions should just return something # to be displayed ui.message("{} {}".format( ansi_colors.color_word(loc, ansi_colors.DATASET), ansi_colors.color_word( loc_type, ansi_colors.RED if loc_type in {'unknown', 'not installed'} else ansi_colors.BLUE)))
def __call__(cmd=None, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, jobcfg='default', submit=False): # TODO makes sure a different rel_pwd is handled properly on the remote end pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='preparing a remote command execution') try: cmd_expanded = format_command(ds, cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'htcprepare', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return transfer_files_list = ['pre.sh', 'post.sh'] # where all the submission packs live subroot_dir = get_submissions_dir(ds) subroot_dir.mkdir(parents=True, exist_ok=True) # location of to-be-created submission submission_dir = ut.Path( tempfile.mkdtemp(prefix='submit_', dir=text_type(subroot_dir))) submission = submission_dir.name[7:] split_cmd = shlex.split(cmd_expanded) # is this a singularity job? singularity_job = get_singularity_jobspec(split_cmd) if not singularity_job: with (submission_dir / 'runner.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/runner_direct.sh')) job_args = split_cmd else: # link the container into the submission dir (submission_dir / 'singularity.simg').symlink_to( ut.Path(singularity_job[0]).resolve()) transfer_files_list.append('singularity.simg') # arguments of the job job_args = singularity_job[1] job_args.insert(0, 'singularity.simg') # TODO conditional on run_as_user=false with (submission_dir / 'runner.sh').open('wb') as f: f.write( resource_string( 'datalad_htcondor', 'resources/scripts/runner_singularity_anon.sh')) make_executable(submission_dir / 'runner.sh') # htcondor wants the log dir to exist at submit time # TODO ATM we only support a single job per cluster submission (submission_dir / 'job_0' / 'logs').mkdir(parents=True) # TODO make job pre/post script selection configurable with (submission_dir / 'pre.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/pre_posix_chirp.sh')) make_executable(submission_dir / 'pre.sh') with (submission_dir / 'post.sh').open('wb') as f: f.write( resource_string('datalad_htcondor', 'resources/scripts/post_posix.sh')) make_executable(submission_dir / 'post.sh') # API support selection (bound dataset methods and such) # internal import to avoid circularities from datalad.api import ( rev_status as status, ) inputs = GlobbedPaths(inputs, pwd=pwd) prepare_inputs(ds, inputs) # it could be that an input expression does not expand, # because it doesn't match anything. In such a case # we need to filter out such globs to not confuse # the status() call below that only takes real paths inputs = [p for p in inputs.expand(full=True) if op.lexists(p)] # now figure out what matches the remaining paths in the # entire repo and dump a list of files to transfer if inputs: with (submission_dir / 'input_files').open('w') as f: # TODO disable output renderer for p in ds.rev_status( path=inputs, # TODO do we really want that True? I doubt it # this might pull in the world recursive=False, # we would have otherwise no idea untracked='no', result_renderer=None): f.write(text_type(p['path'])) f.write(u'\0') transfer_files_list.append('input_files') if outputs: # write the output globs to a file for eval on the execute # side # XXX we may not want to eval them on the remote side # at all, however. This would make things different # than with local execute, where we also just write to # a dataset and do not have an additional filter (submission_dir / 'output_globs').write_text( # we need a final trailing delimiter as a terminator u'\0'.join(outputs) + u'\0') transfer_files_list.append('output_globs') (submission_dir / 'source_dataset_location').write_text(text_type(ds.pathobj) + op.sep) transfer_files_list.append('source_dataset_location') with (submission_dir / 'cluster.submit').open('w') as f: f.write( submission_template.format( executable='runner.sh', # TODO if singularity_job else 'job.sh', transfer_files_list=','.join( op.join(op.pardir, f) for f in transfer_files_list), **submission_defaults)) f.write(u'\narguments = "{}"\nqueue\n'.format( # TODO deal with single quotes in the args ' '.join("'{}'".format(a) for a in job_args))) # dump the run command args into a file for re-use # when the result is merged # include even args that are already evaluated and # acted upon, to be able to convince `run` to create # a full run record that maybe could be re-run # locally json_py.dump( dict( cmd=cmd, inputs=inputs, outputs=outputs, expand=expand, explicit=explicit, message=message, sidecar=sidecar, # report the PWD to, to given `run` a chance # to be correct after the fact pwd=pwd, ), text_type(submission_dir / 'runargs.json')) # we use this file to inspect what state this submission is in (submission_dir / 'status').write_text(u'prepared') yield get_status_dict(action='htc_prepare', status='ok', refds=text_type(ds.pathobj), submission=submission, path=text_type(submission_dir), logger=lgr) if submit: try: Runner(cwd=text_type(submission_dir)).run( ['condor_submit', 'cluster.submit'], log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, ) (submission_dir / 'status').write_text(u'submitted') yield get_status_dict(action='htc_submit', status='ok', submission=submission, refds=text_type(ds.pathobj), path=text_type(submission_dir), logger=lgr) except CommandError as e: yield get_status_dict(action='htc_submit', status='error', submission=submission, message=('condor_submit failed: %s', exc_str(e)), refds=text_type(ds.pathobj), path=text_type(submission_dir), logger=lgr)
def download_file(self, file_id, path, existing="error", attrs=None, digests=None): """ Parameters ---------- digests: dict, optional possible checksums or other digests provided for the file. Only one will be used to verify download """ if op.lexists(path): msg = f"File {path!r} already exists" if existing == "error": raise FileExistsError(msg) elif existing == "skip": lgr.info(msg + " skipping") return elif existing == "overwrite": pass elif existing == "refresh": remote_file_mtime = self._get_file_mtime(attrs) if remote_file_mtime is None: lgr.warning( f"{path!r} - no mtime or ctime in the record, redownloading" ) else: stat = os.stat(op.realpath(path)) same = [] if is_same_time(stat.st_mtime, remote_file_mtime): same.append("mtime") if "size" in attrs and stat.st_size == attrs["size"]: same.append("size") if same == ["mtime", "size"]: # TODO: add recording and handling of .nwb object_id lgr.info(f"{path!r} - same time and size, skipping") return lgr.debug( f"{path!r} - same attributes: {same}. Redownloading") destdir = op.dirname(path) os.makedirs(destdir, exist_ok=True) # suboptimal since # 1. downloads into TMPDIR which might lack space etc. If anything, we # might tune up setting/TMPDIR at the # level of download so it goes alongside with the target path # (e.g. under .FILENAME.dandi-download). That would speed things up # when finalizing the download, possibly avoiding `mv` across partitions # 2. unlike upload it doesn't use a callback but relies on a context # manager to be called with an .update. also it uses only filename # in the progressbar label # For starters we would do this implementation but later RF # when RF - do not forget to remove progressReporterCls in __init__ # Will do 3 attempts to avoid some problems due to flaky/overloaded # connections, see https://github.com/dandi/dandi-cli/issues/87 for attempt in range(3): try: self.downloadFile(file_id, path) break except gcl.HttpError as exc: if is_access_denied(exc) or attempt >= 2: raise # sleep a little and retry lgr.debug( "Failed to download on attempt#%d, will sleep a bit and retry", attempt, ) time.sleep(random.random() * 5) # It seems that above call does not care about setting either mtime if attrs: mtime = self._get_file_mtime(attrs) if mtime: os.utime(path, (time.time(), mtime.timestamp())) if digests: # Pick the first one (ordered according to speed of computation) for algo in metadata_digests: if algo in digests: break else: algo = list(digests)[:1] # first available digest = Digester([algo])(path)[algo] if digests[algo] != digest: lgr.warning( "%s %s is different: downloaded %s, should have been %s.", path, algo, digest, digests[algo], ) else: lgr.debug("Verified that %s has correct %s %s", path, algo, digest)
def _download_file(downloader, path, size=None, mtime=None, existing="error", digests=None): """Common logic for downloading a single file Generator downloader: TODO: describe expected records it should yield - progress - error - completion Parameters ---------- downloader: callable returning a generator A backend-specific fixture for downloading some file into path. It should be a generator yielding downloaded blocks. size: int, optional Target size if known digests: dict, optional possible checksums or other digests provided for the file. Only one will be used to verify download """ if op.lexists(path): block = f"File {path!r} already exists" if existing == "error": raise FileExistsError(block) elif existing == "skip": yield _skip_file("already exists") return elif existing == "overwrite": pass elif existing == "refresh": if mtime is None: lgr.warning( f"{path!r} - no mtime or ctime in the record, redownloading" ) else: stat = os.stat(op.realpath(path)) same = [] if is_same_time(stat.st_mtime, mtime): same.append("mtime") if size is not None and stat.st_size == size: same.append("size") # TODO: use digests if available? or if e.g. size is identical # but mtime is different if same == ["mtime", "size"]: # TODO: add recording and handling of .nwb object_id yield _skip_file("same time and size") return lgr.debug( f"{path!r} - same attributes: {same}. Redownloading") if size is not None: yield {"size": size} destdir = op.dirname(path) os.makedirs(destdir, exist_ok=True) yield {"status": "downloading"} algo, digester, digest, downloaded_digest = None, None, None, None if digests: # choose first available for now. # TODO: reuse that sorting based on speed for algo, digest in digests.items(): if algo == "dandi-etag": from .core.digests.dandietag import ETagHashlike digester = lambda: ETagHashlike(size) # noqa: E731 else: digester = getattr(hashlib, algo, None) if digester: break if not digester: lgr.warning("Found no digests in hashlib for any of %s", str(digests)) # TODO: how do we discover the total size???? # TODO: do not do it in-place, but rather into some "hidden" file for attempt in range(3): try: if digester: downloaded_digest = digester() # start empty warned = False # I wonder if we could make writing async with downloader with DownloadDirectory(path, digests) as dldir: downloaded = dldir.offset if size is not None and downloaded == size: # Exit early when downloaded == size, as making a Range # request in such a case results in a 416 error from S3. # Problems will result if `size` is None but we've already # downloaded everything. break for block in downloader(start_at=dldir.offset): if digester: downloaded_digest.update(block) downloaded += len(block) # TODO: yield progress etc msg = {"done": downloaded} if size: if downloaded > size and not warned: warned = True # Yield ERROR? lgr.warning( "Downloaded %d bytes although size was told to be just %d", downloaded, size, ) msg["done%"] = 100 * downloaded / size if size else "100" # TODO: ETA etc yield msg dldir.append(block) break except requests.exceptions.HTTPError as exc: # TODO: actually we should probably retry only on selected codes, and also # respect Retry-After if attempt >= 2 or exc.response.status_code not in ( 400, # Bad Request, but happened with gider: # https://github.com/dandi/dandi-cli/issues/87 503, # Service Unavailable ): lgr.debug("Download failed: %s", exc) yield {"status": "error", "message": str(exc)} return # if is_access_denied(exc) or attempt >= 2: # raise # sleep a little and retry lgr.debug( "Failed to download on attempt#%d: %s, will sleep a bit and retry", attempt, exc, ) time.sleep(random.random() * 5) if downloaded_digest: downloaded_digest = downloaded_digest.hexdigest( ) # we care only about hex if digest != downloaded_digest: msg = f"{algo}: downloaded {downloaded_digest} != {digest}" yield {"checksum": "differs", "status": "error", "message": msg} lgr.debug("%s is different: %s.", path, msg) return else: yield {"checksum": "ok"} lgr.debug("Verified that %s has correct %s %s", path, algo, digest) else: # shouldn't happen with more recent metadata etc yield { "checksum": "-", # "message": "no digests were provided" } # TODO: dissolve attrs and pass specific mtime? if mtime: yield {"status": "setting mtime"} os.utime(path, (time.time(), ensure_datetime(mtime).timestamp())) yield {"status": "done"}
def save_dataset( ds, paths=None, message=None, version_tag=None): """Save changes in a single dataset. Parameters ---------- ds : Dataset The dataset to be saved. paths : list, optional Paths to dataset components to be saved. message: str, optional (Commit) message to be attached to the saved state. version_tag : str, optional Tag to be assigned to the saved state. Returns ------- bool Whether a new state was saved. If all to be saved content was unmodified no new state will be saved. """ # XXX paths must be in the given ds, no further sanity checks! # make sure that all pending changes (batched annex operations, etc.) # are actually reflected in Git ds.repo.precommit() # track what is to be committed, so it becomes # possible to decide when/what to save further down # and one level up orig_hexsha = ds.repo.get_hexsha() # always yields list; empty if None files = list( set( [opj(ds.path, f) if not isabs(f) else f for f in assure_list(paths)])) # try to consider existing and changed files, and prevent untracked # files from being added # XXX not acting upon untracked files would be very expensive, because # I see no way to avoid using `add` below and git annex has no equivalent # to git add's --update -- so for now don't bother # XXX alternatively we could consider --no-ignore-removal to also # have it delete any already vanished files # asking yourself why we need to `add` at all? For example, freshly # unlocked files in a v5 repo are listed as "typechange" and commit # refuses to touch them without an explicit `add` tostage = [f for f in files if lexists(f)] if tostage: lgr.debug('staging files for commit: %s', tostage) if isinstance(ds.repo, AnnexRepo): # to make this work without calling `git add` in addition, # this needs git-annex v6.20161210 (see #1027) ds.repo.add(tostage, commit=False) else: # --update will ignore any untracked files, sadly git-annex add # above does not # will complain about vanished files though, filter them here, but # keep them for a later commit call ds.repo.add(tostage, git_options=['--update'], commit=False) _datalad_msg = False if not message: message = 'Recorded existing changes' _datalad_msg = True if files or ds.repo.repo.is_dirty( index=True, working_tree=False, untracked_files=False, submodules=True): # either we have an explicit list of files, or we have something # stages otherwise do not attempt to commit, as the underlying # repo will happily commit any non-change # not checking the working tree or untracked files should make this # relavtively cheap # TODO: commit() should rather report a dedicated ValueError # waiting for #1170 from datalad.support.exceptions import CommandError try: # we will blindly call commit not knowing if there is anything to # commit -- this is cheaper than to anticipate all possible ways # a repo in whatever mode is dirty # however, if nothing is dirty the whining wil start # --> sucking it up right here with swallow_logs(new_level=logging.ERROR) as cml: ds.repo.commit(message, options=files, _datalad_msg=_datalad_msg) except CommandError as e: # TODO until #1171 is resolved, test here for "normal" failure # to commit if 'nothing to commit' in str(e): lgr.debug( "Was instructed to commit %s files but repository is not dirty", files) elif 'no changes added to commit': lgr.info( 'Nothing to save') else: # relay any prior whining in the exception raise ValueError('{} [error log follows] {}; {}'.format( e, cml.out, cml.err)) # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: ds.repo.tag(version_tag) _was_modified = ds.repo.get_hexsha() != orig_hexsha return ds.repo.repo.head.commit if _was_modified else None
def _replace_file(str_src, dest, str_dest, follow_symlinks): if op.lexists(str_dest): dest.unlink() else: dest.parent.mkdir(exist_ok=True, parents=True) copyfile(str_src, str_dest, follow_symlinks=follow_symlinks)
def get_paths_by_dataset(paths, recursive=False, recursion_limit=None, out=None, dir_lookup=None): """Sort a list of paths per dataset they are contained in. Any paths that are not part of a dataset, or presently unavailable are reported. Parameter --------- paths : sequence A sequence of path specifications to sort. recursive : bool Flag whether to report subdatasets under any of the given paths recursion_limit : Depth constraint for recursion. See `Dataset.get_subdatasets()` for more information. out : dict or None By default a new output dictionary is created, however an existing one can be provided via this argument to enable incremental processing. dir_lookup : dict or None Optional lookup cache that maps paths to previously determined datasets. This can speed up repeated processing. Returns ------- Tuple(dict, list, list) Dict of `existing dataset path`: `path` mappings, the list of currently non-existing paths (possibly matching currently uninstalled datasets), and any paths that are not part of any dataset. """ # sort paths into the respective datasets if dir_lookup is None: dir_lookup = {} if out is None: out = {} # paths that don't exist (yet) unavailable_paths = [] nondataset_paths = [] for path in paths: if not lexists(path): # not there yet, impossible to say which ds it will actually # be in, if any unavailable_paths.append(path) continue # the path exists in some shape or form if isdir(path): # this could contain all types of additional content d = path else: # for everything else we are interested in the container d = dirname(path) if not d: d = curdir # this could be `None` if there is no git repo dspath = dir_lookup.get(d, get_dataset_root(d)) dir_lookup[d] = dspath if not dspath: nondataset_paths.append(path) continue if isdir(path): ds = Dataset(dspath) # we need to doublecheck that this is not a subdataset mount # point, in which case get_toppath() would point to the parent smpath = ds.get_containing_subdataset( path, recursion_limit=1).path if smpath != dspath: # fix entry dir_lookup[d] = smpath # submodule still needs to be obtained unavailable_paths.append(path) continue if recursive: # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # else done elsewhere subs = ds.get_subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit) for sub in subs: subdspath = opj(dspath, sub) if subdspath.startswith(_with_sep(path)): # this subdatasets is underneath the search path # we want it all # be careful to not overwrite anything, in case # this subdataset has been processed before out[subdspath] = out.get( subdspath, [subdspath]) out[dspath] = out.get(dspath, []) + [path] return out, unavailable_paths, nondataset_paths
def __call__(archive, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): """ Returns ------- annex """ if exclude: exclude = assure_tuple_or_list(exclude) if rename: rename = assure_tuple_or_list(rename) # TODO: actually I see possibly us asking user either he wants to convert # his git repo into annex archive_path = archive pwd = getpwd() if annex is None: annex = get_repo_instance(pwd, class_=AnnexRepo) if not isabs(archive): # if not absolute -- relative to wd and thus archive_path = normpath(opj(realpath(pwd), archive)) # abspath(archive) is not "good" since dereferences links in the path # archive_path = abspath(archive) elif not isabs(archive): # if we are given an annex, then assume that given path is within annex, not # relative to PWD archive_path = opj(annex.path, archive) annex_path = annex.path # _rpath below should depict paths relative to the top of the annex archive_rpath = relpath(archive_path, annex_path) # TODO: somewhat too cruel -- may be an option or smth... if not allow_dirty and annex.dirty: # already saved me once ;) raise RuntimeError( "You better commit all the changes and untracked files first") if not key: # we were given a file which must exist if not exists(archive_path): raise ValueError("Archive {} does not exist".format(archive)) # TODO: support adding archives content from outside the annex/repo origin = 'archive' key = annex.get_file_key(archive_rpath) archive_dir = dirname(archive_path) else: origin = 'key' key = archive archive_dir = None # We must not have anything to do with the location under .git/annex archive_basename = file_basename(archive) if not key: # TODO: allow for it to be under git??? how to reference then? raise NotImplementedError( "Provided file %s is not under annex. We don't support yet adding everything " "straight to git" % archive) # are we in a subdirectory of the repository? pwd_under_annex = commonprefix([pwd, annex_path]) == annex_path # then we should add content under that # subdirectory, # get the path relative to the repo top if use_current_dir: # if outside -- extract to the top of repo extract_rpath = relpath(pwd, annex_path) \ if pwd_under_annex \ else None else: extract_rpath = relpath(archive_dir, annex_path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None # no special relpath from top of the repo # and operate from now on the key or whereever content available "canonically" try: key_rpath = annex.get_contentlocation( key) # , relative_to_top=True) except: raise RuntimeError( "Content of %s seems to be N/A. Fetch it first" % key) # now we simply need to go through every file in that archive and lgr.info("Adding content of the archive %s into annex %s", archive, annex) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive annexarchive = ArchiveAnnexCustomRemote(path=annex_path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # TODO: check if may be it was already added if ARCHIVES_SPECIAL_REMOTE not in annex.get_remotes(): init_datalad_remote(annex, ARCHIVES_SPECIAL_REMOTE, autoenable=True) else: lgr.debug("Special remote {} already exists".format( ARCHIVES_SPECIAL_REMOTE)) precommitted = False delete_after_rpath = None try: old_always_commit = annex.always_commit annex.always_commit = False if annex_options: if isinstance(annex_options, string_types): annex_options = shlex.split(annex_options) leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \ if strip_leading_dirs else None leading_dir_len = len(leading_dir) + len( opsep) if leading_dir else 0 # we need to create a temporary directory at the top level which would later be # removed prefix_dir = basename(tempfile.mktemp(prefix=".datalad", dir=annex_path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() for extracted_file in earchive.get_extracted_files(): stats.files += 1 extracted_path = opj(earchive.path, extracted_file) if islink(extracted_path): link_path = realpath(extracted_path) if not exists( link_path ): # TODO: config addarchive.symlink-broken='skip' lgr.warning("Path %s points to non-existing file %s" % (extracted_path, link_path)) stats.skipped += 1 continue # TODO: check if points outside of the archive -- warning and skip # preliminary target name which might get modified by renames target_file_orig = target_file = extracted_file # strip leading dirs target_file = target_file[leading_dir_len:] if add_archive_leading_dir: target_file = opj(archive_basename, target_file) if rename: target_file = apply_replacement_rules(rename, target_file) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains {regexp} pattern" .format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if prefix_dir: target_file = opj(prefix_dir, target_file) # but also allow for it in the orig target_file_orig = opj(prefix_dir, target_file_orig) target_file_path_orig = opj(annex.path, target_file_orig) url = annexarchive.get_file_url( archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals())) target_file_path = opj(extract_rpath, target_file) \ if extract_rpath else target_file target_file_path = opj(annex.path, target_file_path) if lexists(target_file_path): handle_existing = True if md5sum(target_file_path) == md5sum(extracted_path): if not annex.is_under_annex(extracted_path): # if under annex -- must be having the same content, # we should just add possibly a new extra URL # but if under git -- we cannot/should not do # anything about it ATM if existing != 'overwrite': continue else: handle_existing = False if not handle_existing: pass # nothing... just to avoid additional indentation elif existing == 'fail': raise RuntimeError( "File {} already exists, but new (?) file {} was instructed " "to be placed there while overwrite=False".format( target_file_path, extracted_file)) elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a tree rmtree(target_file_path) else: target_file_path_orig_ = target_file_path # To keep extension intact -- operate on the base of the filename p, fn = os.path.split(target_file_path) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: raise ValueError(existing) # keep incrementing index in the suffix until file doesn't collide suf, i = '', 0 while True: target_file_path_new = opj( p, fn_base + suf + ('.' if (fn_ext or ends_with_dot) else '') + fn_ext) if not lexists(target_file_path_new): break lgr.debug("File %s already exists" % target_file_path_new) i += 1 suf = '.%d' % i target_file_path = target_file_path_new lgr.debug("Original file %s will be saved into %s" % (target_file_path_orig_, target_file_path)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file_path != target_file_path_orig: stats.renamed += 1 #target_path = opj(getpwd(), target_file) if copy: raise NotImplementedError( "Not yet copying from 'persistent' cache") else: # os.renames(extracted_path, target_path) # addurl implementation relying on annex'es addurl below would actually copy pass lgr.debug( "Adding %s to annex pointing to %s and with options %r", target_file_path, url, annex_options) out_json = annex.add_url_to_file(target_file_path, url, options=annex_options, batch=True) if 'key' in out_json and out_json[ 'key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug( "File {} was added to git, not adding url".format( target_file_path)) stats.add_git += 1 if delete_after: # delayed removal so it doesn't interfer with batched processes since any pure # git action invokes precommit which closes batched processes. But we like to count stats.removed += 1 # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc # annex.add(target_path, options=annex_options) # # above action might add to git or to annex # if annex.file_has_content(target_path): # # if not -- it was added to git, if in annex, it is present and output is True # annex.add_url_to_file(target_file, url, options=['--relaxed'], batch=True) # stats.add_annex += 1 # else: # lgr.debug("File {} was added to git, not adding url".format(target_file)) # stats.add_git += 1 # # TODO: actually check if it is anyhow different from a previous version. If not # # then it wasn't really added del target_file # Done with target_file -- just to have clear end of the loop if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(archive_rpath, force=True) lgr.info("Finished adding %s: %s" % (archive, stats.as_str(mode='line'))) if outside_stats: outside_stats += stats if delete_after: # force since not committed. r=True for -r (passed into git call # to recurse) delete_after_rpath = opj( extract_rpath, prefix_dir) if extract_rpath else prefix_dir lgr.debug("Removing extracted and annexed files under %s", delete_after_rpath) annex.remove(delete_after_rpath, r=True, force=True) if commit: commit_stats = outside_stats if outside_stats else stats annex.precommit( ) # so batched ones close and files become annex symlinks etc precommitted = True if annex.is_dirty(untracked_files=False): annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive, commit_stats.as_str(mode='full')), _datalad_msg=True) commit_stats.reset() finally: # since we batched addurl, we should close those batched processes # if haven't done yet. explicitly checked to avoid any possible # "double-action" if not precommitted: annex.precommit() if delete_after_rpath: delete_after_path = opj(annex_path, delete_after_rpath) if exists(delete_after_path): # should not be there # but for paranoid yoh lgr.warning( "Removing temporary directory under which extracted " "files were annexed and should have been removed: %s", delete_after_path) rmtree(delete_after_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) return annex
def tearDown(self): rm_rf(self.prefix) assert not lexists(self.prefix)
def makeLinks(self): full = self.options.get("full") short = self.options.get("short") addfold = self.options.get("addfold") fold = self.options.get("fold") useServiceNameLinks = self.options.get("useServiceNameLinks") useHardLinks = self.options.get("useHardLinks") piconLinks = {} linksMade = 0 commentRe = re.compile('#.*') for line in self.servrefFile: line = commentRe.sub('', line).rstrip() if not line: continue F = line.split() if len(F) > 3: print >> stderr, "Too many fields in server reference file:", line continue if len(F) < 3: print >> stderr, "Too few fields in server reference file:", line continue servRef, serviceName, picon = F servRefName = servRef servRefParts = servRefName.split(':')[0:10] servRefs = [] if useServiceNameLinks: servRefs.append([serviceName]) if full or addfold: servRefs.append(servRefParts) if short: servRefs.append(servRefParts[0:1] + servRefParts[3:7]) if addfold and (int(servRefParts[0]) & ~0x0100) == 1: stype = int(servRefParts[2], 16) if stype not in (0x1, 0x2, 0xA): servRefPartsFold = servRefParts[:] servRefPartsFold[2] = "1" servRefs.append(servRefPartsFold) # Fake up servicref 0x2 & 0xA for ABC news Radio if stype in (0x2, 0xA) and int(servRefParts[5], 16) in ( 0x1010, 0x3201) and int(servRefParts[3], 16) & 0xF == 0xF: servRefPartsFold = servRefParts[:] servRefPartsFold[2] = "2" if stype == 0xA else "A" servRefs.append(servRefPartsFold) # Fake up servicref 0x2 & 0xA for ABC news Radio if fold and (int(servRefParts[0]) & ~0x0100) == 1: stype = int(servRefParts[2], 16) if stype not in (0x1, 0x2, 0xA): servRefPartsFold = servRefParts[:] servRefPartsFold[2] = "1" servRefs.append(servRefPartsFold) for srp in servRefs: servRefName = '_'.join(srp) + '.png' if piconLinks.get(servRefName) == picon: continue if servRefName not in piconLinks: linked = False servRefPath = path.join(self.piconPath, servRefName) exists = path.exists(servRefPath) alreadyOverridden = servRefPath in self.overrides if exists and self.isOverride(servRefPath): if not alreadyOverridden: print >> stderr, "Picon", picon, "over-ridden by specific servref icon", servRefName continue lexists = exists or path.lexists(servRefPath) if (not exists or lexists) and picon in self.piconFiles: piconName, piconRef = self.piconFiles[picon] piconPath = path.join(self.CHAN_PICON_DIR, piconName) if useHardLinks: piconPath = path.join(self.piconPath, piconPath) if servRefName in self.origPiconLinks: if self.origPiconLinks[servRefName] == piconRef: linked = True del self.origPiconLinks[servRefName] if not linked: try: if lexists: remove(servRefPath) linksMade += 1 if useHardLinks: link(piconPath, servRefPath) else: symlink(piconPath, servRefPath) linked = True except Exception as err: print >> stderr, ( "Link" if useHardLinks else "Symlink" ), piconName, "->", servRefName, "failed -", str( err) if linked: self.linkedPiconNames.add(piconName) piconLinks[servRefName] = picon else: if picon not in ("tba", "tobeadvised"): print >> stderr, "No picon", picon, "for", servRef else: print >> stderr, "Servref link", servRef, "->", piconLinks[ servRefName], "exists; new link requested for", picon self.servrefFile.close() print >> stderr, "linksMade:", linksMade
def __call__( path=None, *, dataset=None, drop='datasets', reckless=None, message=None, jobs=None, # deprecated below recursive=None, check=None, save=None, if_dirty=None): # deprecate checks if if_dirty is not None: warnings.warn( "The `if_dirty` argument of `datalad remove` is ignored, " "it can be removed for a safe-by-default behavior. For " "other cases consider the `reckless` argument.", DeprecationWarning) if save is not None: warnings.warn( "The `save` argument of `datalad remove` is ignored. " "A dataset modification is always saved. Consider " "`save --amend` if post-remove fix-ups are needed.", DeprecationWarning) if recursive is not None: warnings.warn( "The `recursive` argument of `datalad remove` is ignored. " "Removal operations are always recursive, and the parameter " "can be stripped from calls for a safe-by-default behavior. ", DeprecationWarning) if check is not None: warnings.warn( "The `check` argument of `datalad remove` is deprecated, " "use the `reckless` argument instead.", DeprecationWarning) if check is False: if reckless is not None: raise ValueError( 'Must not use deprecated `check` argument, and new ' '`reckless` argument together with `datalad remove`.') reckless = 'availability' refds = require_dataset(dataset, check_installed=True, purpose='remove') # same path resolution that drop will do paths_by_ds, errors = get_paths_by_ds( refds, dataset, ensure_list(path), # super-mode will readily tell us which datasets to # save as the end subdsroot_mode='super') drop_success = True for res in Drop.__call__( dataset=dataset, path=path, what=drop, reckless=reckless, recursive=True, recursion_limit=None, jobs=jobs, result_xfm=None, return_type='generator', result_renderer='disabled', # delegate error handling here on_failure='ignore'): if res.get('status') not in ('ok', 'notneeded'): drop_success = False yield res if not drop_success: # there will be 'rm -rf' below, so play safe lgr.debug('Observed drop failure, will not attempt remove') return for dpath, paths in paths_by_ds.items(): for delpath in ([dpath] if paths is None else paths): if lexists(str(delpath)): # here we still have something around on the # filesystem. There is no need to fiddle with # Git, just wipe it out. A later save() will # act on it properly if delpath.is_dir(): lgr.debug('Remove directory: %s', delpath) rmtree(delpath) # cannot use .exists() must forsee dead symlinks else: lgr.debug('Remove file: %s', delpath) delpath.unlink() continue # if we get here, there is nothing on the file system # anymore at this path. Either because the parent # dataset vanished already, or because we dropped a # dataset, and it still needs to be unregistered # from its parent -> `git rm` if dpath.exists(): GitRepo(dpath).call_git( # no need for recursion, we know that even the root # path not longer exists ['rm', '-q'], files=[str(delpath.relative_to(dpath))]) # this path was already being removed by drop # so it must belong to a dropped dataset # save won't report about this, let's do it yield dict( action='remove', status='ok', path=str(delpath), type='dataset', ) if not refds.is_installed(): # we already dropped the whole thing return for res in Save.__call__( dataset=dataset, path=path, # we might have removed the reference dataset by now, recheck message=message if message else '[DATALAD] removed content', return_type='generator', result_renderer='disabled', result_xfm=None, result_filter=None, on_failure='ignore'): if res.get('action') == 'delete': # normalize to previous remove results res['action'] = 'remove' yield res