def downloadRun(run, myAPI, dryRun, files=[], force=False): # you can only pull 1024 items at once, so we have to loop over "pages" of items, 1024 at a time # this is done by incrementing the offset by 1024 each time, so the next loop gets the next page # the limit can be adjusted as long as the limit is equal to the offset page = 0 pageFiles = run.getFiles( myAPI, QueryParameters.QueryParameters({ 'Limit': 1024, 'Offset': int(1024 * page) })) totalSize = 0 #did the user select files? fileSel = False if files: fileSel = True # todo: insert regex matching to pull down only those required for demultiplex while len(pageFiles) > 0: for fn in pageFiles: if fileSel and fn.Name not in files: # user selected some particular files, but this aint one of em continue elif files and fn.Name in files: # we found it! cut it from the list files.pop(files.index(fn.Name)) thisSize = fn.__dict__['Size'] totalSize += thisSize if dryRun: continue savePath = str(run) + "/" + pathFromFile(fn, myAPI) if not os.path.exists(savePath): os.makedirs(savePath) if not force and os.path.exists(os.path.join(savePath, fn.Name)): print("already have " + savePath + fn.Name + ". Skipping...") continue else: fn.downloadFile(myAPI, savePath) if fileSel and len(files) == 0: # user selected some file(s) and we found them all; return break page += 1 pageFiles = run.getFiles( myAPI, QueryParameters.QueryParameters({ 'Limit': 1024, 'Offset': int(1024 * page) })) if files: # files was user-defined, but didn't successfully pop all elements # i.e. something was selected and never found print("warning: could not find these selected files") for fn in files: print('\t' + fn) print( humanFormat(totalSize) + '\t' + str(run) + '\t' + str(run.ExperimentName)) return totalSize
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', '--profile', default="DEFAULT", help="the .basespacepy.cfg profile to load") parser.add_argument('-d', '--dry', action='store_true', default=False, help="dry run; return size of selected items") parser.add_argument('-f', '--force', action='store_true', default=False, help="force overwrite; otherwise cat counters on new filenames") parser.add_argument('-j', '--project', required=True, nargs="+", help="project to download; can accept multiple values") parser.add_argument('-t', '--type', choices=['b','f','bam','fastq'], default='f', help='type of file to download') args = parser.parse_args() myAPI = BaseSpaceAPI(profile=args.profile, timeout=500) user = myAPI.getUserById('current') qp = QueryParameters.QueryParameters({'Limit':1024}) projects = user.getProjects(myAPI, qp) if args.type in ['b', 'bam']: download = downloadProjectBam elif args.type in ['f', 'fastq']: download = downloadProjectFastq userProjs = stringsToBSObj(projects, args.project) for lostProj in set(args.project) - set([str(x) for x in userProjs]): warning("cannot find " + str(lostProj)) TotalSize = 0 for project in userProjs: TotalSize += download(project , myAPI, args.dry, force=args.force) if len(userProjs) > 1: print(humanFormat(TotalSize) + "\tTotal")
def downloadProjectBam(project, myAPI, dryRun, samples=[], force=False, qp=QueryParameters.QueryParameters({'Limit':1024})): totalSize = 0 results = project.getAppResults(myAPI, qp) for result in results: bams = [ x for x in result.getFiles(myAPI, qp) if "bam" in str(x) ] if samples: if type(samples[0]) == str: samples = stringsToBSObj(project.getSamples(myAPI, qp), samples) # user picked particular samples # subset the list of bams accordingly #bams = [x for x in bams if ] #WIP print("\n\nuser picked particular samples, but this isn't coded in yet\n") stop() savePath = str(project).replace(" ","_") + "/" + pathFromFile(bams[0], myAPI) tmpPath = str(project).replace(" ","_") + "/" + pathFromFile(bams[0], myAPI) + "/partial/" if not os.path.exists(savePath): os.makedirs(savePath) if not os.path.exists(tmpPath): os.makedirs(tmpPath) for fn in bams: thisSize = fn.__dict__['Size'] # totalSize += thisSize if dryRun: totalSize += thisSize print(humanFormat(thisSize) + '\t' + fn.Name) continue # savePath = str(project).replace(" ","_") + "/" + pathFromFile(fn, myAPI) # tmpPath = str(project).replace(" ","_") + "/" + pathFromFile(fn, myAPI) + "partial/" # if not os.path.exists(savePath): # os.makedirs(savePath) # if not os.path.exists(tmpPath): # os.makedirs(tmpPath) pathToFn = os.path.join(savePath, fn.Name) if not force and fileExists(pathToFn, fn): print("already have " + savePath + "/" + fn.Name + ". Skipping...") continue else: while os.path.exists(os.path.join(savePath, fn.Name)): # if the path exists, append this string to the end to avoid overwriting counter = 1 fn.Name = os.path.basename(fn.Path) + "." + str(counter) counter += 1 print(os.path.join(savePath, fn.Name)) totalSize += thisSize fn.downloadFile(myAPI, tmpPath) shutil.move(os.path.join(tmpPath, os.path.split(fn.Path)[1] ) , os.path.join(savePath,fn.Name) ) if os.path.exists(tmpPath) and not os.listdir(tmpPath): os.rmdir(tmpPath) if not dryRun: downloadProjectMetadata(project, myAPI, samples=samples, outdir=savePath) print( humanFormat(totalSize) + '\t' + str(project) ) return totalSize
def downloadProjectFastq(project, myAPI, dryRun, samples=[], force=False, qp=QueryParameters.QueryParameters({'Limit':1024})): totalSize = 0 if not samples: samples = project.getSamples(myAPI, qp) elif samples and type(samples[0]) == str: #convert samples strings to sample objects samples = stringsToBSObj(project.getSamples(myAPI, qp), samples) for sample in samples: fns = sample.getFiles(myAPI, qp) savePath = str(project).replace(" ","_") + "/" + pathFromFile(fns[0], myAPI) tmpPath = str(project).replace(" ","_") + "/" + pathFromFile(fns[0], myAPI) + "/partial/" if not os.path.exists(savePath): os.makedirs(savePath) if not os.path.exists(tmpPath): os.makedirs(tmpPath) for fn in fns: thisSize = fn.__dict__['Size'] # skip addition until we know this will be downloaded #totalSize += thisSize if dryRun: totalSize += thisSize print(humanFormat(thisSize) + '\t' + fn.Name) continue # savePath = str(project).replace(" ","_") + "/" + pathFromFile(fn, myAPI) # tmpPath = str(project).replace(" ","_") + "/" + pathFromFile(fn, myAPI) + "partial/" # files are downloaded by ID so we have no control to rename them until they are on disk # download to a temp dir and rename accordingly when finished with this file # if not os.path.exists(savePath): # os.makedirs(savePath) # if not os.path.exists(tmpPath): # os.makedirs(tmpPath) pathToFn = os.path.join(savePath, fn.Name) if not force and fileExists(pathToFn, fn): print("already have " + savePath + fn.Name + ". Skipping...") continue else: while os.path.exists(os.path.join(savePath, fn.Name)): # if the path exists, append this string to the end to avoid overwriting counter = 1 fn.Name = os.path.basename(fn.Path) + "." + str(counter) counter += 1 totalSize += thisSize print(os.path.join(savePath, fn.Name)) fn.downloadFile(myAPI, tmpPath) shutil.move(os.path.join(tmpPath,os.path.split(fn.Path)[1] ) , os.path.join(savePath,fn.Name) ) if os.path.exists(tmpPath) and not os.listdir(tmpPath): # delete the temp directory if it is empty os.rmdir(tmpPath) if not dryRun: downloadProjectMetadata(project, myAPI, samples=samples) print( humanFormat(totalSize) + '\t' + str(project) ) return totalSize
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', '--profile', default="DEFAULT", help="the .basespacepy.cfg profile to load") parser.add_argument('-d', '--dry', action='store_true', default=False, help="dry run; return size of selected items") parser.add_argument( '-f', '--force', action='store_true', default=False, help="force overwrite; otherwise cat counters on new filenames") parser.add_argument( '-r', '--run', default=[], nargs="+", help="run name to download; can accept multiple values") parser.add_argument( '--file', default=[], nargs="+", help= "specific file(s) to pull from each run; can accept multiple values") args = parser.parse_args() myAPI = BaseSpaceAPI(profile=args.profile, timeout=500) user = myAPI.getUserById('current') qp = QueryParameters.QueryParameters({'Limit': 1024}) runs = user.getRuns(myAPI, qp) userRuns = stringsToBSObj(runs, args.run) if not args.run: userRuns = runs for lostRun in set(args.run) - set([str(x) for x in userRuns]): warning("cannot find " + str(lostRun)) TotalSize = 0 userFiles = args.file for run in userRuns: # must create a copy of userFiles or the downloadRun function will strip entries from this instance of the list TotalSize += downloadRun(run, myAPI, args.dry, files=[x for x in userFiles], force=args.force)
def downloadProjectMetadata(project, myAPI, samples=[], qp=QueryParameters.QueryParameters({'Limit': 1024})): totalSize = 0 sampleMetadata = pd.DataFrame() sindx = 0 fileMetadata = pd.DataFrame() findx = 0 if not samples: samples = project.getSamples(myAPI, qp) elif samples and type(samples[0]) == str: #convert samples strings to sample objects samples = stringsToBSObj(project.getSamples(myAPI, qp), samples) for sample in samples: sampleMetadata = sampleMetadata.append( pd.DataFrame(pullMetadata(sample), index=[sindx])) sindx += 1 fns = sample.getFiles(myAPI, qp) for fn in fns: thisFileMeta = pd.DataFrame(pullMetadata(fn), index=[findx]) thisFileMeta['SID'] = str(sample) fileMetadata = fileMetadata.append(thisFileMeta) findx += 1 timestamp = str(datetime.datetime.today()).replace(' ', '_') savePath = str(project).replace(" ", "_") + "/" + pathFromFile( fns[0], myAPI) if not os.path.exists(savePath): os.makedirs(savePath) sampleMetadata.to_csv(os.path.join( savePath, str(project) + '_SampleMetadata.' + timestamp + '.txt'), sep='\t', header=True, index=False) fileMetadata.to_csv(os.path.join( savePath, str(project) + '_FileMetadata.' + timestamp + '.txt'), sep='\t', header=True, index=False) return sampleMetadata, fileMetadata
def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', '--profile', default="DEFAULT", help="the .basespacepy.cfg profile to load") parser.add_argument('-j', '--project', required=True, nargs="+", help="project to download; can accept multiple values") args = parser.parse_args() myAPI = BaseSpaceAPI(profile=args.profile, timeout=500) user = myAPI.getUserById('current') qp = QueryParameters.QueryParameters({'Limit': 1024}) projects = user.getProjects(myAPI, qp) userProjs = stringsToBSObj(projects, args.project) for lostProj in set(args.project) - set([str(x) for x in userProjs]): warning("cannot find " + str(lostProj)) fullSampleMetadata = pd.DataFrame() fullFileMetadata = pd.DataFrame() for project in userProjs: smout, fmout = downloadProjectMetadata(project, myAPI) fullSampleMetadata = fullSampleMetadata.append(smout) fullFileMetadata = fullFileMetadata.append(fmout) thisInstant = str(datetime.datetime.today()).replace(' ', ';') fullSampleMetadata.to_csv('fullSampleMetadata.' + thisInstant + '.txt', sep='\t', header=True, index=False) fullFileMetadata.to_csv('fullFileMetadata.' + thisInstant + '.txt', sep='\t', header=True, index=False)