def updateAllSampleSet(workspace, Allsample_setname='all'): """ update the previous All Sample sample_set with the new samples that have been added. It is especially useful for the aggregate task. Can more generally merge two samplesets together Args: ---- workspace: str namespace/workspace from url typically newsample_setname: str name of sampleset to add to All_samples """ dm.WorkspaceManager(workspace).update_sample_set( Allsample_setname, dm.WorkspaceManager(workspace).get_samples().index.tolist())
def updateAllSampleSet(workspace, newsample_setname, Allsample_setname='All_samples'): """ update the previous All Sample sample_set with the new samples that have been added. It is especially useful for the aggregate task. Can more generally merge two samplesets together Args: ---- workspace: str namespace/workspace from url typically newsample_setname: str name of sampleset to add to All_samples """ prevsamples = list(dm.WorkspaceManager(workspace).get_sample_sets().loc[Allsample_setname]['samples']) newsamples = list(dm.WorkspaceManager(workspace).get_sample_sets().loc[newsample_setname]['samples']) prevsamples.extend(newsamples) dm.WorkspaceManager(workspace).update_sample_set(Allsample_setname, list(set(prevsamples)))
def getQC(workspace, only=[], qcname=[], match=""): """ Will get from a workspace, the QC data for each samples Args: ----- workspace: the workspace name only: do it only for this set of samples qcname: col name where the QC is in the workspace samples match: for example'.Log.final.out' get only that QC if you have a list of QCs in you qcname col Returns: -------- a dict(sample_id:list[QC_filepaths]) """ if type(qcname) is str: qcname = [qcname] res = {} wm = dm.WorkspaceManager(workspace) sam = wm.get_samples() if len(only) > 0: sam = sam[sam.index.isin(only)] for k, val in sam.iterrows(): res[k] = [] for i in val[qcname]: if type(i) is list: if match: res[k].extend([e for e in i if match in e]) else: res[k].extend(i) else: res[k].append(i) return res
def removeFromFailedWorkflows(workspaceid, maxtime='2020-06-10', everythingFor=[], dryrun=False): """ Lists all files from all jobs that have failed and deletes them. Can be very long Args: ----- workspaceid: str the workspace name maxtime: str date format (eg. 2020-06-10) does not delete files generated past this date everythingFor: list[str] removes from these workflows even if not failed dryrun: bool whether or not to execute or just print commands """ wm = dm.WorkspaceManager(workspaceid) for k, val in wm.get_submission_status(filter_active=False).iterrows(): if (val.Failed > 0 or val.configuration in everythingFor ) and val.date.date() > pd.to_datetime(maxtime): for w in wm.get_submission(val.submission_id)['workflows']: if w['status'] == 'Failed' or val.configuration in everythingFor: try: a = w['workflowId'] #else it was not even run except: continue deleteJob(workspaceid, val.submission_id, a, dryrun=dryrun)
def test_simple_interactions(self): ws = dalmatian.WorkspaceManager('broad-firecloud-gtex/unit_testing') ws._LegacyWorkspaceManager__hound = unittest.mock.Mock( dalmatian.base.HoundClient) ws.hound.configure_mock(**{ 'with_reason.side_effect': no_op_ctx, 'batch.side_effect': no_op_ctx }) self.assertTrue(ws.create_workspace()) self.assertEqual(ws.bucket_id, 'fc-3ce4e797-82a8-46fe-8eee-66422dd92ed0') ws.upload_samples( pd.read_csv(os.path.join(relpath, 'test_samples.tsv'), sep='\t', index_col=0)) ws.update_participant_samples() with open(os.path.join(relpath, 'rnaseqc_counts.config.json')) as r: ws.update_config(json.load(r)) ws.update_sample_set('all_samples', ws.samples.index) self.assertEqual( ws.create_submission('rnaseqc_v2_cfg', 'all_samples', 'sample_set', 'this.samples'), '88c08eea-c10d-4f8c-b757-504ae59f86c5') ws.delete_workspace() # Check that the request buffer has been played out for buf in requests.values(): self.assertEqual(len(buf), 0)
def createManySubmissions(workspace, workflow, references, entity=None, expression=None, use_callcache=True): """ wrapper to create many submissions for a workflow Args: ---- workspace: str namespace/workspace from url typically references: list(str) a list of name of the row in this entity entity: str terra csv type (sample_id...) expresson: str to use if want to compute on the direct value of the entity or on values of values e.g. this.samples use_callcache: Bool to false if want to recompute everything even if same input Returns: ------ submission_ids list(str) the submission ids """ wm = dm.WorkspaceManager(workspace) submission_ids = [] for ref in references: submission_ids += [ wm.create_submission(workflow, ref, entity, expression, use_callcache) ] return submission_ids
def get_pairs_from_wsname(www): paired_path = pd.DataFrame() for nspair in www: print(paired_path.shape) S = dalmatian.WorkspaceManager(*nspair).get_samples() sample_df = S.reset_index()["sample_id"].str.extract( r"([A-Z]+)-(..-....)-(..)$") path_df = S.reset_index()[["sample_id", "WXS_bam_path"]] concat_df = pd.concat([sample_df, path_df], axis=1) concat_df.columns = ["cohort", "id", "type", "sample_id", "bam_path"] concat_df['pid'] = concat_df['cohort'].str.cat(concat_df['id'], sep="-") reset_df = pd.pivot_table(concat_df, index=["pid"], columns=["type"], values=["bam_path"], aggfunc=np.sum).reset_index() new_paired_path = reset_df.rename_axis(None)["bam_path"] new_paired_path["pid"] = reset_df["pid"] paired_path = paired_path.append(pd.DataFrame(new_paired_path), ignore_index=True, sort=True) #columns = ["pid","NB","NT","TR","TP"]) return paired_path
async def waitForSubmission(workspace, submissions, raise_errors=True): """ wrapper to create many submissions for a workflow Args: ----- workspace: str namespace/workspace from url typically submissions: list[str] of submission ids raise_errors: bool to true if errors should stop your code Returns: ------- list of ids of failed submissions """ failed_submission = [] timing = 0 wm = dm.WorkspaceManager(workspace).disable_hound() assert submissions is not None if type(submissions) is type(""): submissions = [submissions] for scount, submission_id in enumerate(submissions): finished = False while not finished: done = 0 failed = 0 finished = True submission = wm.get_submission(submission_id)["workflows"] for _, i in enumerate(submission): if i['status'] not in { 'Done', 'Aborted', 'Failed', 'Succeeded' }: finished = False elif i["status"] in {'Failed', 'Aborted'}: failed += 1 if i["workflowEntity"][ "entityName"] not in failed_submission: print(i["workflowEntity"]["entityName"]) failed_submission.append( i["workflowEntity"]["entityName"]) elif i["status"] in {'Done', 'Succeeded'}: done += 1 if not finished: time.sleep(40) print("status is: Done for " + str(done) + " jobs in submission " + str(scount) + ". " + str(timing) + ",5 mn elapsed.", end="\r") timing += 1 time.sleep(20) print("status is: Failed for " + str(failed) + " jobs in submission " + str(scount) + ". " + str(timing) + " mn elapsed.", end="\r") else: print( str(done / (done + failed)) + " of jobs Succeeded in submission " + str(scount) + ".") if len(failed_submission) > 0 and raise_errors: raise RuntimeError(str(len(failed_submission)) + " failed submission") return failed_submission
async def cleanWorkspace(workspaceid, only=[], toleave=[], defaulttoleave=[ 'workspace', 'scripts', 'notebooks', 'files', 'data', 'hound', 'references', 'name', 'folder' ]): """ removes all processing folder in a terra workspace easily args: only: list of strings to keep workspaceid: str, the workspace toleave: a list of first order folder in the bucket that you don't want to be deleted defaulttoleave: it should contain non processing folders that contain metadata and files for the workspace """ toleave.extend(defaulttoleave) bucket = dm.WorkspaceManager(workspaceid).get_bucket_id() res = subprocess.run('gsutil -m ls gs://' + bucket, shell=True, capture_output=True) if res.returncode != 0: raise ValueError(str(res.stderr)) res = str(res.stdout)[2:-1].split('\\n')[:-1] toremove = [val for val in res if val.split('/')[-2] not in toleave] if only: # you were here toremove = [val for val in res if val.split('/')[-2] in only] if h.askif('we are going to remove ' + str(len(toremove)) + " files/folders:\n" + str(toremove) + "\nare you sure?"): gcp.rmFiles(toremove, add='-r') else: print("aborting")
def deleteHeavyFiles(workspaceid, unusedOnly=True): """ deletes all files above a certain size in a workspace (that are used or unused) Args: ---- workspaceid: str the name off the workspace unusedOnly: bool whether to delete used files as well (files that appear in one of the sample/samplesets/pairs data tables) """ wm = dm.WorkspaceManager(workspaceid) bucket = wm.get_bucket_id() sizes = gcp.get_all_sizes('gs://'+bucket+'/') print('we got '+str(len(sizes))+' files') a = list(sizes.keys()) a.sort() ma = 100 torm = [] tot = 0 for i in a[::-1]: if i>1000000*ma: tot += i for val in sizes[i]: torm.append(val) print('we might remove more than '+str(tot/1000000000)+'GB') if unusedOnly: sam = pd.concat([wm.get_samples(),wm.get_pairs(),wm.get_sample_sets()]) tokeep = set([val for val in sam.values.ravel() if type(val) is str and val[:5]=='gs://']) torm = set(torm) - tokeep return torm
def df_from_workspace_set(workspace, set_type, set_name, column_name): """Get the dataframe specified by set_name (of set_type) in your Terra workspace. The dataframe is filtered by indices that have non-null values in the column specified by column_name. The whole dataframe is returned so that further filtering can be performed beyond what is in the given set_name. :param workspace: full name of Terra workspace :param set_type: one of [pair, sample, participant] :param set_name: name of desired set :param column_name: column name holding desired file/data :return: pandas.Dataframe of the desired set """ if not set_type or not set_name or not column_name: raise ValueError("If calling from Terra workspace, the set_type, " "set_name, and column_name must be specified.") # Import Workspace from Terra/Firecloud wm = dalmatian.WorkspaceManager(workspace) if set_type == 'pair': set_df = wm.get_pairs_in_pair_set(set_name) elif set_type == 'sample': set_df = wm.get_samples() set_df = set_df[np.in1d(set_df.index.values, wm.get_sample_sets().loc[set_name]['samples'])] elif set_type == 'participant': set_df = wm.get_participants() set_df = set_df[np.in1d( set_df.index.values, wm.get_participant_sets().loc[set_name]['participants'])] else: raise ValueError( f"set_type must be one of pair, sample, participant, not {set_type}." ) return set_df[set_df[column_name].notnull()]
def extract_config_summary(workspace_name, workflows=None): wm = dalmatian.WorkspaceManager(workspace_name) configs = wm.get_configs() config_summaries = [] for rec in configs.to_records(): cfgname = rec['namespace'] + "/" + rec['name'] if workflows is not None: if cfgname not in workflows: continue config = wm.get_config(cfgname) config['inputs'] = {k: v.strip() for k, v in config['inputs'].items()} config['outputs'] = { k: v.strip() for k, v in config['outputs'].items() } inputs = [ resolve_dot_path(config['rootEntityType'], x) for x in config['inputs'].values() if x.startswith("this.") ] outputs = [ resolve_dot_path(config['rootEntityType'], x) for x in config['outputs'].values() if x.startswith("this.") ] config_summaries.append( dict(inputs=inputs, outputs=outputs, entity_type=rec['rootEntityType'], name=cfgname)) return config_summaries
def saveOmicsOutput(workspace, pathto_cnvpng='segmented_copy_ratio_img', pathto_stats='sample_statistics', specific_cohorts=[], specific_celllines=[], is_from_pairs=True, pathto_snv='filtered_variants', pathto_seg='cnv_calls', datadir='gs://cclf_results/targeted/kim_sept/', specific_samples=[]): """ *WIP* For a workspace containing all omics workflows (CNV/SNV) (like CCLF's) copies all interesting output to a data bucket Args: ----- workspace: the workspace name pathto_cnvpng: sample col of the CNV plot results pathto_stats: sample col of the bam QC results specific_cohorts: if provided, will only look for this specific specific_celllines: if need to rrun on specific cell lines is_from_pairs: if we process on pairs or samples data pathto_snv: sample col of the snv files pathto_seg: sample col of the segment files datadir: gs bucket path where to copy the resulting files specific_samples: if provided will only look for these samples """ if specific_cohorts: samples = dm.WorkspaceManager(workspace).get_samples() samples = samples[samples.index.isin(specificlist)] if is_from_pairs: pairs = dm.WorkspaceManager(workspace).get_pairs() pairs = pairs[pairs['case_sample'].isin(specificlist)] for i, val in samples.iterrows(): os.system('gsutil cp ' + val[pathto_seg] + ' ' + datadir + i + '/') os.system('gsutil cp ' + val[pathto_cnvpng] + ' ' + datadir + i + '/') os.system('gsutil cp ' + val[pathto_stats] + ' ' + datadir + i + '/') if is_from_pairs: snvs = pairs[pairs["case_sample"] == i][pathto_snv] for snv in snvs: if snv is not np.nan: os.system('gsutil cp ' + snv + ' ' + datadir + i + '/') break else: os.system('gsutil cp ' + val[pathto_snv] + ' ' + datadir + i + '/')
def addToSampleSet(workspace, samplesetid, samples): """ add samples to a sample set will create new if doesn't already exist, else adds to existing Args: ---- workspace: the workspace name samplesetid: the sample set name samples: a list of samples """ try: prevsamples = dm.WorkspaceManager(workspace).get_sample_sets()['samples'][samplesetid] samples.extend(prevsamples) except KeyError: print('The sample set ' + str(samplesetid) + ' did not exist in the workspace. Will be created now...') dm.WorkspaceManager(workspace).update_sample_set(samplesetid, list(set(samples)))
def get_current_samples(): """Get current samples from FC """ namespace = "nci-mimoun-bi-org" workspace = "CCLF_TSCA_2_0" wm = dalmatian.WorkspaceManager(namespace, workspace) # Current samples curr_samples = wm.get_samples() return curr_samples
def findBackErasedDuplicaBamteFromTerraBucket(workspace, gsfolder, bamcol="WES_bam", baicol="WES_bai"): """ If you have erased bam files in gcp with bai files still present and the bam files are stored elsewhere and their location is in a terra workspace. Will find them back by matching bai sizes and copy them back to their original locations Args: ---- workspace: str namespace/workspace from url typically gsfolder: str the gsfolder where the bam files are bamcol: str colname of the bam baicol: str colname of the bai """ # get ls of all files folder samples = os.popen('gsutil -m ls -al ' + gsfolder + '**.bai').read().split('\n') # compute size filepath sizes = {'gs://' + val.split('gs://')[1].split('#')[0]: int(val.split("2019-")[0]) for val in samples[:-2]} names = {} for k, val in sizes.items(): if val in names: names[val].append(k) else: names[val] = [k] # get all bai in tsv samp = dm.WorkspaceManager(workspace).get_samples() for k, val in samp.iterrows(): if val[bamcol] != 'NA' and val[baicol] != 'NA': # if bai has duplicate size code = os.system('gsutil ls ' + val[bamcol]) if code == 256: if val[bamcol] is None: print('we dont have bam value for ' + str(k)) continue else: print('no match values for ' + str(val[bamcol])) for va in names[sizes[val[baicol]]]: # for all duplicate size # if ls bam of bai duplicate size work # mv bam to bampath in gsfolder if '.bam' in va: if os.system('gsutil ls ' + va.split('.bam.bai')[0] + ".bam") == 0: print('gsutil mv ' + va.split('.bam.bai')[0] + ".bam " + val[bamcol]) os.system('gsutil mv ' + va.split('.bam.bai')[0] + ".bam " + val[bamcol]) break elif os.system('gsutil ls ' + va.split('.bai')[0] + ".bam") == 0: print('gsutil mv ' + va.split('.bai')[0] + ".bam " + val[bamcol]) os.system('gsutil mv ' + va.split('.bai')[0] + ".bam " + val[bamcol]) break elif code == signal.SIGINT: print('Awakened') break else: print("no data for " + str(k))
def addToPairSet(workspace, pairsetid, pairs): """ add pairs to a pair set will create new if doesn't already exist, else adds to existing Args: ---- workspace: the workspace name pairsetid: the pair set name pairs: a list of pairs """ try: prevpairs = dm.WorkspaceManager(workspace).get_pair_sets().loc[[pairsetid]].pairs[0] pairs.extend(prevpairs) except KeyError: print('The pair set ' + str(pairsetid) + ' did not exist in the workspace. Will be created now...') dm.WorkspaceManager(workspace).update_pair_set(pairsetid, list(set(pairs)))
def getRNAQC(workspace, only=[], qcname="star_logs"): res = {} wm = dm.WorkspaceManager(workspace) sam = wm.get_samples() if len(only) > 0: sam = sam[sam.index.isin(only)] for k, val in sam[qcname].iteritems(): for i in val: if '.Log.final.out' in i: res[k] = i return res
def copyToWorkspace(workspaceID, tracker, columns=[ "arxspan_id", "version", "sm_id", "datatype", "size", "ccle_name", "stripped_cell_line_name", "participant_id", "cellosaurus_id", "bam_public_sra_path", "internal_bam_filepath", "internal_bai_filepath", "parent_cell_line", "sex", "matched_normal", "age", "primary_site", "primary_disease", "subtype", "subsubtype", "origin", "mediatype", "condition", "sequencing_type", "baits", "source", "legacy_bam_filepath", "legacy_bai_filepath" ], rename={}, deleteUnmatched=False, addMissing=False): """ will use the current sample tracker to update samples in the workspace it can remove samples that are not in the tracker. Args: ---- workspaceID: str the workspace id tracker: dataframe the sample tracker columns: list[str] the columns to sync rename: dict(str:str) columns to rename from sample tracker to workspace deleteUnmatched: bool whether or not to delete samples in the workspace and not in the sample tracker """ wm = dm.WorkspaceManager(workspaceID).disable_hound() sam = wm.get_samples() track = tracker[tracker.index.isin( sam.index)][columns].rename(columns=rename) track.index.name = "sample_id" miss = set(columns) - set(sam.columns) if len(track) == 0 and not addMissing: raise ValueError('wrong tracker or index non matching') unmatched = set(sam.index) - (set(tracker.index) | set(['nan'])) if not addMissing: print("found these to be unmatched in the tracker: " + str(unmatched)) if deleteUnmatched and len(unmatched) > 0: terra.removeSamples(workspaceID, unmatched) unmatched = (set(tracker.index) - set(sam.index)) if len(track) != 0: wm.update_sample_attributes(track) if addMissing and len(unmatched) > 0: print('found these columns to be missing in workspace: ' + str(unmatched)) track = tracker[tracker.index.isin(unmatched)][columns].rename( columns=rename) track.index.name = "sample_id" wm.upload_samples(track)
def update( samples, samplesetname, stype, bucket, refworkspace, name_col="index", values=['legacy_bam_filepath', 'legacy_bai_filepath'], filetypes=['bam', 'bai'], my_id='~/.client_secret.json', mystorage_id="~/.storage.json", creds='../.credentials.json', sampletrackername='ccle sample tracker', refsheet_url="https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY", ): # uploading to our bucket (now a new function) terra.changeToBucket(samples, bucket, name_col=name_col, values=values, filetypes=filetypes, catchdup=True, test=False) samplesetname sheets = Sheets.from_files(my_id, mystorage_id) ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0) names = [] subccle_refsamples = ccle_refsamples[ccle_refsamples['datatype'] == stype] for k, val in samples.iterrows(): val = val["arxspan_id"] names.append(val) samples.loc[k, 'version'] = len(subccle_refsamples[ subccle_refsamples['arxspan_id'] == val]) + names.count(val) samples['version'] = samples['version'].astype(int) ccle_refsamples = ccle_refsamples.append(samples, sort=False) dfToSheet(ccle_refsamples, sampletrackername, secret=creds) #uploading new samples to mut refwm = dm.WorkspaceManager(refworkspace).disable_hound() refwm.upload_samples(samples) sam = refwm.get_samples() #creating a sample set refwm.update_sample_set(sample_set_id=samplesetname, sample_ids=samples.index) refwm.update_sample_set( sample_set_id='all', sample_ids=[i for i in sam.index.tolist() if i != 'nan'])
def ExtractStarQualityInfo(samplesetname, workspace, release='temp'): """ put all of the Star Quality results from Terra Star Workflow into one txt file Args: ----- samplesetname: the sampleset name for which to grab the samples processed by star. wm: the terra workspace release: the name of the folder where it will be stored """ a = dm.WorkspaceManager(workspace).get_samples().loc[dm.WorkspaceManager( workspace).get_sample_sets().loc[samplesetname].samples].star_logs for i, sample in enumerate(a): if sample is None: print("no log file found for: " + a.index[i]) for log in sample: if 'final.out' in log: print("copying " + a.index[i]) os.system('gsutil cp ' + log + ' temp/') os.system("cat data/" + release + "/*.Log.final.out > temp/" + samplesetname + ".txt") os.system("rm data/" + release + "/*.Log.final.out")
def getWESQC(workspace, only=[], qcname=[]): res = {} wm = dm.WorkspaceManager(workspace) sam = wm.get_samples() if len(only) > 0: sam = sam[sam.index.isin(only)] for k, val in sam.iterrows(): res[k] = [] for i in val[qcname]: if type(i) is list: res[k].extend(i) else: res[k].append(i) return res
def deleteJob(workspaceid, subid, taskid, deleteCurrent=False, dryrun=True): """ removes files generated by a job on Terra Args: ----- workspaceid: str wokspace name subid: str the name of the job taskid: str the name of the task in this job DeleteCurrent: bool whether or not to delete files if they appear in one of the sample/samplesets/pairs data tables dryrun: bool just plot the commands but don't execute them """ wm = dm.WorkspaceManager(workspaceid) bucket = wm.get_bucket_id() data = [] if deleteCurrent: if dryrun: print('gsutil -m rm gs://' + bucket + '/' + subid + '/*/' + taskid + '/**') else: res = subprocess.run('gsutil -m rm gs://' + bucket + '/' + subid + '/*/' + taskid + '/**', shell=True, capture_output=True) if res.returncode != 0: raise ValueError(str(res.stderr)) else: res = subprocess.run('gsutil -m ls gs://' + bucket + '/' + subid + '/*/' + taskid + '/**', shell=True, capture_output=True) if res.returncode != 0 or len(str(res.stdout)) < 4: raise ValueError(str(res.stderr)) data += str(res.stdout)[2:-1].split('\\n')[:-1] if "TOTAL:" in data[-1]: data = data[:-1] sam = pd.concat( [wm.get_samples(), wm.get_pairs(), wm.get_sample_sets()]) tokeep = set([ val for val in sam.values.ravel() if type(val) is str and val[:5] == 'gs://' ]) torm = set(data) - tokeep if dryrun: print(torm) else: h.parrun(['gsutil rm ' + i for i in torm], cores=12)
def get_filenames(workspace, output_file='maf_filenames.txt'): # Import Workspace from Firecloud wm = dalmatian.WorkspaceManager(workspace) pairs = wm.get_pairs() # pairs = wm.get_pairs_in_pair_set('DESIRED_PAIR_SET') pairs = pairs[pairs['mutation_validator_validated_maf'].notnull()] ################### # Perform Filtering ################### desired_files = pairs['mutation_validator_validated_maf'].tolist() with open(output_file, 'w') as o_file: o_file.writelines(f'{f_name}\n' for f_name in desired_files)
async def shareTerraBams( samples, users, workspace, bamcols=["internal_bam_filepath", "internal_bai_filepath"], unshare=False): """ will share some files from gcp with a set of users using terra as metadata repo. only works with files that are listed on a terra workspace tsv but actually point to a regular google bucket and not a terra bucket. Args: ---- users: list[str] of users' google accounts workspace: str namespace/workspace from url typically samples list[str] of samples_id for which you want to share data bamcols: list[str] list of column names of gsfiles to share Returns: -------- a list of the gs path we have been giving access to """ if type(users) is str: users = [users] wm = dm.WorkspaceManager(workspace) togiveaccess = np.ravel(wm.get_samples()[bamcols].loc[samples].values) key = "-rd " if unshare else "-ru " for user in users: files = '' for i in togiveaccess: files += ' ' + i code = os.system("gsutil -m acl ch " + key + user + (" " if unshare else ":R ") + files) if code == signal.SIGINT: print('Awakened') break print('the files are stored here:\n\n') print(togiveaccess) print('\n\njust install and use gsutil to copy them') print('https://cloud.google.com/storage/docs/gsutil_install') print('https://cloud.google.com/storage/docs/gsutil/commands/cp') return togiveaccess
def removeSamples(workspace, samples): """ removes a set of samples from a workspace (very usefull when we have linked pairs and pairsets) Args: ----- workspace: str workspace name samples: list of samples """ wm = dm.WorkspaceManager(workspace).disable_hound() try: wm.delete_sample(samples) except: print('we had pairs.') pairs = wm.get_pairs() pairid = pairs[pairs.case_sample.isin(samples)].index.tolist() for k, val in wm.get_pair_sets().iterrows(): wm.update_pair_set(k, set(val.tolist()[0]) - set(pairid)) wm.delete_pair(pairid) wm.delete_sample(samples)
def saveConfigs(workspace, filepath): """ will save everything about a workspace into a csv and json file Args: ----- workspace: str namespace/workspace from url typically namespace (str): project to which workspace belongs workspace (str): Workspace name filepath to save files """ wm = dm.WorkspaceManager(workspace) h.createFoldersFor(filepath) conf = wm.get_configs() conf.to_csv(filepath + '.csv') params = {} params['GENERAL'] = wm.get_workspace_metadata() for k, val in conf.iterrows(): params[k] = wm.get_config(val['name']) h.dictToFile(params, filepath + '.json')
def saveWorkspace(workspace, folderpath): """ will save everything about a workspace into a csv and json file Args: ----- workspace: str namespace/workspace from url typically namespace (str): project to which workspace belongs workspace (str): Workspace name folderpath: str path to save files """ wm = dm.WorkspaceManager(workspace) h.createFoldersFor(folderpath) conf = wm.get_configs() for k, val in conf.iterrows(): with open(folderpath + val['name'] + ".wdl", "w") as f: if val.sourceRepo == 'dockstore': name = "dockstore.org/" + '/'.join(val['methodPath'].split( '/')[2:4]) + '/' + val['methodVersion'] else: name = '/'.join( val[['methodNamespace', 'methodName', 'methodVersion']].astype(str).tolist()) try: f.write(dm.get_wdl(name)) except MethodNotFound: print(name + " could not be found") conf.to_csv(folderpath + 'worflow_list.csv') params = {} params['GENERAL'] = wm.get_workspace_metadata() for k, val in conf.iterrows(): params[k] = wm.get_config(val['name']) h.dictToFile(params[k]['inputs'], folderpath + "inputs_" + val['name'] + '.json') h.dictToFile(params[k], folderpath + "conf_" + val['name'] + '.json') h.dictToFile(params[k]['outputs'], folderpath + "outputs_" + val['name'] + '.json') h.dictToFile(params, folderpath + 'all_configs.json')
def __init__( self, workspace: str, entityType: str, entityName: str, entityExpression: typing.Optional[str] = None, write_to_workspace: bool = True, alias: typing.Union[None, str, typing.List[str]] = None, ): """ Initializes the adapter Must provide workspace and entity information If no expression is provided: * Assume a single job, and resolve all input expressions in the context of the one entity If an expression is provided: * Assume multiple entities (entity type will be auto-detected) * Launch one job per entity, resolving input expressions for each one If alias is provided, it is used to specify custom job aliases. alias may be a list of strings (an alias for each job) or a single string (the input variable to use as the alias) """ super().__init__(alias=alias) self.workspace = dalmatian.WorkspaceManager(workspace) if entityName not in self.workspace._get_entities_internal(entityType).index: raise NameError('No such {} "{}" in workspace {}'.format( entityType, entityName, workspace )) self._entityType = entityType self._entityName = entityName self._entityExpression = entityExpression self.evaluator = self.workspace.get_evaluator(False) if entityExpression is not None: self.entities = self.evaluator(entityType, entityName, entityExpression) self.etype = self.evaluator.determine_reference_type(entityType, self.entities, '') else: self.entities = [entityName] self.etype = entityType self.write_to_workspace = write_to_workspace self.__spec = None
def renametsvs(workspace, wmto=None, index_func=None): """ ################## WIP ############ only works for one use case """ data = {} wmfrom = dm.WorkspaceManager(workspace) try: a = wmfrom.get_participants() data.update({'participants': a}) except: print('no participants') try: a = wmfrom.get_samples() data.update({'samples': a}) except: print('no samples') try: a = wmfrom.get_pair_sets() data.update({'pair_sets': a}) except: print('no pair_sets') try: a = wmfrom.get_pairs() data.update({'pairs': a}) except: print('no pairs') try: a = wmfrom.get_sample_sets() data.update({'sample_sets': a}) except: print('no sample_sets') # currently works only for sample, sample for k, entity in data.items(): ind = [] for i in entity.index: pos = val.find('-SM') if pos != -1: val = val[pos + 1:] pos = val.find('-SM') if pos != -1: val = val[:9] + val[pos + 1:] ind.append(val) entity.index = ind # for all columns of the tsv for k, val in entity.iterrows(): for i, v in enumerate(val): if type(v) is list or type(v) is str: ind = [] for j in v: pos = j.find('-SM') if pos != -1: j = j[pos + 1:] pos = j.find('-SM') if pos != -1: j = j[:9] + j[pos + 1:] ind.append(j) val[i] = ind entity.loc[k] = val if wmto is None: wmto = wmfrom if "participants" in data: wmto.upload_participants(data['participants'].index.tolist()) if "samples" in data: wmto.upload_samples(data['samples']) if "pairs" in data: wmto.upload_entities('pair', data['pairs']) if "pair_set" in data: pairset = data['pair_set'].drop('pairs', 1) wmto.upload_entities('pair_set', pairset) for i, val in data['pair_set'].iterrows(): wmto.update_pair_set(i, val.pairs) if "sample_set" in data: sampleset = data['sample_set'].drop('samples', 1) wmto.upload_entities('sample_set', sampleset) for i, val in data['sample_set'].iterrows(): wmto.update_sample_set(i, val.samples)