예제 #1
0
def updateAllSampleSet(workspace, Allsample_setname='all'):
    """
  update the previous All Sample sample_set with the new samples that have been added.

  It is especially useful for the aggregate task. Can more generally merge two samplesets together

  Args:
  ----
    workspace: str namespace/workspace from url typically
    newsample_setname: str name of sampleset to add to All_samples
  """
    dm.WorkspaceManager(workspace).update_sample_set(
        Allsample_setname,
        dm.WorkspaceManager(workspace).get_samples().index.tolist())
예제 #2
0
파일: __init__.py 프로젝트: jkobject/JKBio
def updateAllSampleSet(workspace, newsample_setname, Allsample_setname='All_samples'):
  """
  update the previous All Sample sample_set with the new samples that have been added.

  It is especially useful for the aggregate task. Can more generally merge two samplesets together

  Args:
  ----
    workspace: str namespace/workspace from url typically
    newsample_setname: str name of sampleset to add to All_samples
  """
  prevsamples = list(dm.WorkspaceManager(workspace).get_sample_sets().loc[Allsample_setname]['samples'])
  newsamples = list(dm.WorkspaceManager(workspace).get_sample_sets().loc[newsample_setname]['samples'])
  prevsamples.extend(newsamples)
  dm.WorkspaceManager(workspace).update_sample_set(Allsample_setname, list(set(prevsamples)))
예제 #3
0
def getQC(workspace, only=[], qcname=[], match=""):
    """
  Will get from a workspace, the QC data for each samples

  Args:
  -----
    workspace: the workspace name
    only: do it only for this set of samples
    qcname: col name where the QC is in the workspace samples
    match: for example'.Log.final.out' get only that QC if you have a list of QCs in you qcname col

  Returns:
  --------
    a dict(sample_id:list[QC_filepaths])
  """
    if type(qcname) is str:
        qcname = [qcname]
    res = {}
    wm = dm.WorkspaceManager(workspace)
    sam = wm.get_samples()
    if len(only) > 0:
        sam = sam[sam.index.isin(only)]
    for k, val in sam.iterrows():
        res[k] = []
        for i in val[qcname]:
            if type(i) is list:
                if match:
                    res[k].extend([e for e in i if match in e])
                else:
                    res[k].extend(i)
            else:
                res[k].append(i)
    return res
예제 #4
0
def removeFromFailedWorkflows(workspaceid,
                              maxtime='2020-06-10',
                              everythingFor=[],
                              dryrun=False):
    """
  Lists all files from all jobs that have failed and deletes them.

  Can be very long

  Args:
  -----
    workspaceid: str the workspace name
    maxtime: str date format (eg. 2020-06-10) does not delete files generated past this date
    everythingFor: list[str] removes from these workflows even if not failed
    dryrun: bool whether or not to execute or just print commands
  """
    wm = dm.WorkspaceManager(workspaceid)
    for k, val in wm.get_submission_status(filter_active=False).iterrows():
        if (val.Failed > 0 or val.configuration in everythingFor
            ) and val.date.date() > pd.to_datetime(maxtime):
            for w in wm.get_submission(val.submission_id)['workflows']:
                if w['status'] == 'Failed' or val.configuration in everythingFor:
                    try:
                        a = w['workflowId']
                    #else it was not even run
                    except:
                        continue
                    deleteJob(workspaceid, val.submission_id, a, dryrun=dryrun)
예제 #5
0
 def test_simple_interactions(self):
     ws = dalmatian.WorkspaceManager('broad-firecloud-gtex/unit_testing')
     ws._LegacyWorkspaceManager__hound = unittest.mock.Mock(
         dalmatian.base.HoundClient)
     ws.hound.configure_mock(**{
         'with_reason.side_effect': no_op_ctx,
         'batch.side_effect': no_op_ctx
     })
     self.assertTrue(ws.create_workspace())
     self.assertEqual(ws.bucket_id,
                      'fc-3ce4e797-82a8-46fe-8eee-66422dd92ed0')
     ws.upload_samples(
         pd.read_csv(os.path.join(relpath, 'test_samples.tsv'),
                     sep='\t',
                     index_col=0))
     ws.update_participant_samples()
     with open(os.path.join(relpath, 'rnaseqc_counts.config.json')) as r:
         ws.update_config(json.load(r))
     ws.update_sample_set('all_samples', ws.samples.index)
     self.assertEqual(
         ws.create_submission('rnaseqc_v2_cfg', 'all_samples', 'sample_set',
                              'this.samples'),
         '88c08eea-c10d-4f8c-b757-504ae59f86c5')
     ws.delete_workspace()
     # Check that the request buffer has been played out
     for buf in requests.values():
         self.assertEqual(len(buf), 0)
예제 #6
0
def createManySubmissions(workspace,
                          workflow,
                          references,
                          entity=None,
                          expression=None,
                          use_callcache=True):
    """
  wrapper to create many submissions for a workflow

  Args:
  ----
    workspace: str namespace/workspace from url typically
    references: list(str) a list of name of the row in this entity
    entity: str terra csv type (sample_id...)
    expresson: str to use if want to compute on the direct value of the entity or on values of values
                e.g. this.samples
    use_callcache: Bool to false if want to recompute everything even if same input

  Returns:
  ------
    submission_ids list(str) the submission ids
  """
    wm = dm.WorkspaceManager(workspace)
    submission_ids = []
    for ref in references:
        submission_ids += [
            wm.create_submission(workflow, ref, entity, expression,
                                 use_callcache)
        ]
    return submission_ids
예제 #7
0
def get_pairs_from_wsname(www):
    paired_path = pd.DataFrame()
    for nspair in www:
        print(paired_path.shape)
        S = dalmatian.WorkspaceManager(*nspair).get_samples()

        sample_df = S.reset_index()["sample_id"].str.extract(
            r"([A-Z]+)-(..-....)-(..)$")
        path_df = S.reset_index()[["sample_id", "WXS_bam_path"]]
        concat_df = pd.concat([sample_df, path_df], axis=1)
        concat_df.columns = ["cohort", "id", "type", "sample_id", "bam_path"]
        concat_df['pid'] = concat_df['cohort'].str.cat(concat_df['id'],
                                                       sep="-")
        reset_df = pd.pivot_table(concat_df,
                                  index=["pid"],
                                  columns=["type"],
                                  values=["bam_path"],
                                  aggfunc=np.sum).reset_index()
        new_paired_path = reset_df.rename_axis(None)["bam_path"]
        new_paired_path["pid"] = reset_df["pid"]
        paired_path = paired_path.append(pd.DataFrame(new_paired_path),
                                         ignore_index=True,
                                         sort=True)
        #columns = ["pid","NB","NT","TR","TP"])
    return paired_path
예제 #8
0
async def waitForSubmission(workspace, submissions, raise_errors=True):
    """
  wrapper to create many submissions for a workflow

  Args:
  -----
    workspace: str namespace/workspace from url typically
    submissions: list[str] of submission ids
    raise_errors: bool to true if errors should stop your code

  Returns:
  -------
    list of ids of failed submissions
  """
    failed_submission = []
    timing = 0
    wm = dm.WorkspaceManager(workspace).disable_hound()
    assert submissions is not None
    if type(submissions) is type(""):
        submissions = [submissions]
    for scount, submission_id in enumerate(submissions):
        finished = False
        while not finished:
            done = 0
            failed = 0
            finished = True
            submission = wm.get_submission(submission_id)["workflows"]
            for _, i in enumerate(submission):
                if i['status'] not in {
                        'Done', 'Aborted', 'Failed', 'Succeeded'
                }:
                    finished = False
                elif i["status"] in {'Failed', 'Aborted'}:
                    failed += 1
                    if i["workflowEntity"][
                            "entityName"] not in failed_submission:
                        print(i["workflowEntity"]["entityName"])
                        failed_submission.append(
                            i["workflowEntity"]["entityName"])
                elif i["status"] in {'Done', 'Succeeded'}:
                    done += 1
            if not finished:
                time.sleep(40)
                print("status is: Done for " + str(done) +
                      " jobs in submission " + str(scount) + ". " +
                      str(timing) + ",5 mn elapsed.",
                      end="\r")
                timing += 1
                time.sleep(20)
                print("status is: Failed for " + str(failed) +
                      " jobs in submission " + str(scount) + ". " +
                      str(timing) + " mn elapsed.",
                      end="\r")
            else:
                print(
                    str(done / (done + failed)) +
                    " of jobs Succeeded in submission " + str(scount) + ".")
    if len(failed_submission) > 0 and raise_errors:
        raise RuntimeError(str(len(failed_submission)) + " failed submission")
    return failed_submission
예제 #9
0
async def cleanWorkspace(workspaceid,
                         only=[],
                         toleave=[],
                         defaulttoleave=[
                             'workspace', 'scripts', 'notebooks', 'files',
                             'data', 'hound', 'references', 'name', 'folder'
                         ]):
    """
  removes all processing folder in a terra workspace easily

  args:
    only: list of strings to keep
    workspaceid: str, the workspace
    toleave: a list of first order folder in the bucket that you don't want to be deleted
    defaulttoleave: it should contain non processing folders that contain metadata and files for the workspace
  """
    toleave.extend(defaulttoleave)
    bucket = dm.WorkspaceManager(workspaceid).get_bucket_id()
    res = subprocess.run('gsutil -m ls gs://' + bucket,
                         shell=True,
                         capture_output=True)
    if res.returncode != 0:
        raise ValueError(str(res.stderr))
    res = str(res.stdout)[2:-1].split('\\n')[:-1]
    toremove = [val for val in res if val.split('/')[-2] not in toleave]
    if only:  # you were here
        toremove = [val for val in res if val.split('/')[-2] in only]
    if h.askif('we are going to remove ' + str(len(toremove)) +
               " files/folders:\n" + str(toremove) + "\nare you sure?"):
        gcp.rmFiles(toremove, add='-r')
    else:
        print("aborting")
예제 #10
0
파일: __init__.py 프로젝트: jkobject/JKBio
def deleteHeavyFiles(workspaceid, unusedOnly=True):
    """
    deletes all files above a certain size in a workspace (that are used or unused)

    Args:
    ----
      workspaceid: str the name off the workspace
      unusedOnly: bool whether to delete used files as well (files that appear in one of the sample/samplesets/pairs data tables)
    """
    wm = dm.WorkspaceManager(workspaceid)
    bucket = wm.get_bucket_id()
    sizes = gcp.get_all_sizes('gs://'+bucket+'/')
    print('we got '+str(len(sizes))+' files')
    a  = list(sizes.keys())
    a.sort()
    ma = 100
    torm = []
    tot = 0
    for i in a[::-1]:
      if i>1000000*ma:
        tot += i
        for val in sizes[i]:
          torm.append(val)
    print('we might remove more than '+str(tot/1000000000)+'GB')
    if unusedOnly:
      sam = pd.concat([wm.get_samples(),wm.get_pairs(),wm.get_sample_sets()])
      tokeep = set([val for val in sam.values.ravel() if type(val) is str and val[:5]=='gs://'])
      torm = set(torm) - tokeep
    return torm
def df_from_workspace_set(workspace, set_type, set_name, column_name):
    """Get the dataframe specified by set_name (of set_type) in your Terra workspace.

    The dataframe is filtered by indices that have non-null values in the column specified by column_name. The whole
    dataframe is returned so that further filtering can be performed beyond what is in the given set_name.

    :param workspace: full name of Terra workspace
    :param set_type: one of [pair, sample, participant]
    :param set_name: name of desired set
    :param column_name: column name holding desired file/data
    :return: pandas.Dataframe of the desired set
    """
    if not set_type or not set_name or not column_name:
        raise ValueError("If calling from Terra workspace, the set_type, "
                         "set_name, and column_name must be specified.")

    # Import Workspace from Terra/Firecloud
    wm = dalmatian.WorkspaceManager(workspace)
    if set_type == 'pair':
        set_df = wm.get_pairs_in_pair_set(set_name)
    elif set_type == 'sample':
        set_df = wm.get_samples()
        set_df = set_df[np.in1d(set_df.index.values,
                                wm.get_sample_sets().loc[set_name]['samples'])]
    elif set_type == 'participant':
        set_df = wm.get_participants()
        set_df = set_df[np.in1d(
            set_df.index.values,
            wm.get_participant_sets().loc[set_name]['participants'])]
    else:
        raise ValueError(
            f"set_type must be one of pair, sample, participant, not {set_type}."
        )

    return set_df[set_df[column_name].notnull()]
예제 #12
0
def extract_config_summary(workspace_name, workflows=None):
    wm = dalmatian.WorkspaceManager(workspace_name)
    configs = wm.get_configs()

    config_summaries = []
    for rec in configs.to_records():
        cfgname = rec['namespace'] + "/" + rec['name']
        if workflows is not None:
            if cfgname not in workflows:
                continue
        config = wm.get_config(cfgname)
        config['inputs'] = {k: v.strip() for k, v in config['inputs'].items()}
        config['outputs'] = {
            k: v.strip()
            for k, v in config['outputs'].items()
        }
        inputs = [
            resolve_dot_path(config['rootEntityType'], x)
            for x in config['inputs'].values() if x.startswith("this.")
        ]
        outputs = [
            resolve_dot_path(config['rootEntityType'], x)
            for x in config['outputs'].values() if x.startswith("this.")
        ]
        config_summaries.append(
            dict(inputs=inputs,
                 outputs=outputs,
                 entity_type=rec['rootEntityType'],
                 name=cfgname))
    return config_summaries
예제 #13
0
def saveOmicsOutput(workspace,
                    pathto_cnvpng='segmented_copy_ratio_img',
                    pathto_stats='sample_statistics',
                    specific_cohorts=[],
                    specific_celllines=[],
                    is_from_pairs=True,
                    pathto_snv='filtered_variants',
                    pathto_seg='cnv_calls',
                    datadir='gs://cclf_results/targeted/kim_sept/',
                    specific_samples=[]):
    """
  *WIP* For a workspace containing all omics workflows (CNV/SNV) (like CCLF's) copies all interesting output to a data bucket

  Args:
  -----
    workspace: the workspace name
    pathto_cnvpng: sample col of the CNV plot results
    pathto_stats: sample col of the bam QC results
    specific_cohorts: if provided, will only look for this specific
    specific_celllines: if need to rrun on specific cell lines
    is_from_pairs: if we process on pairs or samples data
    pathto_snv: sample col of the snv files
    pathto_seg: sample col of the segment files
    datadir: gs bucket path where to copy the resulting files
    specific_samples: if provided will only look for these samples

  """
    if specific_cohorts:
        samples = dm.WorkspaceManager(workspace).get_samples()
        samples = samples[samples.index.isin(specificlist)]
    if is_from_pairs:
        pairs = dm.WorkspaceManager(workspace).get_pairs()
        pairs = pairs[pairs['case_sample'].isin(specificlist)]
    for i, val in samples.iterrows():
        os.system('gsutil cp ' + val[pathto_seg] + ' ' + datadir + i + '/')
        os.system('gsutil cp ' + val[pathto_cnvpng] + ' ' + datadir + i + '/')
        os.system('gsutil cp ' + val[pathto_stats] + ' ' + datadir + i + '/')
        if is_from_pairs:
            snvs = pairs[pairs["case_sample"] == i][pathto_snv]
            for snv in snvs:
                if snv is not np.nan:
                    os.system('gsutil cp ' + snv + ' ' + datadir + i + '/')
                    break
        else:
            os.system('gsutil cp ' + val[pathto_snv] + ' ' + datadir + i + '/')
예제 #14
0
파일: __init__.py 프로젝트: jkobject/JKBio
def addToSampleSet(workspace, samplesetid, samples):
  """
  add samples to a sample set

  will create new if doesn't already exist, else adds to existing

  Args:
  ----
    workspace: the workspace name
    samplesetid: the sample set name
    samples: a list of samples
  """
  try:
    prevsamples = dm.WorkspaceManager(workspace).get_sample_sets()['samples'][samplesetid]
    samples.extend(prevsamples)
  except KeyError:
    print('The sample set ' + str(samplesetid) + ' did not exist in the workspace. Will be created now...')
  dm.WorkspaceManager(workspace).update_sample_set(samplesetid, list(set(samples)))
def get_current_samples():
    """Get current samples from FC
    """
    namespace = "nci-mimoun-bi-org"
    workspace = "CCLF_TSCA_2_0"
    wm = dalmatian.WorkspaceManager(namespace, workspace)
    # Current samples
    curr_samples = wm.get_samples()
    return curr_samples
예제 #16
0
파일: __init__.py 프로젝트: jkobject/JKBio
def findBackErasedDuplicaBamteFromTerraBucket(workspace, gsfolder, bamcol="WES_bam", baicol="WES_bai"):
  """
  If you have erased bam files in gcp with bai files still present and the bam files are stored elsewhere
  and their location is in a terra workspace.

  Will find them back by matching bai sizes and copy them back to their original locations

  Args:
  ----
    workspace: str namespace/workspace from url typically
    gsfolder: str the gsfolder where the bam files are
    bamcol: str colname of the bam
    baicol: str colname of the bai
  """
  # get ls of all files folder
  samples = os.popen('gsutil -m ls -al ' + gsfolder + '**.bai').read().split('\n')
  # compute size filepath

  sizes = {'gs://' + val.split('gs://')[1].split('#')[0]: int(val.split("2019-")[0]) for val in samples[:-2]}
  names = {}
  for k, val in sizes.items():
    if val in names:
      names[val].append(k)
    else:
      names[val] = [k]
  # get all bai in tsv
  samp = dm.WorkspaceManager(workspace).get_samples()
  for k, val in samp.iterrows():
    if val[bamcol] != 'NA' and val[baicol] != 'NA':
      # if bai has duplicate size
      code = os.system('gsutil ls ' + val[bamcol])
      if code == 256:
        if val[bamcol] is None:
          print('we dont have bam value for ' + str(k))
          continue
        else:
          print('no match values for ' + str(val[bamcol]))

        for va in names[sizes[val[baicol]]]:
          # for all duplicate size
          # if ls bam of bai duplicate size work
          # mv bam to bampath in gsfolder
          if '.bam' in va:
            if os.system('gsutil ls ' + va.split('.bam.bai')[0] + ".bam") == 0:
              print('gsutil mv ' + va.split('.bam.bai')[0] + ".bam " + val[bamcol])
              os.system('gsutil mv ' + va.split('.bam.bai')[0] + ".bam " + val[bamcol])
              break
          elif os.system('gsutil ls ' + va.split('.bai')[0] + ".bam") == 0:
            print('gsutil mv ' + va.split('.bai')[0] + ".bam " + val[bamcol])
            os.system('gsutil mv ' + va.split('.bai')[0] + ".bam " + val[bamcol])
            break
      elif code == signal.SIGINT:
        print('Awakened')
        break
    else:
      print("no data for " + str(k))
예제 #17
0
파일: __init__.py 프로젝트: jkobject/JKBio
def addToPairSet(workspace, pairsetid, pairs):
  """
  add pairs to a pair set

  will create new if doesn't already exist, else adds to existing

  Args:
  ----
    workspace: the workspace name
    pairsetid: the pair set name
    pairs: a list of pairs
  """

  try:
    prevpairs = dm.WorkspaceManager(workspace).get_pair_sets().loc[[pairsetid]].pairs[0]
    pairs.extend(prevpairs)
  except KeyError:
    print('The pair set ' + str(pairsetid) + ' did not exist in the workspace. Will be created now...')
  dm.WorkspaceManager(workspace).update_pair_set(pairsetid, list(set(pairs)))
예제 #18
0
def getRNAQC(workspace, only=[], qcname="star_logs"):
    res = {}
    wm = dm.WorkspaceManager(workspace)
    sam = wm.get_samples()
    if len(only) > 0:
        sam = sam[sam.index.isin(only)]
    for k, val in sam[qcname].iteritems():
        for i in val:
            if '.Log.final.out' in i:
                res[k] = i
    return res
예제 #19
0
def copyToWorkspace(workspaceID,
                    tracker,
                    columns=[
                        "arxspan_id", "version", "sm_id", "datatype", "size",
                        "ccle_name", "stripped_cell_line_name",
                        "participant_id", "cellosaurus_id",
                        "bam_public_sra_path", "internal_bam_filepath",
                        "internal_bai_filepath", "parent_cell_line", "sex",
                        "matched_normal", "age", "primary_site",
                        "primary_disease", "subtype", "subsubtype", "origin",
                        "mediatype", "condition", "sequencing_type", "baits",
                        "source", "legacy_bam_filepath", "legacy_bai_filepath"
                    ],
                    rename={},
                    deleteUnmatched=False,
                    addMissing=False):
    """
  will use the current sample tracker to update samples in the workspace

  it can remove samples that are not in the tracker.

  Args:
  ----
    workspaceID: str the workspace id
    tracker: dataframe the sample tracker
    columns: list[str] the columns to sync
    rename: dict(str:str) columns to rename from sample tracker to workspace
    deleteUnmatched: bool whether or not to delete samples in the workspace and not in the sample tracker
  """
    wm = dm.WorkspaceManager(workspaceID).disable_hound()
    sam = wm.get_samples()
    track = tracker[tracker.index.isin(
        sam.index)][columns].rename(columns=rename)
    track.index.name = "sample_id"
    miss = set(columns) - set(sam.columns)
    if len(track) == 0 and not addMissing:
        raise ValueError('wrong tracker or index non matching')
    unmatched = set(sam.index) - (set(tracker.index) | set(['nan']))
    if not addMissing:
        print("found these to be unmatched in the tracker: " + str(unmatched))
        if deleteUnmatched and len(unmatched) > 0:
            terra.removeSamples(workspaceID, unmatched)
    unmatched = (set(tracker.index) - set(sam.index))
    if len(track) != 0:
        wm.update_sample_attributes(track)
    if addMissing and len(unmatched) > 0:
        print('found these columns to be missing in workspace: ' +
              str(unmatched))
        track = tracker[tracker.index.isin(unmatched)][columns].rename(
            columns=rename)
        track.index.name = "sample_id"
        wm.upload_samples(track)
예제 #20
0
def update(
    samples,
    samplesetname,
    stype,
    bucket,
    refworkspace,
    name_col="index",
    values=['legacy_bam_filepath', 'legacy_bai_filepath'],
    filetypes=['bam', 'bai'],
    my_id='~/.client_secret.json',
    mystorage_id="~/.storage.json",
    creds='../.credentials.json',
    sampletrackername='ccle sample tracker',
    refsheet_url="https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY",
):

    # uploading to our bucket (now a new function)
    terra.changeToBucket(samples,
                         bucket,
                         name_col=name_col,
                         values=values,
                         filetypes=filetypes,
                         catchdup=True,
                         test=False)

    samplesetname
    sheets = Sheets.from_files(my_id, mystorage_id)
    ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

    names = []
    subccle_refsamples = ccle_refsamples[ccle_refsamples['datatype'] == stype]
    for k, val in samples.iterrows():
        val = val["arxspan_id"]
        names.append(val)
        samples.loc[k, 'version'] = len(subccle_refsamples[
            subccle_refsamples['arxspan_id'] == val]) + names.count(val)
    samples['version'] = samples['version'].astype(int)

    ccle_refsamples = ccle_refsamples.append(samples, sort=False)
    dfToSheet(ccle_refsamples, sampletrackername, secret=creds)

    #uploading new samples to mut
    refwm = dm.WorkspaceManager(refworkspace).disable_hound()
    refwm.upload_samples(samples)
    sam = refwm.get_samples()

    #creating a sample set
    refwm.update_sample_set(sample_set_id=samplesetname,
                            sample_ids=samples.index)
    refwm.update_sample_set(
        sample_set_id='all',
        sample_ids=[i for i in sam.index.tolist() if i != 'nan'])
예제 #21
0
def ExtractStarQualityInfo(samplesetname, workspace, release='temp'):
    """
  put all of the Star Quality results from Terra Star Workflow into one txt file

  Args:
  -----
    samplesetname: the sampleset name for which to grab the samples processed by star.
    wm: the terra workspace
    release: the name of the folder where it will be stored
  """
    a = dm.WorkspaceManager(workspace).get_samples().loc[dm.WorkspaceManager(
        workspace).get_sample_sets().loc[samplesetname].samples].star_logs
    for i, sample in enumerate(a):
        if sample is None:
            print("no log file found for: " + a.index[i])
        for log in sample:
            if 'final.out' in log:
                print("copying " + a.index[i])
                os.system('gsutil cp ' + log + ' temp/')
    os.system("cat data/" + release + "/*.Log.final.out > temp/" +
              samplesetname + ".txt")
    os.system("rm data/" + release + "/*.Log.final.out")
예제 #22
0
def getWESQC(workspace, only=[], qcname=[]):
    res = {}
    wm = dm.WorkspaceManager(workspace)
    sam = wm.get_samples()
    if len(only) > 0:
        sam = sam[sam.index.isin(only)]
    for k, val in sam.iterrows():
        res[k] = []
        for i in val[qcname]:
            if type(i) is list:
                res[k].extend(i)
            else:
                res[k].append(i)
    return res
예제 #23
0
def deleteJob(workspaceid, subid, taskid, deleteCurrent=False, dryrun=True):
    """
  removes files generated by a job on Terra

  Args:
  -----
    workspaceid: str wokspace name
    subid: str the name of the job
    taskid: str the name of the task in this job
    DeleteCurrent: bool whether or not to delete files if they appear in one of the sample/samplesets/pairs data tables
    dryrun: bool just plot the commands but don't execute them
  """
    wm = dm.WorkspaceManager(workspaceid)
    bucket = wm.get_bucket_id()
    data = []
    if deleteCurrent:
        if dryrun:
            print('gsutil -m rm gs://' + bucket + '/' + subid + '/*/' +
                  taskid + '/**')
        else:
            res = subprocess.run('gsutil -m rm gs://' + bucket + '/' + subid +
                                 '/*/' + taskid + '/**',
                                 shell=True,
                                 capture_output=True)
            if res.returncode != 0:
                raise ValueError(str(res.stderr))
    else:
        res = subprocess.run('gsutil -m ls gs://' + bucket + '/' + subid +
                             '/*/' + taskid + '/**',
                             shell=True,
                             capture_output=True)
        if res.returncode != 0 or len(str(res.stdout)) < 4:
            raise ValueError(str(res.stderr))
        data += str(res.stdout)[2:-1].split('\\n')[:-1]
        if "TOTAL:" in data[-1]:
            data = data[:-1]
        sam = pd.concat(
            [wm.get_samples(),
             wm.get_pairs(),
             wm.get_sample_sets()])
        tokeep = set([
            val for val in sam.values.ravel()
            if type(val) is str and val[:5] == 'gs://'
        ])
        torm = set(data) - tokeep
        if dryrun:
            print(torm)
        else:
            h.parrun(['gsutil rm ' + i for i in torm], cores=12)
def get_filenames(workspace, output_file='maf_filenames.txt'):
    # Import Workspace from Firecloud
    wm = dalmatian.WorkspaceManager(workspace)
    pairs = wm.get_pairs()
    # pairs = wm.get_pairs_in_pair_set('DESIRED_PAIR_SET')

    pairs = pairs[pairs['mutation_validator_validated_maf'].notnull()]

    ###################
    # Perform Filtering

    ###################

    desired_files = pairs['mutation_validator_validated_maf'].tolist()

    with open(output_file, 'w') as o_file:
        o_file.writelines(f'{f_name}\n' for f_name in desired_files)
예제 #25
0
async def shareTerraBams(
        samples,
        users,
        workspace,
        bamcols=["internal_bam_filepath", "internal_bai_filepath"],
        unshare=False):
    """
  will share some files from gcp with a set of users using terra as metadata repo.

  only works with files that are listed on a terra workspace tsv but actually
  point to a regular google bucket and not a terra bucket.

  Args:
  ----
    users: list[str] of users' google accounts
    workspace: str namespace/workspace from url typically
    samples list[str] of samples_id for which you want to share data
    bamcols: list[str] list of column names of gsfiles to share

  Returns:
  --------
    a list of the gs path we have been giving access to
  """
    if type(users) is str:
        users = [users]
    wm = dm.WorkspaceManager(workspace)
    togiveaccess = np.ravel(wm.get_samples()[bamcols].loc[samples].values)
    key = "-rd " if unshare else "-ru "
    for user in users:
        files = ''
        for i in togiveaccess:
            files += ' ' + i
        code = os.system("gsutil -m acl ch " + key + user +
                         (" " if unshare else ":R ") + files)
        if code == signal.SIGINT:
            print('Awakened')
            break
    print('the files are stored here:\n\n')
    print(togiveaccess)
    print('\n\njust install and use gsutil to copy them')
    print('https://cloud.google.com/storage/docs/gsutil_install')
    print('https://cloud.google.com/storage/docs/gsutil/commands/cp')
    return togiveaccess
예제 #26
0
파일: __init__.py 프로젝트: jkobject/JKBio
def removeSamples(workspace, samples):
  """
  removes a set of samples from a workspace (very usefull when we have linked pairs and pairsets)

  Args:
  -----
    workspace: str workspace name
    samples: list of samples
  """
  wm = dm.WorkspaceManager(workspace).disable_hound()
  try:
    wm.delete_sample(samples)
  except:
    print('we had pairs.')
    pairs = wm.get_pairs()
    pairid = pairs[pairs.case_sample.isin(samples)].index.tolist()
    for k, val in wm.get_pair_sets().iterrows():
      wm.update_pair_set(k, set(val.tolist()[0]) - set(pairid))
    wm.delete_pair(pairid)
    wm.delete_sample(samples)
예제 #27
0
파일: __init__.py 프로젝트: jkobject/JKBio
def saveConfigs(workspace, filepath):
  """
  will save everything about a workspace into a csv and json file

  Args:
  -----
    workspace: str namespace/workspace from url typically
      namespace (str): project to which workspace belongs
      workspace (str): Workspace name
    filepath to save files
  """
  wm = dm.WorkspaceManager(workspace)
  h.createFoldersFor(filepath)

  conf = wm.get_configs()
  conf.to_csv(filepath + '.csv')
  params = {}
  params['GENERAL'] = wm.get_workspace_metadata()
  for k, val in conf.iterrows():
    params[k] = wm.get_config(val['name'])
  h.dictToFile(params, filepath + '.json')
예제 #28
0
def saveWorkspace(workspace, folderpath):
    """
  will save everything about a workspace into a csv and json file

  Args:
  -----
    workspace: str namespace/workspace from url typically
      namespace (str): project to which workspace belongs
      workspace (str): Workspace name
    folderpath: str path to save files
  """
    wm = dm.WorkspaceManager(workspace)
    h.createFoldersFor(folderpath)

    conf = wm.get_configs()
    for k, val in conf.iterrows():
        with open(folderpath + val['name'] + ".wdl", "w") as f:
            if val.sourceRepo == 'dockstore':
                name = "dockstore.org/" + '/'.join(val['methodPath'].split(
                    '/')[2:4]) + '/' + val['methodVersion']
            else:
                name = '/'.join(
                    val[['methodNamespace', 'methodName',
                         'methodVersion']].astype(str).tolist())
            try:
                f.write(dm.get_wdl(name))
            except MethodNotFound:
                print(name + " could not be found")
    conf.to_csv(folderpath + 'worflow_list.csv')
    params = {}
    params['GENERAL'] = wm.get_workspace_metadata()
    for k, val in conf.iterrows():
        params[k] = wm.get_config(val['name'])
        h.dictToFile(params[k]['inputs'],
                     folderpath + "inputs_" + val['name'] + '.json')
        h.dictToFile(params[k], folderpath + "conf_" + val['name'] + '.json')
        h.dictToFile(params[k]['outputs'],
                     folderpath + "outputs_" + val['name'] + '.json')
    h.dictToFile(params, folderpath + 'all_configs.json')
예제 #29
0
 def __init__(
     self, workspace: str, entityType: str, entityName: str,
     entityExpression: typing.Optional[str] = None, write_to_workspace: bool = True,
     alias: typing.Union[None, str, typing.List[str]] = None,
 ):
     """
     Initializes the adapter
     Must provide workspace and entity information
     If no expression is provided:
     * Assume a single job, and resolve all input expressions in the context of the one entity
     If an expression is provided:
     * Assume multiple entities (entity type will be auto-detected)
     * Launch one job per entity, resolving input expressions for each one
     If alias is provided, it is used to specify custom job aliases.
     alias may be a list of strings (an alias for each job) or a single string
     (the input variable to use as the alias)
     """
     super().__init__(alias=alias)
     self.workspace = dalmatian.WorkspaceManager(workspace)
     if entityName not in self.workspace._get_entities_internal(entityType).index:
         raise NameError('No such {} "{}" in workspace {}'.format(
             entityType,
             entityName,
             workspace
         ))
     self._entityType = entityType
     self._entityName = entityName
     self._entityExpression = entityExpression
     self.evaluator = self.workspace.get_evaluator(False)
     if entityExpression is not None:
         self.entities = self.evaluator(entityType, entityName, entityExpression)
         self.etype = self.evaluator.determine_reference_type(entityType, self.entities, '')
     else:
         self.entities = [entityName]
         self.etype = entityType
     self.write_to_workspace = write_to_workspace
     self.__spec = None
예제 #30
0
파일: __init__.py 프로젝트: jkobject/JKBio
def renametsvs(workspace, wmto=None, index_func=None):
  """
  ################## WIP ############
  only works for one use case
  """
  data = {}
  wmfrom = dm.WorkspaceManager(workspace)
  try:
    a = wmfrom.get_participants()
    data.update({'participants': a})
  except:
    print('no participants')
  try:
    a = wmfrom.get_samples()
    data.update({'samples': a})
  except:
    print('no samples')
  try:
    a = wmfrom.get_pair_sets()
    data.update({'pair_sets': a})
  except:
    print('no pair_sets')
  try:
    a = wmfrom.get_pairs()
    data.update({'pairs': a})
  except:
    print('no pairs')
  try:
    a = wmfrom.get_sample_sets()
    data.update({'sample_sets': a})
  except:
    print('no sample_sets')
  # currently works only for sample, sample
  for k, entity in data.items():
    ind = []
    for i in entity.index:
      pos = val.find('-SM')
      if pos != -1:
        val = val[pos + 1:]
        pos = val.find('-SM')
        if pos != -1:
          val = val[:9] + val[pos + 1:]
      ind.append(val)
    entity.index = ind
    # for all columns of the tsv
    for k, val in entity.iterrows():
      for i, v in enumerate(val):
        if type(v) is list or type(v) is str:
          ind = []
          for j in v:
            pos = j.find('-SM')
            if pos != -1:
              j = j[pos + 1:]
              pos = j.find('-SM')
              if pos != -1:
                j = j[:9] + j[pos + 1:]
            ind.append(j)
          val[i] = ind
        entity.loc[k] = val
    if wmto is None:
      wmto = wmfrom
    if "participants" in data:
      wmto.upload_participants(data['participants'].index.tolist())
    if "samples" in data:
      wmto.upload_samples(data['samples'])
    if "pairs" in data:
      wmto.upload_entities('pair', data['pairs'])
    if "pair_set" in data:
      pairset = data['pair_set'].drop('pairs', 1)
      wmto.upload_entities('pair_set', pairset)
      for i, val in data['pair_set'].iterrows():
        wmto.update_pair_set(i, val.pairs)
    if "sample_set" in data:
      sampleset = data['sample_set'].drop('samples', 1)
      wmto.upload_entities('sample_set', sampleset)
      for i, val in data['sample_set'].iterrows():
        wmto.update_sample_set(i, val.samples)