Exemplo n.º 1
0
async def cleanWorkspace(workspaceid,
                         only=[],
                         toleave=[],
                         defaulttoleave=[
                             'workspace', 'scripts', 'notebooks', 'files',
                             'data', 'hound', 'references', 'name', 'folder'
                         ]):
    """
  removes all processing folder in a terra workspace easily

  args:
    only: list of strings to keep
    workspaceid: str, the workspace
    toleave: a list of first order folder in the bucket that you don't want to be deleted
    defaulttoleave: it should contain non processing folders that contain metadata and files for the workspace
  """
    toleave.extend(defaulttoleave)
    bucket = dm.WorkspaceManager(workspaceid).get_bucket_id()
    res = subprocess.run('gsutil -m ls gs://' + bucket,
                         shell=True,
                         capture_output=True)
    if res.returncode != 0:
        raise ValueError(str(res.stderr))
    res = str(res.stdout)[2:-1].split('\\n')[:-1]
    toremove = [val for val in res if val.split('/')[-2] not in toleave]
    if only:  # you were here
        toremove = [val for val in res if val.split('/')[-2] in only]
    if h.askif('we are going to remove ' + str(len(toremove)) +
               " files/folders:\n" + str(toremove) + "\nare you sure?"):
        gcp.rmFiles(toremove, add='-r')
    else:
        print("aborting")
Exemplo n.º 2
0
def mergeReplicatePeaks(peaks,
                        bigwigfolder,
                        markedasbad=None,
                        window=100,
                        sampling=3000,
                        mincov=4,
                        doPlot=True,
                        cov={},
                        minKL=8,
                        use='max',
                        MINOVERLAP=0.3,
                        lookeverywhere=True,
                        only='',
                        saveloc=''):
    """
    /!/ should only be passed peaks with at least one good replicate
    for each TFpeaksets,
    1. find the replicate that have the most peaks
    2. correlate peaks and get in highest correlation order with the replicate found in 1
    3. find overlap of both and get size of second replicate
    4. if small(er)-> use only to increase statistics
        1. if a lot of uncalled peaks in replicate 2 at replicate 1 peaks (flag for mergebam)
    5. if similar size -> get only intersect
        2. add to intersect, find uncalled peaks in both replicates which are called in the other
    6. repeat for all replicates
    -------------------------
    if full overlap of one of the peak replicate, only use the overlapped one to increase confidence on peak
    if >80% average non overlap,
        print warning and percentage of overlap

    if <20% average non overlap,
        take the overlap and increase confidence and avg logfold

    if one is <20%:
        if other <40% average non overlap,
        take the overlap and increase confidence and avg logfold
        else
        take

    gets the max cov at the genomic window and if above some threshold, accepts the peak.

    extend peak by X bp if no TSS
    remove TSS from peaks


        create a new data frame containing merged peak size, reassembled peak data (p value etc..) and
        a the value for presence of each TF listed in previous df
        ------------------------------------

        args:
        ----
        peaks: df[bed-like] all the peaks into the sameBam with a column containing the 'name'
        being the id of the sample, the 'replicate' number of this sample, the 'tf' chiped here
        bamfolder: str, foldername
        avgCov: dict(filename:int) a dict where for each bam filename is given an averageCoverage
        if use=='max':
                window:
                mincov:

        if use=='max':


        returns:
        -------
        mergedpeaks: dict{df-peakslike}
        bamtomerge: [[bam1,bam2]]
    """
    def col_nan_scatter(x, y, **kwargs):
        df = pd.DataFrame({'x': x[:], 'y': y[:]})
        df = df[df.sum(0) != 0]
        x = df['x']
        y = df['y']
        plt.gca()
        plt.scatter(x, y)

    def col_nan_kde_histo(x, **kwargs):
        df = pd.DataFrame({'x': x[:]})
        df = df[df['x'] != 0]
        x = df['x']
        plt.gca()
        sns.kdeplot(x)

    print("/!/ should only be passed peaks with at least one good replicate")
    # for a df containing a set of peaks in bed format and an additional column of different TF
    tfs = list(set(peaks['tf']))
    totpeaknumber = 0
    mergedpeaksdict = {}
    remove = []
    tomergebam = []
    ratiosofunique = {}
    h.createFoldersFor(saveloc)
    f = open(saveloc + 'results.txt', 'w')
    warnings.simplefilter("ignore")
    for tf in tfs:
        if only and tf != only:
            continue
        cpeaks = peaks[peaks.tf == tf]
        print('_____________________________________________________')
        f.write('_____________________________________________________' + '\n')
        if len(set(cpeaks['replicate'])) == 1:
            if cpeaks.name.tolist()[0] in markedasbad:
                print('the only replicate is considered bad!')
                f.write('the only replicate is considered bad!' + "\n")
                print('wrong TF: ' + tf)
                f.write('wrong TF: ' + tf + "\n")
                mergedpeaksdict.update({tf: cpeaks})
                remove.append(tf)
                continue
            print("we only have one replicate for " + tf + " .. pass")
            f.write("we only have one replicate for " + tf + " .. pass" + "\n")
            mergedpeaksdict.update({tf: cpeaks})
            continue
        print("merging " + tf + " peaks")
        f.write("merging " + tf + " peaks" + "\n")
        merged = simpleMergePeaks(cpeaks, window=window, maxp=False)
        merged_bed = merged[merged.columns[8:]]
        finalpeaks = merged[merged.columns[:8]]
        print('--> finish first overlaps lookup')
        f.write('--> finish first overlaps lookup' + "\n")
        # flag when  biggest is <1000 peaks
        if len(finalpeaks) < 1000:
            print('!TF has less than 1000 PEAKS!')
            f.write('!TF has less than 1000 PEAKS!' + "\n")
        # for each TF (replicates), compute number of peaks
        peakmatrix = merged_bed.values.astype(bool)

        presence = []
        for peakpres in peakmatrix.T:  # https://github.com/tctianchi/pyvenn
            presence.append(
                set([i for i, val in enumerate(peakpres) if val == 1]))
        # compute overlap matrix (venn?)
        if peakmatrix.shape[1] < 7 and doPlot:
            plot.venn(presence, [
                i + '_BAD' if i.split('-')[0] in markedasbad else i
                for i in merged_bed.columns
            ],
                      title=tf + "_before_venn",
                      folder=saveloc)
            plt.show()
        else:
            print('too many replicates for Venn: ' + str(peakmatrix.shape[1]))
            f.write('too many replicates for Venn: ' +
                    str(peakmatrix.shape[1]) + "\n")
        if doPlot:
            fig = sns.pairplot(merged_bed,
                               corner=True,
                               diag_kind="kde",
                               kind="reg",
                               plot_kws={"scatter_kws": {
                                   "alpha": .05
                               }})
            #fig = fig.map_upper(col_nan_scatter)
            #fig = fig.map_upper(col_nan_kde_histo)
            plt.suptitle("correlation of peaks in each replicate", y=1.08)
            if saveloc:
                fig.savefig(saveloc + tf + "_before_pairplot.pdf")
            plt.show()
            for i, val in enumerate(merged_bed):
                unique_inval = np.logical_and(
                    np.delete(peakmatrix, i, axis=1).sum(1).astype(bool) == 0,
                    peakmatrix[:, i])
                sns.kdeplot(merged_bed[val][unique_inval],
                            legend=True).set(xlim=(0, None))
            plt.title("distribution of unique peaks in each replicate")
            if saveloc:
                plt.savefig(saveloc + tf + "_before_unique_kdeplot.pdf")
            plt.show()

        bigwigs = os.listdir(bigwigfolder)

        foundgood = False
        sort = findBestPeak(presence)
        for ib, sb in enumerate(sort):
            if merged_bed.columns[sb].split('-')[0] not in markedasbad:
                foundgood = True
                break
        if not foundgood:
            print('no peaks were good enough quality')
            f.write('no peaks were good enough quality' + "\n")
            print('bad TF: ' + tf)
            f.write('bad TF: ' + tf + "\n")
            remove.append(tf)
            ib = 0
        # distplot
        # correlation plot

        biggest_ind = sort[ib]
        peakmatrix = peakmatrix.T
        biggest = merged_bed.columns[biggest_ind]
        print('-> main rep is: ' + str(biggest))
        f.write('-> main rep is: ' + str(biggest) + '\n')
        tot = peakmatrix[biggest_ind].copy().astype(int)
        # starts with highest similarity and go descending
        j = 0
        recovered = 0
        additionalpeaksinbig = np.array([])
        for i, val in enumerate(sort):
            if i == ib:
                continue
            j += 1
            # if avg non overlap > 60%, and first, and none small flag TF as unreliable.
            overlap = len(presence[biggest_ind] & presence[val]) / len(
                presence[biggest_ind])
            peakname = merged_bed.columns[val]
            print('- ' + peakname)
            f.write('- ' + peakname + '\n')
            print('  overlap: ' + str(overlap * 100) + "%")
            f.write('  overlap: ' + str(overlap * 100) + "%" + '\n')
            if overlap < MINOVERLAP:
                smallsupport = len(presence[biggest_ind]
                                   & presence[val]) / len(presence[val])
                print(' --> not enough overlap')
                f.write(' --> not enough overlap' + '\n')
                if smallsupport < MINOVERLAP:
                    # if the secondary does not have itself the required support
                    if j == 1 and merged_bed.columns[val].split(
                            '-')[0] not in markedasbad:
                        print("  Wrong TF: " + tf)
                        f.write("  Wrong TF: " + tf + '\n')
                        remove.append(tf)
                        break
                    # if not first, throw the other replicate and continue
                    print("  not using this replicate from the peakmatrix")
                    f.write("  not using this replicate from the peakmatrix" +
                            '\n')
                    continue
            if lookeverywhere:
                tolookfor = peakmatrix[val] == 0
            else:
                tolookfor = np.logical_and(peakmatrix[biggest_ind],
                                           peakmatrix[val] == 0)
            # ones that we have in the Primary but not in the secondary
            additionalpeaksinsec = findAdditionalPeaks(
                finalpeaks,
                tolookfor,
                bigwigfolder + findpeakpath(bigwigfolder, peakname),
                sampling=sampling,
                mincov=mincov,
                window=window,
                minKL=minKL,
                use=use)
            if len(additionalpeaksinsec[additionalpeaksinsec > 0]) > 0:
                sns.kdeplot(additionalpeaksinsec[additionalpeaksinsec > 0],
                            label=peakname,
                            legend=True).set(xlim=(0, None))
                print('  min,max from newly found peaks: ' + str(
                    (additionalpeaksinsec[additionalpeaksinsec > 0].min(),
                     additionalpeaksinsec[additionalpeaksinsec > 0].max())))
                f.write('  min,max from newly found peaks: ' + str(
                    (additionalpeaksinsec[additionalpeaksinsec > 0].min(),
                     additionalpeaksinsec[additionalpeaksinsec > 0].max())) +
                        '\n')
            # for testing purposes mainly
            finalpeaks[additionalpeaksinsec.astype(bool)].to_csv(
                'additionalpeaksinsec_mp' + merged_bed.columns[val] + '.bed',
                sep='\t',
                index=None,
                header=False)
            peakmatrix[val] = np.logical_or(peakmatrix[val],
                                            additionalpeaksinsec.astype(bool))
            overlap = np.sum(
                np.logical_and(peakmatrix[val],
                               peakmatrix[biggest_ind])) / np.sum(
                                   peakmatrix[biggest_ind])
            if overlap < MINOVERLAP:
                newsmalloverlap = np.sum(
                    np.logical_and(peakmatrix[val],
                                   peakmatrix[biggest_ind])) / np.sum(
                                       peakmatrix[val])
                print("  we did not had enough initial overlap.")
                f.write("  we did not had enough initial overlap." + '\n')
                if newsmalloverlap < MINOVERLAP:
                    if merged_bed.columns[val].split('-')[0] in markedasbad:
                        print('  replicate ' + merged_bed.columns[val] +
                              ' was too bad and had not enough overlap')
                        f.write('  replicate ' + merged_bed.columns[val] +
                                ' was too bad and had not enough overlap' +
                                '\n')
                        continue
                    elif h.askif(
                            "we have two good quality peaks that don't merge well at all: "
                            + merged_bed.columns[val] + " and " +
                            merged_bed.columns[biggest_ind] +
                            " can the first one be removed?:\n  \
                            overlap: " + str(overlap * 100) +
                            '%\n  smalloverlap: ' + str(smalloverlap * 100) +
                            '%\n  new smalloverlap: ' +
                            str(newsmalloverlap * 100) + "%"):
                        continue
                    else:
                        print("  enough from small overlaps")
                        f.write("  enough from small overlaps" + '\n')
            print(' --> enough overlap')
            f.write(' --> enough overlap' + '\n')
            recovered += np.sum(additionalpeaksinsec.astype(bool))
            if merged_bed.columns[val].split('-')[0] not in markedasbad:
                tot += peakmatrix[val].astype(int)
            # ones that we have in the Primary but not in the secondary
            if not lookeverywhere or len(additionalpeaksinbig) == 0:
                tolookfor = peakmatrix[
                    biggest_ind] == 0 if lookeverywhere else np.logical_and(
                        peakmatrix[biggest_ind] == 0, peakmatrix[val])
                additionalpeaksinbig = findAdditionalPeaks(
                    finalpeaks,
                    tolookfor,
                    bigwigfolder + findpeakpath(bigwigfolder, biggest),
                    sampling=sampling,
                    mincov=mincov,
                    window=window,
                    minKL=minKL,
                    use=use)
                if len(additionalpeaksinbig[additionalpeaksinbig > 0]) > 0:
                    sns.kdeplot(additionalpeaksinbig[additionalpeaksinbig > 0],
                                label=biggest,
                                legend=True).set(xlim=(0, None))
                    print('  min,max from newly found peaks: ' + str((
                        additionalpeaksinbig[additionalpeaksinbig > 0].min(),
                        additionalpeaksinbig[additionalpeaksinbig > 0].max())))
                    f.write('  min,max from newly found peaks: ' + str(
                        (additionalpeaksinbig[additionalpeaksinbig > 0].min(),
                         additionalpeaksinbig[additionalpeaksinbig > 0].max()
                         )) + '\n')

                peakmatrix[biggest_ind] = np.logical_or(
                    peakmatrix[biggest_ind], additionalpeaksinbig)
                tot += additionalpeaksinbig.astype(bool).astype(int)
                recovered += np.sum(additionalpeaksinbig.astype(bool))
            print('  we have recovered ' + str(recovered) +
                  ' peaks, equal to ' +
                  str(100 * recovered / np.sum(peakmatrix[biggest_ind])) +
                  '% of the peaks in main replicate')
            f.write('  we have recovered ' + str(recovered) +
                    ' peaks, equal to ' +
                    str(100 * recovered / np.sum(peakmatrix[biggest_ind])) +
                    '% of the peaks in main replicate' + '\n')
            if overlap < (MINOVERLAP + 0.2) / 1.2:
                # we recompute to see if the overlap changed
                newoverlap = np.sum(
                    np.logical_and(peakmatrix[val],
                                   peakmatrix[biggest_ind])) / np.sum(
                                       peakmatrix[biggest_ind])
                smalloverlap = np.sum(
                    np.logical_and(peakmatrix[val],
                                   peakmatrix[biggest_ind])) / np.sum(
                                       peakmatrix[val])
                if newoverlap < (MINOVERLAP + 0.2) / 1.2:
                    if smalloverlap < (2 + MINOVERLAP) / 3:
                        print(
                            "  not enough overlap to advice to merge the bams.\n  oldnew overlap: "
                            + str(overlap * 100) + '%\n  \
                            new overlap: ' + str(newoverlap * 100) + "%")
                        f.write(
                            "  not enough overlap to advice to merge the bams.\n  oldnew overlap: "
                            + str(overlap * 100) + '%\n  \
                            new overlap: ' + str(newoverlap * 100) + "%" +
                            '\n')
                        continue
                    else:
                        print(
                            '  enough from small overlap to advice to merge the peaks'
                        )
                        f.write(
                            '  enough from small overlap to advice to merge the peaks'
                            + '\n')
            tomergebam.append([biggest, peakname])
            #the quality is good enough in the end we can pop from the list if it exists
            if tf in remove:
                remove.remove(tf)
        plt.title('distribution of new found peaks')
        if saveloc:
            plt.savefig(saveloc + tf + "_new_found_peaks_kdeplot.pdf")
        plt.show()
        # new distplot
        # new correlation plot
        ratiosofunique[tf] = len(
            np.argwhere(peakmatrix.sum(0) == 1)) / peakmatrix.shape[1]
        if doPlot:
            sns.pairplot(merged_bed,
                         corner=True,
                         diag_kind="kde",
                         kind="reg",
                         plot_kws={"scatter_kws": {
                             "alpha": .05
                         }})
            #fig = fig.map_upper(col_nan_scatter)
            #fig = fig.map_upper(col_nan_kde_histo)
            plt.suptitle(
                "correlation and distribution of peaks after recovery", y=1.08)
            if saveloc:
                plt.savefig(saveloc + tf + "_after_pairplot.pdf")
            plt.show()
            for i, val in enumerate(merged_bed):
                unique_inval = np.logical_and(
                    np.delete(peakmatrix, i, axis=0).sum(0).astype(bool) == 0,
                    peakmatrix[i])
                sns.kdeplot(merged_bed[val][unique_inval],
                            legend=True).set(xlim=(0, None))
            plt.title(
                "distribution of unique peaks in each replicate after recovery"
            )
            if saveloc:
                plt.savefig(saveloc + tf + "_after_unique_kdeplot.pdf")
            plt.show()
        if len(peakmatrix.shape) > 1 and doPlot:
            if peakmatrix.shape[0] < 7:
                presence = []
                for peakpres in peakmatrix:  # https://github.com/tctianchi/pyvenn
                    presence.append(
                        set([i for i, val in enumerate(peakpres) if val == 1]))
                title = tf + '_recovered (TOREMOVE)' if tf in remove else tf + '_recovered'
                plot.venn(presence, [
                    i + '_BAD' if i.split('-')[0] in markedasbad else i
                    for i in merged_bed.columns
                ],
                          title=title,
                          folder=saveloc)
                plt.show()
            else:
                print('too many replicates for Venn')
                f.write('(too many replicates for Venn)' + '\n')
            finalpeaks = finalpeaks[np.logical_or(tot > 1,
                                                  peakmatrix[biggest_ind])]
        finalpeaks['name'] = biggest
        finalpeaks['tf'] = tf
        mergedpeaksdict.update({tf: finalpeaks})
        print(str((tf, len(finalpeaks))))
        f.write(str((tf, len(finalpeaks))) + '\n')
    mergedpeak = pd.concat([peaks for _, peaks in mergedpeaksdict.items()
                            ]).reset_index(drop=True)
    if doPlot:
        df = pd.DataFrame(data=ratiosofunique, index=['percentage of unique'])
        df['proteins'] = df.index
        fig = sns.barplot(data=df)
        plt.xticks(rotation=60, ha='right')
        plt.title("ratios of unique in replicates across experiments")
        if saveloc:
            plt.savefig(saveloc + "All_ratios_unique.pdf")
        plt.show()
    f.close()
    mergedpeak['name'] = mergedpeak.tf
    return mergedpeak, tomergebam, remove, ratiosofunique
Exemplo n.º 3
0
def load(
        samplesetname,
        workspaces,
        sources,
        maxage,
        baits,
        stype,
        toupdate={
            'primary_disease': ['Primary Disease'],
            'sex': ['CCLF Age'],
            'primary_site': ['Sample Collection Site'],
            'subtype': ['lineage_subtype'],
            'subsubtype': ['lineage_sub_subtype'],
            'origin': ['lineage'],
            'source': ['Program'],
            'parent_cell_line': ["Parental ID"],
            'comments': ['Comments'],
            'mediatype': ['Culture Medium', 'Culture Type'],
            'stripped_cell_line_name': ['Stripped Cell Line Name'],
            "cellosaurus_id": ["RRID"],
            "age": ["CCLF Gender"]
        },
        pv_index="DepMap_ID",
        master_index="arxspan_id",
        my_id='~/.client_secret.json',
        creds='../.credentials.json',
        mystorage_id="~/.storage.json",
        refsheet_url="https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY",
        depmappvlink="https://docs.google.com/spreadsheets/d/1uqCOos-T9EMQU7y2ZUw4Nm84opU5fIT1y7jet1vnScE",
        extract_to_change={'from_arxspan_id': 'participant'},
        # version 102
        match=['ACH-', 'CDS-'],
        pv_tokeep=['Culture Type', 'Culture Medium'],
        masterfilename="ACH",
        nanslist=['None', 'nan', 'Unknown'],
        depmap_taiga="arxspan-cell-line-export-f808",
        toraise=["ACH-001195"],
        participantslicepos=10,
        accept_unknowntypes=True,
        recomputehash=True):

    release = samplesetname
    sheets = Sheets.from_files(my_id, mystorage_id)
    ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

    ## Adding new data

    # we will be missing "primary disease","sm_id", "cellosaurus_id", "gender, "age", "primary_site", "primary_disease", "subtype", "subsubtype", "origin", "comments"
    #when SMid: match==
    samples, _, noarxspan = GetNewCellLinesFromWorkspaces(
        stype=stype,
        maxage=maxage,
        refurl=refsheet_url,
        wmfroms=workspaces,
        sources=sources,
        match=match,
        participantslicepos=participantslicepos,
        accept_unknowntypes=accept_unknowntypes,
        extract=extract_to_change,
        recomputehash=recomputehash)

    ### finding back arxspan
    noarxspan = tracker.retrieveFromCellLineName(noarxspan,
                                                 ccle_refsamples,
                                                 datatype=stype,
                                                 depmappvlink=depmappvlink,
                                                 extract=extract_to_change)

    #assess any potential issues
    samples = pd.concat([samples, noarxspan[noarxspan.arxspan_id != '0']],
                        sort=False)
    noarxspan = noarxspan[noarxspan.arxspan_id == '0']

    extract_defaults.update(extract_to_change)
    samples = assessAllSamples(samples,
                               ccle_refsamples,
                               stype=stype,
                               rename={},
                               extract=extract_defaults)

    if len(noarxspan) > 0:
        print("we found " + str(len(noarxspan)) +
              " samples without arxspan_ids!!")
        noarxspan = noarxspan.sort_values(by='stripped_cell_line_name')
        dfToSheet(samples, "depmap ALL samples not found", creds)
        noarxspan.to_csv('temp/noarxspan_' + stype + '_' + release + '.csv')
        if h.askif(
                "Please review the samples (on 'depmap samples not found') and write yes once \
      finished, else write no to quit and they will not be added"):
            updated_samples = sheets.get(
                "https://docs.google.com/spreadsheets/d/1yC3brpov3JELvzNoQe3eh0W196tfXzvpa0jUezMAxIg"
            ).sheets[0].to_frame().set_index('sample_id')
            samples = pd.concat([samples, updated_samples], sort=False)

    samples, notfound = tracker.updateFromTracker(samples, ccle_refsamples)

    for val in toraise:
        if val in samples['arxspan_id'].tolist():
            raise ValueError(
                'some samples were amongst the known wrong samples')

    samples['baits'] = baits
    if len(samples.loc[notfound]) > 0:
        print("we found some samples where we could not get annotations. \
      trying to infer it from depmap master sheet and arxspan export")
        samples, unk = completeFromMasterSheet(samples,
                                               notfound,
                                               toupdate=toupdate,
                                               my_id=my_id,
                                               pv_index=pv_index,
                                               master_index=master_index,
                                               mystorage_id=mystorage_id,
                                               pv_tokeep=pv_tokeep,
                                               masterfilename=masterfilename,
                                               nanslist=nanslist,
                                               depmap_pv=depmappvlink,
                                               depmap_taiga=depmap_taiga)
        if len(unk) > 0:
            print("some samples could still not be inferred")
            dfToSheet(samples.loc[notfound], "depmap samples not found", creds)
            samples.loc[notfound].to_csv('temp/notfound_' + stype + '_' +
                                         release + '.csv')
            if h.askif(
                    "Please review the samples (on 'depmap samples not found') and write yes once \
        finished, else write no to quit and they will not be added"):
                updated_samples = sheets.get(
                    "https://docs.google.com/spreadsheets/d/1yC3brpov3JELvzNoQe3eh0W196tfXzvpa0jUezMAxIg"
                ).sheets[0].to_frame().set_index('sample_id')
                samples.loc[updated_samples.index,
                            updated_samples.columns] = updated_samples.values

    dfToSheet(samples, 'depmap ALL samples found', secret=creds)
    samples.to_csv('temp/new_' + stype + '_' + release + '.csv')
    return samples