Пример #1
0
def makeProfiles(matx=[],
                 folder='',
                 matnames=[],
                 title='',
                 name='temp/peaksat.pdf',
                 refpoint="TSS",
                 scale=None,
                 sort=False,
                 withDeeptools=True,
                 cluster=1,
                 vmax=None,
                 vmin=None,
                 overlap=False,
                 legendLoc=None):
    if withDeeptools:
        if not (len(matnames) == 2 and len(matx) == 2):
            raise ValueError('you need two mat.gz files and two names')
        h.createFoldersFor(name)
        cmd = 'computeMatrixOperations relabel -m '
        cmd += matx[0] + ' -o ' + matx[0] + ' --groupLabels ' + matnames[0]
        cmd += ' && computeMatrixOperations relabel -m '
        cmd += matx[1] + ' -o ' + matx[1] + ' --groupLabels ' + matnames[1]
        cmd += ' && computeMatrixOperations rbind -m '
        cmd += matx[0] + ' ' + matx[1] + " -o " + \
                        '.'.join(name.split('.')[:-1]) + ".gz"
        cmd += ' && plotProfile'
        cmd += " --matrixFile " + '.'.join(name.split('.')[:-1]) + ".gz"
        cmd += " --outFileName " + name
        cmd += " --refPointLabel " + refpoint
        if vmax is not None:
            cmd += " -max " + str(vmax)
        if vmin is not None:
            cmd += " -min " + str(vmin)
        if cluster > 1:
            cmd += " --perGroup --kmeans " + str(cluster)
        if legendLoc:
            cmd += " --legendLocation " + legendLoc
        if title:
            cmd += " --plotTitle " + title
        data = subprocess.run(cmd, shell=True, capture_output=True)
        print(data)
Пример #2
0
def saveConfigs(workspace, filepath):
  """
  will save everything about a workspace into a csv and json file

  Args:
  -----
    workspace: str namespace/workspace from url typically
      namespace (str): project to which workspace belongs
      workspace (str): Workspace name
    filepath to save files
  """
  wm = dm.WorkspaceManager(workspace)
  h.createFoldersFor(filepath)

  conf = wm.get_configs()
  conf.to_csv(filepath + '.csv')
  params = {}
  params['GENERAL'] = wm.get_workspace_metadata()
  for k, val in conf.iterrows():
    params[k] = wm.get_config(val['name'])
  h.dictToFile(params, filepath + '.json')
Пример #3
0
def saveWorkspace(workspace, folderpath):
    """
  will save everything about a workspace into a csv and json file

  Args:
  -----
    workspace: str namespace/workspace from url typically
      namespace (str): project to which workspace belongs
      workspace (str): Workspace name
    folderpath: str path to save files
  """
    wm = dm.WorkspaceManager(workspace)
    h.createFoldersFor(folderpath)

    conf = wm.get_configs()
    for k, val in conf.iterrows():
        with open(folderpath + val['name'] + ".wdl", "w") as f:
            if val.sourceRepo == 'dockstore':
                name = "dockstore.org/" + '/'.join(val['methodPath'].split(
                    '/')[2:4]) + '/' + val['methodVersion']
            else:
                name = '/'.join(
                    val[['methodNamespace', 'methodName',
                         'methodVersion']].astype(str).tolist())
            try:
                f.write(dm.get_wdl(name))
            except MethodNotFound:
                print(name + " could not be found")
    conf.to_csv(folderpath + 'worflow_list.csv')
    params = {}
    params['GENERAL'] = wm.get_workspace_metadata()
    for k, val in conf.iterrows():
        params[k] = wm.get_config(val['name'])
        h.dictToFile(params[k]['inputs'],
                     folderpath + "inputs_" + val['name'] + '.json')
        h.dictToFile(params[k], folderpath + "conf_" + val['name'] + '.json')
        h.dictToFile(params[k]['outputs'],
                     folderpath + "outputs_" + val['name'] + '.json')
    h.dictToFile(params, folderpath + 'all_configs.json')
Пример #4
0
def mergeReplicatePeaks(peaks,
                        bigwigfolder,
                        markedasbad=None,
                        window=100,
                        sampling=3000,
                        mincov=4,
                        doPlot=True,
                        cov={},
                        minKL=8,
                        use='max',
                        MINOVERLAP=0.3,
                        lookeverywhere=True,
                        only='',
                        saveloc=''):
    """
    /!/ should only be passed peaks with at least one good replicate
    for each TFpeaksets,
    1. find the replicate that have the most peaks
    2. correlate peaks and get in highest correlation order with the replicate found in 1
    3. find overlap of both and get size of second replicate
    4. if small(er)-> use only to increase statistics
        1. if a lot of uncalled peaks in replicate 2 at replicate 1 peaks (flag for mergebam)
    5. if similar size -> get only intersect
        2. add to intersect, find uncalled peaks in both replicates which are called in the other
    6. repeat for all replicates
    -------------------------
    if full overlap of one of the peak replicate, only use the overlapped one to increase confidence on peak
    if >80% average non overlap,
        print warning and percentage of overlap

    if <20% average non overlap,
        take the overlap and increase confidence and avg logfold

    if one is <20%:
        if other <40% average non overlap,
        take the overlap and increase confidence and avg logfold
        else
        take

    gets the max cov at the genomic window and if above some threshold, accepts the peak.

    extend peak by X bp if no TSS
    remove TSS from peaks


        create a new data frame containing merged peak size, reassembled peak data (p value etc..) and
        a the value for presence of each TF listed in previous df
        ------------------------------------

        args:
        ----
        peaks: df[bed-like] all the peaks into the sameBam with a column containing the 'name'
        being the id of the sample, the 'replicate' number of this sample, the 'tf' chiped here
        bamfolder: str, foldername
        avgCov: dict(filename:int) a dict where for each bam filename is given an averageCoverage
        if use=='max':
                window:
                mincov:

        if use=='max':


        returns:
        -------
        mergedpeaks: dict{df-peakslike}
        bamtomerge: [[bam1,bam2]]
    """
    def col_nan_scatter(x, y, **kwargs):
        df = pd.DataFrame({'x': x[:], 'y': y[:]})
        df = df[df.sum(0) != 0]
        x = df['x']
        y = df['y']
        plt.gca()
        plt.scatter(x, y)

    def col_nan_kde_histo(x, **kwargs):
        df = pd.DataFrame({'x': x[:]})
        df = df[df['x'] != 0]
        x = df['x']
        plt.gca()
        sns.kdeplot(x)

    print("/!/ should only be passed peaks with at least one good replicate")
    # for a df containing a set of peaks in bed format and an additional column of different TF
    tfs = list(set(peaks['tf']))
    totpeaknumber = 0
    mergedpeaksdict = {}
    remove = []
    tomergebam = []
    ratiosofunique = {}
    h.createFoldersFor(saveloc)
    f = open(saveloc + 'results.txt', 'w')
    warnings.simplefilter("ignore")
    for tf in tfs:
        if only and tf != only:
            continue
        cpeaks = peaks[peaks.tf == tf]
        print('_____________________________________________________')
        f.write('_____________________________________________________' + '\n')
        if len(set(cpeaks['replicate'])) == 1:
            if cpeaks.name.tolist()[0] in markedasbad:
                print('the only replicate is considered bad!')
                f.write('the only replicate is considered bad!' + "\n")
                print('wrong TF: ' + tf)
                f.write('wrong TF: ' + tf + "\n")
                mergedpeaksdict.update({tf: cpeaks})
                remove.append(tf)
                continue
            print("we only have one replicate for " + tf + " .. pass")
            f.write("we only have one replicate for " + tf + " .. pass" + "\n")
            mergedpeaksdict.update({tf: cpeaks})
            continue
        print("merging " + tf + " peaks")
        f.write("merging " + tf + " peaks" + "\n")
        merged = simpleMergePeaks(cpeaks, window=window, maxp=False)
        merged_bed = merged[merged.columns[8:]]
        finalpeaks = merged[merged.columns[:8]]
        print('--> finish first overlaps lookup')
        f.write('--> finish first overlaps lookup' + "\n")
        # flag when  biggest is <1000 peaks
        if len(finalpeaks) < 1000:
            print('!TF has less than 1000 PEAKS!')
            f.write('!TF has less than 1000 PEAKS!' + "\n")
        # for each TF (replicates), compute number of peaks
        peakmatrix = merged_bed.values.astype(bool)

        presence = []
        for peakpres in peakmatrix.T:  # https://github.com/tctianchi/pyvenn
            presence.append(
                set([i for i, val in enumerate(peakpres) if val == 1]))
        # compute overlap matrix (venn?)
        if peakmatrix.shape[1] < 7 and doPlot:
            plot.venn(presence, [
                i + '_BAD' if i.split('-')[0] in markedasbad else i
                for i in merged_bed.columns
            ],
                      title=tf + "_before_venn",
                      folder=saveloc)
            plt.show()
        else:
            print('too many replicates for Venn: ' + str(peakmatrix.shape[1]))
            f.write('too many replicates for Venn: ' +
                    str(peakmatrix.shape[1]) + "\n")
        if doPlot:
            fig = sns.pairplot(merged_bed,
                               corner=True,
                               diag_kind="kde",
                               kind="reg",
                               plot_kws={"scatter_kws": {
                                   "alpha": .05
                               }})
            #fig = fig.map_upper(col_nan_scatter)
            #fig = fig.map_upper(col_nan_kde_histo)
            plt.suptitle("correlation of peaks in each replicate", y=1.08)
            if saveloc:
                fig.savefig(saveloc + tf + "_before_pairplot.pdf")
            plt.show()
            for i, val in enumerate(merged_bed):
                unique_inval = np.logical_and(
                    np.delete(peakmatrix, i, axis=1).sum(1).astype(bool) == 0,
                    peakmatrix[:, i])
                sns.kdeplot(merged_bed[val][unique_inval],
                            legend=True).set(xlim=(0, None))
            plt.title("distribution of unique peaks in each replicate")
            if saveloc:
                plt.savefig(saveloc + tf + "_before_unique_kdeplot.pdf")
            plt.show()

        bigwigs = os.listdir(bigwigfolder)

        foundgood = False
        sort = findBestPeak(presence)
        for ib, sb in enumerate(sort):
            if merged_bed.columns[sb].split('-')[0] not in markedasbad:
                foundgood = True
                break
        if not foundgood:
            print('no peaks were good enough quality')
            f.write('no peaks were good enough quality' + "\n")
            print('bad TF: ' + tf)
            f.write('bad TF: ' + tf + "\n")
            remove.append(tf)
            ib = 0
        # distplot
        # correlation plot

        biggest_ind = sort[ib]
        peakmatrix = peakmatrix.T
        biggest = merged_bed.columns[biggest_ind]
        print('-> main rep is: ' + str(biggest))
        f.write('-> main rep is: ' + str(biggest) + '\n')
        tot = peakmatrix[biggest_ind].copy().astype(int)
        # starts with highest similarity and go descending
        j = 0
        recovered = 0
        additionalpeaksinbig = np.array([])
        for i, val in enumerate(sort):
            if i == ib:
                continue
            j += 1
            # if avg non overlap > 60%, and first, and none small flag TF as unreliable.
            overlap = len(presence[biggest_ind] & presence[val]) / len(
                presence[biggest_ind])
            peakname = merged_bed.columns[val]
            print('- ' + peakname)
            f.write('- ' + peakname + '\n')
            print('  overlap: ' + str(overlap * 100) + "%")
            f.write('  overlap: ' + str(overlap * 100) + "%" + '\n')
            if overlap < MINOVERLAP:
                smallsupport = len(presence[biggest_ind]
                                   & presence[val]) / len(presence[val])
                print(' --> not enough overlap')
                f.write(' --> not enough overlap' + '\n')
                if smallsupport < MINOVERLAP:
                    # if the secondary does not have itself the required support
                    if j == 1 and merged_bed.columns[val].split(
                            '-')[0] not in markedasbad:
                        print("  Wrong TF: " + tf)
                        f.write("  Wrong TF: " + tf + '\n')
                        remove.append(tf)
                        break
                    # if not first, throw the other replicate and continue
                    print("  not using this replicate from the peakmatrix")
                    f.write("  not using this replicate from the peakmatrix" +
                            '\n')
                    continue
            if lookeverywhere:
                tolookfor = peakmatrix[val] == 0
            else:
                tolookfor = np.logical_and(peakmatrix[biggest_ind],
                                           peakmatrix[val] == 0)
            # ones that we have in the Primary but not in the secondary
            additionalpeaksinsec = findAdditionalPeaks(
                finalpeaks,
                tolookfor,
                bigwigfolder + findpeakpath(bigwigfolder, peakname),
                sampling=sampling,
                mincov=mincov,
                window=window,
                minKL=minKL,
                use=use)
            if len(additionalpeaksinsec[additionalpeaksinsec > 0]) > 0:
                sns.kdeplot(additionalpeaksinsec[additionalpeaksinsec > 0],
                            label=peakname,
                            legend=True).set(xlim=(0, None))
                print('  min,max from newly found peaks: ' + str(
                    (additionalpeaksinsec[additionalpeaksinsec > 0].min(),
                     additionalpeaksinsec[additionalpeaksinsec > 0].max())))
                f.write('  min,max from newly found peaks: ' + str(
                    (additionalpeaksinsec[additionalpeaksinsec > 0].min(),
                     additionalpeaksinsec[additionalpeaksinsec > 0].max())) +
                        '\n')
            # for testing purposes mainly
            finalpeaks[additionalpeaksinsec.astype(bool)].to_csv(
                'additionalpeaksinsec_mp' + merged_bed.columns[val] + '.bed',
                sep='\t',
                index=None,
                header=False)
            peakmatrix[val] = np.logical_or(peakmatrix[val],
                                            additionalpeaksinsec.astype(bool))
            overlap = np.sum(
                np.logical_and(peakmatrix[val],
                               peakmatrix[biggest_ind])) / np.sum(
                                   peakmatrix[biggest_ind])
            if overlap < MINOVERLAP:
                newsmalloverlap = np.sum(
                    np.logical_and(peakmatrix[val],
                                   peakmatrix[biggest_ind])) / np.sum(
                                       peakmatrix[val])
                print("  we did not had enough initial overlap.")
                f.write("  we did not had enough initial overlap." + '\n')
                if newsmalloverlap < MINOVERLAP:
                    if merged_bed.columns[val].split('-')[0] in markedasbad:
                        print('  replicate ' + merged_bed.columns[val] +
                              ' was too bad and had not enough overlap')
                        f.write('  replicate ' + merged_bed.columns[val] +
                                ' was too bad and had not enough overlap' +
                                '\n')
                        continue
                    elif h.askif(
                            "we have two good quality peaks that don't merge well at all: "
                            + merged_bed.columns[val] + " and " +
                            merged_bed.columns[biggest_ind] +
                            " can the first one be removed?:\n  \
                            overlap: " + str(overlap * 100) +
                            '%\n  smalloverlap: ' + str(smalloverlap * 100) +
                            '%\n  new smalloverlap: ' +
                            str(newsmalloverlap * 100) + "%"):
                        continue
                    else:
                        print("  enough from small overlaps")
                        f.write("  enough from small overlaps" + '\n')
            print(' --> enough overlap')
            f.write(' --> enough overlap' + '\n')
            recovered += np.sum(additionalpeaksinsec.astype(bool))
            if merged_bed.columns[val].split('-')[0] not in markedasbad:
                tot += peakmatrix[val].astype(int)
            # ones that we have in the Primary but not in the secondary
            if not lookeverywhere or len(additionalpeaksinbig) == 0:
                tolookfor = peakmatrix[
                    biggest_ind] == 0 if lookeverywhere else np.logical_and(
                        peakmatrix[biggest_ind] == 0, peakmatrix[val])
                additionalpeaksinbig = findAdditionalPeaks(
                    finalpeaks,
                    tolookfor,
                    bigwigfolder + findpeakpath(bigwigfolder, biggest),
                    sampling=sampling,
                    mincov=mincov,
                    window=window,
                    minKL=minKL,
                    use=use)
                if len(additionalpeaksinbig[additionalpeaksinbig > 0]) > 0:
                    sns.kdeplot(additionalpeaksinbig[additionalpeaksinbig > 0],
                                label=biggest,
                                legend=True).set(xlim=(0, None))
                    print('  min,max from newly found peaks: ' + str((
                        additionalpeaksinbig[additionalpeaksinbig > 0].min(),
                        additionalpeaksinbig[additionalpeaksinbig > 0].max())))
                    f.write('  min,max from newly found peaks: ' + str(
                        (additionalpeaksinbig[additionalpeaksinbig > 0].min(),
                         additionalpeaksinbig[additionalpeaksinbig > 0].max()
                         )) + '\n')

                peakmatrix[biggest_ind] = np.logical_or(
                    peakmatrix[biggest_ind], additionalpeaksinbig)
                tot += additionalpeaksinbig.astype(bool).astype(int)
                recovered += np.sum(additionalpeaksinbig.astype(bool))
            print('  we have recovered ' + str(recovered) +
                  ' peaks, equal to ' +
                  str(100 * recovered / np.sum(peakmatrix[biggest_ind])) +
                  '% of the peaks in main replicate')
            f.write('  we have recovered ' + str(recovered) +
                    ' peaks, equal to ' +
                    str(100 * recovered / np.sum(peakmatrix[biggest_ind])) +
                    '% of the peaks in main replicate' + '\n')
            if overlap < (MINOVERLAP + 0.2) / 1.2:
                # we recompute to see if the overlap changed
                newoverlap = np.sum(
                    np.logical_and(peakmatrix[val],
                                   peakmatrix[biggest_ind])) / np.sum(
                                       peakmatrix[biggest_ind])
                smalloverlap = np.sum(
                    np.logical_and(peakmatrix[val],
                                   peakmatrix[biggest_ind])) / np.sum(
                                       peakmatrix[val])
                if newoverlap < (MINOVERLAP + 0.2) / 1.2:
                    if smalloverlap < (2 + MINOVERLAP) / 3:
                        print(
                            "  not enough overlap to advice to merge the bams.\n  oldnew overlap: "
                            + str(overlap * 100) + '%\n  \
                            new overlap: ' + str(newoverlap * 100) + "%")
                        f.write(
                            "  not enough overlap to advice to merge the bams.\n  oldnew overlap: "
                            + str(overlap * 100) + '%\n  \
                            new overlap: ' + str(newoverlap * 100) + "%" +
                            '\n')
                        continue
                    else:
                        print(
                            '  enough from small overlap to advice to merge the peaks'
                        )
                        f.write(
                            '  enough from small overlap to advice to merge the peaks'
                            + '\n')
            tomergebam.append([biggest, peakname])
            #the quality is good enough in the end we can pop from the list if it exists
            if tf in remove:
                remove.remove(tf)
        plt.title('distribution of new found peaks')
        if saveloc:
            plt.savefig(saveloc + tf + "_new_found_peaks_kdeplot.pdf")
        plt.show()
        # new distplot
        # new correlation plot
        ratiosofunique[tf] = len(
            np.argwhere(peakmatrix.sum(0) == 1)) / peakmatrix.shape[1]
        if doPlot:
            sns.pairplot(merged_bed,
                         corner=True,
                         diag_kind="kde",
                         kind="reg",
                         plot_kws={"scatter_kws": {
                             "alpha": .05
                         }})
            #fig = fig.map_upper(col_nan_scatter)
            #fig = fig.map_upper(col_nan_kde_histo)
            plt.suptitle(
                "correlation and distribution of peaks after recovery", y=1.08)
            if saveloc:
                plt.savefig(saveloc + tf + "_after_pairplot.pdf")
            plt.show()
            for i, val in enumerate(merged_bed):
                unique_inval = np.logical_and(
                    np.delete(peakmatrix, i, axis=0).sum(0).astype(bool) == 0,
                    peakmatrix[i])
                sns.kdeplot(merged_bed[val][unique_inval],
                            legend=True).set(xlim=(0, None))
            plt.title(
                "distribution of unique peaks in each replicate after recovery"
            )
            if saveloc:
                plt.savefig(saveloc + tf + "_after_unique_kdeplot.pdf")
            plt.show()
        if len(peakmatrix.shape) > 1 and doPlot:
            if peakmatrix.shape[0] < 7:
                presence = []
                for peakpres in peakmatrix:  # https://github.com/tctianchi/pyvenn
                    presence.append(
                        set([i for i, val in enumerate(peakpres) if val == 1]))
                title = tf + '_recovered (TOREMOVE)' if tf in remove else tf + '_recovered'
                plot.venn(presence, [
                    i + '_BAD' if i.split('-')[0] in markedasbad else i
                    for i in merged_bed.columns
                ],
                          title=title,
                          folder=saveloc)
                plt.show()
            else:
                print('too many replicates for Venn')
                f.write('(too many replicates for Venn)' + '\n')
            finalpeaks = finalpeaks[np.logical_or(tot > 1,
                                                  peakmatrix[biggest_ind])]
        finalpeaks['name'] = biggest
        finalpeaks['tf'] = tf
        mergedpeaksdict.update({tf: finalpeaks})
        print(str((tf, len(finalpeaks))))
        f.write(str((tf, len(finalpeaks))) + '\n')
    mergedpeak = pd.concat([peaks for _, peaks in mergedpeaksdict.items()
                            ]).reset_index(drop=True)
    if doPlot:
        df = pd.DataFrame(data=ratiosofunique, index=['percentage of unique'])
        df['proteins'] = df.index
        fig = sns.barplot(data=df)
        plt.xticks(rotation=60, ha='right')
        plt.title("ratios of unique in replicates across experiments")
        if saveloc:
            plt.savefig(saveloc + "All_ratios_unique.pdf")
        plt.show()
    f.close()
    mergedpeak['name'] = mergedpeak.tf
    return mergedpeak, tomergebam, remove, ratiosofunique
Пример #5
0
def filterRNAfromQC(rnaqc, folder='tempRNAQCplot/', plot=True, qant1=0.07, qant3=0.93, thresholds={},
    num_cols = 10, figsize=(10, 0.2)):
  thresh = {'minmapping': 0.8,  # Mapping Rate
            'minendmapping': 0.75,
            'minefficiency': 0.6,  # Expression Profiling Efficiency
            'maxendmismatch': 0.025,  # Base Mismatch end wise
            'maxmismatch': 0.02,  # Base Mismatch
            'minhighqual': 0.6,  # High Quality Rate
            'minexon': 0.6,  # Exonic Rate
            "maxambiguous": 0.2,  # Ambiguous Alignment Rate
            "maxsplits": 0.1,  # Avg. Splits per Read
            "maxalt": 0.65,  # Alternative Alignments rate
            "maxchim": 0.3,  # Chimeric Alignment Rate
            "minreads": 20000000,
            "minlength": 80,  # Read Length
            "maxgenes": 35000,
            "mingenes": 10000,
            }
  thresh.update(thresholds)

  qcs = rnaqc.T
  tot = []
  a = qcs[(qcs["Mapping Rate"] < thresh['minmapping']) | (qcs["Base Mismatch"] > thresh['maxmismatch']) |
          (qcs["End 1 Mapping Rate"] < thresh['minendmapping']) | (qcs["End 2 Mapping Rate"] < thresh['minendmapping']) |
          (qcs["End 1 Mismatch Rate"] > thresh['maxendmismatch']) | (qcs["End 2 Mismatch Rate"] > thresh['maxendmismatch']) |
          (qcs["Expression Profiling Efficiency"] < thresh['minefficiency']) | (qcs["High Quality Rate"] < thresh['minhighqual']) |
          (qcs["Exonic Rate"] < thresh['minexon']) | (qcs["Ambiguous Alignment Rate"] > thresh['maxambiguous']) |
          (qcs["Avg. Splits per Read"] < thresh['maxsplits']) | (qcs["Alternative Alignments"] > thresh['maxalt']*qcs["Total Reads"]) |
          (qcs["Chimeric Alignment Rate"] > thresh['maxchim']) | (qcs["Total Reads"] < thresh['minreads']) |
          (qcs["Read Length"] < thresh['minlength']) | (thresh['maxgenes'] < qcs["Genes Detected"]) |
          (qcs["Genes Detected"] < thresh['mingenes'])].index.tolist()

  tot.append([1 if i in qcs[(qcs["Mapping Rate"] <
                              thresh['minmapping'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Base Mismatch"] >
                              thresh['maxmismatch'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["End 1 Mapping Rate"] <
                              thresh['minendmapping'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["End 2 Mapping Rate"] <
                              thresh['minendmapping'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["End 1 Mismatch Rate"] >
                              thresh['maxendmismatch'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["End 2 Mismatch Rate"] >
                              thresh['maxendmismatch'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Expression Profiling Efficiency"]
                              < thresh['minefficiency'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["High Quality Rate"] <
                              thresh['minhighqual'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Exonic Rate"] < thresh['minexon'])
                            ].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Ambiguous Alignment Rate"] >
                              thresh['maxambiguous'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Avg. Splits per Read"] <
                              thresh['maxsplits'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Alternative Alignments"] > thresh['maxalt']
                              * qcs["Total Reads"])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Chimeric Alignment Rate"] >
                              thresh['maxchim'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Total Reads"] < thresh['minreads'])
                            ].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Read Length"] <
                              thresh['minlength'])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(thresh['maxgenes'] <
                              qcs["Genes Detected"])].index.tolist() else 0 for i in a])
  tot.append([1 if i in qcs[(qcs["Genes Detected"] <
                              thresh['mingenes'])].index.tolist() else 0 for i in a])

  res = pd.DataFrame(index=a, columns=["Mapping Rate",
                                        "Base Mismatch",
                                        "End 1 Mapping Rate",
                                        "End 2 Mapping Rate",
                                        "End 1 Mismatch Rate",
                                        "End 2 Mismatch Rate",
                                        "Expression Profiling Efficiency",
                                        "High Quality Rate",
                                        "Exonic Rate",
                                        "Ambiguous Alignment Efficiency",
                                        "Avg. Splits per Read",
                                        "Alternative Alignments",
                                        "Chimeric Alignment Rate",
                                        "Total Reads",
                                        "Read Length",
                                        "Min Genes Detected",
                                        "Max Genes Detected"], data=np.array(tot).astype(bool).T)

  print(a)
  if len(res) > 0:
    h.createFoldersFor(folder)
    res.to_csv(folder+'_qc_results.csv')
    if plot:
      _, ax = plt.subplots(figsize=(figsize[0], math.ceil(len(res)*figsize[1])))
      plot = sns.heatmap(res, xticklabels=True, yticklabels=True, cbar=False)
      plt.yticks(rotation = 0)
      plt.show()
      plot.get_figure().savefig(folder+'failed_qc.pdf')

      num_rows = math.ceil(len(rnaqc)/num_cols)
      _, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows*2))
      for val_idx, val in enumerate(rnaqc.index):
        ax = axes.flatten()[val_idx]
        qc = rnaqc.loc[val]
        sns.violinplot(y=qc, ax=ax)
        q1 = qc.quantile(qant1)
        q3 = qc.quantile(qant3)
        outlier_top_lim = q3 + 1.5 * (q3 - q1)
        outlier_bottom_lim = q1 - 1.5 * (q3 - q1)
        for k, v in qc[(qc < outlier_bottom_lim) | (qc > outlier_top_lim)].iteritems():
          ax.text(0.05, v, k, ha='left', va='center',
                    color='red' if k in a else 'black')
      plt.tight_layout()
      plt.show()
      plt.savefig('{}/qc_metrics.pdf'.format(folder), bbox_inches='tight')
  return res
Пример #6
0
def getPeaksAt(peaks,
               bigwigs,
               folder='',
               bigwignames=[],
               peaknames=[],
               window=1000,
               title='',
               numpeaks=4000,
               numthreads=8,
               width=5,
               length=10,
               torecompute=False,
               name='temp/peaksat.pdf',
               refpoint="TSS",
               scale=None,
               sort=False,
               withDeeptools=True,
               onlyProfile=False,
               cluster=1,
               vmax=None,
               vmin=None,
               overlap=False,
               legendLoc=None):
    """
  get pysam data
  ask for counts only at specific locus based on windows from center+-size from sorted MYC peaks
  for each counts, do a rolling average (or a convolving of the data) with numpy
  append to an array
  return array, normalized
  """
    if withDeeptools:
        if isinstance(peaks, pd.DataFrame):
            peaks = 'peaks.bed '
            peaks.to_csv('peaks.bed', sep='\t', index=False, header=False)
        elif type(peaks) == list:
            pe = ''
            i = 0
            for n, p in enumerate(peaks):
                if 20 < int(os.popen('wc -l ' + p).read().split(' ')[0]):
                    pe += p + ' '
                elif len(peaknames) > 0:
                    peaknames.pop(n - i)
                    i += 1
            peaks = pe
        elif type(peaks) == str:
            peaks += ' '
        else:
            raise ValueError(' we dont know this filetype')
        if type(bigwigs) is list:
            pe = ''
            for val in bigwigs:
                pe += folder + val + ' '
            bigwigs = pe
        else:
            bigwigs = folder + bigwigs + ' '
        h.createFoldersFor(name)
        cmd = ''
        if not os.path.exists('.'.join(name.split('.')[:-1]) +
                              ".gz") or torecompute:
            cmd += "computeMatrix reference-point -S "
            cmd += bigwigs
            cmd += " --referencePoint " + refpoint
            cmd += " --regionsFileName " + peaks
            cmd += " --missingDataAsZero"
            cmd += " --outFileName " + '.'.join(name.split('.')[:-1]) + ".gz"
            cmd += " --upstream " + str(window) + " --downstream " + str(
                window)
            cmd += " --numberOfProcessors " + str(numthreads) + ' && '
        cmd += "plotHeatmap" if not onlyProfile else 'plotProfile'
        if type(name) is list:
            if not onlyProfile:
                raise ValueError(
                    'needs to be set to True, can\'t average heatmaps')
            cmd += " --matrixFile " + '.gz '.join(name) + ".gz"
            if average:
                cmd += "--averageType mean"
        else:
            cmd += " --matrixFile " + '.'.join(name.split('.')[:-1]) + ".gz"
        cmd += " --outFileName " + name
        cmd += " --refPointLabel " + refpoint
        if vmax is not None:
            cmd += " -max " + str(vmax)
        if vmin is not None:
            cmd += " -min " + str(vmin)
        if cluster > 1:
            cmd += " --perGroup --kmeans " + str(cluster)
        if overlap:
            if onlyProfile:
                cmd += " --plotType overlapped_lines"
            else:
                raise ValueError("overlap only works when onlyProfile is set")
        if legendLoc:
            cmd += " --legendLocation " + legendLoc

        if len(peaknames) > 0:
            pe = ''
            for i in peaknames:
                pe += ' ' + i
            cmd += " --regionsLabel" + pe
        if type(bigwigs) is list:
            if len(bigwignames) > 0:
                pe = ''
                for i in bigwignames:
                    pe += ' "' + i + '"'
                cmd += " --samplesLabel" + pe
        if title:
            cmd += " --plotTitle '" + title + "'"
        data = subprocess.run(cmd, shell=True, capture_output=True)
        print(data)
    else:
        if 'relative_summit_pos' in peaks.columns:
            center = [
                int((val['start'] + val['relative_summit_pos']))
                for k, val in peaks.iterrows()
            ]
        else:
            center = [
                int((val['start'] + val['end']) / 2)
                for k, val in peaks.iterrows()
            ]
        pd.set_option('mode.chained_assignment', None)
        peaks['start'] = [c - window for c in center]
        peaks['end'] = [c + window for c in center]
        fig, ax = plt.subplots(1,
                               len(bigwigs),
                               figsize=[width, length],
                               title=title if title else 'Chip Heatmap')
        if sort:
            peaks = peaks.sort_values(by=["foldchange"], ascending=False)
        if numpeaks > len(peaks):
            numpeaks = len(peaks) - 1
        cov = {}
        maxs = []
        for num, bigwig in enumerate(bigwigs):
            bw = pyBigWig.open(folder + bigwig)
            co = np.zeros((numpeaks, window * 2), dtype=int)
            scale = scale[bigwig] if scale is dict else 1
            for i, (k, val) in enumerate(peaks.iloc[:numpeaks].iterrows()):
                try:
                    co[i] = np.nan_to_num(
                        bw.values(str(val.chrom), val.start, val.end), 0)
                except RuntimeError as e:
                    print(str(val.chrom), val.start, val.end)
                    pass
            cov[bigwig] = co
            maxs.append(co.max())
        for num, bigwig in enumerate(bigwigs):
            sns.heatmap(cov[bigwig] * scale,
                        ax=ax[num],
                        vmax=max(maxs),
                        yticklabels=[],
                        cmap=cmaps[num],
                        cbar=True)
            ax[num].set_title(bigwig.split('.')[0])
        fig.subplots_adjust(wspace=0.1)
        fig.show()
        fig.savefig(name)
        return cov, fig
Пример #7
0
def runChromHMM(outdir,
                data,
                numstates=15,
                datatype='bed',
                folderPath=".",
                chromHMMFolderpath="~/ChromHMM/",
                assembly="hg38",
                control_bam_dir=None):
    """
  runs the chromHMM algorithm

  Args:
  -----
    outdir str: an existing dir where the results should be saved
    data: df[cellname,markname,markbed|bam|bigwig, ?controlbed|bam|bigwig]
    numstates: number of states to use
    datatype: flag one of bed
    folderPath: str folder where to save chromHMM's work
    chromHMMFolderpath: str folderpath to chromHMM algorithm
    assembly: flag one of hg38, hg37 ...
    control_bam_dir: str directory where the control would be stored (if not given in the ddf)

  Returns:
  -------
    A dict of bed like dataframes containing the regions of the different states
  """
    print("you need to have ChromHMM")
    chromHMM = "java -mx8000M -jar " + chromHMMFolderpath + "ChromHMM.jar "
    h.createFoldersFor(outdir + 'binarized/')
    data.to_csv(outdir + "input_data.tsv", sep='\t', index=None, header=None)
    cmd = chromHMM
    if datatype == "bed":
        cmd += "BinarizeBed "
    elif datatype == "bigwig":
        cmd += "BinarizeSignal "
    elif datatype == "bam":
        cmd += "BinarizeBam "
    else:
        raise ValueError('you need to provide one of bam, bigwig, bed')
    cmd += chromHMMFolderpath + "CHROMSIZES/" + assembly + ".txt " + folderPath + " " + outdir + "input_data.tsv " + outdir + "binarized"
    if control_bam_dir:
        cmd += " -c " + control_bam_dir
    res1 = subprocess.run(cmd, capture_output=True, shell=True)
    print(res1)
    if res1.returncode != 0:
        raise ValueError(str(res1.stderr))
    cmd = chromHMM + "LearnModel -printposterior -noautoopen "
    if len(data) < 10:
        cmd += '-init load -m ' + chromHMMFolderpath + 'model_15_coreMarks.txt '
    cmd += outdir + "binarized " + outdir + " " + str(
        numstates) + " " + assembly
    res2 = subprocess.run(cmd, capture_output=True, shell=True)
    print(res2)
    if res2.returncode != 0:
        raise ValueError(res2.stderr)
    ret = {}
    for v in set(data[0]):
        ret[v] = pd.read_csv(
            outdir + v + '_' + str(numstates) + '_dense.bed',
            sep='\t',
            header=None,
            skiprows=1).drop(columns=[4, 5, 6, 7]).rename(columns={
                0: 'chrom',
                1: 'start',
                2: 'end',
                3: 'state',
                8: "color"
            })
    return ret