Exemplo n.º 1
0
def makePlots(truthFile, predFile, method, outBase, measure):

    if measure == "tpm":
        trueColName = "TPM_truth"
        predColName = "TPM_{}".format(method)
    elif measure == "num_reads":
        trueColName = "NumReads_truth"
        predColName = "NumReads_{}".format(method)

    # Load the data; first the predicitons
    p =  None
    if method == "salmon":
        p = ParsingUtils.readSalmon(predFile, '_{}'.format(method))
    elif method == "kallisto":
        p = ParsingUtils.readKallisto(predFile, '_{}'.format(method))
    elif method == "express":
        p = ParsingUtils.readExpress(predFile, '_{}'.format(method))
    
    # Now the ground truth
    g = ParsingUtils.readProFile(truthFile, '_truth')

    # Convert to TPM
    g["TPM_truth"] = 1000000.0 * (g["ExpFrac_truth"] / g["ExpFrac_truth"].sum())
    # Flux sim thinks paired-end = 2 reads . . . sigh
    g["NumReads_truth"] = g["SeqNum_truth"] * 0.5

    # Filter out low TPM
    AnalysisUtils.filterValues("TPM_truth", g, 0.01)
    AnalysisUtils.filterValues("TPM_{}".format(method), p, 0.01)
    AnalysisUtils.filterValues("NumReads_truth", g, 1.0)
    AnalysisUtils.filterValues("NumReads_{}".format(method), p, 1.0)

    # merge dataframes
    m = g.join(p)

    setPlotProperties()

    makeCorrPlot(trueColName, predColName, m, outBase, method, measure)
    makeRelDiffPlot(trueColName, predColName, m, outBase, method, measure)
    makeRelErrorPlot(trueColName, predColName, m, outBase, method, measure)
Exemplo n.º 2
0
def qpcrplots(outpath):
    font = {
            'weight' : 'bold',
            'size'   : 50}
    import matplotlib
    matplotlib.rc('font', **font)
    sns.set_style("white")
    tgmap = pd.read_csv('/shared/SalmonManuscript/data/HumanGenomeAnnotations/tgmap.all.txt', sep='\t', names=['gene.id', 'Name', 'gene.name'])
    tgmap.set_index('Name', inplace=True)
    print( "length tgmap={}".format(len(tgmap)) )
    salmondf = ParsingUtils.readSalmon('data/quant_seqc/rep1_salmon/quant.sf', suffix="_salmon")
    print( "length salmondf={}".format( len(salmondf)) )
    merged = salmondf.join(tgmap, how='inner')
    print( "length tgmap&salmondf={}".format( len(merged) ) )
    expressDF = ParsingUtils.readExpress('data/quant_seqc/rep1_express/results.xprs', suffix="_express")
    merged = merged.join(expressDF, how='inner')
    print( "Length tgmap&salmonDF&expressDF={}".format( len(merged) ) )
    sailfishDF = ParsingUtils.readSailfish('data/quant_seqc/rep1_sailfish/quant_bias_corrected.sf', suffix="_sailfish")
    merged = merged.join(sailfishDF, how='inner')
    print( "Length tgmap&salmonDF&expressDF&sailifishDF={}".format( len(merged) ) )
    kallistoDF = ParsingUtils.readKallisto('data/quant_seqc/rep1_kallisto/abundance.txt', suffix="_kallisto")
    merged = merged.join(kallistoDF, how='inner')
    print( "Length tgmap&salmonDF&expressDF&sailifishDF&kallistoDF={}".format( len(merged) ) )
    merged = merged.groupby('gene.name').sum()
    print( "Length after summing by gene={}".format(len(merged)) ) 
    qpcrDF = pd.read_csv('data/seqc/qpcr/PrimePCR.txt', sep='\t', names=['gene.name', 'A','B','C','D'], skiprows=1)
    qpcrDF.set_index("gene.name", inplace=True)
    print( "Length qpcrDF={}".format(len(qpcrDF)) )
    merged = merged.join(qpcrDF, how='inner')
    print( "Length everything&qpcrDF={}".format(len(merged)) )

    methods = ["Sailfish", "Kallisto", "Salmon", "eXpress"]
    #sns.set_palette(sns.color_palette([methodcolor[m] for m in methods]))

    import AnalysisUtils
    from AnalysisUtils import proportionalityCorrelation 
    prop_salmon = proportionalityCorrelation('TPM_salmon', 'A', merged)
    prop_express = proportionalityCorrelation('TPM_express', 'A', merged)
    prop_sailfish = proportionalityCorrelation('TPM_sailfish', 'A', merged)
    prop_kallisto = proportionalityCorrelation('TPM_kallisto', 'A', merged)
    print( "Proportionality Salmon: {}".format(prop_salmon) )
    print( "Proportionality eXpress: {}".format(prop_express) )
    print( "Proportionality Sailfish: {}".format(prop_sailfish) )
    print( "Proportionality Kallisto: {}".format(prop_kallisto) )

    spearman_salmon =  merged['TPM_salmon'].corr( merged['A'], "spearman")
    spearman_express =  merged['TPM_express'].corr( merged['A'], "spearman")
    spearman_sailfish =  merged['TPM_sailfish'].corr( merged['A'], "spearman")
    spearman_kallisto =  merged['TPM_kallisto'].corr( merged['A'], "spearman")
    print( "Spearman Salmon: {}".format(spearman_salmon) )
    print( "Spearman eXpress: {}".format(spearman_express) )
    print( "Spearman Sailfish: {}".format(spearman_sailfish) )
    print( "Spearman Kallisto: {}".format(spearman_kallisto) )
    #plt.xlim(0,17000)
    def plot_table(data, methods, suffix=""):
        plt.clf()
        plt.figure(figsize=(8,12))
        corrtable = pd.DataFrame(data=data, columns = ['Spearman corr.'], index=methods)
        corrtable = corrtable.reset_index()
        corrtable.columns = ['Method', 'Spearman corr.']
        print (corrtable)
        sns.barplot("Method", "Spearman corr.", data=corrtable)
        plt.savefig("{}/qpcrbarplot{}.pdf".format(outpath,suffix))
    methods = ["Sailfish", "Salmon", "eXpress"]
    data = [spearman_sailfish, spearman_salmon, spearman_express]
    plot_table(data,methods)
    methods = ["Kallisto", "Salmon"]
    data = [spearman_kallisto, spearman_salmon]
    plot_table(data,methods,"_supp")
    plt.cla()
    plt.clf()
    plt.plot(merged['TPM_salmon'].rank()[7000:14000], merged['A'].rank()[7000:14000], 'o', alpha=1, ms=2)
    plt.savefig("{}/qpcr_salmon.pdf".format(outpath))
    plt.clf()
    plt.plot(merged['TPM_express'].rank()[7000:14000], merged['A'].rank()[7000:14000], 'o',alpha=1, ms=2)
    #plt.xlim(0,17000)
    plt.savefig("{}/qpcr_express.pdf".format(outpath))
    plt.clf()
    plt.plot(merged['TPM_sailfish'].rank()[7000:14000], merged['A'].rank()[7000:14000], 'o', alpha=1, ms=2)
    #plt.xlim(0,17000)
    plt.savefig("{}/qpcr_sailfish.pdf".format(outpath))
    plt.clf()
    plt.plot(merged['TPM_kallisto'].rank()[7000:14000], merged['A'].rank()[7000:14000], 'o', alpha=1, ms=2)
    #plt.xlim(0,17000)
    plt.savefig("{}/qpcr_kallisto.pdf".format(outpath))
Exemplo n.º 3
0
def makeTable(methodDict, outpath, outfile, measure, annotPath):
    import pandas as pd
    import seaborn as sns
    import ParsingUtils
    import AnalysisUtils
    dframes = []
    for k, v in methodDict.items():
        if k.upper().startswith('SALMON'):
            d = ParsingUtils.readSalmon(v, '_{}'.format(k))
        elif k.upper().startswith('KALLISTO'):
            d = ParsingUtils.readKallisto(v, '_{}'.format(k))
        elif k.upper().startswith('EXPRESS'):
            d = ParsingUtils.readExpress(v, '_{}'.format(k))
        elif k.upper() == 'SAILFISH':
            d = ParsingUtils.readSailfish(v, '_{}'.format(k))
        elif k.upper() == 'SAILFISH (QUASI)':
            d = ParsingUtils.readSalmon(v, '_{}'.format(k))
        elif k.upper().startswith('TRUTH'):
            suffix = '_{}'.format(k)
            d = ParsingUtils.readProFile(v, suffix) 
            d["TPM{}".format(suffix)] = 1000000.0 * (d["ExpFrac{}".format(suffix)] / d["ExpFrac{}".format(suffix)].sum())
            # Flux sim thinks paired-end = 2 reads . . . sinh
            d["NumReads{}".format(suffix)] = d["SeqNum{}".format(suffix)] * 0.5

        # Add this dataframe to the list
        dframes.append(d)

    M = dframes[0].join(dframes[1:])
    
    # Filter eXpress results
    minVal = np.inf
    for mn in set(methodDict.keys()) - set(["Truth", "eXpress"]):
        newMin = M.loc[M["{}_{}".format(measure, mn)]>0, "{}_{}".format(measure,mn)].min()
        minVal = min(minVal, newMin) 
    print("filtering eXpress results < {} {}".format(minVal, measure))
    AnalysisUtils.filterValues("{}_{}".format(measure, "eXpress"), M, minVal)

    org = outfile.split('/')[-1].split('_')[0] 
    print("org = {}".format(org))
    if org == 'human':
        plotStratifiedDiffs(M, methodDict, annotPath, outpath, measure)

    mrdName = 'abs. mean rel. diff.'
    corrName = 'Spearman corr.'
    propName = 'Proportionality corr.'
    tpefName = 'TP error fraction'
    tpMedErrorName = 'TP median per. error'
    res = pd.DataFrame(data={ m : {tpMedErrorName : np.nan, tpefName : np.nan, mrdName : np.nan, corrName : np.nan, propName : np.nan} for m in (methodDict.keys() - set('Truth'))})

    import scipy as sp
    import scipy.stats

    for k in methodDict:
        if k.upper() != "TRUTH":
            c = sp.stats.spearmanr(M["{}_Truth".format(measure)], M["{}_{}".format(measure, k)])[0]
            res[k][corrName] = c
            mrd, _ = AnalysisUtils.relDiff("{}_Truth".format(measure), "{}_{}".format(measure, k), M) 
            res[k][mrdName] = mrd["relDiff"].abs().mean()

            pc = AnalysisUtils.proportionalityCorrelation("{}_Truth".format(measure), "{}_{}".format(measure, k), M) 
            res[k][propName] = pc 

            tpind =  M[M["{}_Truth".format(measure)] >= 1]
            y = tpind["{}_{}".format(measure, k)] 
            x = tpind["{}_Truth".format(measure)]
            ef = 10.0
            re = (y - x) / x
            are = 100.0 * (y - x).abs() / x
            tpef = len(are[are > ef]) / float(len(are))
            res[k][tpefName] = tpef
            res[k][tpMedErrorName] = re.median()

    res.drop('Truth', axis=1, inplace=True)
    print(res)
    res.to_csv(outfile+".csv")

    with open(outfile, 'w') as ofile:
        ofile.write(res.to_latex(float_format=lambda x: "{0:.2f}".format(x)))
    print("wrote {}".format(outpath))
Exemplo n.º 4
0
def rsemComparisonPlots(topdir, measure, simIDs, outdir):
    import os
    kallistoQuant = "abundance.txt"
    salmonQuant = "quant.sf"
    sailfishQuant = "quant.sf/quant.sf"
    sailfishQuasiQuant = "quant.sf/quant.sf"

    print("parsing results")

    # Gather true results
    GroundTruths = []
    for i in simIDs:
        DF = pd.read_csv('{}/{}.sim.isoforms.results'.format(topdir, i), sep='\t')
        DF.drop(set(DF.columns) - set(['transcript_id', 'length', 'count', 
                'TPM', 'effective_length']), axis=1, inplace=True)
        DF.rename(columns={'transcript_id' : 'Name', 
                'length' : 'Length_truth{}'.format(i), 
                'count' : 'NumReads_truth{}'.format(i), 
                'TPM' : 'TPM_truth{}'.format(i), 
                'effective_length' : 'EffLen_truth{}'.format(i)}, inplace=True)
        DF.set_index('Name', inplace=True)
        DF.convert_objects(convert_numeric=True)
        GroundTruths.append(DF)

    SalmonRes = []
    # Gather salmon results
    for i in simIDs:
        fn = os.path.sep.join([topdir, str(i), 'salmon', salmonQuant]) 
        DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('Salmon', i))
        SalmonRes.append(DF)

    SailfishRes = []
    for i in simIDs:
        fn = os.path.sep.join([topdir, str(i), 'sailfish', sailfishQuant]) 
        DF = ParsingUtils.readSailfish(fn, '_{}{}'.format('Sailfish', i))
        SailfishRes.append(DF)

    SalmonVBRes = []
    for i in simIDs:
        fn = os.path.sep.join([topdir, str(i), 'salmonVB', salmonQuant])
        DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('Salmon (VB)', i))
        SalmonVBRes.append(DF)

    SalmonAlnVBRes = []
    for i in simIDs:
        fn = os.path.sep.join([topdir, str(i), 'salmon_alnVB', salmonQuant])
        DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('SalmonAln (VB)', i))
        SalmonAlnVBRes.append(DF)

    SalmonAlnRes = []
    # Gather salmon results
    for i in simIDs:
        fn = os.path.sep.join([topdir, str(i), 'salmon_aln', salmonQuant]) 
        DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('SalmonAln', i))
        SalmonAlnRes.append(DF)
   
    # Gather kallisto results
    KallistoRes = []
    for i in simIDs:
        fn = os.path.sep.join([topdir, str(i), 'kallisto', kallistoQuant]) 
        DF = ParsingUtils.readKallisto(fn, '_{}{}'.format('Kallisto', i))
        KallistoRes.append(DF)

    # Gather sailfish quasi results
    SailfishQuasiRes = []
    for i in simIDs:
        fn = os.path.sep.join([topdir, str(i), 'sailfish_quasi', sailfishQuasiQuant]) 
        DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('Sailfish (Quasi)', i))
        SailfishQuasiRes.append(DF)


    # Gather eXpress results
    expressQuant = "results.xprs"
    ExpressRes = []
    for i in simIDs:
        fn = os.path.sep.join([topdir, str(i), 'express', expressQuant])
        DF = ParsingUtils.readExpress(fn, '_{}{}'.format('eXpress', i))
        ExpressRes.append(DF)

    # Gather StringTie results
    #stringtieQuant = "t_data.ctab"
    #StringtieRes = []
    #for i in simIDs:
    #    fn = os.path.sep.join([topdir, str(i), 'stringtie', stringtieQuant])
    #    DF = ParsingUtils.readExpress(fn, '_{}{}'.format('stringtie', i))
    #    StringtieRes.append(DF)

    K = KallistoRes[0].join(KallistoRes[1:])
    S = SalmonRes[0].join(SalmonRes[1:])
    SVB = SalmonVBRes[0].join(SalmonVBRes[1:])
    SA = SalmonAlnRes[0].join(SalmonAlnRes[1:])
    SAVB = SalmonAlnVBRes[0].join(SalmonAlnVBRes[1:])
    E = ExpressRes[0].join(ExpressRes[1:])
    G = GroundTruths[0].join(GroundTruths[1:])
    SF = SailfishRes[0].join(SailfishRes[1:])
    SFQ = SailfishQuasiRes[0].join(SailfishQuasiRes[1:])
    #ST = StringtieRes[0].join(StringtieRes[1:])

    # Gather *all* results into a single dataframe
    M = G.join(K).join(S).join(SA).join(SVB).join(SAVB).join(E).join(SF).join(SFQ)

    methods = ["Salmon", "Salmon (VB)", "SalmonAln", "SalmonAln (VB)", "eXpress", "Kallisto", "Sailfish", "Sailfish (Quasi)"]

    # Filter eXpress results
    for i in simIDs:
        minVal = np.inf
        for mn in set(methods) - set(["Truth", "eXpress"]):
            print("Method name = {}".format(mn))
            newMin = M.loc[M["{}_{}{}".format(measure, mn, i)]>0, "{}_{}{}".format(measure, mn, i)].min()
            minVal = min(minVal, newMin) 
        print("filtering eXpress results < {} {}".format(minVal, measure))
        AnalysisUtils.filterValues("{}_{}{}".format(measure, "eXpress", i), M, minVal)

    #print("generating relDiff plot")
    #rsemRelDiffPlots(M, measure, methods, simIDs, outdir, "MeanAbsRelDiffs{}".format(measure))
    #print("generating proportionality plot")
    #proportionalityCorrelationPlots(M, measure, methods, simIDs, outdir, "ProportionalityCorrelation{}".format(measure))
    print("generating spearman correlation plot")
    methods_spearman = ["Sailfish", "eXpress", "Salmon",  "SalmonAln"]
    spearmanCorrelationPlots(M, measure, methods_spearman, simIDs, outdir, "SpearmanCorrelation{}".format(measure), setxlim=(0.87, 0.93))
    methods_spearman_kallisto = ["Kallisto", "Salmon"]
    spearmanCorrelationPlots(M, measure, methods_spearman_kallisto, simIDs, outdir, "SpearmanCorrelationKallisto{}".format(measure), setxlim=(0.87, 0.93))
    methods_spearman_salmon = ["Salmon (VB)","Salmon", "SalmonAln (VB)", "SalmonAln" ]
    spearmanCorrelationPlots(M, measure, methods_spearman_salmon, simIDs, outdir, "SpearmanCorrelationSalmon{}".format(measure), setxlim=(0.91, 0.93), lines=True)
    methods_spearman_sailfishquasi = ["Sailfish", "Sailfish (Quasi)", "eXpress", "Salmon",  "SalmonAln"]
    spearmanCorrelationPlots(M, measure, methods_spearman_sailfishquasi, simIDs, outdir, "SpearmanCorrelationSailfishQuasi{}".format(measure), setxlim=(0.87, 0.93), lines=True)
   #print("generating tp relative error plots")
    #errorFracPlots(M, measure, methods, simIDs, outdir, "TPErrorFrac{}".format(measure))
 
    #for i in simIDs:
    #    for m in methods + ['truth']:
    #        AnalysisUtils.filterValues("NumReads_{}{}".format(m, i), M, 1.0)
    print("generating filtered relDiff plot")
    #rsemRelDiffPlots(M, measure, methods, simIDs, outdir, "MeanAbsRelDiffs{}Filtered".format(measure))
    print("generating filtered proportionality plot")
    #proportionalityCorrelationPlots(M, measure, methods, simIDs, outdir, "ProportionalityCorrelation{}Filtered".format(measure))
    print("generating spearman correlation plot")