def makePlots(truthFile, predFile, method, outBase, measure): if measure == "tpm": trueColName = "TPM_truth" predColName = "TPM_{}".format(method) elif measure == "num_reads": trueColName = "NumReads_truth" predColName = "NumReads_{}".format(method) # Load the data; first the predicitons p = None if method == "salmon": p = ParsingUtils.readSalmon(predFile, '_{}'.format(method)) elif method == "kallisto": p = ParsingUtils.readKallisto(predFile, '_{}'.format(method)) elif method == "express": p = ParsingUtils.readExpress(predFile, '_{}'.format(method)) # Now the ground truth g = ParsingUtils.readProFile(truthFile, '_truth') # Convert to TPM g["TPM_truth"] = 1000000.0 * (g["ExpFrac_truth"] / g["ExpFrac_truth"].sum()) # Flux sim thinks paired-end = 2 reads . . . sigh g["NumReads_truth"] = g["SeqNum_truth"] * 0.5 # Filter out low TPM AnalysisUtils.filterValues("TPM_truth", g, 0.01) AnalysisUtils.filterValues("TPM_{}".format(method), p, 0.01) AnalysisUtils.filterValues("NumReads_truth", g, 1.0) AnalysisUtils.filterValues("NumReads_{}".format(method), p, 1.0) # merge dataframes m = g.join(p) setPlotProperties() makeCorrPlot(trueColName, predColName, m, outBase, method, measure) makeRelDiffPlot(trueColName, predColName, m, outBase, method, measure) makeRelErrorPlot(trueColName, predColName, m, outBase, method, measure)
def qpcrplots(outpath): font = { 'weight' : 'bold', 'size' : 50} import matplotlib matplotlib.rc('font', **font) sns.set_style("white") tgmap = pd.read_csv('/shared/SalmonManuscript/data/HumanGenomeAnnotations/tgmap.all.txt', sep='\t', names=['gene.id', 'Name', 'gene.name']) tgmap.set_index('Name', inplace=True) print( "length tgmap={}".format(len(tgmap)) ) salmondf = ParsingUtils.readSalmon('data/quant_seqc/rep1_salmon/quant.sf', suffix="_salmon") print( "length salmondf={}".format( len(salmondf)) ) merged = salmondf.join(tgmap, how='inner') print( "length tgmap&salmondf={}".format( len(merged) ) ) expressDF = ParsingUtils.readExpress('data/quant_seqc/rep1_express/results.xprs', suffix="_express") merged = merged.join(expressDF, how='inner') print( "Length tgmap&salmonDF&expressDF={}".format( len(merged) ) ) sailfishDF = ParsingUtils.readSailfish('data/quant_seqc/rep1_sailfish/quant_bias_corrected.sf', suffix="_sailfish") merged = merged.join(sailfishDF, how='inner') print( "Length tgmap&salmonDF&expressDF&sailifishDF={}".format( len(merged) ) ) kallistoDF = ParsingUtils.readKallisto('data/quant_seqc/rep1_kallisto/abundance.txt', suffix="_kallisto") merged = merged.join(kallistoDF, how='inner') print( "Length tgmap&salmonDF&expressDF&sailifishDF&kallistoDF={}".format( len(merged) ) ) merged = merged.groupby('gene.name').sum() print( "Length after summing by gene={}".format(len(merged)) ) qpcrDF = pd.read_csv('data/seqc/qpcr/PrimePCR.txt', sep='\t', names=['gene.name', 'A','B','C','D'], skiprows=1) qpcrDF.set_index("gene.name", inplace=True) print( "Length qpcrDF={}".format(len(qpcrDF)) ) merged = merged.join(qpcrDF, how='inner') print( "Length everything&qpcrDF={}".format(len(merged)) ) methods = ["Sailfish", "Kallisto", "Salmon", "eXpress"] #sns.set_palette(sns.color_palette([methodcolor[m] for m in methods])) import AnalysisUtils from AnalysisUtils import proportionalityCorrelation prop_salmon = proportionalityCorrelation('TPM_salmon', 'A', merged) prop_express = proportionalityCorrelation('TPM_express', 'A', merged) prop_sailfish = proportionalityCorrelation('TPM_sailfish', 'A', merged) prop_kallisto = proportionalityCorrelation('TPM_kallisto', 'A', merged) print( "Proportionality Salmon: {}".format(prop_salmon) ) print( "Proportionality eXpress: {}".format(prop_express) ) print( "Proportionality Sailfish: {}".format(prop_sailfish) ) print( "Proportionality Kallisto: {}".format(prop_kallisto) ) spearman_salmon = merged['TPM_salmon'].corr( merged['A'], "spearman") spearman_express = merged['TPM_express'].corr( merged['A'], "spearman") spearman_sailfish = merged['TPM_sailfish'].corr( merged['A'], "spearman") spearman_kallisto = merged['TPM_kallisto'].corr( merged['A'], "spearman") print( "Spearman Salmon: {}".format(spearman_salmon) ) print( "Spearman eXpress: {}".format(spearman_express) ) print( "Spearman Sailfish: {}".format(spearman_sailfish) ) print( "Spearman Kallisto: {}".format(spearman_kallisto) ) #plt.xlim(0,17000) def plot_table(data, methods, suffix=""): plt.clf() plt.figure(figsize=(8,12)) corrtable = pd.DataFrame(data=data, columns = ['Spearman corr.'], index=methods) corrtable = corrtable.reset_index() corrtable.columns = ['Method', 'Spearman corr.'] print (corrtable) sns.barplot("Method", "Spearman corr.", data=corrtable) plt.savefig("{}/qpcrbarplot{}.pdf".format(outpath,suffix)) methods = ["Sailfish", "Salmon", "eXpress"] data = [spearman_sailfish, spearman_salmon, spearman_express] plot_table(data,methods) methods = ["Kallisto", "Salmon"] data = [spearman_kallisto, spearman_salmon] plot_table(data,methods,"_supp") plt.cla() plt.clf() plt.plot(merged['TPM_salmon'].rank()[7000:14000], merged['A'].rank()[7000:14000], 'o', alpha=1, ms=2) plt.savefig("{}/qpcr_salmon.pdf".format(outpath)) plt.clf() plt.plot(merged['TPM_express'].rank()[7000:14000], merged['A'].rank()[7000:14000], 'o',alpha=1, ms=2) #plt.xlim(0,17000) plt.savefig("{}/qpcr_express.pdf".format(outpath)) plt.clf() plt.plot(merged['TPM_sailfish'].rank()[7000:14000], merged['A'].rank()[7000:14000], 'o', alpha=1, ms=2) #plt.xlim(0,17000) plt.savefig("{}/qpcr_sailfish.pdf".format(outpath)) plt.clf() plt.plot(merged['TPM_kallisto'].rank()[7000:14000], merged['A'].rank()[7000:14000], 'o', alpha=1, ms=2) #plt.xlim(0,17000) plt.savefig("{}/qpcr_kallisto.pdf".format(outpath))
def makeTable(methodDict, outpath, outfile, measure, annotPath): import pandas as pd import seaborn as sns import ParsingUtils import AnalysisUtils dframes = [] for k, v in methodDict.items(): if k.upper().startswith('SALMON'): d = ParsingUtils.readSalmon(v, '_{}'.format(k)) elif k.upper().startswith('KALLISTO'): d = ParsingUtils.readKallisto(v, '_{}'.format(k)) elif k.upper().startswith('EXPRESS'): d = ParsingUtils.readExpress(v, '_{}'.format(k)) elif k.upper() == 'SAILFISH': d = ParsingUtils.readSailfish(v, '_{}'.format(k)) elif k.upper() == 'SAILFISH (QUASI)': d = ParsingUtils.readSalmon(v, '_{}'.format(k)) elif k.upper().startswith('TRUTH'): suffix = '_{}'.format(k) d = ParsingUtils.readProFile(v, suffix) d["TPM{}".format(suffix)] = 1000000.0 * (d["ExpFrac{}".format(suffix)] / d["ExpFrac{}".format(suffix)].sum()) # Flux sim thinks paired-end = 2 reads . . . sinh d["NumReads{}".format(suffix)] = d["SeqNum{}".format(suffix)] * 0.5 # Add this dataframe to the list dframes.append(d) M = dframes[0].join(dframes[1:]) # Filter eXpress results minVal = np.inf for mn in set(methodDict.keys()) - set(["Truth", "eXpress"]): newMin = M.loc[M["{}_{}".format(measure, mn)]>0, "{}_{}".format(measure,mn)].min() minVal = min(minVal, newMin) print("filtering eXpress results < {} {}".format(minVal, measure)) AnalysisUtils.filterValues("{}_{}".format(measure, "eXpress"), M, minVal) org = outfile.split('/')[-1].split('_')[0] print("org = {}".format(org)) if org == 'human': plotStratifiedDiffs(M, methodDict, annotPath, outpath, measure) mrdName = 'abs. mean rel. diff.' corrName = 'Spearman corr.' propName = 'Proportionality corr.' tpefName = 'TP error fraction' tpMedErrorName = 'TP median per. error' res = pd.DataFrame(data={ m : {tpMedErrorName : np.nan, tpefName : np.nan, mrdName : np.nan, corrName : np.nan, propName : np.nan} for m in (methodDict.keys() - set('Truth'))}) import scipy as sp import scipy.stats for k in methodDict: if k.upper() != "TRUTH": c = sp.stats.spearmanr(M["{}_Truth".format(measure)], M["{}_{}".format(measure, k)])[0] res[k][corrName] = c mrd, _ = AnalysisUtils.relDiff("{}_Truth".format(measure), "{}_{}".format(measure, k), M) res[k][mrdName] = mrd["relDiff"].abs().mean() pc = AnalysisUtils.proportionalityCorrelation("{}_Truth".format(measure), "{}_{}".format(measure, k), M) res[k][propName] = pc tpind = M[M["{}_Truth".format(measure)] >= 1] y = tpind["{}_{}".format(measure, k)] x = tpind["{}_Truth".format(measure)] ef = 10.0 re = (y - x) / x are = 100.0 * (y - x).abs() / x tpef = len(are[are > ef]) / float(len(are)) res[k][tpefName] = tpef res[k][tpMedErrorName] = re.median() res.drop('Truth', axis=1, inplace=True) print(res) res.to_csv(outfile+".csv") with open(outfile, 'w') as ofile: ofile.write(res.to_latex(float_format=lambda x: "{0:.2f}".format(x))) print("wrote {}".format(outpath))
def rsemComparisonPlots(topdir, measure, simIDs, outdir): import os kallistoQuant = "abundance.txt" salmonQuant = "quant.sf" sailfishQuant = "quant.sf/quant.sf" sailfishQuasiQuant = "quant.sf/quant.sf" print("parsing results") # Gather true results GroundTruths = [] for i in simIDs: DF = pd.read_csv('{}/{}.sim.isoforms.results'.format(topdir, i), sep='\t') DF.drop(set(DF.columns) - set(['transcript_id', 'length', 'count', 'TPM', 'effective_length']), axis=1, inplace=True) DF.rename(columns={'transcript_id' : 'Name', 'length' : 'Length_truth{}'.format(i), 'count' : 'NumReads_truth{}'.format(i), 'TPM' : 'TPM_truth{}'.format(i), 'effective_length' : 'EffLen_truth{}'.format(i)}, inplace=True) DF.set_index('Name', inplace=True) DF.convert_objects(convert_numeric=True) GroundTruths.append(DF) SalmonRes = [] # Gather salmon results for i in simIDs: fn = os.path.sep.join([topdir, str(i), 'salmon', salmonQuant]) DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('Salmon', i)) SalmonRes.append(DF) SailfishRes = [] for i in simIDs: fn = os.path.sep.join([topdir, str(i), 'sailfish', sailfishQuant]) DF = ParsingUtils.readSailfish(fn, '_{}{}'.format('Sailfish', i)) SailfishRes.append(DF) SalmonVBRes = [] for i in simIDs: fn = os.path.sep.join([topdir, str(i), 'salmonVB', salmonQuant]) DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('Salmon (VB)', i)) SalmonVBRes.append(DF) SalmonAlnVBRes = [] for i in simIDs: fn = os.path.sep.join([topdir, str(i), 'salmon_alnVB', salmonQuant]) DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('SalmonAln (VB)', i)) SalmonAlnVBRes.append(DF) SalmonAlnRes = [] # Gather salmon results for i in simIDs: fn = os.path.sep.join([topdir, str(i), 'salmon_aln', salmonQuant]) DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('SalmonAln', i)) SalmonAlnRes.append(DF) # Gather kallisto results KallistoRes = [] for i in simIDs: fn = os.path.sep.join([topdir, str(i), 'kallisto', kallistoQuant]) DF = ParsingUtils.readKallisto(fn, '_{}{}'.format('Kallisto', i)) KallistoRes.append(DF) # Gather sailfish quasi results SailfishQuasiRes = [] for i in simIDs: fn = os.path.sep.join([topdir, str(i), 'sailfish_quasi', sailfishQuasiQuant]) DF = ParsingUtils.readSalmon(fn, '_{}{}'.format('Sailfish (Quasi)', i)) SailfishQuasiRes.append(DF) # Gather eXpress results expressQuant = "results.xprs" ExpressRes = [] for i in simIDs: fn = os.path.sep.join([topdir, str(i), 'express', expressQuant]) DF = ParsingUtils.readExpress(fn, '_{}{}'.format('eXpress', i)) ExpressRes.append(DF) # Gather StringTie results #stringtieQuant = "t_data.ctab" #StringtieRes = [] #for i in simIDs: # fn = os.path.sep.join([topdir, str(i), 'stringtie', stringtieQuant]) # DF = ParsingUtils.readExpress(fn, '_{}{}'.format('stringtie', i)) # StringtieRes.append(DF) K = KallistoRes[0].join(KallistoRes[1:]) S = SalmonRes[0].join(SalmonRes[1:]) SVB = SalmonVBRes[0].join(SalmonVBRes[1:]) SA = SalmonAlnRes[0].join(SalmonAlnRes[1:]) SAVB = SalmonAlnVBRes[0].join(SalmonAlnVBRes[1:]) E = ExpressRes[0].join(ExpressRes[1:]) G = GroundTruths[0].join(GroundTruths[1:]) SF = SailfishRes[0].join(SailfishRes[1:]) SFQ = SailfishQuasiRes[0].join(SailfishQuasiRes[1:]) #ST = StringtieRes[0].join(StringtieRes[1:]) # Gather *all* results into a single dataframe M = G.join(K).join(S).join(SA).join(SVB).join(SAVB).join(E).join(SF).join(SFQ) methods = ["Salmon", "Salmon (VB)", "SalmonAln", "SalmonAln (VB)", "eXpress", "Kallisto", "Sailfish", "Sailfish (Quasi)"] # Filter eXpress results for i in simIDs: minVal = np.inf for mn in set(methods) - set(["Truth", "eXpress"]): print("Method name = {}".format(mn)) newMin = M.loc[M["{}_{}{}".format(measure, mn, i)]>0, "{}_{}{}".format(measure, mn, i)].min() minVal = min(minVal, newMin) print("filtering eXpress results < {} {}".format(minVal, measure)) AnalysisUtils.filterValues("{}_{}{}".format(measure, "eXpress", i), M, minVal) #print("generating relDiff plot") #rsemRelDiffPlots(M, measure, methods, simIDs, outdir, "MeanAbsRelDiffs{}".format(measure)) #print("generating proportionality plot") #proportionalityCorrelationPlots(M, measure, methods, simIDs, outdir, "ProportionalityCorrelation{}".format(measure)) print("generating spearman correlation plot") methods_spearman = ["Sailfish", "eXpress", "Salmon", "SalmonAln"] spearmanCorrelationPlots(M, measure, methods_spearman, simIDs, outdir, "SpearmanCorrelation{}".format(measure), setxlim=(0.87, 0.93)) methods_spearman_kallisto = ["Kallisto", "Salmon"] spearmanCorrelationPlots(M, measure, methods_spearman_kallisto, simIDs, outdir, "SpearmanCorrelationKallisto{}".format(measure), setxlim=(0.87, 0.93)) methods_spearman_salmon = ["Salmon (VB)","Salmon", "SalmonAln (VB)", "SalmonAln" ] spearmanCorrelationPlots(M, measure, methods_spearman_salmon, simIDs, outdir, "SpearmanCorrelationSalmon{}".format(measure), setxlim=(0.91, 0.93), lines=True) methods_spearman_sailfishquasi = ["Sailfish", "Sailfish (Quasi)", "eXpress", "Salmon", "SalmonAln"] spearmanCorrelationPlots(M, measure, methods_spearman_sailfishquasi, simIDs, outdir, "SpearmanCorrelationSailfishQuasi{}".format(measure), setxlim=(0.87, 0.93), lines=True) #print("generating tp relative error plots") #errorFracPlots(M, measure, methods, simIDs, outdir, "TPErrorFrac{}".format(measure)) #for i in simIDs: # for m in methods + ['truth']: # AnalysisUtils.filterValues("NumReads_{}{}".format(m, i), M, 1.0) print("generating filtered relDiff plot") #rsemRelDiffPlots(M, measure, methods, simIDs, outdir, "MeanAbsRelDiffs{}Filtered".format(measure)) print("generating filtered proportionality plot") #proportionalityCorrelationPlots(M, measure, methods, simIDs, outdir, "ProportionalityCorrelation{}Filtered".format(measure)) print("generating spearman correlation plot")