Exemplo n.º 1
0
def makePlots(truthFile, predFile, method, outBase, measure):

    if measure == "tpm":
        trueColName = "TPM_truth"
        predColName = "TPM_{}".format(method)
    elif measure == "num_reads":
        trueColName = "NumReads_truth"
        predColName = "NumReads_{}".format(method)

    # Load the data; first the predicitons
    p =  None
    if method == "salmon":
        p = ParsingUtils.readSalmon(predFile, '_{}'.format(method))
    elif method == "kallisto":
        p = ParsingUtils.readKallisto(predFile, '_{}'.format(method))
    elif method == "express":
        p = ParsingUtils.readExpress(predFile, '_{}'.format(method))
    
    # Now the ground truth
    g = ParsingUtils.readProFile(truthFile, '_truth')

    # Convert to TPM
    g["TPM_truth"] = 1000000.0 * (g["ExpFrac_truth"] / g["ExpFrac_truth"].sum())
    # Flux sim thinks paired-end = 2 reads . . . sigh
    g["NumReads_truth"] = g["SeqNum_truth"] * 0.5

    # Filter out low TPM
    AnalysisUtils.filterValues("TPM_truth", g, 0.01)
    AnalysisUtils.filterValues("TPM_{}".format(method), p, 0.01)
    AnalysisUtils.filterValues("NumReads_truth", g, 1.0)
    AnalysisUtils.filterValues("NumReads_{}".format(method), p, 1.0)

    # merge dataframes
    m = g.join(p)

    setPlotProperties()

    makeCorrPlot(trueColName, predColName, m, outBase, method, measure)
    makeRelDiffPlot(trueColName, predColName, m, outBase, method, measure)
    makeRelErrorPlot(trueColName, predColName, m, outBase, method, measure)
Exemplo n.º 2
0
def makeTable(methodDict, outpath, outfile, measure, annotPath):
    import pandas as pd
    import seaborn as sns
    import ParsingUtils
    import AnalysisUtils
    dframes = []
    for k, v in methodDict.items():
        if k.upper().startswith('SALMON'):
            d = ParsingUtils.readSalmon(v, '_{}'.format(k))
        elif k.upper().startswith('KALLISTO'):
            d = ParsingUtils.readKallisto(v, '_{}'.format(k))
        elif k.upper().startswith('EXPRESS'):
            d = ParsingUtils.readExpress(v, '_{}'.format(k))
        elif k.upper() == 'SAILFISH':
            d = ParsingUtils.readSailfish(v, '_{}'.format(k))
        elif k.upper() == 'SAILFISH (QUASI)':
            d = ParsingUtils.readSalmon(v, '_{}'.format(k))
        elif k.upper().startswith('TRUTH'):
            suffix = '_{}'.format(k)
            d = ParsingUtils.readProFile(v, suffix) 
            d["TPM{}".format(suffix)] = 1000000.0 * (d["ExpFrac{}".format(suffix)] / d["ExpFrac{}".format(suffix)].sum())
            # Flux sim thinks paired-end = 2 reads . . . sinh
            d["NumReads{}".format(suffix)] = d["SeqNum{}".format(suffix)] * 0.5

        # Add this dataframe to the list
        dframes.append(d)

    M = dframes[0].join(dframes[1:])
    
    # Filter eXpress results
    minVal = np.inf
    for mn in set(methodDict.keys()) - set(["Truth", "eXpress"]):
        newMin = M.loc[M["{}_{}".format(measure, mn)]>0, "{}_{}".format(measure,mn)].min()
        minVal = min(minVal, newMin) 
    print("filtering eXpress results < {} {}".format(minVal, measure))
    AnalysisUtils.filterValues("{}_{}".format(measure, "eXpress"), M, minVal)

    org = outfile.split('/')[-1].split('_')[0] 
    print("org = {}".format(org))
    if org == 'human':
        plotStratifiedDiffs(M, methodDict, annotPath, outpath, measure)

    mrdName = 'abs. mean rel. diff.'
    corrName = 'Spearman corr.'
    propName = 'Proportionality corr.'
    tpefName = 'TP error fraction'
    tpMedErrorName = 'TP median per. error'
    res = pd.DataFrame(data={ m : {tpMedErrorName : np.nan, tpefName : np.nan, mrdName : np.nan, corrName : np.nan, propName : np.nan} for m in (methodDict.keys() - set('Truth'))})

    import scipy as sp
    import scipy.stats

    for k in methodDict:
        if k.upper() != "TRUTH":
            c = sp.stats.spearmanr(M["{}_Truth".format(measure)], M["{}_{}".format(measure, k)])[0]
            res[k][corrName] = c
            mrd, _ = AnalysisUtils.relDiff("{}_Truth".format(measure), "{}_{}".format(measure, k), M) 
            res[k][mrdName] = mrd["relDiff"].abs().mean()

            pc = AnalysisUtils.proportionalityCorrelation("{}_Truth".format(measure), "{}_{}".format(measure, k), M) 
            res[k][propName] = pc 

            tpind =  M[M["{}_Truth".format(measure)] >= 1]
            y = tpind["{}_{}".format(measure, k)] 
            x = tpind["{}_Truth".format(measure)]
            ef = 10.0
            re = (y - x) / x
            are = 100.0 * (y - x).abs() / x
            tpef = len(are[are > ef]) / float(len(are))
            res[k][tpefName] = tpef
            res[k][tpMedErrorName] = re.median()

    res.drop('Truth', axis=1, inplace=True)
    print(res)
    res.to_csv(outfile+".csv")

    with open(outfile, 'w') as ofile:
        ofile.write(res.to_latex(float_format=lambda x: "{0:.2f}".format(x)))
    print("wrote {}".format(outpath))