Exemplo n.º 1
0
def plot_mean_and_variance(unit, event, ests, model):
    """ests - dict(major: major alleles, mean: means, var: variances
    model   - a tuple(mean model, variance model)
    TODO: add error bars"""
    x_axis = range(min(ests['major']), max(ests['major']) + 1)
    mean_model, var_model = model
    rpy.r.plot(y=[ na(x) for x in ests['mean'] ],
        x=ests['major'], xlab="Major allele", ylab="Freq",
        main="%s, %s: mean vs. major allele" % (unit, event))
    rpy.r.lines(predict_lm(mean_model, x_axis, log), x=x_axis, col="blue")

    rpy.r.plot(y=[ na(var) for var in ests['var'] ],
        x=ests['major'], xlab="Major allele", ylab="Freq",
        main="%s, %s: variance vs. major allele" % (unit, event))
    rpy.r.lines(predict_lm(var_model, x_axis, log), x=x_axis, col="blue")
Exemplo n.º 2
0
def model_mean_and_variance(meanvar_ests):
    """Regression models of mean and var as functions of major allele len.
    NOTE: since alleles are already normalized to the major allele (e.g.,
    allele len=0 is the major allele), we're modeling error of the means
    and variances.
    LATER: Use several regression formulae to see how things look and choose
    the best fit?  For now, the log regression seems (simply by eye) to be
    the better fit."""
    # Weights are just the number of observed sites for each majro allele
    weights = meanvar_ests['count']
    adjmean = [ na(mean) for mean in meanvar_ests['mean'] ]
    adjvar = [ na(var) for var in meanvar_ests['var'] ]
    lmdata = rpy.r.data_frame(major=meanvar_ests['major'], mean=adjmean, var=adjvar)
    meanmodel = rpy.r.lm(rpy.r("mean ~ log(major)"), data=lmdata,
        weights=weights)
    varmodel = rpy.r.lm(rpy.r("var ~ log(major)"), data=lmdata,
        weights=weights)
    return meanmodel, varmodel
Exemplo n.º 3
0
def model_mean_and_variance(binned_data):
    """Given a histogram of observed data (binned or not), estimate the mean
    and variance as functions of major allele length, then use linear
    regression to model the mean and variance.  These fitted curves will be
    used to construct PMFs for each major allele length."""

    emp_values = []  # tuple(major_allele, mean, var, gmean, gvar, lmean, lvar)
    for major_allele, data in binned_data.iteritems():
        N_tot = sum(freq for _, freq in data.iteritems())
        mean_tot = sum(freq*float(allele) for allele, freq in data.items()) / N_tot
        var_tot = sum(freq*(float(allele) - mean_tot)**2 for allele, freq in data.items()) / (N_tot - 1)

        gains = [ (allele, freq) for allele, freq in data.items() if allele > 0 ]
        N_gain = sum(freq for _, freq in gains)
        if N_gain > 1:
            mean_gain = sum(freq*float(allele) for allele, freq in gains) / N_gain
            var_gain = sum(freq*(float(allele) - mean_gain)**2 for allele, freq in gains) / (N_gain - 1)
        else:
            mean_gain = None
            var_gain = None

        losses = [ (allele, freq) for allele, freq in data.items() if allele < 0 ]
        N_loss = sum(freq for _, freq in losses)
        if N_loss > 1:
            mean_loss = sum(freq*float(allele) for allele, freq in losses) / N_loss
            var_loss = sum(freq*(float(allele) - mean_loss)**2 for allele, freq in losses) / (N_loss - 1)
        else:
            mean_loss = None
            var_loss = None

        emp_values.append((int(major_allele), mean_tot, var_tot, mean_gain, var_gain, mean_loss, var_loss))

    # These are just for getting plot boundaries
    all_means = sum([ [ tup[1], tup[3], tup[5] ] for tup in emp_values ], [])
    all_means = [ x for x in all_means if x ]  # drop Nones
    all_vars = sum([ [ tup[2], tup[4], tup[6] ] for tup in emp_values ], [])
    all_vars = [ x for x in all_vars if x ] # drop Nones

    rpy.r.layout(rpy.r.matrix([1,2], ncol=1))
    # Plots for visual inspection, move elsewhere
    ylim_mean = [ min(all_means), max(all_means) ]
    x_axis = [ tup[0] for tup in emp_values ]
    # Total mean
    rpy.r.plot([ na(tup[1]) for tup in emp_values ],
               x=x_axis, ylim=ylim_mean,
               main="Empirical mean error vs. major allele length",
               xlab="Major allele length (units)",
               ylab="Empirical mean error")
    # Mean for gain alleles
    rpy.r.lines([ na(tup[3]) for tup in emp_values ], x=x_axis, col="green")
    # Mean for loss alleles
    rpy.r.lines([ na(tup[5]) for tup in emp_values ], x=x_axis, col="red")
    rpy.r.legend("bottomleft", horiz=True, legend=["Total", "Gain", "Loss"],
        fill=["black", "green", "red"])

    # Total variance
    ylim_var = [ min(all_vars), max(all_vars) ]
    rpy.r.plot([ na(tup[2]) for tup in emp_values ],
               x=x_axis, ylim=ylim_var, log="y",
               main="Empirical variance of error vs. major allele length",
               xlab="Major allele length (units)",
               ylab="Empirical variance of error")
    # Gain variance
    rpy.r.lines([ na(tup[4]) for tup in emp_values ], x=x_axis, col="green")
    # Loss variance
    rpy.r.lines([ na(tup[6]) for tup in emp_values ], x=x_axis, col="red")
    rpy.r.legend("topleft", horiz=True, legend=["Total", "Gain", "Loss"],
        fill=["black", "green", "red"])