def plot_mean_and_variance(unit, event, ests, model): """ests - dict(major: major alleles, mean: means, var: variances model - a tuple(mean model, variance model) TODO: add error bars""" x_axis = range(min(ests['major']), max(ests['major']) + 1) mean_model, var_model = model rpy.r.plot(y=[ na(x) for x in ests['mean'] ], x=ests['major'], xlab="Major allele", ylab="Freq", main="%s, %s: mean vs. major allele" % (unit, event)) rpy.r.lines(predict_lm(mean_model, x_axis, log), x=x_axis, col="blue") rpy.r.plot(y=[ na(var) for var in ests['var'] ], x=ests['major'], xlab="Major allele", ylab="Freq", main="%s, %s: variance vs. major allele" % (unit, event)) rpy.r.lines(predict_lm(var_model, x_axis, log), x=x_axis, col="blue")
def model_mean_and_variance(meanvar_ests): """Regression models of mean and var as functions of major allele len. NOTE: since alleles are already normalized to the major allele (e.g., allele len=0 is the major allele), we're modeling error of the means and variances. LATER: Use several regression formulae to see how things look and choose the best fit? For now, the log regression seems (simply by eye) to be the better fit.""" # Weights are just the number of observed sites for each majro allele weights = meanvar_ests['count'] adjmean = [ na(mean) for mean in meanvar_ests['mean'] ] adjvar = [ na(var) for var in meanvar_ests['var'] ] lmdata = rpy.r.data_frame(major=meanvar_ests['major'], mean=adjmean, var=adjvar) meanmodel = rpy.r.lm(rpy.r("mean ~ log(major)"), data=lmdata, weights=weights) varmodel = rpy.r.lm(rpy.r("var ~ log(major)"), data=lmdata, weights=weights) return meanmodel, varmodel
def model_mean_and_variance(binned_data): """Given a histogram of observed data (binned or not), estimate the mean and variance as functions of major allele length, then use linear regression to model the mean and variance. These fitted curves will be used to construct PMFs for each major allele length.""" emp_values = [] # tuple(major_allele, mean, var, gmean, gvar, lmean, lvar) for major_allele, data in binned_data.iteritems(): N_tot = sum(freq for _, freq in data.iteritems()) mean_tot = sum(freq*float(allele) for allele, freq in data.items()) / N_tot var_tot = sum(freq*(float(allele) - mean_tot)**2 for allele, freq in data.items()) / (N_tot - 1) gains = [ (allele, freq) for allele, freq in data.items() if allele > 0 ] N_gain = sum(freq for _, freq in gains) if N_gain > 1: mean_gain = sum(freq*float(allele) for allele, freq in gains) / N_gain var_gain = sum(freq*(float(allele) - mean_gain)**2 for allele, freq in gains) / (N_gain - 1) else: mean_gain = None var_gain = None losses = [ (allele, freq) for allele, freq in data.items() if allele < 0 ] N_loss = sum(freq for _, freq in losses) if N_loss > 1: mean_loss = sum(freq*float(allele) for allele, freq in losses) / N_loss var_loss = sum(freq*(float(allele) - mean_loss)**2 for allele, freq in losses) / (N_loss - 1) else: mean_loss = None var_loss = None emp_values.append((int(major_allele), mean_tot, var_tot, mean_gain, var_gain, mean_loss, var_loss)) # These are just for getting plot boundaries all_means = sum([ [ tup[1], tup[3], tup[5] ] for tup in emp_values ], []) all_means = [ x for x in all_means if x ] # drop Nones all_vars = sum([ [ tup[2], tup[4], tup[6] ] for tup in emp_values ], []) all_vars = [ x for x in all_vars if x ] # drop Nones rpy.r.layout(rpy.r.matrix([1,2], ncol=1)) # Plots for visual inspection, move elsewhere ylim_mean = [ min(all_means), max(all_means) ] x_axis = [ tup[0] for tup in emp_values ] # Total mean rpy.r.plot([ na(tup[1]) for tup in emp_values ], x=x_axis, ylim=ylim_mean, main="Empirical mean error vs. major allele length", xlab="Major allele length (units)", ylab="Empirical mean error") # Mean for gain alleles rpy.r.lines([ na(tup[3]) for tup in emp_values ], x=x_axis, col="green") # Mean for loss alleles rpy.r.lines([ na(tup[5]) for tup in emp_values ], x=x_axis, col="red") rpy.r.legend("bottomleft", horiz=True, legend=["Total", "Gain", "Loss"], fill=["black", "green", "red"]) # Total variance ylim_var = [ min(all_vars), max(all_vars) ] rpy.r.plot([ na(tup[2]) for tup in emp_values ], x=x_axis, ylim=ylim_var, log="y", main="Empirical variance of error vs. major allele length", xlab="Major allele length (units)", ylab="Empirical variance of error") # Gain variance rpy.r.lines([ na(tup[4]) for tup in emp_values ], x=x_axis, col="green") # Loss variance rpy.r.lines([ na(tup[6]) for tup in emp_values ], x=x_axis, col="red") rpy.r.legend("topleft", horiz=True, legend=["Total", "Gain", "Loss"], fill=["black", "green", "red"])