def model_error(model_counts, bin=1): """Given the error profile of the X and Y chromosome, derive an error model for stutter introduced into the library during MDA.""" # Step 1. since many major alleles do not have enough observations in # the data to make a good model, combine histograms from nearby major # alleles to increase the amount of data available while also retaining # some of the unique properties of the major allele. # Combine histograms for: # | major_allele - bin | ... | major_allele + bin | # IMPORTANT NOTE: the bin parameter is in repeat units. Therefore, if # we observe a major allele at 39 repeat units but NO alleles at 38 or # 40, for bin=1 the 39 histogram will get no extra data. That is, there # is a distinction between the n closest alleles and alleles within n # repeat units. binned_model = defaultdict(dict) for unit, counts_by_major_allele in model_counts.iteritems(): for major_allele, data in counts_by_major_allele.iteritems(): major_allele = int(major_allele) binned_model[unit][major_allele] = \ bin_data(major_allele, bin, counts_by_major_allele) for unit, binned_data_by_major_allele in binned_model.iteritems(): vert_rows = ceil(len(binned_data_by_major_allele) / 8.0) #rpy.r.png('%s_piecewise_pmfs.png' % unit, width=8*250, #height=300*vert_rows) #rpy.r.library('plotrix') #rpy.r.layout(rpy.r.matrix(range(1, vert_rows*8 + 1), ncol=8, byrow=True)) #rpy.r.par(mar=[3,2,2,1]) print("opening layout for %s, %d fits" % (unit, len(binned_data_by_major_allele))) for major_allele, binned_data in binned_data_by_major_allele.iteritems(): #rpy.r.barp(binned_data.values(), x=[ major_allele + a for a in binned_data ]) rpy.r.assign("%s.%d.freq" % (unit, major_allele), binned_data.values()) rpy.r.assign("%s.%d.allele" % (unit, major_allele),[major_allele+a for a in binned_data ]) #rpy.r.dev_off() rpy.r.save_image(file="hists.RData") exit(1) # Step 2. with our binned profiles, we can determine the empirical mean # and variance for errors for each major allele size. This needs to be # done separately for gains and losses since the final piecewise PMF will # be constructed with two different negbin models--one for gain and one # for loss. for unit, binned_data in binned_model.iteritems(): model_mean_and_variance(binned_data) sys.stdin.read(1)
def model_error(model_counts, bin=2, save_rdata=None, interactive=False): """Given the error profile of the X and Y chromosome, derive an error model for stutter introduced into the library during MDA. If save_rdata is a file name, convert data into R readable format and save it in the RData format.""" # Step 1. since many major alleles do not have enough observations in # the data to make a good model, combine histograms from nearby major # alleles to increase the amount of data available while also retaining # some of the unique properties of the major allele. # Combine histograms for: # | major_allele - bin | ... | major_allele + bin | # IMPORTANT NOTE: the bin parameter is in repeat units. Therefore, if # we observe a major allele at 39 repeat units but NO alleles at 38 or # 40, for bin=1 the 39 histogram will get no extra data. That is, there # is a distinction between the n closest alleles and alleles within n # repeat units. binned_data = defaultdict(dict) unbinned_data = defaultdict(dict) for unit, counts_by_major_allele in model_counts.iteritems(): for major_allele, data in counts_by_major_allele.iteritems(): major_allele = int(major_allele) # bin_data with bin extension=0 is the unbinned data for that allele unbinned_data[unit][major_allele] = \ bin_data(major_allele, 0, counts_by_major_allele) # NB. 'data' is not the right thing to pass binned_data[unit][major_allele] = \ bin_data(major_allele, bin, counts_by_major_allele) #plot_binned_data("%s_binned_hists.png" % unit, binned_data[unit], #interactive=interactive) rpy.r.assign('%s.raw.data' % unit, unbinned_data[unit]) rpy.r.assign('%s.binned.data' % unit, binned_data[unit]) # Step 2. with our binned profiles, we can determine the empirical mean # and variance for errors for each major allele size. This needs to be # done separately for gains and losses since the final piecewise PMF will # be constructed with two different negbin models--one for gain and one # for loss. model = defaultdict(dict) for unit, bdata in binned_data.iteritems(): tot_ests, gain_ests, loss_ests = \ estimate_mean_and_variance(bdata) rpy.r.assign("%s.tot.ests" % unit, tot_ests) rpy.r.assign("%s.gain.ests" % unit, gain_ests) rpy.r.assign("%s.loss.ests" % unit, loss_ests) print("unit=" + unit) model[unit]['gain'] = model_mean_and_variance(gain_ests) model[unit]['loss'] = model_mean_and_variance(loss_ests) rpy.r.assign("%s.gain.model" % unit, model[unit]['gain']) rpy.r.assign("%s.loss.model" % unit, model[unit]['loss']) with PlotContext(interactive, rpy.r.png, filename="%s_mean_and_var.png" % unit, width=1000, height=1000): rpy.r.layout(rpy.r.matrix([1,2,3,4], ncol=2)) rpy.r.par(mar=[3, 2, 2, 1]) plot_mean_and_variance(unit, "gain", gain_ests, model[unit]['gain']) plot_mean_and_variance(unit, "loss", loss_ests, model[unit]['loss']) if save_rdata: rpy.r.save_image(file=save_rdata)