def sample_distribution(self): ## pilot sample read_lengths = [] # max_tlen = 0 #bam_filtered = ifilter(lambda r: is_proper_aligned_unique_innie(r), self.bamfile) isize_list = [] mcmc_dict = {} #nr_reads = 0 #nr_mapped = 0 # nr_proper_mapped = 0 for sample_nr,read in enumerate(self.bamfile): ## add do insert size distribution calculation if proper pair if is_proper_aligned_unique_innie(read) and not read.is_reverse: self.param.nr_proper_mapped += 2 # add the read plus its mate since the mate does not enter here assert read.tlen > 0 read_lengths.append(read.rlen) isize_list.append(read.tlen) if read.tid in mcmc_dict: mcmc_dict[read.tid].append(read.tlen) else: mcmc_dict[read.tid] = [read.tlen] # if abs(read.tlen) > max_tlen: # max_tlen = abs(read.tlen) if sample_nr >= SAMPLE_SIZE: break # for sample_nr,read in enumerate(bam_filtered): # ## add do insert size distribution calculation if proper pair # if is_proper_aligned_unique_innie(read) and not read.is_reverse: # assert read.tlen > 0 # read_lengths.append(read.rlen) # isize_list.append(read.tlen) # # if abs(read.tlen) > max_tlen: # # max_tlen = abs(read.tlen) # if sample_nr >= SAMPLE_SIZE: # break # for read, mate_pos in fb.proper_read_isize(self.bamfile, self.param.lib_min, self.param.ligetdistr.assemblymodule.b_max): # sample_nr += 1 # ## add do insert size distribution calculation if proper pair # if read.tlen >= 0: # #if is_proper_aligned_unique_innie(read) and read.is_read1: # read_lengths.append(read.rlen) # isize_list.append(read.tlen) # # if abs(read.tlen) > max_tlen: # # max_tlen = abs(read.tlen) # if sample_nr >= SAMPLE_SIZE: # break self.bamfile.reset() #max_tlen = max_tlen+1000 self.read_length = sum(read_lengths)/float(len(read_lengths)) ## sample proper reads # isize_list = [] # for sample_nr,read in enumerate(proper_read_isize_iter(self.bampath, self.read_length, max_tlen)): # isize_list.append(read) # if sample_nr > SAMPLE_SIZE: # break params = dict() params["sample-nr"] = sample_nr isize_list = filter(lambda x: 0 < x - 2*self.read_length,isize_list) n_isize = float(len(isize_list)) mean_isize = sum(isize_list)/n_isize std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), isize_list))) / (n_isize - 1)) ** 0.5 params["mu-raw"] = mean_isize params["sd-raw"] = std_dev_isize extreme_obs_occur = True while extreme_obs_occur: #print 'HERE!!' extreme_obs_occur, filtered_list = AdjustInsertsizeDist(mean_isize, std_dev_isize, isize_list) n_isize = float(len(filtered_list)) mean_isize = sum(filtered_list) / n_isize std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), filtered_list))) / (n_isize - 1)) ** 0.5 isize_list = filtered_list self.min_isize, self.max_isize = min(isize_list), max(isize_list) # filter outliers for ref in mcmc_dict.keys(): ref_isizes = mcmc_dict[ref] mcmc_dict[ref] = list(filter(lambda x: self.min_isize <= x <= self.max_isize, ref_isizes)) params["mu-filtered"] = mean_isize params["sd-filtered"] = std_dev_isize params["min-isize"] = self.min_isize params["max-isize"] = self.max_isize params["read-length"] = self.read_length self.nobs = n_isize self.mean = mean_isize self.stddev = std_dev_isize self.full_ECDF = ECDF(isize_list) self.adjustedECDF_no_gap = None self.adjustedECDF_no_gap = self.get_correct_ECDF() params["mu-adjusted"] = self.adjusted_mean params["sd-adjusted"] = self.adjusted_stddev samples = min(SAMPLE_SIZE,len(isize_list)) # ess = self.effectiveSampleSize(mcmc_dict) #isize_list[:samples]) # mcmc_dict ) # # self.ess_ratio = ess / float(sum(map(lambda x: len(mcmc_dict[x]), mcmc_dict))) params["ess"] = 1 #self.ess_ratio reference_lengths = map(lambda x: int(x), self.bamfile.lengths) ref_list = zip(self.bamfile.references, reference_lengths) total_basepairs = sum(reference_lengths) self.param.total_basepairs = total_basepairs params["genome-length"] = total_basepairs params["contigs"] = [] for ref, length in ref_list: params["contigs"].append( { "name" : ref, "length" : length } ) json.dump(params, self.lib_file, sort_keys=True, indent=4, separators=(',', ': ')) params = dict() if self.param.nr_reads: reads = dict() reads["total"] = self.param.nr_reads reads["mapped"] = self.param.nr_mapped reads["properly-mapped"] = self.param.nr_proper_mapped reads["mapped-percentage"] = self.param.nr_mapped/float(self.param.nr_reads) reads["properly-mapped-percentage"] = self.param.nr_proper_mapped/float(self.param.nr_reads) reads["coverage"] = self.param.nr_reads/float(total_basepairs) reads["coverage-mapped"] = self.param.nr_mapped/float(total_basepairs) reads["coverage-properly-mapped"] = self.param.nr_proper_mapped/float(total_basepairs) params["reads"] = reads info = dict() info["proper-samples"] = samples info["ess-proper-samples"] = 1 #ess info["ess-ratio"] = 1 #self.ess_ratio coverage = self.read_length*samples*2/float(total_basepairs) info["mean-coverage-proper"] = coverage inner_span_coverage = coverage * (self.mean -2*self.read_length)/(2*self.read_length) info["average-theoretical-inner-span-coverage"] = inner_span_coverage info["mu-full-lib"] = self.mean info["sd-full-lib"] = self.stddev info["mu-empirical"] = self.adjusted_mean info["sd-empirical"] = self.adjusted_stddev mu_naive = self.mean + self.stddev**2/float(self.mean - 2*self.read_length+1) sigma_naive = math.sqrt(self.stddev**2 - self.stddev**4/(self.mean -2*self.read_length +1)**2 ) info["mu-naive"] = mu_naive info["sd-naive"] = sigma_naive mu_sophisticated = param_est.mean_given_d(self.mean, self.stddev, self.read_length, total_basepairs, total_basepairs, 0) sigma_sophisticated = param_est.stddev_given_d(self.mean, self.stddev, self.read_length, total_basepairs, total_basepairs, 0) info["mu-sophisticated"] = mu_sophisticated info["sd-sophisticated"] = sigma_sophisticated theoretical_margin_of_error = NORMAL_QUANTILE_TWO_SIDED_95*self.stddev / math.sqrt(inner_span_coverage) info["theoretical-error-margin-two-sided-95"] = theoretical_margin_of_error params["extra-info"] = info json.dump(params, self.stats_file, sort_keys=True, indent=4, separators=(',', ': ')) self.stats_file.close() if self.param.plots: outfile = os.path.join(self.param.plotfolder, 'isize.eps') plot_isize(isize_list, outfile) outfile = os.path.join(self.param.plotfolder, 'fitted_params_isize.eps') fit.main(isize_list, outfile)
def sample_distribution(self): ## pilot sample read_lengths = [] # max_tlen = 0 #bam_filtered = ifilter(lambda r: is_proper_aligned_unique_innie(r), self.bamfile) isize_list = [] mcmc_dict = {} #nr_reads = 0 #nr_mapped = 0 # nr_proper_mapped = 0 for sample_nr, read in enumerate(self.bamfile): ## add do insert size distribution calculation if proper pair if is_proper_aligned_unique_innie(read) and not read.is_reverse: self.param.nr_proper_mapped += 2 # add the read plus its mate since the mate does not enter here assert read.tlen > 0 read_lengths.append(read.rlen) isize_list.append(read.tlen) if read.tid in mcmc_dict: mcmc_dict[read.tid].append(read.tlen) else: mcmc_dict[read.tid] = [read.tlen] # if abs(read.tlen) > max_tlen: # max_tlen = abs(read.tlen) if sample_nr >= SAMPLE_SIZE: break # for sample_nr,read in enumerate(bam_filtered): # ## add do insert size distribution calculation if proper pair # if is_proper_aligned_unique_innie(read) and not read.is_reverse: # assert read.tlen > 0 # read_lengths.append(read.rlen) # isize_list.append(read.tlen) # # if abs(read.tlen) > max_tlen: # # max_tlen = abs(read.tlen) # if sample_nr >= SAMPLE_SIZE: # break # for read, mate_pos in fb.proper_read_isize(self.bamfile, self.param.lib_min, self.param.ligetdistr.assemblymodule.b_max): # sample_nr += 1 # ## add do insert size distribution calculation if proper pair # if read.tlen >= 0: # #if is_proper_aligned_unique_innie(read) and read.is_read1: # read_lengths.append(read.rlen) # isize_list.append(read.tlen) # # if abs(read.tlen) > max_tlen: # # max_tlen = abs(read.tlen) # if sample_nr >= SAMPLE_SIZE: # break self.bamfile.reset() #max_tlen = max_tlen+1000 self.read_length = sum(read_lengths) / float(len(read_lengths)) ## sample proper reads # isize_list = [] # for sample_nr,read in enumerate(proper_read_isize_iter(self.bampath, self.read_length, max_tlen)): # isize_list.append(read) # if sample_nr > SAMPLE_SIZE: # break params = dict() params["sample-nr"] = sample_nr isize_list = filter(lambda x: 0 < x - 2 * self.read_length, isize_list) n_isize = float(len(isize_list)) mean_isize = sum(isize_list) / n_isize std_dev_isize = (sum( list( map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2), isize_list))) / (n_isize - 1))**0.5 params["mu-raw"] = mean_isize params["sd-raw"] = std_dev_isize extreme_obs_occur = True while extreme_obs_occur: #print 'HERE!!' extreme_obs_occur, filtered_list = AdjustInsertsizeDist( mean_isize, std_dev_isize, isize_list) n_isize = float(len(filtered_list)) mean_isize = sum(filtered_list) / n_isize std_dev_isize = (sum( list( map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2), filtered_list))) / (n_isize - 1))**0.5 isize_list = filtered_list self.min_isize, self.max_isize = min(isize_list), max(isize_list) # filter outliers for ref in mcmc_dict.keys(): ref_isizes = mcmc_dict[ref] mcmc_dict[ref] = list( filter(lambda x: self.min_isize <= x <= self.max_isize, ref_isizes)) params["mu-filtered"] = mean_isize params["sd-filtered"] = std_dev_isize params["min-isize"] = self.min_isize params["max-isize"] = self.max_isize params["read-length"] = self.read_length self.nobs = n_isize self.mean = mean_isize self.stddev = std_dev_isize self.full_ECDF = ECDF(isize_list) self.adjustedECDF_no_gap = None self.adjustedECDF_no_gap = self.get_correct_ECDF() params["mu-adjusted"] = self.adjusted_mean params["sd-adjusted"] = self.adjusted_stddev samples = min(SAMPLE_SIZE, len(isize_list)) # ess = self.effectiveSampleSize(mcmc_dict) #isize_list[:samples]) # mcmc_dict ) # # self.ess_ratio = ess / float(sum(map(lambda x: len(mcmc_dict[x]), mcmc_dict))) params["ess"] = 1 #self.ess_ratio reference_lengths = map(lambda x: int(x), self.bamfile.lengths) ref_list = zip(self.bamfile.references, reference_lengths) total_basepairs = sum(reference_lengths) self.param.total_basepairs = total_basepairs params["genome-length"] = total_basepairs params["contigs"] = [] for ref, length in ref_list: params["contigs"].append({"name": ref, "length": length}) json.dump(params, self.lib_file, sort_keys=True, indent=4, separators=(',', ': ')) params = dict() if self.param.nr_reads: reads = dict() reads["total"] = self.param.nr_reads reads["mapped"] = self.param.nr_mapped reads["properly-mapped"] = self.param.nr_proper_mapped reads["mapped-percentage"] = self.param.nr_mapped / float( self.param.nr_reads) reads[ "properly-mapped-percentage"] = self.param.nr_proper_mapped / float( self.param.nr_reads) reads["coverage"] = self.param.nr_reads / float(total_basepairs) reads["coverage-mapped"] = self.param.nr_mapped / float( total_basepairs) reads[ "coverage-properly-mapped"] = self.param.nr_proper_mapped / float( total_basepairs) params["reads"] = reads info = dict() info["proper-samples"] = samples info["ess-proper-samples"] = 1 #ess info["ess-ratio"] = 1 #self.ess_ratio coverage = self.read_length * samples * 2 / float(total_basepairs) info["mean-coverage-proper"] = coverage inner_span_coverage = coverage * (self.mean - 2 * self.read_length) / ( 2 * self.read_length) info["average-theoretical-inner-span-coverage"] = inner_span_coverage info["mu-full-lib"] = self.mean info["sd-full-lib"] = self.stddev info["mu-empirical"] = self.adjusted_mean info["sd-empirical"] = self.adjusted_stddev mu_naive = self.mean + self.stddev**2 / float(self.mean - 2 * self.read_length + 1) sigma_naive = math.sqrt(self.stddev**2 - self.stddev**4 / (self.mean - 2 * self.read_length + 1)**2) info["mu-naive"] = mu_naive info["sd-naive"] = sigma_naive mu_sophisticated = param_est.mean_given_d(self.mean, self.stddev, self.read_length, total_basepairs, total_basepairs, 0) sigma_sophisticated = param_est.stddev_given_d(self.mean, self.stddev, self.read_length, total_basepairs, total_basepairs, 0) info["mu-sophisticated"] = mu_sophisticated info["sd-sophisticated"] = sigma_sophisticated theoretical_margin_of_error = NORMAL_QUANTILE_TWO_SIDED_95 * self.stddev / math.sqrt( inner_span_coverage) info[ "theoretical-error-margin-two-sided-95"] = theoretical_margin_of_error params["extra-info"] = info json.dump(params, self.stats_file, sort_keys=True, indent=4, separators=(',', ': ')) self.stats_file.close() if self.param.plots: outfile = os.path.join(self.param.plotfolder, 'isize.eps') plot_isize(isize_list, outfile) outfile = os.path.join(self.param.plotfolder, 'fitted_params_isize.eps') fit.main(isize_list, outfile)
def plot_bp_specific_distr(infile, param): means = {} stddevs = {} for i in [2, 51, 201,501]: means[i]=[] stddevs[i] = [] avg_mean = 0 avg_stddev = 0 avg_spancov = 0 tot_pos = 0 for line in infile: [ref,pos, n_obs,mean,sigma] = line.strip().split() n_obs = int(float(n_obs)) mean = float(mean) sigma = float(sigma) if n_obs > 2: avg_mean += mean avg_stddev += sigma avg_spancov += n_obs tot_pos += 1 if 2 < n_obs <= 50: means[2].append(mean) stddevs[2].append(sigma) elif 51 < n_obs <= 200: means[51].append(mean) stddevs[51].append(sigma) elif 201 < n_obs <= 500: means[201].append(mean) stddevs[201].append(sigma) elif 501 < n_obs: means[501].append(mean) stddevs[501].append(sigma) # print len(m_1), len(m_2), len(m_3),len(m_4) avg_mean = avg_mean / float(tot_pos) avg_stddev = avg_stddev / float(tot_pos) avg_spancov = avg_spancov /float(tot_pos) print avg_mean,avg_stddev, avg_spancov nr_obs, mu = zip(*filter(lambda x: means[x[0]] , means.iteritems())) nr_obs, sigma = zip(*filter(lambda x: stddevs[x[0]] , stddevs.iteritems())) #nr_obs = list(nr_obs) #nr_obs.sort() labels = [] for low in nr_obs: labels.append(">{0} obs".format(low)) #labels.append(">{0} obs".format(high)) plt.hist(mu, stacked=True, bins=100, log=True, label=labels) plt.ylabel('Frequency (log scale)') plt.xlabel('isize mean of mates spanning over position') title = "Bp specific mean insert size (avg. over genome = %.2f)" % (avg_mean) plt.title(title) plt.legend( ) out = os.path.join(param.plotfolder, 'bp_specific_mean.eps') plt.savefig(out) plt.close() plt.hist(sigma, stacked=True, bins=100, log=True, label=labels) plt.ylabel('Frequency (log scale)') plt.xlabel('isize standard deviation of mates spanning over position') title = "Bp specific stddev of insert size (avg. over genome = %.2f)" % (avg_stddev) plt.title(title) plt.legend( ) out = os.path.join(param.plotfolder, 'bp_specific_stddev.eps') plt.savefig(out) plt.savefig(out) plt.close() stddevs = {} out = os.path.join(param.plotfolder, 'fitted_params_avg_span.eps') bp_list= [] for key in means: for mean in means[key]: bp_list.append(mean) fit.main(bp_list, out)
def plot_bp_specific_distr(infile, param): means = {} stddevs = {} for i in [2, 51, 201, 501]: means[i] = [] stddevs[i] = [] avg_mean = 0 avg_stddev = 0 avg_spancov = 0 tot_pos = 0 for line in infile: [ref, pos, n_obs, mean, sigma] = line.strip().split() n_obs = int(float(n_obs)) mean = float(mean) sigma = float(sigma) if n_obs > 2: avg_mean += mean avg_stddev += sigma avg_spancov += n_obs tot_pos += 1 if 2 < n_obs <= 50: means[2].append(mean) stddevs[2].append(sigma) elif 51 < n_obs <= 200: means[51].append(mean) stddevs[51].append(sigma) elif 201 < n_obs <= 500: means[201].append(mean) stddevs[201].append(sigma) elif 501 < n_obs: means[501].append(mean) stddevs[501].append(sigma) # print len(m_1), len(m_2), len(m_3),len(m_4) avg_mean = avg_mean / float(tot_pos) avg_stddev = avg_stddev / float(tot_pos) avg_spancov = avg_spancov / float(tot_pos) print avg_mean, avg_stddev, avg_spancov nr_obs, mu = zip(*filter(lambda x: means[x[0]], means.iteritems())) nr_obs, sigma = zip(*filter(lambda x: stddevs[x[0]], stddevs.iteritems())) #nr_obs = list(nr_obs) #nr_obs.sort() labels = [] for low in nr_obs: labels.append(">{0} obs".format(low)) #labels.append(">{0} obs".format(high)) plt.hist(mu, stacked=True, bins=100, log=True, label=labels) plt.ylabel('Frequency (log scale)') plt.xlabel('isize mean of mates spanning over position') title = "Bp specific mean insert size (avg. over genome = %.2f)" % ( avg_mean) plt.title(title) plt.legend() out = os.path.join(param.plotfolder, 'bp_specific_mean.eps') plt.savefig(out) plt.close() plt.hist(sigma, stacked=True, bins=100, log=True, label=labels) plt.ylabel('Frequency (log scale)') plt.xlabel('isize standard deviation of mates spanning over position') title = "Bp specific stddev of insert size (avg. over genome = %.2f)" % ( avg_stddev) plt.title(title) plt.legend() out = os.path.join(param.plotfolder, 'bp_specific_stddev.eps') plt.savefig(out) plt.savefig(out) plt.close() stddevs = {} out = os.path.join(param.plotfolder, 'fitted_params_avg_span.eps') bp_list = [] for key in means: for mean in means[key]: bp_list.append(mean) fit.main(bp_list, out)