#x0 = marker_coverages[i] xs, CDFs = stats_utils.calculate_unnormalized_CDF_from_histogram(sample_coverage_histogram) pylab.plot(xs, CDFs[-1]-CDFs, '-') pylab.semilogx([1],[1]) pylab.xlabel('Coverage, D') pylab.ylabel('Fraction sites with coverage >= D') #pylab.xlim([1e-01,1e01]) pylab.savefig('%s/%s_genomic_coverage_distribution.pdf' % (parse_midas_data.analysis_directory,species_name),bbox_inches='tight',transparent=True) pylab.figure(2) median_coverages.sort() median_coverage_xs, median_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(median_coverages, min_x=0.1, max_x=10000) # Load gene coverage information for species_name sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name) sys.stderr.write("Done!\n") marker_coverages = numpy.clip(marker_coverages, 2e-01,1e04) marker_coverages.sort() marker_coverage_xs, marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_coverages, min_x=0.1, max_x=10000) median_marker_coverage_xs, median_marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(median_marker_coverages, min_x=0.1, max_x=10000) mean_marker_coverage_xs, mean_marker_coverage_survivals = stats_utils.calculate_unnormalized_survival_from_vector(mean_marker_coverages, min_x=0.1, max_x=10000)
snp_samples[j]]: pair_snp_substitution_rates.append( snp_substitution_matrix[i, j]) if snp_substitution_matrix[i, j] < min_substitution_rate: min_substitution_rate = snp_substitution_matrix[i, j] closest_snp_substitution_rates.append(min_substitution_rate) all_closest_rates.extend(closest_snp_substitution_rates) all_pair_rates.extend(pair_snp_substitution_rates) print numpy.sort(all_closest_rates) print numpy.sort(all_pair_rates) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( all_closest_rates, min_x=1e-06, max_x=1e09) pylab.step(xs, ns / ns[0], '-', color='r', linewidth=0.5, alpha=0.5, label='Between-host', where='mid', zorder=2) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( all_pair_rates, min_x=1e-06, max_x=1e09) pylab.step(xs, ns / ns[0], '-',
print low_divergence_between_host_gene_prevalences.mean() print len(low_divergence_between_host_gene_prevalences), len( between_host_gene_prevalences) h = numpy.histogram(low_divergence_between_host_gene_prevalences, bins=prevalence_bins)[0] #prevalence_axis.plot(prevalence_locations, h*1.0/h.sum(),'r.-',label=('d<%g' % modification_divergence_threshold), alpha=0.5,markersize=3) h = numpy.histogram(within_host_gene_prevalences, bins=prevalence_bins)[0] #prevalence_axis.plot(prevalence_locations, h*1.0/h.sum(),'b.-',label='Within-host',markersize=3) print len(within_host_gene_prevalences), "within-host changes" # CDF version xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( within_host_gene_prevalences) prevalence_axis.step(xs, 1 - ns * 1.0 / ns[0], 'b-', label='Within-host', zorder=2) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( between_host_gene_prevalences) prevalence_axis.step(xs, 1 - ns * 1.0 / ns[0], 'r-', label='Between-host', zorder=1) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
################### # # Prevalence # ################### prevalence_axis = plt.Subplot(fig, outer_grid[0]) fig.add_subplot(prevalence_axis) fig.suptitle(species_name) prevalence_axis.set_ylabel('Fraction genes $\geq p$') prevalence_axis.set_xlabel('Prevalence of gene, $p$') prevalence_axis.set_xlim([0, 1]) prevalence_axis.set_ylim([0, 1]) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( pangenome_prevalences) prevalence_axis.step(xs, ns * 1.0 / ns[0], 'k-', label='Total pan genome') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( between_host_gene_prevalences) prevalence_axis.step(xs, ns * 1.0 / ns[0], 'r-', label='Between host differences') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( within_host_gene_prevalences) prevalence_axis.step(xs, ns * 1.0 / ns[0], 'g-', label='Within host differences')
combination_type = "sample" # Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Calculate num timepoints per sample num_timepoints_per_subject = [] for subject in subject_sample_map.keys(): num_timepoints_per_subject.append(len(subject_sample_map[subject].keys())) num_timepoints_per_subject.sort() num_timepoints_per_subject = numpy.array(num_timepoints_per_subject) num_timepoints, num_subjects = stats_utils.calculate_unnormalized_survival_from_vector( num_timepoints_per_subject) pylab.figure(1, figsize=(5, 3)) pylab.step(num_timepoints + 0.25, num_subjects, where='pre') pylab.semilogy([0], [1]) pylab.xlim([0.5, 9]) pylab.ylim([0.3, 300]) pylab.xlabel('Num timepoints, $T$') pylab.ylabel('Num subjects with $\geq T$') print len(num_timepoints_per_subject), max(num_timepoints_per_subject) pylab.savefig('%s/num_timepoints_per_subject.pdf' % parse_midas_data.analysis_directory, bbox_inches='tight') # Load marker gene coverages species_coverage_matrix, sample_list, species_list = parse_midas_data.parse_global_marker_gene_coverages(
min_copynum_distribution = gene_copynum_matrix[desired_gene_idxs, :].min( axis=0) for gene_name in desired_gene_names: gene_idx = numpy.nonzero(gene_names == gene_name)[0][0] gene_copynum_distribution = gene_copynum_matrix[gene_idx, :] print gene_copynum_matrix.shape, gene_copynum_distribution.shape #print gene_copynum_distribution xvalues, ns = stats_utils.calculate_unnormalized_survival_from_vector( gene_copynum_distribution, min_x=0, max_x=gene_copynum_distribution.max()) pylab.step(xvalues, ns, label=gene_name) #pylab.semilogy([4],[4]) xvalues, ns = stats_utils.calculate_unnormalized_survival_from_vector( min_copynum_distribution, min_x=0, max_x=min_copynum_distribution.max()) pylab.step(xvalues, ns, label='Both') pylab.legend(loc='upper right', frameon=False) pylab.savefig( '../morteza_collaboration/ben_figures/Alistipes_onderdonkii_gene_gain_HMP_prevalence.pdf', bbox_inches='tight')
print "Mean within host snps =", pooled_snp_change_distribution.mean() print "Median withon host snps =", numpy.median(pooled_snp_change_distribution) pooled_snp_change_distribution = numpy.clip(pooled_snp_change_distribution, 1e-01,1e08) pooled_twin_snp_change_distribution = numpy.clip(pooled_twin_snp_change_distribution, 1e-01,1e08) pooled_between_snp_change_distribution = numpy.clip(pooled_between_snp_change_distribution, 1e-01,1e08) pooled_min_between_snp_change_distribution = numpy.clip(pooled_min_between_snp_change_distribution, 1e-01,1e08) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(pooled_between_snp_change_distribution, min_x=1e-02, max_x=1e09) #pooled_snp_axis.step(xs,ns,'-',color='r',linewidth=0.5, alpha=0.5, label='Between-host', where='mid') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(pooled_min_between_snp_change_distribution, min_x=1e-02, max_x=1e09) ymin = 1.0/ns[0] ymax = 1.3 pooled_snp_axis.loglog([1e-01,1e05],[ymin,ymin],'k:') pooled_snp_axis.set_ylim([1.0/ns[0],1.3]) pooled_snp_axis.fill_between([1e-01,modification_difference_threshold],[ymin,ymin],[ymax,ymax],color='#deebf7') pooled_snp_axis.fill_between([replacement_difference_threshold,1e05],[ymin,ymin],[ymax,ymax],color='#fee0d2') pooled_snp_axis.text(exp((log(1e05)+log(replacement_difference_threshold))/2), ymax*1.2, 'putative\nreplacement',fontsize=6,fontstyle='italic',ha='center',color='#fc9272') pooled_snp_axis.text(exp((log(1)+log(modification_difference_threshold))/2), ymax*1.2, 'putative\nmodification',fontsize=6,fontstyle='italic',ha='center',color='#9ecae1') #pooled_snp_axis.text(exp((log(modification_difference_threshold)+log(replacement_difference_threshold))/2), ymax*1.2, 'unclassified',fontsize=6,fontstyle='italic',ha='center')
bootstrapped_fake_low_ps.extend( binomial(sample_sizes, low_p) * 1.0 / sample_sizes) bootstrapped_fake_all_ps.extend( binomial(sample_sizes, all_p) * 1.0 / sample_sizes) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_low_ps, min_x=0,max_x=2) #sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (matched)',zorder=3) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_all_ps, min_x=0,max_x=2) #sharing_axis.step(xs,ns*1.0/ns[0],'k-',label='All (matched)',zorder=2) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_fake_low_ps, min_x=0,max_x=1) #sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (pooled)',zorder=1,alpha=0.5) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( real_all_ps[all_doubleton_opportunities > min_opportunities], min_x=0, max_x=2) sharing_axis.step(xs, ns * 1.0 / ns[0], 'k-', label='Between hosts (all)', zorder=1) #,alpha=0.5) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( real_low_ps[low_doubleton_opportunities > min_opportunities], min_x=0, max_x=2) sharing_axis.step(xs, ns * 1.0 / ns[0], 'r-', label='Between hosts\n(closely related)',
bootstrapped_haploid_countss = [] for bootstrap_idx in xrange(0, num_bootstraps): bootstrapped_haploid_countss.append( binomial(sample_highcoverage_counts, pavg)) pooled_bootstrapped_haploid_fractions = [] for bootstrap_idx in xrange(0, num_bootstraps): pooled_bootstrapped_haploid_fractions.extend( bootstrapped_haploid_countss[bootstrap_idx][ sample_highcoverage_counts >= 1] * 1.0 / sample_highcoverage_counts[sample_highcoverage_counts >= 1]) pooled_bootstrapped_haploid_fractions = numpy.array( pooled_bootstrapped_haploid_fractions) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( pooled_bootstrapped_haploid_fractions) haploid_cdf_axis.step(xs, ns * 1.0 / ns[0], '-', color='0.7', label='Null') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( sample_haploid_fractions[sample_highcoverage_counts >= 1]) haploid_cdf_axis.step(xs, ns * 1.0 / ns[0], '-', color=haploid_color, label='Obs') haploid_cdf_axis.set_xlim([0, 1]) haploid_cdf_axis.legend(loc='upper right', frameon=False, numpoints=1) ######### #
gene_difference_axis.spines['right'].set_visible(False) gene_difference_axis.get_xaxis().tick_bottom() gene_difference_axis.get_yaxis().tick_left() gene_difference_axis.semilogx([1, 1]) gene_difference_axis.set_xlim([1, 1e04]) gene_difference_axis.set_ylim([0, 1.174]) low_divergence_snp_differences = numpy.array(low_divergence_snp_differences) low_divergence_gene_differences = numpy.array(low_divergence_gene_differences) low_divergence_clock_null_gene_differences = numpy.array( low_divergence_clock_null_gene_differences) normal_divergence_gene_differences = numpy.array( normal_divergence_gene_differences) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( low_divergence_gene_differences, min_x=0.1, max_x=1e04) snp_difference_axis.step(xs, 1 - ns * 1.0 / ns[0], 'r-', label='Closely\nrelated', zorder=1) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( low_divergence_gene_differences, min_x=0.1, max_x=1e04) gene_difference_axis.step(xs, 1 - ns * 1.0 / ns[0], 'r-', label='Closely\nrelated', zorder=1) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(
pylab.savefig('%s/%s_pooled_sfs.pdf' % (parse_midas_data.analysis_directory, species_name), bbox_inches='tight') pylab.savefig('%s/%s_pooled_sfs.png' % (parse_midas_data.analysis_directory, species_name), bbox_inches='tight', dpi=300) pylab.figure(2, figsize=(3.42, 2)) pylab.suptitle(species_name) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(polymorphic_freqs) #pylab.step(xs,ns*1.0/ns[0],'b-',label='All polymorphisms') if len(null_inconsistent_freqs) > 0: xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( null_inconsistent_freqs) pylab.step(xs, ns * 1.0 / ns[0], '-', color='0.7', linewidth=0.5, label=('Unlinked expectation')) if len(inconsistent_freqs) > 0: xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( inconsistent_freqs) pylab.step(xs, ns * 1.0 / ns[0], 'r-', label=('Inconsistent ($d=%g$)' % max_clade_d))
ds.append(d) vs.append(v) random_ds = numpy.array(random_ds) ds = numpy.array(ds) vs = numpy.array(vs) sys.stderr.write("Done!\n") print len(ds), "total singletons" print(vs > 0.5).sum(), "1D" print(vs < 0.5).sum(), "4D" # Now plot them. xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(random_ds) d_axis.step(xs, 1 - ns * 1.0 / ns[0], '-', color='0.7') xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(ds) d_axis.step(xs, 1 - ns * 1.0 / ns[0], 'b-') d_axis.semilogx([1e-05, 2e-05], [1, 1]) dstars = numpy.logspace(-4, -2, 20) fraction_nonsynonymous = [] for dstar in dstars: less_idxs = (ds <= dstar) if less_idxs.sum() > 1:
all_ngood = all_doubletons[idxs].astype(numpy.int32) all_nbad = (all_doubleton_opportunities[idxs] - all_ngood).astype( numpy.int32) all_p = all_doubletons.sum() * 1.0 / all_doubleton_opportunities.sum() bootstrapped_low_ps.extend( hypergeometric(low_ngood, low_nbad, sample_sizes) * 1.0 / sample_sizes) bootstrapped_all_ps.extend( hypergeometric(all_ngood, all_nbad, sample_sizes) * 1.0 / sample_sizes) bootstrapped_fake_low_ps.extend( binomial(sample_sizes, low_p) * 1.0 / sample_sizes) bootstrapped_fake_all_ps.extend( binomial(sample_sizes, all_p) * 1.0 / sample_sizes) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( bootstrapped_low_ps, min_x=0, max_x=2) sharing_axis.step(xs, ns * 1.0 / ns[0], 'r-', label='Low $d_S$ (matched)', zorder=3) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( bootstrapped_all_ps, min_x=0, max_x=2) sharing_axis.step(xs, ns * 1.0 / ns[0], 'k-', label='All (matched)', zorder=2) #xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(bootstrapped_fake_low_ps, min_x=0,max_x=1) #sharing_axis.step(xs,ns*1.0/ns[0],'r-',label='Low $d_S$ (pooled)',zorder=1,alpha=0.5) xs, ns = stats_utils.calculate_unnormalized_survival_from_vector(real_low_ps, min_x=0,
print marker_genes print marker_gene_idxs.sum() sample_idxs = (parse_midas_data.calculate_unique_samples(subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage) prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences(gene_depth_matrix[:, sample_idxs], marker_coverages[sample_idxs], min_copynum=0.3) reference_prevalences = prevalences[reference_gene_idxs] metaphlan2_prevalences = prevalences[metaphlan2_gene_idxs] marker_prevalences = prevalences[marker_gene_idxs] print marker_prevalences pangenome_xs, pangenome_survivals = stats_utils.calculate_unnormalized_survival_from_vector(prevalences, min_x=0, max_x=1) reference_xs, reference_survivals = stats_utils.calculate_unnormalized_survival_from_vector(reference_prevalences, min_x=0, max_x=1) metaphlan2_xs, metaphlan2_survivals = stats_utils.calculate_unnormalized_survival_from_vector(metaphlan2_prevalences, min_x=0, max_x=1) marker_xs, marker_survivals = stats_utils.calculate_unnormalized_survival_from_vector(marker_prevalences, min_x=0, max_x=1) pylab.figure(1,figsize=(3.42,4)) pylab.title(species_name) #pylab.step(pangenome_xs, pangenome_survivals/pangenome_survivals[0],label='Pan-genome') #pylab.step(reference_xs, reference_survivals/reference_survivals[0],label='Reference') #pylab.step(metaphlan2_xs, metaphlan2_survivals/metaphlan2_survivals[0],label='Metaphlan2') #pylab.step(marker_xs, marker_survivals/marker_survivals[0],label='MIDAS Marker') #pylab.ylim([1e-02,1])
print "d=", max_ds[i] print "Site", "Polymorphic", "Inconsistent" for variant_type in sorted(polymorphic_variant_types[i].keys()): variant_type, polymorphic_variant_types[i][ variant_type], inconsistent_variant_types[i][variant_type] print "" pylab.figure(4, figsize=(3.42, 2)) pylab.suptitle(species_name) for i in xrange(0, len(polymorphic_freqs)): if len(polymorphic_freqs[i]) == 0: continue xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( polymorphic_freqs[i]) pylab.step(xs, ns * 1.0 / ns[0], '-', label='Polymorphic ($d=%g$)' % max_ds[i]) print 'Polymorphic (d=%g), n=%g' % (max_ds[i], ns[0]) if len(inconsistent_freqs[1]) > 0: xs, ns = stats_utils.calculate_unnormalized_survival_from_vector( inconsistent_freqs[1]) pylab.step(xs, ns * 1.0 / ns[0], 'r-', linewidth=2, label=('Inconsistent ($d=%g$)' % max_ds[1]))