median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Calculate full matrix of synonymous pairwise differences sys.stderr.write("Calculate synonymous pi matrix...\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D', allowed_genes=metaphlan2_genes) # Calculate fixation matrix fixation_matrix_syn = diversity_utils.calculate_fixation_matrix( allele_counts_map, passed_sites_map, variant_type='4D', min_change=min_change, allowed_genes=metaphlan2_genes) sys.stderr.write("Done!\n") # Calculate full matrix of nonsynonymous pairwise differences sys.stderr.write("Calculate nonsynonymous pi matrix...\n") # Calculate allele count matrices pi_matrix_non, avg_pi_matrix_non = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='1D', allowed_genes=metaphlan2_genes) # Calculate fixation matrix fixation_matrix_non = diversity_utils.calculate_fixation_matrix(
nonsynonymous_count_sfs = [] synonymous_pi_weighted_counts = 0 nonsynonymous_pi_weighted_counts = 0 final_line_number = 0 while final_line_number >= 0: sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number) snp_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(species_name, debug=debug, allowed_variant_types=allowed_variant_types, allowed_samples=largest_clade_samples,allowed_genes=core_genes, chunk_size=chunk_size,initial_line_number=final_line_number) sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys())) # Calculate fixation matrix sys.stderr.write("Calculating matrix of snp differences...\n") chunk_snp_difference_matrix, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_genes=core_genes, min_change=min_change) sys.stderr.write("Done!\n") if snp_difference_matrix.shape[0]==0: snp_difference_matrix = numpy.zeros_like(chunk_snp_difference_matrix)*1.0 snp_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)*1.0 synonymous_difference_matrix = numpy.zeros_like(snp_difference_matrix) synonymous_opportunity_matrix = numpy.zeros_like(snp_difference_matrix) nonsynonymous_difference_matrix = numpy.zeros_like(snp_difference_matrix) nonsynonymous_opportunity_matrix = numpy.zeros_like(snp_difference_matrix) n = len(snp_samples) maf_bins = numpy.arange(1,n+1)*1.0/n maf_bins -= (maf_bins[1]-maf_bins[0])/2 maf_bins[0]=-0.1
else: sys.stderr.write("Analyzing %d haploid samples...\n" % len(desired_samples)) species_idx += 1 # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) dummy_samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(species_name, debug=debug, allowed_samples=desired_samples) sys.stderr.write("Done!\n") # Calculate fixation matrices sys.stderr.write("Calculating 4D fixation matrix...\n") fixation_matrix_syn, fixation_opportunities_syn = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_variant_types=set(['4D']), min_change=min_change) sys.stderr.write("Calculating 1D fixation matrix...\n") fixation_matrix_non, fixation_opportunities_non = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, allowed_variant_types=set(['1D']), min_change=min_change) sys.stderr.write("Calculating total fixation matrix...\n") fixation_matrix_all, fixation_opportunities_all = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, min_change=min_change) sys.stderr.write("Done!\n") # Calculate fraction nonsynonymous dN = fixation_matrix_non/fixation_opportunities_non dS = fixation_matrix_syn/fixation_opportunities_syn dNplusdS = (dN+dS) fraction_nonsynonymous = dN/(dNplusdS+(dNplusdS==0)) # Calculate total divergence dtot = fixation_matrix_all/fixation_opportunities_all
final_line_number = 0 while final_line_number >= 0: sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number) dummy_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species_name, debug=debug, allowed_samples=snp_samples, chunk_size=chunk_size, initial_line_number=final_line_number) sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys())) # Calculate fixation matrix sys.stderr.write("Calculating matrix of snp differences...\n") chunk_snp_difference_matrix, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix( allele_counts_map, passed_sites_map, min_change=min_change) sys.stderr.write("Done!\n") if snp_difference_matrix.shape[0] == 0: snp_difference_matrix = numpy.zeros_like( chunk_snp_difference_matrix) * 1.0 snp_opportunity_matrix = numpy.zeros_like(snp_difference_matrix) * 1.0 snp_difference_matrix += chunk_snp_difference_matrix snp_opportunity_matrix += chunk_snp_opportunity_matrix substitution_rate = snp_difference_matrix * 1.0 / snp_opportunity_matrix cluster_idxss = diversity_utils.cluster_samples(substitution_rate, min_d=0, max_d=1e09)
dummy_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species_name, debug=debug, allowed_samples=snp_samples, allowed_genes=core_genes, chunk_size=chunk_size, initial_line_number=final_line_number) sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys())) print len(dummy_samples), "dummy samples!" # Calculate fixation matrix sys.stderr.write("Calculating matrix of snp differences...\n") chunk_snp_difference_matrix, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix( allele_counts_map, passed_sites_map, min_change=min_change, allowed_genes=core_genes, allowed_variant_types=allowed_variant_types) # sys.stderr.write("Done!\n") if snp_difference_matrix.shape[0] == 0: snp_difference_matrix = numpy.zeros_like( chunk_snp_difference_matrix) * 1.0 snp_opportunity_matrix = numpy.zeros_like(snp_difference_matrix) * 1.0 snp_difference_matrix += chunk_snp_difference_matrix snp_opportunity_matrix += chunk_snp_opportunity_matrix sys.stderr.write("Calculating singletons...\n") chunk_singletons = diversity_utils.calculate_singletons( allele_counts_map, passed_sites_map, allowed_genes=core_genes)
sample_coverage_map = {samples[i]: median_coverages[i] for i in xrange(0,len(samples))} # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map, last_line = parse_midas_data.parse_snps(species_name, debug) sys.stderr.write("Done!\n") median_coverages = numpy.array([sample_coverage_map[samples[i]] for i in xrange(0,len(samples))]) # Calculate full matrix of synonymous pairwise differences sys.stderr.write("Calculate synonymous pi matrix...\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(allele_counts_map, passed_sites_map, variant_type='4D') pis = numpy.diag(pi_matrix_syn) # Calculate fixation matrix fixation_matrix_syn, persite_fixation_matrix_syn = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, variant_type='4D', min_change=min_change) sys.stderr.write("Done!\n") # Calculate full matrix of nonsynonymous pairwise differences sys.stderr.write("Calculate nonsynonymous pi matrix...\n") # Calculate allele count matrices pi_matrix_non, avg_pi_matrix_non = diversity_utils.calculate_pi_matrix(allele_counts_map, passed_sites_map, variant_type='1D') # Calculate fixation matrix fixation_matrix_non, persite_fixation_matrix_non = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map, variant_type='1D', min_change=min_change) sys.stderr.write("Done!\n") # Only plot samples above a certain depth threshold high_coverage_samples = samples[median_coverages>=min_coverage] high_coverage_low_pi_samples = samples[(median_coverages>=min_coverage)*(pis<=1e-03)]