snp_substitution_rate = snp_difference_matrix * 1.0 / ( snp_opportunity_matrix + (snp_opportunity_matrix == 0)) sys.stderr.write("Done!\n") # Load gene coverage information for species_name sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=snp_samples) sys.stderr.write("Done!\n") sys.stderr.write("Loaded gene info for %d samples\n" % len(gene_samples)) gene_copynum_matrix = gene_depth_matrix * 1.0 / (marker_coverages + (marker_coverages == 0)) prevalence_idxs = (parse_midas_data.calculate_unique_samples( subject_sample_map, gene_samples)) * (marker_coverages >= min_coverage) prevalences = gene_diversity_utils.calculate_fractional_gene_prevalences( gene_depth_matrix[:, prevalence_idxs], marker_coverages[prevalence_idxs]) pangenome_prevalences = numpy.array(prevalences, copy=True) pangenome_prevalences.sort() # Calculate matrix of number of genes that differ sys.stderr.write("Calculating matrix of gene differences...\n") gene_gain_matrix, gene_loss_matrix, gene_opportunity_matrix = gene_diversity_utils.calculate_coverage_based_gene_hamming_matrix_gain_loss( gene_depth_matrix, marker_coverages, min_log2_fold_change=4, include_high_copynum=include_high_copynum)
inconsistency_axis = inconsistency_axes[example_idx] # # Only plot samples above a certain depth threshold that are "haploids" snp_samples = diversity_utils.calculate_haploid_samples(species_name, debug=debug) # Only consider samples from isolates snp_samples_isolates = [] for sample in snp_samples: if sample in isolates: snp_samples_isolates.append(sample) snp_samples = numpy.asarray(snp_samples_isolates) # Only consider one sample per person snp_samples = snp_samples[parse_midas_data.calculate_unique_samples( subject_sample_map, sample_list=snp_samples)] sys.stderr.write("Proceeding with %d haploid samples!\n" % len(snp_samples)) # sys.stderr.write("Loading pre-computed substitution rates for %s...\n" % species_name) substitution_rate_map = calculate_substitution_rates.load_substitution_rate_map( species_name) sys.stderr.write("Calculating matrix...\n") dummy_samples, snp_difference_matrix, snp_opportunity_matrix = calculate_substitution_rates.calculate_matrices_from_substitution_rate_map( substitution_rate_map, 'core', allowed_samples=snp_samples) snp_samples = dummy_samples sys.stderr.write("Done!\n") # snp_substitution_rate = snp_difference_matrix * 1.0 / ( snp_opportunity_matrix + (snp_opportunity_matrix == 0))
} # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Only plot samples above a certain depth threshold that are "haploids" snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] # Restrict to single timepoint single timepoints per person unique_subject_idxs = parse_midas_data.calculate_unique_samples( subject_sample_map, snp_samples) snp_samples = snp_samples[unique_subject_idxs] # Analyze SNPs, looping over chunk sizes. # Clunky, but necessary to limit memory usage on cluster # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) genotype_matrix = numpy.array([]) passed_sites_matrix = numpy.array([]) snp_difference_matrix = numpy.array([]) snp_opportunity_matrix = numpy.array([]) final_line_number = 0 while final_line_number >= 0:
median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) ############################################################### # Indexes for SNP samples that have high coverage # ############################################################### # Only plot samples above a certain depth threshold that are "haploids" low_pi_snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] high_pi_snp_samples = samples[(median_coverages >= min_coverage) * (pis > 1e-03)] # Calculate which pairs of idxs belong to unique samples. Remove any samples that are duplicates (i.e. multiple time pts) unique_idxs = parse_midas_data.calculate_unique_samples( subject_sample_map, low_pi_snp_samples) low_pi_snp_samples = low_pi_snp_samples[unique_idxs] unique_idxs = parse_midas_data.calculate_unique_samples( subject_sample_map, high_pi_snp_samples) high_pi_snp_samples = low_pi_snp_samples[unique_idxs] #################################################### # Load gene coverage information for species_name #################################################### sys.stderr.write("Loading pangenome data for %s...\n" % species_name) low_pi_gene_samples, low_pi_gene_names, low_pi_gene_presence_matrix, low_pi_gene_depth_matrix, low_pi_marker_coverages, low_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=low_pi_snp_samples) high_pi_gene_samples, high_pi_gene_names, high_pi_gene_presence_matrix, high_pi_gene_depth_matrix, high_pi_marker_coverages, high_pi_gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=high_pi_snp_samples)