median_coverages = numpy.array([ stats_utils.calculate_median_from_histogram(sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } ############################################################### # Compute Pi within patients to figure out which are haploid # ############################################################### # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities clipped_pis = (total_pis + 1) / (total_pi_opportunities + 1) median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) ############################################################### # Indexes for SNP samples that have low piS and high coverage # ############################################################### # Only plot samples above a certain depth threshold that are "haploids" high_cov_samples = samples[(median_coverages >= min_coverage)] high_cov_pis = clipped_pis[(median_coverages >= min_coverage)]
for i in xrange(0, len(samples)) } # prune time meta data so that the highest coverage sample is retained for those subjects with >1 sample per time pt subject_sample_time_map = parse_midas_data.prune_subject_sample_time_map( subject_sample_time_map_all_samples, sample_coverage_map) ############################################################### # Compute Pi within patients to figure out which are haploid # ############################################################### # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, allowed_variant_types=set(['4D']), allowed_genes=core_genes, debug=debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities ###################### # compute median cov # ###################### median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) ########################################################## # load SNP info ##########################################################
# Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution( species_name) median_coverages = numpy.array([ stats_utils.calculate_nonzero_median_from_histogram( sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, allowed_genes=core_genes, debug=debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Only plot samples above a certain depth threshold that are "haploids" snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] # Load gene coverage information for species_name sys.stderr.write("Loading pangenome data for %s...\n" % species_name) gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data( species_name, allowed_samples=snp_samples) sys.stderr.write("Done!\n")