for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } ############################################################### # Compute Pi within patients to figure out which are haploid # ############################################################### # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, allowed_variant_types=set(['4D']), allowed_genes=core_genes, debug=debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities ###################### # compute median cov # ###################### median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) ############################################################### # Indexes for SNP samples that have high coverage # ###############################################################
# Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution( species_name) median_coverages = numpy.array([ stats_utils.calculate_nonzero_median_from_histogram( sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities clipped_pis = (total_pis + 1) / (total_pi_opportunities + 1) median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Calculate which pairs of idxs belong to the same sample, which to the same subject # and which to different subjects same_sample_idxs, same_subject_idxs, diff_subject_idxs = parse_midas_data.calculate_subject_pairs( subject_sample_map, samples) # Calculate the smaller and larger of the two pi estimates so we can look at correlation over time lower_pis = numpy.fmin(clipped_pis[same_subject_idxs[0]], clipped_pis[same_subject_idxs[1]])
# Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution( species_name) median_coverages = numpy.array([ stats_utils.calculate_nonzero_median_from_histogram( sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, allowed_genes=core_genes, debug=debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Only plot samples above a certain depth threshold that are "haploids" snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] # Analyze SNPs, looping over chunk sizes. # Clunky, but necessary to limit memory usage on cluster # Load SNP information for species_name sys.stderr.write("Loading SNPs for %s...\n" % species_name) sys.stderr.write("(not just core genes...)\n")