for sample_coverage_histogram in sample_coverage_histograms
])
sample_coverage_map = {
    samples[i]: median_coverages[i]
    for i in xrange(0, len(samples))
}

###############################################################
# Compute Pi within patients to figure out which are haploid  #
###############################################################

# Load pi information for species_name
sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi(
    species_name,
    allowed_variant_types=set(['4D']),
    allowed_genes=core_genes,
    debug=debug)
sys.stderr.write("Done!\n")
pis = total_pis / total_pi_opportunities

######################
# compute median cov #
######################

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

###############################################################
# Indexes for SNP samples that have high coverage #
###############################################################
Exemplo n.º 2
0
# Load genomic coverage distributions
sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution(
    species_name)
median_coverages = numpy.array([
    stats_utils.calculate_nonzero_median_from_histogram(
        sample_coverage_histogram)
    for sample_coverage_histogram in sample_coverage_histograms
])
sample_coverage_map = {
    samples[i]: median_coverages[i]
    for i in xrange(0, len(samples))
}

# Load pi information for species_name
sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi(
    species_name, debug)
sys.stderr.write("Done!\n")
pis = total_pis / total_pi_opportunities
clipped_pis = (total_pis + 1) / (total_pi_opportunities + 1)

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
same_sample_idxs, same_subject_idxs, diff_subject_idxs = parse_midas_data.calculate_subject_pairs(
    subject_sample_map, samples)

# Calculate the smaller and larger of the two pi estimates so we can look at correlation over time
lower_pis = numpy.fmin(clipped_pis[same_subject_idxs[0]],
                       clipped_pis[same_subject_idxs[1]])
# Load genomic coverage distributions
sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution(
    species_name)
median_coverages = numpy.array([
    stats_utils.calculate_nonzero_median_from_histogram(
        sample_coverage_histogram)
    for sample_coverage_histogram in sample_coverage_histograms
])
sample_coverage_map = {
    samples[i]: median_coverages[i]
    for i in xrange(0, len(samples))
}

# Load pi information for species_name
sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi(
    species_name, allowed_genes=core_genes, debug=debug)
sys.stderr.write("Done!\n")
pis = total_pis / total_pi_opportunities

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

# Only plot samples above a certain depth threshold that are "haploids"
snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)]

# Analyze SNPs, looping over chunk sizes.
# Clunky, but necessary to limit memory usage on cluster

# Load SNP information for species_name
sys.stderr.write("Loading SNPs for %s...\n" % species_name)
sys.stderr.write("(not just core genes...)\n")