示例#1
0
def main(between_host):
    # Parse and save all the snps between QP hosts
    t0 = time.time()
    if between_host:
        intermediate_file_path = os.path.join(config.analysis_directory,
                                              'between_hosts_checkpoints')
    else:
        intermediate_file_path = os.path.join(config.analysis_directory,
                                              'within_hosts_checkpoints')

    for species_name in desired_species:
        print("Start processing {}".format(species_name))
        core_genes = core_gene_utils.parse_core_genes(species_name)
        desired_samples = get_desired_samples(species_name,
                                              between_host=between_host)
        if desired_samples is None or len(desired_samples) == 0:
            print("{} has no qualified samples".format(species_name))
            continue
        pickle_path = os.path.join(intermediate_file_path, species_name)
        if not os.path.exists(pickle_path):
            print('{} has not been processed'.format(species_name))
            os.mkdir(pickle_path)
        else:
            print('{} already processed'.format(species_name))
            continue
        found_samples, allele_counts_map, passed_sites_map, final_line_number = parse_snps(
            species_name,
            allowed_samples=desired_samples,
            allowed_genes=core_genes,
            allowed_variant_types=['4D'])
        pickle.dump(allele_counts_map,
                    open(pickle_path + '/allele_counts_map.pickle', 'wb'))
        pickle.dump(found_samples,
                    open(pickle_path + '/found_samples.pickle', 'wb'))
        pickle.dump(passed_sites_map,
                    open(pickle_path + '/passed_sites_map.pickle', 'wb'))
        print("Done processing {} at {} min".format(species_name,
                                                    (time.time() - t0) / 60))
sys.stderr.write("Loading SNPs for %s...\n" % species_name)
sys.stderr.write("(not just core genes...)\n")
pi_matrix_syn = numpy.array([])
avg_pi_matrix_syn = numpy.array([])
snp_difference_matrix = numpy.array([])
snp_difference_matrix_mutation = numpy.array([])
snp_difference_matrix_reversion = numpy.array([])
snp_opportunity_matrix = numpy.array([])
final_line_number = 0

while final_line_number >= 0:

    sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number)
    dummy_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(
        species_name,
        debug=debug,
        allowed_samples=snp_samples,
        chunk_size=chunk_size,
        initial_line_number=final_line_number)
    sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys()))

    print len(dummy_samples), "dummy samples!"

    # Calculate fixation matrix
    sys.stderr.write("Calculating matrix of snp differences...\n")
    chunk_snp_difference_matrix_mutation, chunk_snp_difference_matrix_reversion, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix_mutation_reversion(
        allele_counts_map, passed_sites_map, min_change=min_change)  #
    sys.stderr.write("Done!\n")

    if snp_difference_matrix.shape[0] == 0:
        snp_difference_matrix = numpy.zeros_like(
            chunk_snp_difference_matrix_mutation) * 1.0
pis = total_pis / total_pi_opportunities

######################
# compute median cov #
######################

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

##########################################################
# load SNP info
##########################################################

# note that this loads info for all samples. Later the desired samples are selected out.
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(
    species_name, debug)
sys.stderr.write("Done!\n")

###################################################################
# compute 1D SFSs
###################################################################

sys.stderr.write("Calculate within person SFS...\n")
sample_freqs, passed_sites = diversity_utils.calculate_sample_freqs(
    allele_counts_map, passed_sites_map, variant_type='4D', fold=True)

sfss = []
bins = numpy.linspace(0.04, 0.95, 21)
bin_locations = bins[1:] - (bins[1] - bins[0]) / 2

for j in xrange(0, len(samples)):
import pylab
import sys
import numpy
from utils import diversity_utils
from parsers import parse_midas_data
from numpy.random import choice
species_name = sys.argv[1]

# Load subject and sample metadata
sys.stderr.write("Loading HMP metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sys.stderr.write("Done!\n")

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, combination_type="sample", debug=False)
sys.stderr.write("Done!\n")

pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map, passed_sites_map, variant_type='4D')

low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03)

unique_samples = parse_midas_data.calculate_unique_samples(
    subject_sample_map, samples)

desired_samples = unique_samples * low_diversity_samples

# initialize distance bins for LD computations
distance_bins = numpy.logspace(
    0, 4, 20
# Load pi information for species_name
sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi(
    species_name, debug)
sys.stderr.write("Done!\n")
pis = total_pis / total_pi_opportunities

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

# Only plot samples above a certain depth threshold that are "haploids"
snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)]

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
dummy_samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps(
    species_name, debug=debug, allowed_samples=snp_samples)
sys.stderr.write("Done!\n")

# Calculate fixation matrix
sys.stderr.write("Calculating matrix of snp differences...\n")
snp_difference_matrix, snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix(
    allele_counts_map, passed_sites_map, min_change=min_change)
sys.stderr.write("Done!\n")

substitution_rate = snp_difference_matrix * 1.0 / snp_opportunity_matrix

# Calculate which pairs of idxs belong to the same sample, which to the same subject
# and which to different subjects
same_sample_idxs, same_subject_idxs, diff_subject_idxs = parse_midas_data.calculate_subject_pairs(
    subject_sample_map, snp_samples)
nonsynonymous_sfs = []

synonymous_count_sfs = []
nonsynonymous_count_sfs = []

synonymous_pi_weighted_counts = 0
nonsynonymous_pi_weighted_counts = 0

final_line_number = 0
while final_line_number >= 0:

    sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number)
    snp_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(
        species_name,
        debug=debug,
        allowed_variant_types=allowed_variant_types,
        allowed_samples=largest_clade_samples,
        allowed_genes=core_genes,
        chunk_size=chunk_size,
        initial_line_number=final_line_number)
    sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys()))

    # Calculate fixation matrix
    sys.stderr.write("Calculating matrix of snp differences...\n")
    chunk_snp_difference_matrix, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix(
        allele_counts_map,
        passed_sites_map,
        allowed_genes=core_genes,
        min_change=min_change)
    sys.stderr.write("Done!\n")

    if snp_difference_matrix.shape[0] == 0:
import pylab
import sys
import numpy
from calculate_pi_matrix import calculate_self_pis
import os
species = sys.argv[1]

data_directory = os.path.expanduser("~/ben_nandita_hmp_data/")
analysis_directory = os.path.expanduser("~/ben_nandita_hmp_analysis/")

default_directory_prefix = data_directory

print species

sys.stderr.write("Loading %s...\n" % species)

samples, allele_counts_syn, locations_syn, genes_syn, passed_sites_syn, allele_counts_non, locations_non, genes_non, passed_sites_non = parse_midas_data.parse_snps(
    species,
    site_depth_threshold=15,
    directory_prefix=default_directory_prefix)

sys.stderr.write("Done!\n")

sys.stderr.write("Calculating pis...\n")
piS = calculate_self_pis(allele_counts_syn)
piS /= (passed_sites_syn + (passed_sites_syn == 0))
sys.stderr.write("Done!\n")

for sample, pi in zip(samples, piS):
    print sample, pi
示例#8
0
# Load genomic coverage distributions
sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution(
    species_name)
median_coverages = numpy.array([
    stats_utils.calculate_nonzero_median_from_histogram(
        sample_coverage_histogram)
    for sample_coverage_histogram in sample_coverage_histograms
])
sample_coverage_map = {
    samples[i]: median_coverages[i]
    for i in xrange(0, len(samples))
}

# Load SNP information for species_name
sys.stderr.write("Loading %s...\n" % species_name)
samples, allele_counts_map, passed_sites_map, last_line = parse_midas_data.parse_snps(
    species_name, debug)
sys.stderr.write("Done!\n")

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

# Calculate full matrix of synonymous pairwise differences
sys.stderr.write("Calculate synonymous pi matrix...\n")
pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix(
    allele_counts_map, passed_sites_map, variant_type='4D')
pis = numpy.diag(pi_matrix_syn)

# Calculate fixation matrix
fixation_matrix_syn, persite_fixation_matrix_syn = diversity_utils.calculate_fixation_matrix(
    allele_counts_map,
    passed_sites_map,