def main(between_host): # Parse and save all the snps between QP hosts t0 = time.time() if between_host: intermediate_file_path = os.path.join(config.analysis_directory, 'between_hosts_checkpoints') else: intermediate_file_path = os.path.join(config.analysis_directory, 'within_hosts_checkpoints') for species_name in desired_species: print("Start processing {}".format(species_name)) core_genes = core_gene_utils.parse_core_genes(species_name) desired_samples = get_desired_samples(species_name, between_host=between_host) if desired_samples is None or len(desired_samples) == 0: print("{} has no qualified samples".format(species_name)) continue pickle_path = os.path.join(intermediate_file_path, species_name) if not os.path.exists(pickle_path): print('{} has not been processed'.format(species_name)) os.mkdir(pickle_path) else: print('{} already processed'.format(species_name)) continue found_samples, allele_counts_map, passed_sites_map, final_line_number = parse_snps( species_name, allowed_samples=desired_samples, allowed_genes=core_genes, allowed_variant_types=['4D']) pickle.dump(allele_counts_map, open(pickle_path + '/allele_counts_map.pickle', 'wb')) pickle.dump(found_samples, open(pickle_path + '/found_samples.pickle', 'wb')) pickle.dump(passed_sites_map, open(pickle_path + '/passed_sites_map.pickle', 'wb')) print("Done processing {} at {} min".format(species_name, (time.time() - t0) / 60))
sys.stderr.write("Loading SNPs for %s...\n" % species_name) sys.stderr.write("(not just core genes...)\n") pi_matrix_syn = numpy.array([]) avg_pi_matrix_syn = numpy.array([]) snp_difference_matrix = numpy.array([]) snp_difference_matrix_mutation = numpy.array([]) snp_difference_matrix_reversion = numpy.array([]) snp_opportunity_matrix = numpy.array([]) final_line_number = 0 while final_line_number >= 0: sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number) dummy_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species_name, debug=debug, allowed_samples=snp_samples, chunk_size=chunk_size, initial_line_number=final_line_number) sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys())) print len(dummy_samples), "dummy samples!" # Calculate fixation matrix sys.stderr.write("Calculating matrix of snp differences...\n") chunk_snp_difference_matrix_mutation, chunk_snp_difference_matrix_reversion, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix_mutation_reversion( allele_counts_map, passed_sites_map, min_change=min_change) # sys.stderr.write("Done!\n") if snp_difference_matrix.shape[0] == 0: snp_difference_matrix = numpy.zeros_like( chunk_snp_difference_matrix_mutation) * 1.0
pis = total_pis / total_pi_opportunities ###################### # compute median cov # ###################### median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) ########################################################## # load SNP info ########################################################## # note that this loads info for all samples. Later the desired samples are selected out. sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species_name, debug) sys.stderr.write("Done!\n") ################################################################### # compute 1D SFSs ################################################################### sys.stderr.write("Calculate within person SFS...\n") sample_freqs, passed_sites = diversity_utils.calculate_sample_freqs( allele_counts_map, passed_sites_map, variant_type='4D', fold=True) sfss = [] bins = numpy.linspace(0.04, 0.95, 21) bin_locations = bins[1:] - (bins[1] - bins[0]) / 2 for j in xrange(0, len(samples)):
import pylab import sys import numpy from utils import diversity_utils from parsers import parse_midas_data from numpy.random import choice species_name = sys.argv[1] # Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps( species_name, combination_type="sample", debug=False) sys.stderr.write("Done!\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D') low_diversity_samples = (numpy.diag(avg_pi_matrix_syn) < 1e-03) unique_samples = parse_midas_data.calculate_unique_samples( subject_sample_map, samples) desired_samples = unique_samples * low_diversity_samples # initialize distance bins for LD computations distance_bins = numpy.logspace( 0, 4, 20
# Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, total_pis, total_pi_opportunities = parse_midas_data.parse_within_sample_pi( species_name, debug) sys.stderr.write("Done!\n") pis = total_pis / total_pi_opportunities median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Only plot samples above a certain depth threshold that are "haploids" snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) dummy_samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps( species_name, debug=debug, allowed_samples=snp_samples) sys.stderr.write("Done!\n") # Calculate fixation matrix sys.stderr.write("Calculating matrix of snp differences...\n") snp_difference_matrix, snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix( allele_counts_map, passed_sites_map, min_change=min_change) sys.stderr.write("Done!\n") substitution_rate = snp_difference_matrix * 1.0 / snp_opportunity_matrix # Calculate which pairs of idxs belong to the same sample, which to the same subject # and which to different subjects same_sample_idxs, same_subject_idxs, diff_subject_idxs = parse_midas_data.calculate_subject_pairs( subject_sample_map, snp_samples)
nonsynonymous_sfs = [] synonymous_count_sfs = [] nonsynonymous_count_sfs = [] synonymous_pi_weighted_counts = 0 nonsynonymous_pi_weighted_counts = 0 final_line_number = 0 while final_line_number >= 0: sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number) snp_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps( species_name, debug=debug, allowed_variant_types=allowed_variant_types, allowed_samples=largest_clade_samples, allowed_genes=core_genes, chunk_size=chunk_size, initial_line_number=final_line_number) sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys())) # Calculate fixation matrix sys.stderr.write("Calculating matrix of snp differences...\n") chunk_snp_difference_matrix, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix( allele_counts_map, passed_sites_map, allowed_genes=core_genes, min_change=min_change) sys.stderr.write("Done!\n") if snp_difference_matrix.shape[0] == 0:
import pylab import sys import numpy from calculate_pi_matrix import calculate_self_pis import os species = sys.argv[1] data_directory = os.path.expanduser("~/ben_nandita_hmp_data/") analysis_directory = os.path.expanduser("~/ben_nandita_hmp_analysis/") default_directory_prefix = data_directory print species sys.stderr.write("Loading %s...\n" % species) samples, allele_counts_syn, locations_syn, genes_syn, passed_sites_syn, allele_counts_non, locations_non, genes_non, passed_sites_non = parse_midas_data.parse_snps( species, site_depth_threshold=15, directory_prefix=default_directory_prefix) sys.stderr.write("Done!\n") sys.stderr.write("Calculating pis...\n") piS = calculate_self_pis(allele_counts_syn) piS /= (passed_sites_syn + (passed_sites_syn == 0)) sys.stderr.write("Done!\n") for sample, pi in zip(samples, piS): print sample, pi
# Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution( species_name) median_coverages = numpy.array([ stats_utils.calculate_nonzero_median_from_histogram( sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map, last_line = parse_midas_data.parse_snps( species_name, debug) sys.stderr.write("Done!\n") median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) # Calculate full matrix of synonymous pairwise differences sys.stderr.write("Calculate synonymous pi matrix...\n") pi_matrix_syn, avg_pi_matrix_syn = diversity_utils.calculate_pi_matrix( allele_counts_map, passed_sites_map, variant_type='4D') pis = numpy.diag(pi_matrix_syn) # Calculate fixation matrix fixation_matrix_syn, persite_fixation_matrix_syn = diversity_utils.calculate_fixation_matrix( allele_counts_map, passed_sites_map,