# Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Load time metadata subject_sample_time_map_all_samples = parse_midas_data.parse_subject_sample_time_map( ) ###################### # Load coverage data # ###################### # Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution( species_name) median_coverages = numpy.array([ stats_utils.calculate_median_from_histogram(sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # prune time meta data so that the highest coverage sample is retained for those subjects with >1 sample per time pt subject_sample_time_map = parse_midas_data.prune_subject_sample_time_map( subject_sample_time_map_all_samples, sample_coverage_map) ############################################################### # Compute Pi within patients to figure out which are haploid #
from parsers import parse_midas_data species_name = sys.argv[1] min_change = 0.8 # load list of metaphlan2 genes metaphlan2_genes = parse_midas_data.load_metaphlan2_genes(species_name) # Load subject and sample metadata sys.stderr.write("Loading HMP metadata...\n") subject_sample_map = parse_midas_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # Load genomic coverage distributions sample_coverage_histograms, samples = parse_midas_data.parse_coverage_distribution( species_name, combination_type="sample") median_coverages = numpy.array([ stats_utils.calculate_median_from_histogram(sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load SNP information for species_name sys.stderr.write("Loading %s...\n" % species_name) samples, allele_counts_map, passed_sites_map = parse_midas_data.parse_snps( species_name, combination_type="sample", debug=False) sys.stderr.write("Done!\n")