divergence_matrices[species_name] = snp_substitution_matrix between_divergences[species_name] = [] for i in xrange(0, divergence_matrices[species_name].shape[0]): for j in xrange(i + 1, divergence_matrices[species_name].shape[0]): if divergence_matrices[species_name][i, j] >= 0: between_divergences[species_name].append( divergence_matrices[species_name][i, j]) between_divergences[species_name] = numpy.array( between_divergences[species_name]) # Load SNP information for species_name sys.stderr.write("Loading SFSs for %s...\t" % species_name) sfs_samples, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name) sys.stderr.write("Done!\n") highcoverage_samples = diversity_utils.calculate_highcoverage_samples( species_name) desired_samples = snp_samples within_polymorphisms[species_name] = [] for sample in desired_samples: within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map( sfs_map[sample]) within_polymorphisms[species_name].append(within_sites * 1.0 / total_sites) species_names = [] sample_sizes = []
subject_sample_map = parse_HMP_data.parse_subject_sample_map() sample_order_map = parse_HMP_data.parse_sample_order_map() sys.stderr.write("Done!\n") if other_species_str == "": good_species_list = parse_midas_data.parse_good_species_list() else: good_species_list = [species_name] # store all the species' data in a dictionary: all_data = {} #key=species #value={}, key=gene, valuee=num times gene shows up for species_name in good_species_list: dummy_samples, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['1D', '2D', '3D', '4D'])) # # data structures for storing information for pickling later on all_species_gene_changes = {} #all_species_gene_changes_category={} all_species_null = {} all_data[species_name] = {} # #################### # Analyze the data # #################### # # Only plot samples above a certain depth threshold that are "haploids" haploid_samples = diversity_utils.calculate_haploid_samples(species_name, debug=debug) #
species_name) median_coverages = numpy.array([ stats_utils.calculate_nonzero_median_from_histogram( sample_coverage_histogram) for sample_coverage_histogram in sample_coverage_histograms ]) sample_coverage_map = { samples[i]: median_coverages[i] for i in xrange(0, len(samples)) } # Load pi information for species_name sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name) samples, site_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['4D']), allowed_genes=core_genes, debug=debug) sys.stderr.write("Done!\n") median_coverages = numpy.array( [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))]) print[len(site_map[i].keys()) for i in xrange(0, len(site_map))] sys.exit(0) # Only plot samples above a certain depth threshold that are "haploids" snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)] num_haploids = len(snp_samples)