replacement_shared_snps = {cohort: [] for cohort in cohorts} replacement_opportunities = {cohort: [] for cohort in cohorts} for species_name in good_species_list: sys.stderr.write("\nProcessing %s...\n" % species_name) # First we have to enumerate QP pairs in each cohort sys.stderr.write("Enumerating QP pairs...\n") # all samples all_samples = sample_order_map.keys() # list of samples that meet coverage criteria for this species highcoverage_samples = set( diversity_utils.calculate_highcoverage_samples(species_name)) # list of samples that meet QP criteria for this species haploid_samples = set( diversity_utils.calculate_haploid_samples(species_name)) #print len(all_samples), len(highcoverage_samples), len(haploid_samples) if len(haploid_samples) < config.within_host_min_haploid_sample_size: continue same_sample_idxs, same_subject_idxs, diff_subject_idxs = sample_utils.calculate_ordered_subject_pairs( sample_order_map, all_samples) hmp_sample_size = 0
good_species_list = parse_midas_data.parse_good_species_list() if species != 'all': good_species_list = [species] else: if debug: good_species_list = good_species_list[:3] # header for the output file. record_strs = [] for species_name in good_species_list: sys.stderr.write("Loading samples...\n") # Only plot samples above a certain depth threshold that are confidently phaseable. snp_samples = diversity_utils.calculate_highcoverage_samples( species_name, min_coverage=min_coverage) if len(snp_samples) < 2: continue sys.stderr.write("found %d samples\n" % len(snp_samples)) # Analyze SNPs, looping over chunk sizes. # Clunky, but necessary to limit memory usage on cluster # Load SNP information for species_name sys.stderr.write("Loading SNPs for %s...\n" % species_name) snps = [] snp_map = {} # contig: list of locations map