def summarize_population_sizes(dict_of_sizes): results = {} for pop, sizes, in dict_of_sizes.iteritems(): results[pop + '.sample_count.mean'] = fstats._mean_(sizes) results[pop + '.sample_count.stdev'] = fstats._stdev_(sizes) return results
def calc_slice_stats(data): """Main function for caculating statistics. Make it easy to add more statistics. """ tabix_slice, chrm, start, stop, populations, header, min_samples = data #progress_meter(starting_time, chrm, stop, bp_processed, total_bp_in_dataset) if tabix_slice == None or len(tabix_slice) == 0: # skip empty alignments return None else: # Create lists to store values for final printing. output_line = [chrm, start, stop] total_depth = [] snp_wise_f_statistics = [] population_sizes = defaultdict(list) Hs_est_dict = {} Ht_est_dict = {} snp_count = 0 for count, line in enumerate(tabix_slice): vcf_line_dict = parse_vcf_line(line, header) # CREATE FILTERS HERE: if vcf_line_dict["FILTER"] != 'PASS': continue # COUNT SAMPLES IN EACH POPULATION for pop, size in get_population_sizes(vcf_line_dict, populations).iteritems(): population_sizes[pop].append(size) # CALCULATE SNPWISE F-STATISTICS allele_counts = calc_allele_counts(populations, vcf_line_dict) f_statistics = calc_fstats(allele_counts) # UPDATE Hs AND Ht DICTIIONARIES Hs_est_dict, Ht_est_dict = update_Hs_and_Ht_dicts( f_statistics, Hs_est_dict, Ht_est_dict) f_statistics['LOCATION'] = (chrm, start, stop) snp_wise_f_statistics.append(f_statistics) total_depth.append(int(vcf_line_dict['INFO']["DP"])) snp_count = count # SUMMARIZE POPULATION WIDE STATISTICS pop_size_statistics = summarize_population_sizes(population_sizes) multilocus_f_statistics = calc_multilocus_f_statistics( Hs_est_dict, Ht_est_dict) # SKIP SAMPLES WITH TOO MANY NANs if len(multilocus_f_statistics.values()) == 0: return None elif multilocus_f_statistics.values()[0] is None: return None else: # UPDATE OUTPUT LINE WITH DEPTH INFO output_line += [ snp_count, fstats._mean_(total_depth), fstats._stdev_(total_depth) ] return ([chrm, start, stop, snp_count, fstats._mean_(total_depth), fstats._stdev_(total_depth)], \ pop_size_statistics, multilocus_f_statistics)
def calc_slice_stats(data): """Main function for caculating statistics. Make it easy to add more statistics. """ tabix_slice, chrm, start, stop, populations, header, min_samples = data #progress_meter(starting_time, chrm, stop, bp_processed, total_bp_in_dataset) if tabix_slice == None or len(tabix_slice) == 0: # skip empty alignments return None else: # Create lists to store values for final printing. output_line = [chrm, start, stop] total_depth = [] snp_wise_f_statistics = [] population_sizes = defaultdict(list) Hs_est_dict = {} Ht_est_dict = {} snp_count = 0 for count, line in enumerate(tabix_slice): vcf_line_dict = parse_vcf_line(line, header) # CREATE FILTERS HERE: if vcf_line_dict["FILTER"] != 'PASS': continue # COUNT SAMPLES IN EACH POPULATION for pop, size in get_population_sizes(vcf_line_dict, populations).iteritems(): population_sizes[pop].append(size) # CALCULATE SNPWISE F-STATISTICS allele_counts = calc_allele_counts(populations, vcf_line_dict) f_statistics = calc_fstats(allele_counts) # UPDATE Hs AND Ht DICTIIONARIES Hs_est_dict, Ht_est_dict = update_Hs_and_Ht_dicts(f_statistics, Hs_est_dict, Ht_est_dict) f_statistics['LOCATION'] = (chrm, start, stop) snp_wise_f_statistics.append(f_statistics) total_depth.append(int(vcf_line_dict['INFO']["DP"])) snp_count = count # SUMMARIZE POPULATION WIDE STATISTICS pop_size_statistics = summarize_population_sizes(population_sizes) multilocus_f_statistics = calc_multilocus_f_statistics(Hs_est_dict, Ht_est_dict) # SKIP SAMPLES WITH TOO MANY NANs if len(multilocus_f_statistics.values()) == 0: return None elif multilocus_f_statistics.values()[0] is None: return None else: # UPDATE OUTPUT LINE WITH DEPTH INFO output_line += [snp_count, fstats._mean_(total_depth), fstats._stdev_(total_depth)] return ([chrm, start, stop, snp_count, fstats._mean_(total_depth), fstats._stdev_(total_depth)], \ pop_size_statistics, multilocus_f_statistics)