Пример #1
0
Файл: VCF.py Проект: mebh/pypgen
def summarize_population_sizes(dict_of_sizes):
    results = {}
    for pop, sizes, in dict_of_sizes.iteritems():
        results[pop + '.sample_count.mean'] = fstats._mean_(sizes)
        results[pop + '.sample_count.stdev'] = fstats._stdev_(sizes)

    return results
Пример #2
0
def summarize_population_sizes(dict_of_sizes):
    results = {}
    for pop, sizes, in dict_of_sizes.iteritems():
        results[pop + '.sample_count.mean'] = fstats._mean_(sizes)
        results[pop + '.sample_count.stdev'] = fstats._stdev_(sizes)

    return results
Пример #3
0
Файл: VCF.py Проект: mebh/pypgen
def calc_slice_stats(data):
    """Main function for caculating statistics.

       Make it easy to add more statistics.
    """

    tabix_slice, chrm, start, stop, populations, header, min_samples = data

    #progress_meter(starting_time, chrm, stop, bp_processed, total_bp_in_dataset)

    if tabix_slice == None or len(tabix_slice) == 0:  # skip empty alignments
        return None

    else:
        # Create lists to store values for final printing.
        output_line = [chrm, start, stop]
        total_depth = []
        snp_wise_f_statistics = []
        population_sizes = defaultdict(list)

        Hs_est_dict = {}
        Ht_est_dict = {}
        snp_count = 0

        for count, line in enumerate(tabix_slice):

            vcf_line_dict = parse_vcf_line(line, header)

            # CREATE FILTERS HERE:
            if vcf_line_dict["FILTER"] != 'PASS':
                continue

            # COUNT SAMPLES IN EACH POPULATION
            for pop, size in get_population_sizes(vcf_line_dict,
                                                  populations).iteritems():
                population_sizes[pop].append(size)

            # CALCULATE SNPWISE F-STATISTICS
            allele_counts = calc_allele_counts(populations, vcf_line_dict)
            f_statistics = calc_fstats(allele_counts)

            # UPDATE Hs AND Ht DICTIIONARIES
            Hs_est_dict, Ht_est_dict = update_Hs_and_Ht_dicts(
                f_statistics, Hs_est_dict, Ht_est_dict)
            f_statistics['LOCATION'] = (chrm, start, stop)
            snp_wise_f_statistics.append(f_statistics)

            total_depth.append(int(vcf_line_dict['INFO']["DP"]))
            snp_count = count

        # SUMMARIZE POPULATION WIDE STATISTICS
        pop_size_statistics = summarize_population_sizes(population_sizes)
        multilocus_f_statistics = calc_multilocus_f_statistics(
            Hs_est_dict, Ht_est_dict)

        # SKIP SAMPLES WITH TOO MANY NANs

        if len(multilocus_f_statistics.values()) == 0:
            return None

        elif multilocus_f_statistics.values()[0] is None:
            return None

        else:
            # UPDATE OUTPUT LINE WITH DEPTH INFO
            output_line += [
                snp_count,
                fstats._mean_(total_depth),
                fstats._stdev_(total_depth)
            ]

            return ([chrm, start, stop, snp_count, fstats._mean_(total_depth), fstats._stdev_(total_depth)], \
                 pop_size_statistics, multilocus_f_statistics)
Пример #4
0
def calc_slice_stats(data):
    """Main function for caculating statistics.

       Make it easy to add more statistics.
    """

    tabix_slice, chrm, start, stop, populations, header, min_samples = data

    #progress_meter(starting_time, chrm, stop, bp_processed, total_bp_in_dataset)

    if tabix_slice == None or len(tabix_slice) == 0:  # skip empty alignments
        return None

    else:
        # Create lists to store values for final printing.
        output_line = [chrm, start, stop]
        total_depth = []
        snp_wise_f_statistics = []
        population_sizes = defaultdict(list)

        Hs_est_dict = {}
        Ht_est_dict = {}
        snp_count = 0

        for count, line in enumerate(tabix_slice):

            vcf_line_dict = parse_vcf_line(line, header)

            # CREATE FILTERS HERE:
            if vcf_line_dict["FILTER"] != 'PASS':
                continue

            # COUNT SAMPLES IN EACH POPULATION
            for pop, size in get_population_sizes(vcf_line_dict, populations).iteritems():
                population_sizes[pop].append(size)

            # CALCULATE SNPWISE F-STATISTICS
            allele_counts = calc_allele_counts(populations, vcf_line_dict)
            f_statistics = calc_fstats(allele_counts)

            # UPDATE Hs AND Ht DICTIIONARIES
            Hs_est_dict, Ht_est_dict = update_Hs_and_Ht_dicts(f_statistics, Hs_est_dict, Ht_est_dict)
            f_statistics['LOCATION'] = (chrm, start, stop)
            snp_wise_f_statistics.append(f_statistics)

            total_depth.append(int(vcf_line_dict['INFO']["DP"]))
            snp_count = count

        # SUMMARIZE POPULATION WIDE STATISTICS
        pop_size_statistics = summarize_population_sizes(population_sizes)
        multilocus_f_statistics = calc_multilocus_f_statistics(Hs_est_dict, Ht_est_dict)

        # SKIP SAMPLES WITH TOO MANY NANs

        if len(multilocus_f_statistics.values()) == 0:
            return None

        elif multilocus_f_statistics.values()[0] is None:
            return None

        else:
            # UPDATE OUTPUT LINE WITH DEPTH INFO
            output_line += [snp_count, fstats._mean_(total_depth), fstats._stdev_(total_depth)]

            return ([chrm, start, stop, snp_count, fstats._mean_(total_depth), fstats._stdev_(total_depth)], \
                 pop_size_statistics, multilocus_f_statistics)