示例#1
0
def prune_allrefs(vcf, fout):
    for record in vcf.fetch():
        if is_biallelic(record):
            if record.info['AC'][0] > 0:
                fout.write(record)
        else:
            if record.info['CN_NONREF_FREQ'] > 0:
                fout.write(record)
示例#2
0
def drop_nonref_gts(vcf, fout):
    NULL_GT = [(0, 0), (None, None), (0, ), (None, ), (None, 2)]
    samples = [s for s in vcf.header.samples]

    # for record in vcf.fetch():
    #     nonref = svu.get_called_samples(record)
    #     if len(nonref) > 0:
    #         fout.write(record)

    for record in vcf.fetch():
        for s in samples:
            if is_biallelic(record):
                if record.samples[s]['GT'] not in NULL_GT:
                    fout.write(record)
                    break
            else:
                if record.samples[s]['CN'] != 2:
                    fout.write(record)
                    break
示例#3
0
def calc_allele_freq(record, samples, prefix=None, hemi=False):
    """
    Computes allele frequencies for a single record based on a list of samples
    """

    # Treat biallelic and multiallelic records differently
    # For biallelic sites, count number of non-ref, non-no-call GTs
    if svu.is_biallelic(record):

        # Get all sample GTs
        GTs = [record.samples[s]['GT'] for s in samples]

        # Count alleles & genotypes
        AC = 0
        AN = 0
        n_alt_count_0 = 0
        n_alt_count_1 = 0
        n_alt_count_2 = 0
        n_gts_with_gt_0_alts = 0  # Used specifically for hemizygous sites
        for GT in GTs:
            AN += len([
                allele for allele in GT if allele != '.' and allele is not None
            ])
            AC += len([
                allele for allele in GT
                if allele != '.' and allele != 0 and allele is not None
            ])
            if GT == (0, 0):
                n_alt_count_0 += 1
            if len([allele for allele in GT if allele == 0 and allele != '.' and allele is not None]) == 1 \
                    and len([allele for allele in GT if allele != 0 and allele != '.' and allele is not None]) == 1:
                n_alt_count_1 += 1
                n_gts_with_gt_0_alts += 1
            if len([
                    allele for allele in GT
                    if allele != 0 and allele != '.' and allele is not None
            ]) == 2:
                n_alt_count_2 += 1
                n_gts_with_gt_0_alts += 1

        # Adjust hemizygous allele number and allele count, if optioned
        if hemi:
            AN = AN / 2
            # For hemizygous sites, AC must be the sum of all non-reference *genotypes*, not alleles
            AC = n_gts_with_gt_0_alts

        # Calculate allele frequency
        if AN > 0:
            AF = AC / AN
            AF = round(AF, 6)
        else:
            AF = 0

        # Add AN, AC, and AF to INFO field
        record.info[(prefix + '_' if prefix else '') + 'AN'] = AN
        record.info[(prefix + '_' if prefix else '') + 'AC'] = AC
        record.info[(prefix + '_' if prefix else '') + 'AF'] = AF

        # Calculate genotype frequencies
        n_bi_genos = n_alt_count_0 + n_alt_count_1 + n_alt_count_2
        if n_bi_genos > 0:
            freq_homref = n_alt_count_0 / n_bi_genos
            freq_het = n_alt_count_1 / n_bi_genos
            freq_homalt = n_alt_count_2 / n_bi_genos
        else:
            freq_homref = 0
            freq_het = 0
            freq_homalt = 0
        if hemi:
            freq_hemialt = freq_het + freq_homalt

        # Add N_BI_GENOS, N_HOMREF, N_HET, N_HOMALT, FREQ_HOMREF, FREQ_HET, and FREQ_HOMALT to INFO field
        record.info[(prefix + '_' if prefix else '') +
                    'N_BI_GENOS'] = n_bi_genos
        if hemi:
            record.info[(prefix + '_' if prefix else '') +
                        'N_HEMIREF'] = n_alt_count_0
            record.info[(prefix + '_' if prefix else '') +
                        'N_HEMIALT'] = n_gts_with_gt_0_alts
            record.info[(prefix + '_' if prefix else '') +
                        'FREQ_HEMIREF'] = freq_homref
            record.info[(prefix + '_' if prefix else '') +
                        'FREQ_HEMIALT'] = freq_hemialt
        record.info[(prefix + '_' if prefix else '') +
                    'N_HOMREF'] = n_alt_count_0
        record.info[(prefix + '_' if prefix else '') + 'N_HET'] = n_alt_count_1
        record.info[(prefix + '_' if prefix else '') +
                    'N_HOMALT'] = n_alt_count_2
        record.info[(prefix + '_' if prefix else '') +
                    'FREQ_HOMREF'] = freq_homref
        record.info[(prefix + '_' if prefix else '') + 'FREQ_HET'] = freq_het
        record.info[(prefix + '_' if prefix else '') +
                    'FREQ_HOMALT'] = freq_homalt

    # Multiallelic sites should reference FORMAT:CN rather than GT
    # Compute CN_NUMBER, CN_NONREF_COUNT, CN_NONREF_FREQ, and CN_COUNT/CN_FREQ for each copy state
    else:

        # Get all sample CNs and remove Nones
        CNs_wNones = [record.samples[s]['CN'] for s in samples]
        CNs = [
            c for c in CNs_wNones if c is not None and c not in '. NA'.split()
        ]

        if len(CNs) == 0:
            nonnull_CNs, nonref_CN_count, nonref_CN_freq = [0] * 3
            CN_dist = (0, )
            CN_freqs = (0, )
        else:
            # Count number of samples per CN and total CNs observed
            CN_counts = dict(Counter(CNs))
            nonnull_CNs = len(CNs)

            # Get max observed CN and enumerate counts/frequencies per CN as list starting from CN=0
            max_CN = max([int(k) for k, v in CN_counts.items()])
            CN_dist = [int(CN_counts.get(k, 0)) for k in range(max_CN + 1)]
            CN_freqs = [round(v / nonnull_CNs, 6) for v in CN_dist]

            # Get total non-reference CN counts and freq
            if hemi:
                ref_CN = 1
            else:
                ref_CN = 2
            nonref_CN_count = sum([
                int(CN_counts.get(k, 0)) for k in range(max_CN + 1)
                if k != ref_CN
            ])
            nonref_CN_freq = round(nonref_CN_count / nonnull_CNs, 6)

        # Add values to INFO field
        record.info[(prefix + '_' if prefix else '') +
                    'CN_NUMBER'] = nonnull_CNs
        record.info[(prefix + '_' if prefix else '') +
                    'CN_COUNT'] = tuple(CN_dist)
        record.info[(prefix + '_' if prefix else '') +
                    'CN_FREQ'] = tuple(CN_freqs)
        record.info[(prefix + '_' if prefix else '') +
                    'CN_NONREF_COUNT'] = nonref_CN_count
        record.info[(prefix + '_' if prefix else '') +
                    'CN_NONREF_FREQ'] = nonref_CN_freq

    return record
示例#4
0
def gather_allele_freqs(record,
                        samples,
                        males_set,
                        females_set,
                        parbt,
                        pop_dict,
                        pops,
                        sex_chroms,
                        no_combos=False):
    """
    Wrapper to compute allele frequencies for all sex & population pairings
    """

    # Add PAR annotation to record (if optioned)
    if record.chrom in sex_chroms and len(parbt) > 0:
        if in_par(record, parbt):
            rec_in_par = True
            record.info['PAR'] = True
        else:
            rec_in_par = False
    else:
        rec_in_par = False

    # Get allele frequencies for all populations
    calc_allele_freq(record, samples)
    if len(males_set) > 0:
        if record.chrom in sex_chroms and not rec_in_par:
            calc_allele_freq(record, males_set, prefix='MALE', hemi=True)
        else:
            calc_allele_freq(record, males_set, prefix='MALE')
    if len(females_set) > 0:
        calc_allele_freq(record, females_set, prefix='FEMALE')

    # Adjust global allele frequencies on sex chromosomes, if famfile provided
    if record.chrom in sex_chroms and not rec_in_par \
            and svu.is_biallelic(record) and len(males_set) + len(females_set) > 0:
        update_sex_freqs(record)

    # Get allele frequencies per population
    if len(pops) > 0:
        for pop in pops:
            pop_samps = [s for s in samples if pop_dict.get(s, None) == pop]
            calc_allele_freq(record, pop_samps, prefix=pop)
            if len(males_set) > 0 and not no_combos:
                if record.chrom in sex_chroms and not rec_in_par:
                    calc_allele_freq(
                        record,
                        list([s for s in pop_samps if s in males_set]),
                        prefix=pop + '_MALE',
                        hemi=True)
                else:
                    calc_allele_freq(
                        record,
                        list([s for s in pop_samps if s in males_set]),
                        prefix=pop + '_MALE')
            if len(females_set) > 0 and not no_combos:
                calc_allele_freq(
                    record,
                    list([s for s in pop_samps if s in females_set]),
                    prefix=pop + '_FEMALE')

            # Adjust per-pop allele frequencies on sex chromosomes, if famfile provided
            if record.chrom in sex_chroms and not rec_in_par \
                    and svu.is_biallelic(record) and len(males_set) + len(females_set) > 0:
                update_sex_freqs(record, pop=pop)

        # Get POPMAX AF biallelic sites only
        if svu.is_biallelic(record):
            AFs = [record.info['{0}_AF'.format(pop)][0] for pop in pops]
            popmax = max(AFs)
            record.info['POPMAX_AF'] = popmax

    return record
def cleanup_vcf(vcf,
                fout,
                callrates,
                min_callrate_global=0.85,
                min_callrate_smallDels=0.95):

    # minus_samples = [s for s in vcf.header.samples if s not in plus_samples]
    # male_minus_samples = [s for s in minus_samples if s not in male_samples]

    for record in vcf:
        #Move several filters from FILTER to INFO
        for filt in filts_for_info:
            if filt in record.filter:
                record.info[filt] = True

        #Move HIGH_SR_BACKGROUND

        #Remove all HIGH_NOCALL_RATE and HIGH_SR_BACKGROUND tags from FILTER column
        newfilts = [
            filt for filt in record.filter if filt not in filts_to_remove
        ]
        record.filter.clear()
        for filt in newfilts:
            record.filter.add(filt)
        if len(record.filter) == 0:
            record.filter.add('PASS')

        # #Mark sites with low PCR+ call rate
        # plus_callrate = get_call_rate(record, plus_samples)
        # if plus_callrate < min_callrate:
        #     if 'LOW_PCRPLUS_CALL_RATE' not in record.info.keys():
        #         record.info.keys().append('LOW_PCRPLUS_CALL_RATE')
        #     record.info['LOW_PCRPLUS_CALL_RATE'] = True

        #Mark sites with low PCR- call rate
        if record.id in callrates.keys():
            callrate = callrates[record.id]
            #Mark small (300bp-1kb) deletions with stricter 5% null gt rate,
            #and mark all other variants at specified null gt rate
            if record.info['SVTYPE'] == 'DEL' \
            and record.info['SVLEN'] < 1000 \
            and record.info['SVLEN'] > 300:
                if callrate < min_callrate_smallDels:
                    record.filter.add('LOW_CALL_RATE')
            else:
                if callrate < min_callrate_global:
                    record.filter.add('LOW_CALL_RATE')

        #Recalibrate QUAL score for biallelic variants
        if is_biallelic(record):
            newQUAL = recal_qual_score(record)
            if newQUAL is not None:
                record.qual = newQUAL

        #Only check for non-empty GTs for biallelic variants
        if is_biallelic(record):
            for s in record.samples:
                if record.samples[s]['GT'] not in NULL_and_REF_GTs:
                    fout.write(record)
                    break
        else:
            fout.write(record)