Пример #1
0
def get_filtered_phased_het_trio_variants(trio_vcf, trio_filtered_het_phased_vcf, sample_name):
    vcf_in = VariantFile(trio_vcf)
    vcf_in.subset_samples([sample_name])
    vcf_out = VariantFile(trio_filtered_het_phased_vcf, 'w', header=vcf_in.header)
    
    for rec in vcf_in.fetch():
    	if rec.filter.keys()[0] == 'PASS':
    	    rec_sample = rec.samples[0]
    	    if rec_sample.phased and rec_sample['GT'][0] != rec_sample['GT'][1]:
    	        rec.samples[0].update({'PS':1})
    	        vcf_out.write(rec)
    return 0
Пример #2
0
def read_vcf(vcf, sample) :
    """Reads a VCF and fetches relevant information to a Pandas DataFrame"""

    vcf_in = VariantFile(vcf)  # auto-detect input format
    vcf_in.subset_samples([sample])

    # Variant sites for probability computation (and later modelisation)
    VariantSites = {"CHROM":[], "POS":[], "TYPE":[], "DP":[], "MAF":[], "GT":[], "QUAL":[], "ALS":[]}

    for i, rec in enumerate(vcf_in) : # For each record in vcf
        if i % 200000 == 0 :
            print("Elapsed records: {}".format(i))

        gt = rec.samples[sample]["GT"] # Get sample GT
        if len(set(gt)) == 1 :
            continue # SKIP IF HOMOZYGOUS

        VariantSites["CHROM"].append(rec.chrom) # Add record CHROM
        VariantSites["POS"].append(rec.pos) # Add record POS

        als = [x for n, x in enumerate(rec.alleles) if n in gt] # Get record position
        VariantSites["ALS"].append(als) # Add record alleles
        VariantSites["GT"].append(gt) # Add sample GT

        if "<NON_REF>" in als : # In case undefined allele
            vtype = "U"
        elif any(len(x) > 1 for x in als) or "*" in als : # In case any is a deletion or is an insertion
            vtype = "I"
        else : # In case not non-ref and not an INDEL
            vtype = "S"

        VariantSites["TYPE"].append(vtype) # Add sample type

        try : # Add sample DP and compute Min AF based on allele reads frequencies. If AD or DP is unavailable fills with None
            dp = rec.samples[sample]["DP"]
            VariantSites["DP"].append(dp)
            try :
                min_ad = min(rec.samples[sample]["AD"])
                min_af = float(min_ad/dp)
                VariantSites["MAF"].append(min_af)
            except :
                VariantSites["MAF"].append(None)
        except :
            VariantSites["DP"].append(None)

        try : # Add record QUAL at this position
            VariantSites["QUAL"].append(rec.qual)
        except :
            VariantSites["QUAL"].append(None)

    return pd.DataFrame.from_dict(VariantSites)
Пример #3
0
def get_hets(vcf, sample):
    """extract heterozygous STRs and SNPs from the vcf"""
    vcf = VariantFile(vcf)
    vcf.subset_samples([sample])
    # iterate through each variant record in the VCF
    # TODO: make sure you're considering cases where the POS is duplicated
    for rec in vcf.fetch():
        variant = rec.samples[sample]
        is_het = variant['GT'][0] != variant['GT'][1]
        is_snp = not ((len(variant.alleles[0]) - len(variant.alleles[1])))
        # limit our analysis to only variants that are heterozygous
        # and they must be either STRs or SNPs
        if (is_het and (is_snp or rec.id.startswith("STR_"))):
            yield rec
Пример #4
0
def main(varfile, keep, outprefix):
    """
    输出sweepfinder2的allele frequency file文件
    要求输入的vcf文件没有缺失
    输出文件中会把alt allele count为0的过滤掉
    """
    varin = VariantFile(varfile)
    samples = [x.strip() for x in open(keep).readlines()]
    varin.subset_samples(samples)
    print(f'keep samples:\n{samples}')
    ss = len(samples) * 2  # sample size
    with open(f'{outprefix}.SF', 'w') as f:
        f.write('position\tx\tn\tfolded\n')
        for rec in varin.fetch():
            gts = [s['GT'] for s in rec.samples.values()]
            gts = np.array(gts, dtype='int8').flatten()
            ac = np.sum(gts)  # alt allele count
            if ac > 0:
                f.write(f'{rec.pos}\t{ac}\t{ss}\t1\n')