def annotate_false_negs(folder): """ Get information for any false negative results - returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Folder containing output from bcftools isec :return: array of variant dictionaries containing information on false negatives """ false_negs = VariantFile(folder + '/0000.vcf') num_neg = len(list(false_negs.fetch())) print num_neg variants = [] if num_neg > 0: print 'false negatives' for rec in false_negs.fetch(): chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples['INTEGRATION']['GT'] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype} variants.append(variant) else: print 'no false negatives' return variants
def annotate_false_pos(folder, coverage_file, sample): """ Get information for any false positive results - returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Folder containing output from bcftools isec :param coverage_file: File containing per base coverage for the truth_regions panel :param sample: container ID used in vcf file :return: array of variant dictionaries containing information on false negatives """ false_pos = VariantFile(folder + '/0001.vcf') num_pos = len(list(false_pos.fetch())) print num_pos variants = [] if num_pos > 0: print 'false positives' for rec in false_pos.fetch(): chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples[sample]['GT'] if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' total_depth = rec.samples[sample]['DP'] if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command: ' + str(e.returncode) exit(1) if line == '': variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} else: bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} else: variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A', 'alt':'N/A'}} variants.append(variant) else: print 'no false positives' return variants
def check_genotype(folder, sample): """ Compares the genotype for all shared variants :param folder: location of results from the NGS analysis pipeline :param sample: sample number (used in vcf file) :return: dictionary of number of matching variants and detailed information for any with mismatching genotypes """ shared_giab = VariantFile(folder + '/0002.vcf') shared_patient = VariantFile(folder + '/0003.vcf') variants = [] vars_giab = {} for rec in shared_giab.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if chrom not in vars_giab: vars_giab[chrom] = {} if pos not in vars_giab[chrom]: vars_giab[chrom][pos] = {} if alleles not in vars_giab[chrom][pos]: vars_giab[chrom][pos][alleles] = rec.samples['INTEGRATION']['GT'] matching = 0 for rec in shared_patient.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' total_depth = rec.samples[sample]['DP'] giab_genotype = vars_giab[chrom][pos][alleles] if rec.samples[sample]['GT'] == giab_genotype: matching += 1 elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][ 0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]: matching += 1 elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0: matching += 1 elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1: matching += 1 else: variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}} variants.append(variant) print str(matching) + ' matching variants' results = {'matching':matching, 'mismatching':variants} print results return results
def gen_report(vcf, sample, ref_flag): vcf_in = VariantFile(vcf) # run cadd twice over snv and indel file out = open(sample + '.germline.vep91.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'HGVSg': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'BIOTYPE': 0, 'SIFT': 0, 'Existing_variation': 0, 'VARIANT_CLASS': 0, 'gnomAD_AF': 0, 'CLIN_SIG': 0, 'CADD_PHRED': []} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) if desc_list[i] == 'CADD_PHRED': desired[desc_list[i]].append(i) else: desired[desc_list[i]] = i out.write('CHROM\tPOS\tREF\tAllele\tTotal Allele Count\tTotal Position Coverage\tGene\tHGVSg\tTranscript_id' '\tEffect\tIMPACT\tBIOTYPE\tCodons\tAmino_acids\tExisting_variation\tVARIANT_CLASS\tSIFT\tgnomAD_AF' '\tCLIN_SIG\tCADD_PHRED\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt, alt_ct, tot_ct) = (record.contig, str(record.pos), record.ref, record.alts[0], str(record.info['TR']), str(record.info['TC'])) ann_list = [_.split('|') for _ in record.info['ANN']] output_highest_impact(chrom, pos, ref, alt, alt_ct, tot_ct, ann_list, desired, out, ref_flag) out.close() return 0
def gen_report(vcf): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.indels.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') vcf_in = VariantFile(vcf) out = open(parts[0] + '.indels.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in xrange(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact' '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n') for record in vcf_in.fetch(): (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0], str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO'])) ann_list = [_.split('|') for _ in record.info['ANN'].split(',')] output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def annotate_false_pos(folder, sample): """ Get information for any false positive results - returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Folder containing output from bcftools isec :param sample: container ID used in vcf file :return: array of variant dictionaries containing information on false negatives """ false_pos = VariantFile(folder + '/0001.vcf') num_pos = len(list(false_pos.fetch())) print num_pos variants = [] if num_pos > 0: print 'false positives' for rec in false_pos.fetch(): chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples[sample]['GT'] if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' total_depth = rec.samples[sample]['DP'] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}} variants.append(variant) else: print 'no false positives' return variants
def gen_report(vcf, out, c, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.subsitutions.vep.priority_report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') mut_dict = create_mutect_ind(out) log(loc, date_time() + 'Created index for added mutect info\n') on_dict = {} if c != 'n': on_dict = create_target(c) log(loc, date_time() + 'Target file given, creating index for on target info\n') vcf_in = VariantFile(vcf) out = open(parts[0] + '.subsitutions.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write('chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t' 'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tgnomAD_AF\tgene\ttx_id\teffect\timpact\tbiotype\t' 'codon_change\tamino_acid_change\ton/off-target\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0] ann_list = [_.split('|') for _ in record.info['ANN']] tflag = 'NA' if c != 'n': tflag = mark_target(chrom, pos, on_dict) # only outputting ON TARGET hits if tflag == 'OFF': continue output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def get_variants(filename): """ Function that parse the sample VCF file. This function get snp found in the representative genes, and uses the tag 'AD', a list containing the number of read mapped for reference and alternative variant. Args: filename [string] = sample filename Returns: var [dict] = contain snp variation informations of representative genes key : representative gene name value : variant [dict] containing snp position as key, and a list of (nucleotide variant, aligned reads number) """ # open VCF file vcf = VariantFile(filename) # initialise var = {} flag = 0 for rec in vcf.fetch(): # only for the first record, set variable name if flag == 0: name = rec.chrom # rec.chrom is the representative gene name variant = defaultdict(list) flag = 1 # if snp are found in another representative gene if rec.chrom != name: var[name] = variant # store the variant name = rec.chrom # change the representative gene name variant = defaultdict(list) # create a new variant dictionnary # read the snp informations for gene, obj in rec.samples.items(): i = 0 if 'AD' in obj: for nb in obj['AD']: if nb != 0: variant[rec.pos].append((rec.alleles[i], nb)) i +=1 return var
def parse_vcf(filename): """ Function that parse a database VCF file obtained by a variant calling using a multiple alignment file. It parses the VCF file and output a matrix containing all the variant at each snp position of all the clustered genes Args : filename [string] = VCF filename Returns: name [string] = representative gene name index [dict] = a dictionary containing index of snp position in list: key : snp position value : index of the snp in the list of the dict versions matrix [dict] = dictionary containing all variations key : clustered gene value : list of the nucleotide variation """ # open VCF file vcf = VariantFile(filename) # initialise index = {} matrix = defaultdict(list) i = 0 # index of snp name = 0 for rec in vcf.fetch(): name = rec.chrom # representative gene name # get the snp position (rec.pos) and his index (i) index[rec.pos] = i i += 1 # creation of the matrix of a cluster, gene are the different clustered # genes, obj contain information about the snp for gene, obj in rec.samples.items(): snp = obj.allele_indices[0] if snp != -1: matrix[gene].append(rec.alleles[snp]) else: # if deletion matrix[gene].append('') return name, [index, matrix]
def variants_missing_vcf(self,vcf_file): cat_chroms = set(self.data[self.col_chr].unique()) cat_variants = set(self.data[self.col_epacts].unique()) vcf_variants = set() for cat_chrom in cat_chroms: print >> sys.stderr, "Checking chromosome %s..." % str(cat_chrom) if '.json' in vcf_file: import json with open(vcf_file) as jsin: vcf_dict = json.load(jsin) vcf = vcf_dict.get(cat_chrom) if vcf is None: warning("GWAS catalog has variants on chromosome %s, but could not find this chromosome in your VCF (or JSON) file: %s" % (cat_chrom,vcf_file)) continue else: vcf = vcf_file vcf_pysam = VariantFile(vcf) # Subset catalog to chromosome df_cat_for_chrom = self.data.query("{} == '{}'".format(self.col_chr,cat_chrom)) # Catalog has repeated rows for variants depending on the number of traits * citations # But we just need each variant once df_cat_for_chrom = df_cat_for_chrom.drop_duplicates(self.col_epacts) # Loop over subsetted catalog, check if variant is in VCF for idx, row in df_cat_for_chrom.iterrows(): chrom, pos = row[self.col_chr], row[self.col_pos] for rec in vcf_pysam.fetch(chrom,pos,pos): epacts = "{}:{}_{}/{}".format(rec.chrom,rec.pos,rec.ref,rec.alt) vcf_variants.add(epacts) missing_variants = cat_variants.difference(vcf_variants) missing_rows = self.data[self.data[self.col_epacts].isin(missing_variants)] return missing_rows
#!/group/ctan/anaconda3/envs/snakemake/bin/python import sys from vcf_ctan import samvcf from pysam import VariantFile samples= ["AC","BD","Commander","EC2.1","EC2.2","EC7.1","EC7.2","Fleet","Hindmarsh","La_Trobe","Scope","Vlamingh","W1","WI4304","X1","barke","bowman","haruna_Nijo","igri","spontaneum_B1k-04-12"] smps = [samples[3],samples[4],samples[5],samples[6]] ibcf = VariantFile(sys.argv[1]) #obcf = VariantFile(sys.argv[2],'w',header=ibcf.header) ofile = open(sys.argv[2],"w") hd = "\t".join(["#chr","pos","len","ref","ref_num","alt","alt_num") ofile.write(hd) for one in ibcf.fetch("chr3H"): record = samvcf(one) if record.flt and record.diff_repeat(smps): opt = record.opt + [str(sum(one.samples[smps[0]]['GT'])),",".join(list(map(str,one.samples[smps[0]]['AD']))),str(sum(one.samples[smps[1]]['GT'])),",".join(list(map(str,one.samples[smps[1]]['AD']))),str(sum(one.samples[smps[2]]['GT'])),",".join(list(map(str,one.samples[smps[2]]['AD']))),str(sum(one.samples[smps[3]]['GT'])),",".join(list(map(str,one.samples[smps[3]]['AD'])))] ofile.write("\t".join(opt) + "\n")
def force_calling(bam_path, ivcf_path, output_path, sigs_dir, max_cluster_bias_dict, threshold_gloab_dict, gt_round, threads): logging.info('Check the parameter -Ivcf: OK.') logging.info('Enable to perform force calling.') #print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) sv_dict = dict() #''' for sv_type in ["DEL", "DUP"]: sv_dict[sv_type] = parse_sigs(sv_type, sigs_dir) sv_dict['INS'] = parse_inssigs(sigs_dir) sv_dict['INV'] = parse_invsigs(sigs_dir) sv_dict['TRA'] = parse_trasigs(sigs_dir) #''' vcf_reader = VariantFile(ivcf_path, 'r') row_count = 0 for record in vcf_reader.fetch(): row_count += 1 idx = -1 #gt_list = Manager().list([[] for x in range(row_count)]) gt_list = list() result = [] process_pool = Pool(processes=threads) vcf_reader = VariantFile(ivcf_path, 'r') for record in vcf_reader.fetch(): idx += 1 sv_type, chrom, sv_chr2, pos, sv_end, sv_strand = parse_record(record) if sv_type not in ["DEL", "INS", "DUP", "INV", "TRA"]: continue search_id_list = [] if sv_type == 'TRA' and 'TRA' in sv_dict and chrom in sv_dict[ 'TRA'] and sv_chr2 in sv_dict['TRA'][chrom]: search_id_list = sv_dict['TRA'][chrom][sv_chr2] elif sv_type == 'INV' and 'INV' in sv_dict and chrom in sv_dict['INV']: if sv_strand in sv_dict['INV'][chrom]: search_id_list = sv_dict['INV'][chrom][sv_strand] else: for strand_iter in sv_dict['INV'][chrom]: sv_strand = strand_iter search_id_list = sv_dict['INV'][chrom][strand_iter] break elif sv_type != 'TRA' and sv_type != 'INV' and sv_type in sv_dict and chrom in sv_dict[ sv_type]: search_id_list = sv_dict[sv_type][chrom] max_cluster_bias = 0 if sv_type == 'INS' or sv_type == 'DEL': read_id_list, max_cluster_bias, indel_seq, CIPOS, CILEN = find_in_indel_list( sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos, sv_end, threshold_gloab_dict[sv_type]) else: read_id_list, max_cluster_bias = find_in_list( sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos, sv_end) CIPOS = '.,.' CILEN = '.,.' if sv_type == 'INV' and 'INV' in sv_dict and chrom in sv_dict[ 'INV'] and len(read_id_list) == 0: for strand_iter in sv_dict['INV'][chrom]: if strand_iter != sv_strand: search_id_list = sv_dict['INV'][chrom][strand_iter] read_id_list, max_cluster_bias = find_in_list( sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos, sv_end) if len(read_id_list) != 0: sv_strand = strand_iter break #print(read_id_list) if sv_type == 'INS': max_cluster_bias = max(1000, max_cluster_bias) else: max_cluster_bias = max(max_cluster_bias_dict[sv_type], max_cluster_bias) para = Para(record, CIPOS, CILEN) ''' if sv_type == 'INS': fx_para = [([bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'INS')] gt_list.append(call_gt_wrapper(fx_para)) if sv_type == 'DEL': fx_para = [([bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'DEL')] gt_list.append(call_gt_wrapper(fx_para)) if sv_type == 'INV': fx_para = [([bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'INV')] gt_list.append(call_gt_wrapper(fx_para)) if sv_type == 'DUP': fx_para = [([bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'DUP')] gt_list.append(call_gt_wrapper(fx_para)) if sv_type == 'TRA': fx_para = [([bam_path, pos, sv_end, chrom, sv_chr2, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'TRA')] gt_list.append(call_gt_wrapper(fx_para)) ''' #''' if sv_type == 'INS': fx_para = [([ bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, indel_seq, 'INS')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) if sv_type == 'DEL': fx_para = [([ bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, '<DEL>', 'DEL')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) if sv_type == 'INV': fx_para = [([ bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, '<INV>', 'INV')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) if sv_type == 'DUP': fx_para = [([ bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, '<DUP>', 'DUP')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) if sv_type == 'TRA': fx_para = [([ bam_path, pos, sv_end, chrom, sv_chr2, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, '<TRA>', 'TRA')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) #''' process_pool.close() process_pool.join() semi_result = list() for item in gt_list: try: semi_result.append(item.get()[0]) except: pass logging.info('Finished force calling.') return semi_result
def Main(): parser = argparse.ArgumentParser( description="loading vcf and interaction files") parser.add_argument("interactionfile", help="Interaction calls from HiCap method") parser.add_argument( "vcfile", help="Variant calls from either HiCap or sequencing samples") parser.add_argument("-o", "--output", help="output of interaction files", action='store', default=None) args = parser.parse_args() Vcfin = VariantFile(args.vcfile) result_title = [ "RefSeqName", "TranscriptName", "Feature_ID", "Feature_Chr", "Feature_Start", "Feature_End", "Annotation", "Strand", "Interactor_Chr", "Interactor_Start", "Interactor_End", "Distance", "SNPs", "SNP_ID", "Ind_count", "Swed_Freq", "TAV2431", "TAV2515", "TAV2709", "BAV2375", "BAV2424", "BAV2714" ] with open(args.output, "w") as output_file: output_file.write("\t".join(result_title) + "\n") with open(args.interactionfile, 'r') as f: next(f) for line in f: line = line.strip().split("\t") all_fields = line[0], line[1], line[2], line[3], line[4], line[ 5], line[6], line[7], line[8], line[9], line[10], line[11] chr = ((line[8])[3:], line[9], line[10]) TAV2431 = [line[12], line[13]] TAV2515 = [line[15], line[16]] TAV2709 = [line[18], line[19]] BAV2375 = [line[21], line[22]] BAV2424 = [line[24], line[25]] BAV2714 = [line[27], line[28]] interaction_sample = [ TAV2431, TAV2515, TAV2709, BAV2375, BAV2424, BAV2714 ] interaction_binary = int2binary(interaction_sample) sample_list = [3, 4, 5, 0, 1, 2] for rec in Vcfin.fetch(chr[0], int(chr[1]), int(chr[2])): genotype_binary = [] for test in rec.samples.values(): genotype = "/".join([str(x) for x in test["GT"]]) if genotype == "None/None": continue elif genotype == "0/1" or genotype == "1/1": genotype_binary.append("1") elif genotype == "0/0": genotype_binary.append("0") swed_freq = "0" for f, v in rec.info.iteritems(): if pattern.match(f): swed_freq = v if rec.id == None: rec.id = "X" sorted_genotype = [ x for _, x in sorted(zip(sample_list, genotype_binary)) ] zip_array = list(zip(interaction_binary, sorted_genotype)) count = 0 for a, b in zip_array: if a == b: count = count + 1 if count == 6: allele = "|".join(rec.alleles) count_int_allele = 0 for a, b in zip_array: if (a, b) == ('1', '1'): count_int_allele = count_int_allele + 1 changed_freq = "".join(str(x) for x in swed_freq) unzip_array = ["|".join(x) for x in zip_array] snp = (line[8], rec.start, rec.stop, allele, rec.filter.keys()[0]) str_snp = "_".join(str(x) for x in snp) result = "\t".join( all_fields ), str_snp, rec.id, count_int_allele, changed_freq, "\t".join( unzip_array) combined_result = "\t".join(str(x) for x in result) with open(args.output, "a") as output_file: output_file.write(combined_result + "\n")
# Column names for ouptut writer = csv.writer(ofile) writer.writerow([ "chr", "pos", "reference", "call", "methylated", "unmethylated", "strand" ]) # The things in rec.format # GT FT DP MQ GQ QD GL MC8 AMQ CS CG CX # 480 minutes per one bcf--unacceptable!!! # 7 minutes for chrom 22--using 4 threads # 7 minutes for chrom 22--using 8 threads # Iterator #I = infile.fetch('chr1', 100000, 110000) I = infile.fetch('chr22') # Iterate two records at a time if merging... #for rec1, rec2 in zip_longest(*[I]*2): for rec2 in I: #data_1 = rec1.samples.items()[0][1].items() data_2 = rec2.samples.items()[0][1].items() # rec2 should be the base. Is it CpG? Then do the conditional tests # rec2 can be negative strand (and should still be written out) if (data_2[10][1] == 'Y'): m2, um2 = get_methylation_estimate(data_2[7][1], data_2[9][1]) # This is the merge condition. The records need to be one position away from each other # They need to both be CpGs # They need for the first position on #if (rec2.pos - rec1.pos == 1 and data_1[10][1] == 'Y' and data_1[9][1] == "+" and data_2[9][1] == "-"):
def main(self, args): command.Command.main(self, args) self.validate(args) for i in [1, 2]: attr = "pop%d" % i pid, ary = getattr(args, attr) if len(ary) == 1 and ary[0][0] == "@": setattr(args, attr, SampleList( pid, open(ary[0][1:], "rt").read().strip().split("\n"))) pop_d = dict([args.pop1, args.pop2]) for pid in pop_d: if pop_d[pid]: c = Counter(pop_d[pid]) if max(c.values()) > 1: raise RuntimeError( "Population %s has duplicated samples: %s" % (pid, [item for item in c.items() if item[1] > 1])) dist = [[], []] if not args.d: first_sid = args.pop1.samples[0] args.d = [first_sid] * 2 args.d = [args.d[0] + ":0", args.d[1] + ":1"] all_samples = set(args.pop1.samples) | set(args.pop2.samples) for sid_i in args.d: sid, i = sid_i.split(":") i = int(i) if sid not in all_samples: raise RuntimeError("%s is not in the sample list" % sid) if sid in args.pop1.samples: d = dist[0] else: assert sid in args.pop2.samples d = dist[1] d.append((sid, i)) undist = [[(k, i) for k in p.samples for i in (0, 1) if (k, i) not in d] for p, d in zip((args.pop1, args.pop2), dist)] npop = 1 def print_pop(i): logger.info("Population %d:" % i) logger.info("Distinguished lineages: " + ", ".join("%s:%d" % t for t in dist[i - 1])) logger.info("Undistinguished lineages: " + ", ".join("%s:%d" % t for t in undist[i - 1])) print_pop(1) if args.pop2.pid is not None: npop = 2 common = set(args.pop1.samples) & set(args.pop2.samples) if common: logger.error("Populations 1 and 2 should be disjoint, " "but both contain " + ", ".join(common)) sys.exit(1) print_pop(2) # Start parsing vcf = VariantFile(args.vcf) with optional_gzip(args.out, "wt") as out: samples = list(vcf.header.samples) dist = dist[:npop] undist = undist[:npop] if not set([dd[0] for d in dist for dd in d]) <= set(samples): raise RuntimeError("Distinguished lineages not found in data?") missing = [s for u in undist for s, _ in u if s not in samples] if missing: msg = "The following samples were not found in the data: %s. " % ", ".join( missing) if args.ignore_missing: logger.warn(msg) else: msg += "If you want to continue without these samples, use --ignore-missing." raise RuntimeError(msg) undist = [[t for t in u if t[0] not in missing] for u in undist] # Write header pids = [a.pid for a in (args.pop1, args.pop2)[:npop]] out.write("# SMC++ ") json.dump({"version": version, "pids": pids, "undist": undist, "dist": dist}, out) out.write("\n") na = list(map(len, dist)) nb = list(map(len, undist)) # function to convert a VCF record to our format: # <span, dist gt, # undist gt, # undist, [...]> def rec2gt(rec): ref = rec.alleles[0] da = [[rec.samples[d].alleles[i] for d, i in di] for di in dist] a = [sum([x != ref for x in d]) if None not in d else -1 for d in da] bs = [[rec.samples[d].alleles[i] != ref for d, i in un if rec.samples[d].alleles[i] is not None] for un in undist] b = [sum(_) for _ in bs] nb = [len(_) for _ in bs] # Fold non-polymorphic (in subsample) sites if np.array_equal(b, nb) and np.array_equal(a, na): a = [0] * len(a) b = [0] * len(b) return list(sum(zip(a, b, nb), tuple())) try: region_iterator = vcf.fetch(contig=args.contig) except ValueError as e: logger.error("VCF reader threw an error: %s", e) logger.error("Make sure the VCF is indexed:") logger.error("") logger.error(" $ tabix %s", args.vcf) logger.error("") sys.exit(1) contig_length = args.length or vcf.header.contigs[args.contig].length if contig_length is None: logger.error("Failed to acquire contig length from VCF header. See the --length option.") sys.exit(1) if args.mask: mask_iterator = TabixFile( args.mask).fetch(reference=args.contig) args.missing_cutoff = np.inf else: mask_iterator = iter([]) if args.missing_cutoff is None: args.missing_cutoff = np.inf mask_iterator = (x.split("\t") for x in mask_iterator) mask_iterator = ((x[0], int(x[1]), int(x[2])) for x in mask_iterator) snps_only = ( rec for rec in region_iterator if len(rec.alleles) <= 2 and all(len(a) == 1 for a in rec.alleles) ) def interleaved(): cmask = next(mask_iterator, None) csnp = next(snps_only, None) while cmask or csnp: if cmask is None: yield "snp", csnp csnp = next(snps_only, None) elif csnp is None: yield "mask", cmask cmask = next(mask_iterator, None) else: if csnp.pos < cmask[1]: yield "snp", csnp csnp = next(snps_only, None) elif csnp.pos <= cmask[2]: while csnp is not None and csnp.pos <= cmask[2]: csnp = next(snps_only, None) yield "mask", cmask cmask = next(mask_iterator, None) else: yield "mask", cmask cmask = next(mask_iterator, None) abnb_miss = [-1, 0, 0] * len(nb) abnb_nonseg = sum([[0, 0, x] for x in nb], []) multiples = set() with RepeatingWriter(out) as rw, \ tqdm.tqdm(total=contig_length, unit='bases', unit_scale=True) as bar: def write(x): if not write.first or not args.drop_first_last: rw.write(x) write.first = False write.first = True last_pos = 0 for ty, rec in interleaved(): if ty == "mask": span = rec[1] - last_pos write([span] + abnb_nonseg) write([rec[2] - rec[1] + 1] + abnb_miss) last_pos = rec[2] continue bar.update(rec.pos - last_pos) abnb = rec2gt(rec) if rec.pos == last_pos: multiples.add(rec.pos) continue span = rec.pos - last_pos - 1 if 1 <= span <= args.missing_cutoff: write([span] + abnb_nonseg) elif span > args.missing_cutoff: write([span] + abnb_miss) write([1] + abnb) last_pos = rec.pos if not args.drop_first_last: write([contig_length - last_pos] + abnb_nonseg) if multiples: # FIXME: what to do with multiple records at same site logger.warn( "Multiple entries found at %d positions; skipped all but the first", len(multiples))
from pysam import VariantFile import pandas as pd import matplotlib.pyplot as plt import os import glob baseDir = "/scratch/users/fhol/elife_data/" saveDir = "/scratch/users/fhol/elife_data/varfilesDENV01/" dataDir = glob.glob(baseDir + '/10017006*') for d in dataDir: filename = glob.glob(d + "/*.vcf") for i in filename: df = pd.DataFrame(columns=['pos', 'af']) varFileName = os.path.basename(i) SNVs = VariantFile(i) for rec in SNVs.fetch(): df2 = pd.DataFrame([[rec.pos, rec.info["AF"]]], columns=['pos', 'af']) df = df.append(df2, ignore_index=True) os.chdir(saveDir) df.to_pickle(os.path.splitext(varFileName)[0] + '_df.pkl')
def gen_report(vcf, c, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.snv.strelka.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') on_dict = {} if c != 'n': on_dict = create_target(c) log(loc, date_time() + 'Target file given, creating index for on target info\n') vcf_in = VariantFile(vcf) call_type = 'snv' if bool(re.search('indel', fn)): out = open(parts[0] + '.indel.strelka.vep.prioritized_impact.report.xls', 'w') call_type = 'indel' else: out = open(parts[0] + '.snv.strelka.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i if call_type == 'snv': out.write('chr\tpos\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t' 'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\t' 'variant_class_effect\teffect\timpact\tbiotype\tcodon_change\tamino_acid_change\ton/off-target\n') else: out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact\t' 'biotype\tcodon_change\tamino_acid_change\ton/off-target\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): # dict contains what's different between strelka indel and snv reports (chrom, pos, ref, alt) = (record.contig, str(record.pos), record.ref, record.alts[0]) if call_type == 'snv': not_shared = {'norm_ref_ct': record.samples['NORMAL'][(record.ref + 'U')][0], 'norm_alt_ct': record.samples['NORMAL'][(record.alts[0] + 'U')][0], 'tum_ref_ct': record.samples['TUMOR'][(record.ref + 'U')][0], 'tum_alt_ct': record.samples['TUMOR'][(record.alts[0] + 'U')][0]} else: not_shared = {} ann_list = [_.split('|') for _ in record.info['ANN'].split(',')] tflag = 'NA' if c != 'n': tflag = mark_target(chrom, pos, on_dict) # only outputting ON TARGET hits if tflag == 'OFF': continue output_highest_impact(chrom, pos, ref, alt, not_shared, ann_list, desired, tflag, out, ref_flag, call_type) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def __init__(self, vcf_path): self.pairs = {} vcf = VariantFile(vcf_path) for rec in vcf.fetch(): if re.match('^chr[0-9XY]+$', rec.chrom): self.add_entry(rec)
def find_homopolymer_cases(vcf_file, fasta_file): small_variant_vcf = VariantFile(vcf_file) assembly_fasta_file = FastaFile(fasta_file) homopolymer_changes = 0 total_records = 0 for rec in small_variant_vcf.fetch(): alternate_allele = rec.alleles[1] if len(alternate_allele) > 50: continue rec_len = rec.stop - rec.start if rec_len > 50: continue total_records += 1 reference_start = rec.start - 200 reference_end = rec.stop + 200 reference_sequence = assembly_fasta_file.fetch(reference=rec.contig, start=rec.start - 200, end=rec.stop + 200) homopolymer_positions = [0] * len(reference_sequence) for i in range(0, len(reference_sequence)): if i == 0: homopolymer_positions[i] = reference_start elif reference_sequence[i] == reference_sequence[i - 1]: homopolymer_positions[i] = homopolymer_positions[i - 1] else: homopolymer_positions[i] = i + reference_start homopolymer_start = 0 homopolymer_end = 0 # print(rec, end='') for i in range(0, len(reference_sequence)): if i + reference_start == rec.start: homopolymer_start = homopolymer_positions[i] if i + reference_start > rec.stop and homopolymer_positions[ i] != homopolymer_start: homopolymer_end = max(homopolymer_positions[i], rec.stop + 1) break sequence_in_assembly = assembly_fasta_file.fetch(reference=rec.contig, start=rec.start, end=rec.stop + 1) polished_homopolymer = assembly_fasta_file.fetch( reference=rec.contig, start=homopolymer_start, end=rec.start) + alternate_allele + assembly_fasta_file.fetch( reference=rec.contig, start=rec.stop, end=homopolymer_end) sequence_in_polished = assembly_fasta_file.fetch( reference=rec.contig, start=rec.start, end=rec.start) + alternate_allele + assembly_fasta_file.fetch( reference=rec.contig, start=rec.stop, end=rec.stop + 1) homopolymer_record_end = homopolymer_start while reference_sequence[homopolymer_record_end - reference_start] == reference_sequence[ homopolymer_start - reference_start]: homopolymer_record_end += 1 # print(assembly_fasta_file.fetch(reference=rec.contig, start=rec.start-1, end=rec.start)) # print(alternate_allele) # print(assembly_fasta_file.fetch(reference=rec.contig, start=rec.stop, end=rec.stop+10)) # print("Assembly", sequence_in_assembly) # print("Polish", sequence_in_polished) # if rec.contig != 'chr22': # continue # print(rec, end='') # print(sequence_in_assembly) # print(sequence_in_polished) true_homopolymer = True if len(sequence_in_assembly) > 1: start_index = 2 while start_index < len(sequence_in_assembly): if sequence_in_assembly[start_index] != sequence_in_assembly[ start_index - 1]: true_homopolymer = False break start_index += 1 if len(sequence_in_polished) > 1: start_index = 2 while start_index < len(sequence_in_polished): if sequence_in_polished[start_index] != sequence_in_polished[ start_index - 1]: true_homopolymer = False break start_index += 1 if not true_homopolymer: pass else: print(rec.contig + "\t" + str(rec.start) + "\t" + str(rec.stop))
def gen_report(vcf, out, c, ref_flag, cache): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') sample = parts[0] loc = 'LOGS/' + sample + '.subsitutions.vep' + cache + '.priority_report.log' suffix = '.subsitutions.vep' + cache + '.prioritized_impact.report.xls' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') mut_dict = create_mutect_ind(out) log(loc, date_time() + 'Created index for added mutect info\n') on_dict = {} if c != 'n': on_dict = create_target(c) log( loc, date_time() + 'Target file given, creating index for on target info\n') vcf_in = VariantFile(vcf) out_fn = sample + suffix out = open(out_fn, 'w') desired = { 'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'HGVSg': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'gnomAD_AF': 0, 'BIOTYPE': 0 } desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace( 'Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write( 'chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t' 'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tExAC_MAF\tgene\tHGVSg\ttx_id\teffect\timpact\t' 'biotype\tcodon_change\tamino_acid_change\ton/off-target\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0] ann_list = [_.split('|') for _ in record.info['ANN']] tflag = 'NA' if c != 'n': tflag = mark_target(chrom, pos, on_dict) # only outputting ON TARGET hits if tflag == 'OFF': continue output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag) out.close() log( loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
from pysam import VariantFile import sys bcfin = VariantFile(sys.argv[1]) var_calls = [] min_score = 10000 max_score = -1 for i in range(1, 23): for rec in bcfin.fetch('chr%d' % i): if rec.ref in rec.alts: continue var_calls.append(rec) header = '''##fileformat=VCFv4.1 ##FILTER=<ID=PASS,Description="All filters passed"> ##contig=<ID=chr1,length=248956422> ##contig=<ID=chr2,length=242193529> ##contig=<ID=chr3,length=198295559> ##contig=<ID=chr4,length=190214555> ##contig=<ID=chr5,length=181538259> ##contig=<ID=chr6,length=170805979> ##contig=<ID=chr7,length=159345973> ##contig=<ID=chr8,length=145138636> ##contig=<ID=chr9,length=138394717> ##contig=<ID=chr10,length=133797422> ##contig=<ID=chr11,length=135086622> ##contig=<ID=chr12,length=133275309> ##contig=<ID=chr13,length=114364328> ##contig=<ID=chr14,length=107043718> ##contig=<ID=chr15,length=101991189> ##contig=<ID=chr16,length=90338345>
class VcfAltParser(): ''' Class to iterate over a vcf in batches, returns strings of DNA-sequences. Loosely inspired by janggu.dna.VarianStreamer :ivar pysam.VariantFile vcf: VariantFile, the variant calls :ivar pysam.FastaFile ref: FastaFile, the reference sequence :ivar str idx_path: Path to the exported (compatible) variants in bed-format :ivar int bin_size: size of the DNA-sequences :ivar int n_variants: number of exported (compatible) variants ''' def __init__(self, ref_fa_path=None, vcf_path=None, idx_path=None, batch_size=32, bin_size=100, tie='r'): ''' :param str ref_fa_path: Path to indexed reference fasta :param str vcf_path: Path to indexed vcf :param str idx_path: Path to bed-file which will contain the names and locations of compatible variants :param int batch_size: Batch size :param int bin_size: Length of the DNA-sequences (centered on the start position of the variant) ''' self.vcf = VariantFile(vcf_path) self.ref = FastaFile(ref_fa_path) assert os.path.isfile( ref_fa_path + '.fai'), 'Error: no index found for Fasta-file: {}'.format( ref_fa_path) self.idx_path = idx_path self.batch_size = batch_size self.bin_size = bin_size assert tie in ['l', 'r'] self.tie = tie if not bin_size % 2: self.offset = 0 if tie == 'r' else 1 else: self.offset = 0 self.n_variants = self._initialize_index() self._verify_refmatch() def get_flanking_centered(self, variant): ''' get flanking sequence, variant will be centered ''' # centers the alt variant (note: ref centering not implemented) # flank lf, rf = ceil(self.bin_size / 2), floor(self.bin_size / 2) lenref, lenalt = len(variant.ref), len(variant.alts[0]) d = lenalt - lenref # len diff ld, rd = ceil(d / 2), floor(d / 2) pos = variant.pos - 1 # 0-based left_seq = self.ref.fetch(variant.chrom, pos - lf + ld + self.offset, pos) right_seq = self.ref.fetch(variant.chrom, pos + lenref, pos + rf - rd + self.offset) return left_seq, right_seq def get_flanking_right(self, variant): ''' get flanking sequence, variant will be aligned to the right of the center ''' # aligns the varaint to the right of the center # flank if self.bin_size % 2: rf = floor(self.bin_size / 2) lf = rf else: lf, rf = ceil(self.bin_size / 2), floor(self.bin_size / 2) lenref, lenalt = len(variant.ref), len(variant.alts[0]) d = lenalt - lenref # len diff pos = variant.pos - 1 # 0-based left_seq = self.ref.fetch(variant.chrom, pos - lf + self.offset, pos) right_seq = self.ref.fetch(variant.chrom, pos + lenref, pos + rf - d + self.offset) return left_seq, right_seq def get_alt(self, variant): ''' get alternative sequence for a variant ''' l, r = self.get_flanking_right(variant) return (l + variant.alts[0] + r).upper() def get_ref(self, variant): ''' get reference sequence for a variant ''' if self.bin_size % 2: rf = floor(self.bin_size / 2) lf = rf else: lf, rf = ceil(self.bin_size / 2), floor(self.bin_size / 2) return self.ref.fetch(variant.chrom, variant.pos - lf - 1 + self.offset, variant.pos + rf - 1 + self.offset).upper() def is_compatible(self, variant): ''' simple test for compatibility ''' if len(variant.alts[0]) >= (self.bin_size / 2): return False if len(variant.alts) > 1: return False return True def _verify_refmatch(self): variants = self.vcf.fetch() err = 0 proc = 0 varid = [] ref = [] true_ref = [] for i in range(50000): try: variant = next(variants) proc += 1 except StopIteration: break if variant.ref != self.ref.fetch( variant.chrom, variant.pos - 1, variant.pos - 1 + len(variant.ref)): err += 1 ref.append(variant.ref) true_ref.append( self.ref.fetch(variant.chrom, variant.pos - 1, variant.pos - 1 + len(variant.ref))) varid.append(variant.id) if err: print( 'Warning: {} mismatches with reference based on the first {} variants.' .format(err, min(proc, 50000))) for i in range(min(err, 10)): print('variant: {}, vcf ref : {}, actual ref: {}'.format( varid[i], ref[i], true_ref[i])) if err > 10: print('...') def _initialize_index(self): ''' create a bed-file containing the variant locations and ids ''' bedtool = BedTool( (Interval(record.chrom, record.pos - 1, record.pos - 1 + len(record.ref), name='{}_{}>{}'.format(record.id, record.ref, record.alts[0])) for record in self.vcf.fetch() if self.is_compatible(record))) bedtool.saveas(self.idx_path) with open(self.idx_path, 'r') as infile: for n, _ in enumerate(infile): pass try: import subprocess subprocess.call(['gzip', '-f', self.idx_path]) self.idx_path += '.gz' except: pass return n + 1 def batch_generator(self): ''' returns a generator that iterates over pairs of reference and alternative sequences ''' variants = self.vcf.fetch() ibatch = 0 try: while True: br = [] ba = [] ids = [] while ibatch < self.batch_size: variant = next(variants) if not self.is_compatible(variant): continue ids.append(variant.id) br.append(self.get_ref(variant)) ba.append(self.get_alt(variant)) ibatch += 1 yield np.array(ids), (np.array(br), np.array(ba)) ibatch = 0 except StopIteration: yield np.array(ids), (np.array(br), np.array(ba))
pos = int(cols[pos_ind]) ref = cols[ref_ind] alt = cols[alt_ind] marker_id = cols[marker_id_ind] rsid = "." infos_out = [] allele = "." gene = "." annotation = "." hgvs_c = "." hgvs_p = "." ##### # Get annotation ##### for rec in vcf_handle.fetch(chrom, pos - 1, pos + 1): #### # Match by position #### if rec.chrom == chrom and rec.pos == pos: if "ANN" in rec.info: ann_field = rec.info["ANN"] #anns = ann_field.split(",") ann = ann_field[0] ann_cols = ann.split("|") allele = ann_cols[0] gene = ann_cols[3] annotation = ann_cols[1] hgvs_c = ann_cols[9] hgvs_p = ann_cols[10]
dest='call_vcf', help='Called vcf to search for variants not found in reference vcf') parser.add_argument( '-o', '--out-vcf', action='store', dest='out_vcf', help='Output vcf that is a subset of called vcf meeting criteria') args = parser.parse_args() ref_vcf = VariantFile(args.ref_vcf) called_vcf = VariantFile(args.call_vcf, threads=4) out_vcf = VariantFile(args.out_vcf, "w", header=called_vcf.header, threads=4) x = 0 m = 1000 for record in called_vcf.fetch(): if x % m == 0: sys.stderr.write('Processed ' + str(x) + " records\n") sys.stderr.flush() f = 0 for comp in ref_vcf.fetch(record.contig, record.start, record.stop): if record.pos == comp.pos and record.alleles == comp.alleles: f = 1 break if not f: out_vcf.write(record) x += 1 out_vcf.close() ref_vcf.close() called_vcf.close()
#!/bin/python3.6 import sys from pysam import VariantFile import subprocess vcf_in = VariantFile(sys.argv[1]) new_header = new_header = vcf_in.header vcf_out = VariantFile(sys.argv[2], 'w', header=new_header) sv_out = sys.argv[2] + '.svtypeDEL.txt' indelArteFile = sys.argv[3] for record in vcf_in.fetch(): # import pdb; pdb.set_trace() try: if record.info["SVTYPE"] == 'DEL': with open(sv_out, 'a+') as svtype_out: svtype_out.write(str(record)) except KeyError: if len(record.ref) != len(record.alts[0]): # if InDel if ( "mutect2" in record.info["CALLERS"] or "vardict" in record.info["CALLERS"] ): # Support by either Vardict or Manta, ok. # Check if indel artefact # import pdb; pdb.set_trace() write = 1 cmdIndelArte = 'grep -w ' + str(record.pos) + ' ' + indelArteFile artefactLines = ( subprocess.run(cmdIndelArte, stdout=subprocess.PIPE, shell='TRUE').stdout.decode('utf-8').strip() ) for artefactLine in artefactLines.split("\n"): if (
def run_process(opts, inputvcf): reference = opts.reference outputvcf = opts.output infoname = "HOMOPOLYX" maxbp = opts.maxpolypadding minbp = opts.minpolybp # STDERR sys.stderr.write("Maximum basepair of reference region around variant : " + str(maxbp) + "\n") sys.stderr.write("Minumum basepair of homopolymer detection : " + str(minbp) + "\n") # Load Reference Fasta genome = FastaFile(reference) # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header if not ngb_functions.vcfHeaderCheck(vcf_in.header.info, infoname): vcf_in.header.info.add(infoname, ".", "String", "Homepolymer Basepair Count") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-', 'w', header=vcf_in.header) # Found count init homopolymer_cnt = 0 # Fetch VCF Record for record in vcf_in.fetch(): chrom = record.chrom pos = record.pos ref = record.ref alts = record.alts info_value_list = list() for alt in alts: ret = ngb_functions.pairdiff(ref, alt) if (ret['variant_type'] == 'ins' or ret['variant_type'] == 'del') and ret['diff_basepair_composition_count'] == 1: diffbasepair = ret['diff_basepair_composition'][0] match_cnt = 0 around_sequence = (genome.fetch(chrom, pos, pos + maxbp)).upper() for seq in around_sequence: if diffbasepair == seq: match_cnt += 1 else: break if match_cnt >= int(minbp): info_value_list.append(match_cnt) if info_value_list != []: info_value = ','.join(str(e) for e in info_value_list) record.info[infoname] = info_value homopolymer_cnt += 1 vcf_out.write(record) sys.stderr.write("Found homopolymer(s) : " + str(homopolymer_cnt) + "\n")
def find_dimer_repeats(bam_file, vcf_file, fasta_file): assembly_fasta_file = FastaFile(fasta_file) small_variant_vcf = VariantFile(vcf_file) samfile = pysam.AlignmentFile(bam_file, "rb") for rec in small_variant_vcf.fetch(): alternate_allele = rec.alleles[1] if len(alternate_allele) > 50: continue rec_len = rec.stop - rec.start if rec_len > 20: continue # if rec.contig != 'chr22': # continue reference_start = rec.start - 200 reference_end = rec.stop + 200 reference_sequence = assembly_fasta_file.fetch(reference=rec.contig, start=rec.start, end=rec.stop + 200) in_dimer = False end_index = 1 dimer_base = '**' reference_dimer_length = 0 for i in range(len(reference_sequence) - 1): if reference_sequence[i] != reference_sequence[i + 1]: dimer_base = reference_sequence[i] + reference_sequence[i + 1] end_index = extend_dimers(reference_sequence, dimer_base, i) reference_dimer_length = int((end_index - i) / 2) if i == 1 and reference_dimer_length > 1: # print("----------------------") # print(rec, end='') # print(reference_sequence[i:end_index]) # print("FOUND", i, end_index, reference_dimer_length) # print("######################") in_dimer = True break if not in_dimer: continue all_reads = samfile.fetch(rec.contig, rec.start - 10, rec.start + end_index) read_dimers = [] for read in all_reads: aligned_pairs = read.get_aligned_pairs() read_start_index = -1 for index, position in aligned_pairs: if index is None: continue if position == rec.start - 1: read_start_index = index + 2 break if read_start_index < 0: continue if read.query_sequence is None: continue read_end_index = read_start_index + end_index if read_start_index >= len( read.query_sequence) or read_end_index >= len( read.query_sequence): continue read_sequence = read.query_sequence[read_start_index:] read_dimer_base = read_sequence[0:2] if read_dimer_base != dimer_base: continue read_end_index_late = extend_dimers(read_sequence, dimer_base, 0) read_dimer_length = int((read_end_index_late - 0) / 2) read_dimers.append(read_dimer_length) # print(read.query_sequence[read_start_index:read_end_index], read_start_index, read_end_index, read_dimer_length) if len(read_dimers) == 0: continue print( str(rec.contig) + "\t" + str(rec.start) + "\t" + str(rec.start + end_index) + "\t" + str(reference_dimer_length) + "\t" + str(','.join([str(x) for x in read_dimers])))
class AnnotateHelper: def __init__(self): self._gene_database = DataBase(settings.GENE_DATABASE) self._omim_gene_database = DataBase(settings.OMIM_GENE_DATABASE) self._func_region_database = DataBase(settings.FUNC_REGION_DATABASE) self._hi_gene_database = DataBase(settings.HI_GENE_DATABASE) self._hi_exon_database = DataBase(settings.HI_EXON_DATABASE) self._hi_cds_database = DataBase(settings.HI_CDS_DATABASE) self._clinvar_pathogenic_database = VariantFile( settings.CLINVAR_PATHOGENIC_DATABASE) self._uhi_gene_database = DataBase(settings.UHI_GENE_DATABASE) self._hi_region_database = DataBase(settings.HI_REGION_DATABASE) self._uhi_region_database = DataBase(settings.UHI_REGION_DATABASE) self._decipher_gene_database = DataBase( settings.DECIPHER_GENE_DATABASE) self._ts_gene_database = DataBase(settings.TS_GENE_DATABASE) self._ts_region_database = DataBase(settings.TS_REGION_DATABASE) self._uts_gene_database = DataBase(settings.UTS_GENE_DATABASE) self._uts_region_database = DataBase(settings.UTS_REGION_DATABASE) self._dgv_gain_database = DataBase(settings.DGV_GAIN_DATABASE) self._dgv_loss_database = DataBase(settings.DGV_LOSS_DATABASE) self._gnomad_del_database = DataBase(settings.GNOMAD_DEL_DATABASE) self._gnomad_dup_database = DataBase(settings.GNOMAD_DUP_DATABASE) self._cnv_syndrome_del_database = DataBase( settings.CNV_SYNDROME_DEL_DATABASE) self._cnv_syndrome_dup_database = DataBase( settings.CNV_SYNDROME_DUP_DATABASE) @staticmethod def _norm_chrom(ch): """ normalize chromosome name, eg. 2 -> chr2, 23 -> chrX :param ch: input chromosome name :return: normalized name >>> norm_chrom(2) 'chr2' >>> norm_chrom('chr23') 'chrX' """ ch = str(ch).replace('chr', '') if ch == '23': return 'chrX' if ch == '24': return 'chrY' return f'chr{ch}' @staticmethod def _annotate_loss(**annotation): """ 计算拷贝数减少的CNV的证据项 :param annotation: 已注释的CNV :return: 注释后的CNV """ loss = dict() # Section 1 if len(annotation['outer_overlap_genes']) + len( annotation['overlap_func_regions']) > 0: loss['1A'] = True else: loss['1B'] = True # Section 2 # hi区域 for region, overlap, coverage in annotation['overlap_hi_regions']: if coverage == 1: # 完全覆盖区域 loss['2A'] = True elif len( set(gene.symbol for gene, *_ in annotation['overlap_hi_genes'])) == 0: # 未覆盖hi基因 loss['2B'] = True # hi基因 for gene, overlap, coverage in annotation['overlap_hi_genes']: if coverage == 1: # 完全覆盖基因 loss['2A'] = True elif overlap < 1: # 是否位于基因内部 if any(exon.last_exon == 'True' for exon, *_ in annotation['overlap_hi_exons'][ gene.gene_id]): # 是否覆盖末位外显子 if len(annotation['overlap_hi_exons'][gene.gene_id]) >= 2: # 覆盖超过两个外显子 loss['2D-4'] = True elif gene.gene_id in annotation['overlap_hi_cds'] \ and len(annotation['overlap_hi_cds'][gene.gene_id]) > 0: # 是否覆盖CDS if len(annotation['variants']) > 0: # 末位外显子是否有致病变异 loss['2D-2'] = True else: # 末尾外显子无致病变异 loss['2D-3'] = True else: # 不覆盖CDS区 loss['2D-1'] = True # 未覆盖末位外显子 elif gene.gene_id in annotation['overlap_hi_cds'] \ and len(annotation['overlap_hi_cds'][gene.gene_id]) > 0: # 是否覆盖5'端CDS loss['2C-1'] = True else: # 未覆盖5'端CDS loss['2C-2'] = True # 位于基因内部 else: cnv = CNVRecord(annotation['chromosome'], annotation['inner_start'], annotation['inner_end'], annotation['func']) tx = get_transcript(gene.transcript, transcripts) pvs1 = PVS1CNV(cnv, None, tx) loss['2E'] = True #loss[PVS1[pvs1.verify_DEL()[0]]] = True loss['pvs1'] = PVS1[pvs1.verify_DEL()[0]] # 包含预测HI基因 if len(annotation['overlap_hi_genes']) + len(annotation['overlap_hi_regions']) == 0 \ and len(annotation['overlap_decipher_genes']) > 0: loss['2H'] = True # 落入uhi基因 for gene, overlap, coverage in annotation['overlap_uhi_genes']: if overlap == 1: loss['2F'] = True # 落入uhi区域 genes = set(gene.symbol for gene, *_ in annotation['outer_overlap_genes']) for region, overlap, coverage in annotation['overlap_uhi_regions']: if len(genes - set(region.genes.split(','))) > 0: loss['2G'] = True else: loss['2F'] = True # Section 3 # 覆盖基因个数 gene_count = len(annotation['outer_overlap_genes']) if gene_count >= 35: loss['3C'] = True elif gene_count >= 25: loss['3B'] = True elif gene_count >= 0: loss['3A'] = True # Section 4 # DGV金标和Gnomad genes = set(gene.symbol for gene, *_ in annotation['outer_overlap_genes']) l, m = 0, 0 for record, overlap, coverage in chain( annotation['dgv_loss_records'], annotation['gnomad_del_records']): if overlap == 1 and any( float(v) >= 0.01 for f, v in record._asdict().items() if f.startswith('af')): # 完全覆盖待解读CNV且频率大于1% loss['4O'] = True break elif overlap >= 0.5 and len(genes - set(record.genes.split(','))) == 0: # 与待解读CNV重叠超过50%且覆盖全部蛋白编码基因 if any( float(v) < 0.01 for f, v in record._asdict().items() if f.startswith('af')): # 频率小于1% m += 1 else: # 频率大于1% l += 1 else: if l > 0 and m == 0: # 存在频率大于1%且不存在小于1%的CNV loss['4O'] = True annotation['rules'] = loss return annotation @staticmethod def _annotate_gain(**annotation): """ 计算拷贝数减少的CNV的证据项 :param annotation: 已注释的CNV :return: 注释后的CNV """ gain = dict() # Section 1 if len(annotation['outer_overlap_genes']) + len( annotation['overlap_func_regions']) > 0: gain['1A'] = True else: gain['1B'] = True # Section 2 # 完全覆盖ts区域 for region, overlap, coverage in annotation['overlap_ts_regions']: if coverage == 1: # 是否覆盖整改区域 gain['2A'] = True elif len( set(gene.symbol for gene, *_ in annotation['overlap_ts_genes'])) == 0: # 未覆盖ts基因 gain['2B'] = True for gene, overlap, coverage in annotation['overlap_ts_genes']: # 覆盖整个基因 if coverage == 1: gain['2A'] = True # 落入uts基因 for gene, overlap, coverage in annotation['overlap_uts_genes']: if overlap == 1: gain['2D'] = True # 落入uts区域 for region, overlap, coverage in annotation['overlap_uts_regions']: genes = set(gene.symbol for gene, *_ in annotation['inner_overlap_genes']) region_genes = set(region.genes.split(',')) if overlap == coverage == 1: # 与良性区域完全一致 gain['2C'] = True elif len(genes - region_genes) > 0: # 编码蛋白基因比良性区域多 gain['2G'] = True # 破坏蛋白编码基因 elif any(c < 1 for *_, c in annotation['inner_overlap_genes']): gain['2E'] = True elif overlap == 1: # 被良性区域完全覆盖 gain['2D'] = True else: gain['2F'] = True # hi基因 hi_genes = set() for gene, overlap, coverage in annotation['overlap_hi_genes']: hi_genes.add(gene.symbol) if coverage == 1: # 完全覆盖 gain['2H'] = True elif overlap == 1: # 两端均位于基因内 cnv = CNVRecord(annotation['chromosome'], annotation['inner_start'], annotation['inner_end'], annotation['func']) tx = get_transcript(gene.transcript, transcripts) pvs1 = PVS1CNV(cnv, None, tx) gain['2I'] = True # gain[PVS1[pvs1.verify_DUP()[0]]] = True gain['pvs1'] = PVS1[pvs1.verify_DUP()[0]] # 非hi基因 for gene, overlap, coverage in annotation['inner_overlap_genes']: if gene.symbol not in hi_genes and coverage != 1: gain['2L'] = True annotation['break_point_genes'].append(gene.symbol) # Section 3 # 覆盖基因个数 gene_count = len(annotation['inner_overlap_genes']) if gene_count >= 50: gain['3C'] = True elif gene_count >= 35: gain['3B'] = True elif gene_count >= 0: gain['3A'] = True # Section 4 # DGV金标和Gnomad genes = set(gene.symbol for gene, *_ in annotation['outer_overlap_genes']) l, m = 0, 0 for record, overlap, coverage in chain( annotation['dgv_gain_records'], annotation['gnomad_dup_records']): if overlap == 1 and any( float(v) >= 0.01 for f, v in record._asdict().items() if f.startswith('af')): # 完全覆盖待解读CNV且频率大于1% gain['4O'] = True break elif overlap >= 0.5 and len(genes - set(record.genes.split(','))) == 0: # 与待解读CNV重叠超过50%且覆盖全部蛋白编码基因 if any( float(v) < 0.01 for f, v in record._asdict().items() if f.startswith('af')): # 频率小于1% m += 1 else: # 频率大于1% l += 1 else: if l > 0 and m == 0: # 存在频率大于1%且不存在小于1%的CNV gain['4O'] = True annotation['rules'] = gain return annotation @staticmethod def merge_score(func, **rules): """ 整合所有证据项得分 :param func: 变异类型 :param rules: 证据项 :return: 生成各证据项得分 """ groups = defaultdict(list) for rule, score in rules.items(): try: # 需要分组计分的证据项先收集起来 groups[SCORE_GROUP[func][rule]].append(score) except KeyError: # 无需分组计分的证据项直接计分 yield score for _, scores in groups.items(): # 分组计分的证据项只计算最大分值 yield max(scores) @staticmethod def judge(func, **rules): """ 判断给定的证据项组合最终的致病性 :param func: 变异类型 :param rules: 勾选的证据项 :return: 证据项、得分和致病性 """ # 获取所有证据项得分 # rules = { # rule: settings.DEFAULT_SCORE[func][rule] for rule, check in rules.items() if check # } rules_value = {} for rule, check in rules.items(): if check in PVS1.values(): rules_value['pvs1'] = settings.DEFAULT_SCORE[func][check] elif check: rules_value[rule] = settings.DEFAULT_SCORE[func][rule] # 整合所有证据项得分 score = sum(AnnotateHelper.merge_score(func, **rules_value)) # 判断致病性 for op, cutoff, level in PATHOGENICITY_LEVELS[:-1]: if op(score, cutoff): pathogenicity = level break else: pathogenicity = PATHOGENICITY_LEVELS[-1][2] return rules_value, score, pathogenicity def annotate(self, chromosome, start, end, func, error=0): """ 对给定CNV进行注释 :param chromosome: 染色体编号 :param start: 起始位置 :param end: 终止位置 :param func: 变异类型 :param error: 误差值 :return: 注释结果 """ annotation = dict(chromosome=chromosome, start=start, end=end, length=end - start, error=error, outer_start=start - error, outer_end=end + error, inner_start=start + error, inner_end=end - error, func=func, break_point_genes=list()) annotation['inner_overlap_genes'] = list( self._gene_database.overlap( chromosome, annotation['inner_start'], annotation['inner_end'], )) annotation['outer_overlap_genes'] = list( self._gene_database.overlap( chromosome, annotation['outer_start'], annotation['outer_end'], )) annotation['overlap_omim_genes'] = list( self._omim_gene_database.overlap(chromosome, annotation['inner_start'], annotation['inner_end'])) annotation['overlap_func_regions'] = list( self._func_region_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['overlap_hi_genes'] = list( self._hi_gene_database.overlap(chromosome, annotation['inner_start'], annotation['inner_end'])) annotation['overlap_hi_exons'] = self._hi_exon_database.overlap_groups( chromosome, annotation['inner_start'], annotation['inner_end'], lambda record: record[0].gene_id) annotation['overlap_hi_cds'] = self._hi_cds_database.overlap_groups( chromosome, annotation['inner_start'], annotation['inner_end'], lambda record: record[0].gene_id) try: annotation['variants'] = list( self._clinvar_pathogenic_database.fetch( chromosome, annotation['inner_start'], annotation['inner_end'])) except ValueError: annotation['variants'] = [] annotation['overlap_hi_regions'] = list( self._hi_region_database.overlap(chromosome, annotation['inner_start'], annotation['inner_end'])) annotation['overlap_decipher_genes'] = list( self._decipher_gene_database.overlap(chromosome, annotation['inner_start'], annotation['inner_end'])) annotation['overlap_uhi_genes'] = list( self._uhi_gene_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['overlap_uhi_regions'] = list( self._uhi_region_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['overlap_ts_genes'] = list( self._ts_gene_database.overlap(chromosome, annotation['inner_start'], annotation['inner_end'])) annotation['overlap_ts_regions'] = list( self._ts_region_database.overlap(chromosome, annotation['inner_start'], annotation['inner_end'])) annotation['overlap_uts_genes'] = list( self._uts_gene_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['overlap_uts_regions'] = list( self._uts_region_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['dgv_gain_records'] = list( self._dgv_gain_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['dgv_loss_records'] = list( self._dgv_loss_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['gnomad_del_records'] = list( self._gnomad_del_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['gnomad_dup_records'] = list( self._gnomad_dup_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['cnv_syndrome_loss'] = list( self._cnv_syndrome_del_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) annotation['cnv_syndrome_gain'] = list( self._cnv_syndrome_dup_database.overlap(chromosome, annotation['outer_start'], annotation['outer_end'])) if func == 'del': annotation = self._annotate_loss(**annotation) elif func == 'dup': annotation = self._annotate_gain(**annotation) else: raise ValueError('Unknown func `{}`'.format(func)) annotation['rules'], annotation['score'], annotation[ 'pathogenicity'] = self.judge(func, **annotation['rules']) # PVS1 if func == 'del' and '2E' in annotation['rules'].keys(): annotation['rules']['2E'] = annotation['rules'].get('pvs1') elif func == 'dup' and '2I' in annotation['rules'].keys(): annotation['rules']['2I'] = annotation['rules'].get('pvs1') annotation['pvs1'] = annotation['rules'].pop('pvs1', None) return annotation def _serializer(self, anno_result): seri = {} seri['inner_gene'] = ','.join( x[0].symbol for x in anno_result['inner_overlap_genes']) seri['inner_omim_gene'] = ','.join( x[0].symbol for x in anno_result['overlap_omim_genes']) seri['HI_gene'] = ','.join(f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_hi_genes']) seri['HI_region'] = SEP.join( f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_hi_regions']) seri['TS_gene'] = ','.join(f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_ts_genes']) seri['TS_region'] = ','.join( f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_ts_regions']) seri['Pred_HI_gene'] = ','.join( f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_decipher_genes']) seri['auto_evidence'] = ','.join(sorted(anno_result['rules'])) seri['auto_evidence_score'] = ','.join( f'{k}:{anno_result["rules"][k]}' for k in sorted(anno_result['rules'])) seri['benign_hi_gene'] = ','.join( f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uhi_genes']) seri['benign_hi_region'] = ','.join( f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uhi_regions']) seri['benign_ts_gene'] = ','.join( f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uts_genes']) seri['benign_ts_region'] = ','.join( f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uts_regions']) seri['dgv_loss_records'] = ','.join( f'{x[0].id}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in anno_result['dgv_loss_records']) seri['dgv_gain_records'] = ','.join( f'{x[0].id}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in anno_result['dgv_gain_records']) seri['gnomad_loss_records'] = ','.join( f'{x[0].chrom}:{x[0].start}-{x[0].end}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in anno_result['gnomad_del_records']) seri['gnomad_gain_records'] = ','.join( f'{x[0].chrom}:{x[0].start}-{x[0].end}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in anno_result['gnomad_dup_records']) seri['cnv_syndrome_gain'] = ','.join( f'{x[0].disease_name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['cnv_syndrome_gain']) seri['cnv_syndrome_loss'] = ','.join( f'{x[0].disease_name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['cnv_syndrome_loss']) seri['auto_score'] = anno_result['score'] seri['auto_pathogenicity'] = anno_result['pathogenicity'] seri['pvs1'] = anno_result['pvs1'] return seri def _seri_anno(self, seri: pd.Series) -> pd.Series: anno_result = self.annotate(seri['chr'], seri['start'], seri['end'], seri['type'], seri['error']) return seri.append( pd.Series(self._serializer(anno_result)).replace( '', '-').fillna(DEFAULT_EMPTY_VALUE)) def annotation_file(self, file_path, result_path): """ annotate specified file, required columns: chr, start, end, type, error :param file_path: input file (TSV) :param result_path: result file path (TSV) :return: - """ if file_path.endswith('xlsx'): input_df = pd.read_excel(file_path) else: input_df = pd.read_csv(file_path, sep='\t') input_df['chr'] = input_df['chr'].map(self._norm_chrom) try: from tqdm import tqdm tqdm.pandas() input_df = input_df.progress_apply(self._seri_anno, axis=1) except ImportError: input_df = input_df.apply(self._seri_anno, axis=1) if result_path.endswith('xlsx'): input_df.to_excel(result_path, index=False) else: input_df.to_csv(result_path, sep='\t', index=False)
def get_metrics(ftest, fbase_vcf, fbase_bed, contigs, variant_types, min_ro, padding, samples, metric_prefix, max_warnings): test_vcf = VariantFile(ftest) check_header(test_vcf, samples) genotyped = check_if_genotyped(test_vcf) has_vargq = check_if_vargq(test_vcf) collect_evidence = check_if_evidence(test_vcf) test_records = list(test_vcf.fetch()) unfiltered_variant_type_counts = get_count_by_type(test_records, variant_types) pass_filter_set = set(PASSING_FILTERS) pass_records = [ r for r in test_records if ("PASS" in r.filter or len(set(r.filter) - pass_filter_set) == 0) ] error_counts = count_errors(test_records, contigs, max_warnings) variant_type_counts = get_count_by_type(pass_records, variant_types) size_counts = get_distributions_by_type(pass_records, variant_types, "SVLEN", SIZES, exclude_types=['BND']) metrics = add_error_count_metrics({}, error_counts, metric_prefix) if fbase_vcf is not None: base_vcf = VariantFile(fbase_vcf) if genotyped != check_if_genotyped(base_vcf): raise ValueError( "One of the vcfs seems to be genotyped but the other does not") if has_vargq != check_if_vargq(base_vcf): raise ValueError( "One of the vcfs has the varGQ field but the other does not") if collect_evidence != check_if_evidence(base_vcf): raise ValueError( "One of the vcfs has the EVIDENCE field but the other does not" ) base_records = list(base_vcf.fetch()) test_tree = iu.create_trees_from_records(test_records, variant_types, contigs, padding=padding) base_tree = iu.create_trees_from_records(base_records, variant_types, contigs, padding=padding) base_pass_records = [ r for r in base_records if ("PASS" in r.filter or len(set(r.filter) - pass_filter_set) == 0) ] base_pass_tree = iu.create_trees_from_records(base_pass_records, variant_types, contigs, padding=padding) elif fbase_bed is not None: base_records = parse_bed_file(fbase_bed) test_tree = iu.create_trees_from_records(test_records, variant_types, contigs, padding=padding) base_tree = iu.create_trees_from_bed_records(base_records, variant_types, contigs, padding=padding) base_pass_tree = None else: base_tree = None base_pass_tree = None if base_tree is not None: metrics, fp_intervals, fn_intervals = add_evaluation_metrics( metrics, test_tree, base_tree, variant_types, min_ro, metric_prefix) else: fp_intervals = None fn_intervals = None if base_pass_tree is not None: metrics, fp_intervals_pass, fn_intervals_pass = add_evaluation_metrics( metrics, test_tree, base_pass_tree, variant_types, min_ro, metric_prefix, metric_suffix="_pass") else: fp_intervals_pass = None fn_intervals_pass = None if genotyped: allele_frequencies, num_singletons = get_allele_frequency_counts( pass_records, test_vcf.header, variant_types) if has_vargq: vargq_counts = get_distributions_by_type(pass_records, variant_types, "varGQ", VARGQ_BINS) if collect_evidence: evidence_counts = collect_evidence_fields(pass_records, variant_types) for type in variant_types: metrics[metric_prefix + VCF_METRIC_STR + type + "_count"] = unfiltered_variant_type_counts[type] metrics[metric_prefix + VCF_METRIC_STR + type + "_pass_count"] = variant_type_counts[type] if type != 'BND': metrics = add_binned_metrics(size_counts, SIZES, type, metrics, metric_prefix, "pass_size") if genotyped: metrics = add_binned_metrics(allele_frequencies, AF_BINS, type, metrics, metric_prefix, "pass_af") if type in num_singletons: metrics[metric_prefix + VCF_METRIC_STR + type + "_pass_ac_1"] = num_singletons[type] if has_vargq: metrics = add_binned_metrics(vargq_counts, VARGQ_BINS, type, metrics, metric_prefix, "pass_vargq") if collect_evidence: metrics = add_metrics_from_dict(evidence_counts, type, metrics, metric_prefix, "pass_evidence") return metrics, fp_intervals, fn_intervals, fp_intervals_pass, fn_intervals_pass
def check_genotype(folder, sample, coverage_file): """ Compares the genotype for all shared variants :param folder: location of results from the NGS analysis pipeline :param sample: sample number (used in vcf file) :param coverage_file: file containing coverage information for each position in the panel :return: dictionary of number of matching variants and detailed information for any with mismatching genotypes """ shared_giab = VariantFile(folder + '/0002.vcf') shared_patient = VariantFile(folder + '/0003.vcf') variants = [] vars_giab = {} for rec in shared_giab.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if chrom not in vars_giab: vars_giab[chrom] = {} if pos not in vars_giab[chrom]: vars_giab[chrom][pos] = {} if alleles not in vars_giab[chrom][pos]: vars_giab[chrom][pos][alleles] = rec.samples['INTEGRATION']['GT'] matching = 0 for rec in shared_patient.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' total_depth = rec.samples[sample]['DP'] giab_genotype = vars_giab[chrom][pos][alleles] if rec.samples[sample]['GT'] == giab_genotype: matching += 1 elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][ 0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]: matching += 1 elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0: matching += 1 elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1: matching += 1 else: if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command: ' + str(e.returncode) exit(1) if line == '': variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} else: bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':alleles[0], 'alt':alleles[1], 'QUAL':rec.qual, 'GT':{sample:rec.samples[sample]['GT'], 'GIAB':giab_genotype}, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} else: variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'indel: no coverage could be obtained', 'ref': 'N/A', 'alt': 'N/A'}} variants.append(variant) print str(matching) + ' matching variants' results = {'matching':matching, 'mismatching':variants} print results return results
def annotate_false_negs(folder, ref_sample, coverage_file): """ Get information for any false negative results. Returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Folder containing output from bcftools isec :type folder: String :param ref_sample: Sample number for reference vcf :type ref_sample: String :param coverage_file: File containing per base coverage for the truth_regions panel :type coverage_file: String :return: List of variant dictionaries containing information on false negatives :rtype: List """ false_negs = VariantFile(folder + '/0000.vcf') num_neg = len(list(false_negs.fetch())) print(num_neg) variants = {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':[]} if num_neg > 0: print('false negatives') for rec in false_negs.fetch(): chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples['Venter.il_st']['GT'] if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print(command) print('Error executing command: ' + str(e.returncode)) exit(1) if line == '': variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} no_cov = variants['no_coverage'] no_cov.append(variant) variants['no_coverage'] = no_cov else: line.strip('\n') bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} if cov == 0: no_cov = variants['no_coverage'] no_cov.append(variant) variants['no_coverage'] = no_cov elif alt_cov != 0: ev_alt = variants['evidence_of_alt'] ev_alt.append(variant) variants['evidence_of_alt'] = ev_alt else: fn = variants['false_neg'] fn.append(variant) variants['false_neg'] = fn else: variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A', 'alt':'N/A'}} indels = variants['indels'] indels.append(variant) variants['indels'] = indels else: print('no false negatives') return variants
def run_process(opts, inputvcf): outputvcf = opts.output # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header vcf_in.header.info.add( "TYPE", "A", "String", "The type of allele, either snp, ins, del, or complex.") # Add FORMAT to Header vcf_in.header.formats.add( "NGB_DP", "1", "Integer", "Approximate read depth; some reads may have been filtered") vcf_in.header.formats.add("NGB_AO", "A", "Integer", "Alternate allele observation count") vcf_in.header.formats.add("NGB_RO", "1", "Integer", "Reference allele observation count") vcf_in.header.formats.add( "NGB_VAF", "A", "Float", "Allele fractions of alternate alleles in the tumor") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-', 'w', header=vcf_in.header) for record in vcf_in.fetch(): chrom = record.chrom pos = record.pos ref = record.ref alts = record.alts variant_type_list = list() ngb_dp_list = list() ngb_ao_list = list() ngb_ro_list = list() ngb_vaf_list = list() tmp_dp = sum(record.samples[0]['AD']) tmp_ro = record.samples[0]['AD'][0] for n, alt in enumerate(alts): # Get Variant TYPE (freebayes format) ret = ngb_functions.pairdiff(ref, alt) vartype = ret['variant_type'] variant_type_list.append(vartype) # Get DP,AO,RO,VAF tmp_vaf = float(record.samples[0]['AD'][(n + 1)]) / float(tmp_dp) tmp_ao = int(record.samples[0]['AD'][(n + 1)]) ngb_dp_list.append(tmp_dp) ngb_ao_list.append(tmp_ao) ngb_vaf_list.append(tmp_vaf) if variant_type_list != []: #info_value = ','.join(str(e) for e in variant_type_list) record.info['TYPE'] = variant_type_list if ngb_dp_list != []: record.samples[0]["NGB_DP"] = ngb_dp_list[0] record.samples[0]["NGB_AO"] = tuple(ngb_ao_list) record.samples[0]["NGB_RO"] = tmp_ro record.samples[0]["NGB_VAF"] = tuple(ngb_vaf_list) # Write VCF vcf_out.write(record)
#!/group/ctan/anaconda3/envs/snakemake/bin/python import sys from vcf_ctan import samvcf from pysam import VariantFile samples = [ "AC", "BD", "Commander", "EC2.1", "EC2.2", "EC7.1", "EC7.2", "Fleet", "Hindmarsh", "La_Trobe", "Scope", "Vlamingh", "W1", "WI4304", "X1", "barke", "bowman", "haruna_Nijo", "igri", "spontaneum_B1k-04-12" ] grp1 = [samples[1], samples[10], samples[15], samples[17]] grp2 = [samples[2], samples[8], samples[9], samples[11], samples[16]] ibcf = VariantFile(sys.argv[1]) #obcf = VariantFile(sys.argv[2],'w',header=ibcf.header) ofile = open(sys.argv[2], "w") hd = ["#chr", "pos", "len", "ref", "alt", "gt_count"] for one in grp1 + grp2: hd = hd + [one, "Reads"] ofile.write("\t".join(hd) + "\n") for one in ibcf.fetch("chr5H", 544822373, 546294499): record = samvcf(one) if record.diff_group(grp1, grp2): opt = record.opt + record.diff_group(grp1, grp2) ofile.write("\t".join(opt) + "\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument('-vcf', help='Results VCF to be compared', required=True) parser.add_argument('-bed', help='The reference BED file', required=True) parser.add_argument('-s', help='Sample ID in VCF', required=True) parser.add_argument('-out', help='The folder to putt results files', required=True) args = parser.parse_args() if args.out.endswith('/'): out_dir = args.out else: out_dir = args.out + '/' sample = args.s vcf_file = args.vcf bed = args.bed f = open(bed, 'r') regions = [line.strip('\n') for line in f.readlines()] f.close() variants = {} for region in regions: if region.startswith('#'): continue chrom, start, end, name = region.split('\t') pos, ref, alt = name.split(':') if chrom not in variants: variants[chrom] = {pos:{(ref, alt):False,}} elif pos not in variants[chrom]: variants[chrom][pos] = {(ref, alt):False,} else: variants[chrom][pos][(ref, alt)] = False vcf = VariantFile(vcf_file) false_pos = [] false_neg = [] true_pos = [] for v in vcf.fetch(): chrom = v.contig pos = str(v.pos) ref = v.alleles[0] alt = v.alleles[1] qual = v.qual genotype = v.samples[sample]['GT'] if 'AD' in v.samples[sample].keys(): allelic_depth = v.samples[sample]['AD'] elif 'NV' in v.samples[sample].keys(): allelic_depth = v.samples[sample]['NV'] else: allelic_depth = 'N/A' if 'DP' in v.samples[sample].keys(): total_depth = v.samples[sample]['DP'] elif 'NR' in v.samples[sample].keys(): total_depth = v.samples[sample]['NR'] else: total_depth = 0 if pos in variants[chrom].keys(): if (ref,alt) in variants[chrom][pos].keys(): variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual, 'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}} true_pos.append(variant) variants[chrom][pos][(ref, alt)] = True else: variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual, 'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}} false_pos.append(variant) else: variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual, 'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}} false_pos.append(variant) for chrom in variants.keys(): for pos in variants[chrom].keys(): for v in variants[chrom][pos].keys(): if not variants[chrom][pos][v]: variant = {'chrom': chrom, 'pos': pos, 'ref': v[0], 'alt': v[1], 'QUAL': 0, 'GT': (0,0), 'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}} false_neg.append(variant) out = {'false_negative': {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':false_neg}, 'false_positive': false_pos, 'mismatching_genotype': [], 'matching_variants': len(true_pos), 'num_true_negatives': 0, 'sensitivity': 0, 'MCC': 0, 'small_panel_remainder_length': 0, 'percent_small_panel_covered': 0, 'num_false_positive': len(false_pos), 'num_false_negative': {'indel': 0, 'no_coverage': 0, 'ev_of_alt': 0, 'false_neg': 0, 'total': len(false_neg)}, 'num_mismatching_genotype': 0} all_results = {sample:out} f = open(out_dir + sample + '_summary.json', 'w') j = json.dumps(all_results, indent=4) print >> f, j f.close()
def read_vcf(fh, alleles, slh=None): vcf_in = VariantFile(fh) sample = list(vcf_in.header.samples)[0] availcols = next(vcf_in.fetch()).format.keys() vcf_in.seek(0) # Check if sample size info is in header global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0] if alleles: dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} usecols = list(dtype_dict.keys()) # Read in data if 'SS' in availcols: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref ] for rec in vcf_in.fetch()] N = pd.Series([x[2] for x in o], dtype='float') else: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.alts[0], rec.ref ] for rec in vcf_in.fetch()] if 'TotalControls' in global_fields.keys( ) and 'TotalCases' in global_fields.keys(): N = pd.Series([ float(global_fields['TotalControls']) + float(global_fields['TotalCases']) ] * len(o), dtype='float') elif 'TotalControls' in global_fields.keys(): N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') else: N = pd.Series([np.NaN] * len(o), dtype='float') p = pd.DataFrame({ 'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': N, 'A1': pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'), 'A2': pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str') }) else: dtype_dict = {'SNP': str, 'Z': float, 'N': float} usecols = list(dtype_dict.keys()) if 'SS' in availcols: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0] ] for rec in vcf_in.fetch()] N = pd.Series([x[2] for x in o], dtype='float') else: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0] ] for rec in vcf_in.fetch()] if 'TotalControls' in global_fields.keys( ) and 'TotalCases' in global_fields.keys(): N = pd.Series([ float(global_fields['TotalControls']) + float(global_fields['TotalCases']) ] * len(o), dtype='float') elif 'TotalControls' in global_fields.keys(): N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') else: N = pd.Series([np.NaN] * len(o), dtype='float') p = pd.DataFrame({ 'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': N }) vcf_in.close() if slh is not None: compression = get_compression(slh) sl = [] if compression == "gzip": try: with gzip.open(slh) as f: for line in f: sl.append(line.strip()) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted snplist file: ' + str(e.args)) else: try: with open(slh) as f: for line in f: sl.append(line.strip()) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted snplist file: ' + str(e.args)) f.close() p = p.loc[p['SNP'].isin(sl)] return (p)
#!/usr/bin/env python3 from pysam import VariantFile import sys vcf_in = VariantFile(sys.argv[1], 'r') vcf_out = VariantFile('-', 'w', header=vcf_in.header) cp = (0, 0) for rec in vcf_in.fetch(): if (rec.chrom, rec.pos) != cp: vcf_out.write(rec) cp = (rec.chrom, rec.pos)
def annotate_false_negs(folder, ref_sample, coverage_file): """ Get information for any false negative results. Returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) False Negatives are split into categories to aid final comparison: * Zero coverage - No reads present * Evidence of alternate allele - Coverage or quality too low for variant call * Indels - Coverage is more difficult to obtain in these cases; currently they must be investigated by hand * All other false negatives - In these cases there are reads present and no evidence of the alternate allele :param folder: Folder containing output from bcftools isec :type folder: String :param ref_sample: Sample number for reference vcf :type ref_sample: String :param coverage_file: File containing per base coverage for the truth_regions panel :type coverage_file: String :return: List of variant dictionaries containing information on false negatives :rtype: List """ false_negs = VariantFile(folder + '/0000.vcf') num_neg = len(list(false_negs.fetch())) print(num_neg) variants = {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':[]} v_list = [] count=0 if num_neg > 0: print('false negatives') for rec in false_negs.fetch(): print(rec.samples) chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples[ref_sample]['GT'] if [chrom, pos, ref, alt] in v_list: print("duplicate") continue count+=1 if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print(command) print('Error executing command: ' + str(e.returncode)) exit(1) if line == '': variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} no_cov = variants['no_coverage'] no_cov.append(variant) variants['no_coverage'] = no_cov else: line.strip('\n') bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} if int(cov) == 0: no_cov = variants['no_coverage'] no_cov.append(variant) variants['no_coverage'] = no_cov elif int(alt_cov) != 0: ev_alt = variants['evidence_of_alt'] ev_alt.append(variant) variants['evidence_of_alt'] = ev_alt else: fn = variants['false_neg'] fn.append(variant) variants['false_neg'] = fn else: variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A', 'alt':'N/A'}} indels = variants['indels'] indels.append(variant) variants['indels'] = indels else: print('no false negatives') print("false_negatives=" + str(count)) return variants
def fetch(self, chrm, pos_start, pos_end, return_samples=False): vcf_file = "%s.%s.vcf.gz" % (self.pop_vcf_stem, chrm) vcf_open = VariantFile(vcf_file, drop_samples=(not return_samples)) return vcf_open.fetch(chrm, pos_start, pos_end)
Args: query (pysam.VariantRecord): query breakend targets (pysam.VariantFile): vcf file with target breakends ignore_strands, ignore_alt_pos (bool): argument to pass to record_matches Returns: bool: If there is any match, returns True. If there is no match, returns False. """ within_distance_targets = targets.fetch(query.chrom, query.start - dist, query.start + dist) for candidate_hit in within_distance_targets: if record_matches(query, candidate_hit, ignore_strands = ignore_strands, ignore_alt_pos = ignore_alt_pos): return True return False ## Read vcf records from input file out_file = open(sys.argv[3], "w") for rec in vcf_query.fetch(): if rec.info["SVTYPE"] == "BND" or rec.info["SVTYPE"] == "DEL" or rec.info["SVTYPE"] == "DUP": rec_has_match = has_match(rec, vcf_target) elif rec.info["SVTYPE"] == "INS": rec_has_match = has_match(rec, vcf_target, ignore_alt_pos = True, ignore_strands = True) out_file.write(rec.id + "\t" + rec.info["SVTYPE"] + "\t" + str(rec_has_match) + "\n") out_file.close()
def run_process(opts, inputvcf): db_file = opts.database outputvcf = opts.output minhomopolyx = int(opts.minhomopolyx) minrepeatcount = int(opts.minrepeatcount) maxvaf = float(opts.maxvaf) indelmaxdp = int(opts.indelmaxdp) indelmaxao = int(opts.indelmaxao) indelmaxvaf = float(opts.indelmaxvaf) snvmaxdp = int(opts.snvmaxdp) # Get Lowconf Database (obj1 : standard, obj2 : range) lowconfobj1, lowconfobj2 = lowconfdb2obj(db_file) # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header if not ngb_functions.vcfHeaderCheck(vcf_in.header.info, "LOW_CONFIDENCE"): vcf_in.header.info.add("LOW_CONFIDENCE", ".", "String", "Low Confidence Type") # Add FILTER to Header if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "homopolymer"): vcf_in.header.filters.add("homopolymer", None, None, "Homopolymer Sequence Region") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "repeat_sequence"): vcf_in.header.filters.add("repeat_sequence", None, None, "Repeat Sequence Region") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "sequencing_error"): vcf_in.header.filters.add("sequencing_error", None, None, "Sequencing Error Low Confidence Region") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "mapping_error"): vcf_in.header.filters.add("mapping_error", None, None, "Mapping Error Low Confidence Region") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "snp_candidate"): vcf_in.header.filters.add("snp_candidate", None, None, "SNP Candidates") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "strand_biased"): vcf_in.header.filters.add("strand_biased", None, None, "Strand Biased (Freebayes)") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "lowcoverage_indel"): vcf_in.header.filters.add("lowcoverage_indel", None, None, "Low Coverage (DP,AO,VAF) Indels") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "lowcoverage_snv"): vcf_in.header.filters.add("lowcoverage_snv", None, None, "Low Coverage (DP) SNVs") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-', 'w', header=vcf_in.header) for record in vcf_in.fetch(): chrom = record.chrom pos = record.pos ref = record.ref alts = record.alts vaf = float(record.samples[0]["NGB_VAF"][0]) ao = int(record.samples[0]["NGB_AO"][0]) dp = int(record.samples[0]["NGB_DP"]) vtype = record.info["TYPE"][0] reflen = len(record.ref) altlen = len(record.alts[0]) """ if "ngb_cv_rcv_sig_description" in record.info: tmpcv = record.info["ngb_cv_rcv_sig_description"][0] cv = tmpcv.split("|") else: cv = list() """ seqerror_info_list = list() strandbiased_info_list = list() homopolymer_info_list = list() repeat_info_list = list() saf_format_list = list() sar_format_list = list() lowcov_indel_list = list() lowcov_snv_list = list() for i, alt in enumerate(alts): # Get Lowconf info lowconf = "" id1 = chrom + '-' + str(pos) + '-' + ref + '-' + alt if id1 in lowconfobj1: lowconf = lowconfobj1[id1] else: lowconf = "" # Get Lowconf Info from range database for lowconfdata in lowconfobj2: if chrom == lowconfdata["chrom"] and pos in range( int(lowconfdata["start"]), int(lowconfdata["end"]) + 1): lowconf = lowconfdata["type"] seqerror_info_list.append(lowconf) # Get Strand Biased Information strandbiased = "" # (Freebayes) if "SAF" in record.info: if record.info["SAF"][i] == 0 or record.info["SAR"][ i] == 0 or record.info["RPR"][i] < 1 or record.info[ "RPL"][i] < 1: strandbiased = "strand_biased" else: strandbiased = "" """ # Mutect elif "F1R2" in record.format: alt_f1r2 = record.samples[0]['F1R2'][i+1] alt_f2r1 = record.samples[0]['F2R1'][i+1] if alt_f1r2 == 0 or alt_f2r1 == 0: strandbiased = "strand_biased" else: strandbiased = "" saf_format_list.append(alt_f1r2) sar_format_list.append(alt_f2r1) """ strandbiased_info_list.append(strandbiased) # Homopolymer & Repeat Sequence Filtering (VAF, CV) homopolymerinfo = "" repeatinfo = "" #if vaf < maxvaf and ("Pathogenic" not in cv) and ("Likely_pathogenic" not in cv): if vaf < maxvaf: # Get Homopolymer Info if "HOMOPOLYX" in record.info: if int(record.info["HOMOPOLYX"][0]) >= minhomopolyx: homopolymerinfo = "homopolymer" else: homopolymerinfo = "" # Get Repeat Info if "REPEAT_COUNT" in record.info: if int(record.info["REPEAT_COUNT"][0]) >= minrepeatcount: repeatinfo = "repeat_sequence" else: repeatinfo = "" homopolymer_info_list.append(homopolymerinfo) repeat_info_list.append(repeatinfo) # Indel Filtering lowcovindelinfo = "" if (altlen != reflen) and (vtype == "ins" or vtype == "del" or vtype == "complex"): if vaf < indelmaxvaf or ao < indelmaxao or dp < indelmaxdp: lowcovindelinfo = "lowcoverage_indel" else: lowcovindelinfo = "" else: lowcovindelinfo = "" lowcov_indel_list.append(lowcovindelinfo) # SNV Filtering lowcovsnvinfo = "" if (altlen == reflen) and (vtype == "snp" or vtype == "complex"): if dp < snvmaxdp: lowcovsnvinfo = "lowcoverage_snv" else: lowcovsnvinfo = "" else: lowcovsnvinfo = "" lowcov_snv_list.append(lowcovsnvinfo) lowconf_info_list = list() for i, itema in enumerate(seqerror_info_list): itemb = strandbiased_info_list[i] itemc = homopolymer_info_list[i] itemd = repeat_info_list[i] iteme = lowcov_indel_list[i] itemf = lowcov_snv_list[i] itemm = "" if itema != '': itemm += itema + "|" if itemb != '': itemm += itemb + "|" if itemc != '': itemm += itemc + "|" if itemd != '': itemm += itemd + "|" if iteme != '': itemm += iteme + "|" if itemf != '': itemm += itemf + "|" if itemm != '': itemn = itemm[0:-1] else: itemn = '' if itemn != '': lowconf_info_list.append(itemn) if lowconf_info_list != []: info_value = ','.join(str(e) for e in lowconf_info_list) record.info['LOW_CONFIDENCE'] = info_value # Add FILTER lowconf_infolist = list() if 'LOW_CONFIDENCE' in record.info: for lowconf_info in record.info['LOW_CONFIDENCE']: lowconf_infolist += lowconf_info.split("|") lowconf_infolist = list(set(lowconf_infolist)) for lowconf_info in lowconf_infolist: record.filter.add(lowconf_info) # PASS FILTER if list(record.filter) == []: record.filter.add("PASS") # Remove Filter for rf in remove_filter_list: if rf in list(record.filter): record.filter.__delitem__(rf) # Write VCF vcf_out.write(record)
if not tb: return novel try: records = list(tb.query(chr, pos - 1, pos)) if not records: return novel return records[0][2] except tabix.TabixError: return novel reader = Vcf(infile) writer.meta.add('Variant') writer.meta.add(*tuple(reader.header.samples)) writer.writeHead() for r in reader.fetch(): alts = r.alts if not alts: continue if bialt and len(alts) != 1: continue if bialt and len(r.ref) != 1: continue alts = list(alts) refalts = [r.ref] + alts name = r.id if useid and r.id and r.id != '.' else getRsName( r.chrom, r.pos) for alt in alts: record = TsvRecord() record.Variant = '{chr}_{pos}_{name}_{ref}_{alt}'.format(chr=r.chrom, pos=r.pos, name=name, ref=r.ref, alt=alt)
from pysam import VariantFile from pysam import TabixFile from pyfaidx import Fasta # data files reference_file = 'S_lycopersicum_chromosomes.2.40.fa' annotation_file = 'gene_models.gff.gz' variant_file = 'tomato_snps.bcf' # load reference reference = Fasta(reference_file) # load annotations annotations = TabixFile(annotation_file) # laod variants variants = VariantFile(variant_file) # regions to query region1 = ("SL2.40ch01", 15000, 21000) region2 = ("SL2.40ch01", 20000, 70000) region1_reference = reference[region1[0]][region1[1]: region1[2]] region1_annotations = [a for a in annotations.fetch(*region1, parser=pysam.asGTF())] region1_variants = [a for a in variants.fetch(*region1)] region2_reference = reference[region2[0]][region2[1]: region2[2]] region2_annotations = [a for a in annotations.fetch(*region2, parser=pysam.asGTF())] region2_variants = [a for a in variants.fetch(*region2)]
from pysam import VariantFile as Vcf from pyppl import Box from bioprocs.utils import alwaysList infile = {{i.infile | quote}} outfile = {{o.outfile | quote}} rmfilter = {{args.rmfilter | repr}} if rmfilter: rmfilter = alwaysList(rmfilter) invcf = Vcf(infile) outvcf = open(outfile, 'w') outvcf.write(str(invcf.header)) for rec in invcf.fetch(): parts = str(rec).split('\t') filters = parts[6].split(';') if not rmfilter: filters = 'PASS' else: filters = ';'.join(f for f in filters if f not in rmfilter) filters = filters or 'PASS' parts[6] = filters outvcf.write('\t'.join(parts)) outvcf.close()
inF1 = VariantFile(args['<input1>'], 'r') inF2 = VariantFile(args['<input2>'], 'r') Record = Record(inF2) #check smaples in two input file, same samples, and same order. if len(inF1.header.samples) != len(inF2.header.samples): sys.stderr.write('ERROR: different number of samples in two input files.\n') sys.exit(-1) else: for x, y in zip( inF1.header.samples, inF2.header.samples): if x != y: sys.stderr.write('ERROR: two input files should have the same samples, and ordered in same order.\n') sys.exit(-1) #output vcf header sys.stdout.write('%s'%(str(inF1.header))) for line in inF1.fetch(): if len(line.alleles) != 2: sys.stderr.write('ERROR: please decompose the input vcf, only one alt allele permited each line, error record:\n%s\n' %(line)) sys.exit(-1) ss = str(line).strip().split() #print(ss[0]) key = ss[0] + ss[1] + ss[3] + ss[4] line2 = Record.getRecord(key, ss[0], int(ss[1])) if line2: out = ss[:vcfMetaCols] ss2 = str(line2).strip().split() for x, y in zip(ss[vcfMetaCols:], ss2[vcfMetaCols:]): if x[0] == '.' or y[0] == '.': out.append('.') elif x[0] == y[0] and x[2] == y[2]:
#!/group/ctan/anaconda3/envs/snakemake/bin/python import sys from vcf_ctan import samvcf from pysam import VariantFile samples = [ "AC", "BD", "Commander", "EC2.1", "EC2.2", "EC7.1", "EC7.2", "Fleet", "Hindmarsh", "La_Trobe", "Scope", "Vlamingh", "W1", "WI4304", "X1", "barke", "bowman", "haruna_Nijo", "igri", "spontaneum_B1k-04-12" ] grp = ["bam/YSX-W_HJMFHALXX_L5.rmdup.bam", "bam/TBT-M_HJMFHALXX_L4.rmdup.bam"] ibcf = VariantFile(sys.argv[1]) #obcf = VariantFile(sys.argv[2],'w',header=ibcf.header) ofile = open(sys.argv[2], "w") hd = ["#chr", "pos", "len", "ref", "alt", "gt_count"] for one in grp: hd = hd + [one, "Reads"] ofile.write("\t".join(hd) + "\n") for one in ibcf.fetch(): record = samvcf(one) if record.extract(grp): opt = record.opt + record.extract(grp) ofile.write("\t".join(opt) + "\n")
contigs = set(vcf_infile.header.contigs) contigs = contigs.difference(set(exclude_chr)) chrom_list = [] total_af = [] total_ac = [] total_sites = 0 total_callable = 0 with open(args.outfile, 'w') as outfile: print('Population', 'Chromosome', 'Chromosome_length', 'Sites', 'S', 'thetaW', 'pi', 'tajd', sep='\t', file=outfile) for c in contigs: for site in vcf_infile.fetch(c): ac = site.info['AC'][0] af.append(ac / float(n)) total_ac += ac total_af += af sites = calc_stats_chr(args.pop_id, af, c, args.callable, outfile) total_sites += sites[0] total_callable += sites[1] del af[:] if args.sfs: for c in total_ac: sfs[min(n - c, c)] += 1
def phase_structural_variants(sv_vcf, long_reads_bam, workdir): sv_vcf_basename = os.path.basename(sv_vcf) if sv_vcf_basename.endswith('.vcf'): offset = -4 elif sv_vcf_basename.endswith('.vcf.gz'): offset = -7 else: return sv_filtered_phased_vcf = workdir + '/' + sv_vcf_basename[:offset] + '.filtered.phased.vcf' vcf_in = VariantFile(sv_vcf) vcf_out = VariantFile(sv_filtered_phased_vcf, 'w', header=vcf_in.header) bam_in = AlignmentFile(long_reads_bam) phasing_stat_f = open(workdir + '/' + 'phasing_stat.txt', 'w') chr_to_include = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'] """ chr_to_include = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY'] """ phasing_stat = {'INS' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'DEL' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'INV' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'BND' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'DUP:TANDEM' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'DUP_INT' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}} prev_chrom = '' for rec in vcf_in.fetch(): sv_chrom = rec.chrom if sv_chrom in chr_to_include: if sv_chrom != prev_chrom: logging.info('Processing {0}'.format(sv_chrom)) prev_chrom = sv_chrom if rec.filter.keys()[0] == 'PASS': sv_pos = rec.pos sv_read_ids = rec.info['READS'] sv_support = rec.info['SUPPORT'] sv_type = rec.info['SVTYPE'] phasing_stat[sv_type]['Total'] += 1 begin_pos = sv_pos - 1 if 'END' in rec.info: end_pos = rec.info['END'] else: end_pos = sv_pos hap1_counter = 0 hap2_counter = 0 try: read_iterator = bam_in.fetch(sv_chrom, begin_pos-2000, end_pos+2000) except ValueError: read_iterator = bam_in.fetch(sv_chrom, begin_pos, end_pos) for read in read_iterator: if read.query_name in sv_read_ids: if read.has_tag('HP'): read_hp = read.get_tag('HP') hap1_counter += read_hp == 1 hap2_counter += read_hp == 2 threshold_read_count = max(int(0.85 * sv_support), 5) threshold_het = 0.8 threshold_hom = 0.2 if (hap1_counter + hap2_counter) >= threshold_read_count: allele_frequency_hap1 = hap1_counter / float(hap1_counter + hap2_counter) allele_frequency_hap2 = hap2_counter / float(hap1_counter + hap2_counter) if allele_frequency_hap1 >= threshold_hom and allele_frequency_hap1 < threshold_het: rec.samples[0]['GT'] = (1, 1) rec.samples[0].phased = True phasing_stat[sv_type]['Phased HOM'] += 1 elif allele_frequency_hap1 >= threshold_het: rec.samples[0]['GT'] = (1, 0) rec.samples[0].phased = True phasing_stat[sv_type]['Phased HET'] += 1 elif allele_frequency_hap2 >= threshold_het: rec.samples[0]['GT'] = (0, 1) rec.samples[0].phased = True phasing_stat[sv_type]['Phased HET'] += 1 vcf_out.write(rec) phasing_stat_f.write('\tTotal\tPhased HOM\tPhased HET\n') for sv in phasing_stat: phasing_stat_f.write('{0}:\t{1}\t{2}\t{3}\n'.format(sv, phasing_stat[sv]['Total'], phasing_stat[sv]['Phased HOM'], phasing_stat[sv]['Phased HET'])) phasing_stat_f.close()
def check_genotype(folder, sample, ref_sample, coverage_file): """ Compares the genotype for all shared variants The number of matching variants are counted and those that do not match are annotated with basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Location of results from the NGS analysis pipeline :type folder: String :param sample: Sample number (used in vcf file) :type sample: String :param ref_sample: Sample number for reference vcf :type ref_sample: String :param coverage_file: File containing coverage information for each position in the panel :type coverage_file: String :return: Number of matching variants :rtype: Int :return: List of variant dictionaries with detailed information for mismatching genotypes :rtype: List """ shared_giab = VariantFile(folder + '/0002.vcf') shared_patient = VariantFile(folder + '/0003.vcf') variants = [] vars_giab = {} for rec in shared_giab.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if chrom not in vars_giab: vars_giab[chrom] = {} if pos not in vars_giab[chrom]: vars_giab[chrom][pos] = {} if alleles not in vars_giab[chrom][pos]: vars_giab[chrom][pos][alleles] = rec.samples[ref_sample]['GT'] matching = 0 for rec in shared_patient.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' if 'DP' in rec.samples[sample].keys(): total_depth = rec.samples[sample]['DP'] elif 'NR' in rec.samples[sample].keys(): total_depth = rec.samples[sample]['NR'] else: total_depth = 0 giab_genotype = vars_giab[chrom][pos][alleles] if rec.samples[sample]['GT'] == giab_genotype: matching += 1 elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][ 0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]: matching += 1 elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0: matching += 1 elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1: matching += 1 else: if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print('Error executing command: ' + str(e.returncode)) exit(1) if line == '': variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {"sample": rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} else: bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':alleles[0], 'alt':alleles[1], 'QUAL':rec.qual, 'GT':{"sample":rec.samples[sample]['GT'], 'GIAB':giab_genotype}, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} else: variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {"sample": rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'indel: no coverage could be obtained', 'ref': 'N/A', 'alt': 'N/A'}} variants.append(variant) print(str(matching) + ' matching variants') return matching, variants
worksheetIntron.write('A11', 'Regions: ') row = 11 col = 0 for gene in intronDict: worksheetIntron.write_row('B'+str(row), [gene]+intronDict[gene]) row += 1 row += 1 worksheetIntron.write('A'+str(row), 'Coverage below '+str(medCov)+'x', italicFormat) row += 1 tableheading = ['RunID', 'DNAnr', 'Gene', 'Chr', 'Pos', 'Ref', 'Alt', 'AF', 'DP', 'Transcript', 'Mutation cds', 'ENSP', 'Consequence', 'Max popAF', 'Max Pop', 'Callers'] worksheetIntron.write_row('A'+str(row), tableheading, tableHeadFormat) # 1 index for snv in vcf_snv.fetch(): if "PopAF" not in snv.filter.keys(): if snv.contig in introns: for pair in introns[snv.contig]: if snv.pos >= pair[0] and snv.pos <= pair[1] and snv.info["AF"][0] >= 0.2: # import pdb; pdb.set_trace() csq = snv.info["CSQ"][0] gene = csq.split("|")[3] transcript = csq.split("|")[10].split(":")[0] if len(csq.split("|")[10].split(":")) > 1: codingName = csq.split("|")[10].split(":")[1] else: codingName = '' ensp = csq.split("|")[11] consequence = csq.split("|")[1] popFreqsPop = ['AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomAD_AF', 'gnomAD_AFR_AF',