def main(): global options, args # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) # If an FREQ field already exists in FORMAT or INFO, it has to be stored and be used when importing from input former_vcfformat_freq = input_vcf.formats[ 'FREQ'] if 'FREQ' in input_vcf.formats else None former_vcfinfo_sfreq = input_vcf.infos[ 'FREQ'] if 'FREQ' in input_vcf.infos else None former_vcfinfo_sdp = input_vcf.infos[ 'DPS'] if 'DPS' in input_vcf.infos else None input_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String', 'Variant allele frequency') input_vcf.infos['SFREQ'] = VcfInfo( 'SFREQ', 1, 'Float', 'Maximum variant allele frequency of all samples') input_vcf.infos['SDP'] = VcfInfo( 'SDP', 1, 'Integer', 'Maximum sequencing depth of all samples') output_vcf = vcf.Writer(open(options.output_vcf, 'w'), input_vcf, lineterminator='\n') if former_vcfformat_freq is not None: input_vcf.formats['FREQ'] = former_vcfformat_freq if former_vcfinfo_sfreq is not None: input_vcf.infos['SFREQ'] = former_vcfinfo_sfreq if former_vcfinfo_sdp is not None: input_vcf.infos['SDP'] = former_vcfinfo_sdp for record in input_vcf: if not 'FREQ' in record.FORMAT.split(':'): record.add_format('FREQ') # Default values for added INFO fields site_freq = None site_depth = 0 # iterate over all call objects of record for call in record.samples: # Allele frequency and Depth evaluation among samples try: site_freq = max(site_freq, max( call.aaf)) if call.aaf is not None else site_freq site_depth = max( call.depth, site_depth) if call.depth is not None else site_depth except Exception: print "ERROR: unforeseen exception when normalizing record:", record raise call.add_format('FREQ', norm_freq(call.aaf)) # TODO: unfortunately GATK filtering doesn't yet deal correctly with "None" (.) values if site_freq is None or site_freq == '.': site_freq = 0 record.add_info('SFREQ', site_freq) record.add_info('SDP', site_depth) output_vcf.write_record(record)
def annotation_vcf(parsed_args, process_num): records, results = Queue(100 * process_num), Queue() input_finished = False output_finished = False wait_records = dict() processes = list() records_id = count() for i in range(process_num): p = Process(target=score_vcf, args=(records, results, parsed_args.annotation)) processes.append(p) p.start() vcf_reader = vcf.Reader(filename=parsed_args.file_in) vcf_reader.infos['dbscSNV'] = VcfInfo( 'dbscSNV', vcf_field_counts['A'], 'String', 'dbscSNV Score for VCF record alleles, Format: ALLELE|ada_score|rf_score', version=None, source=None) vcf_writer = vcf.Writer(open(parsed_args.file_out, 'w'), vcf_reader) while True: while not records.full() and not input_finished: try: record = next(vcf_reader) record_id = next(records_id) wait_records[record_id] = record record_infos = list() chromosome = str(record.CHROM) pos = record.POS ref = record.REF for alt in record.ALT: record_infos.append( VariantRecord(chromosome, pos, ref, str(alt))) records.put((record_id, record_infos)) except StopIteration: input_finished = True records.put('END') break processes_status = list() for p in processes: processes_status.append(p.is_alive()) if True not in processes_status: results.put('END') while True: try: result = results.get(False) except queue.Empty: break if result != 'END': record_id, record_score = result[0], result[1] record_write = wait_records.pop(record_id) record_write.add_info('dbscSNV', record_score) vcf_writer.write_record(record_write) else: output_finished = True break if output_finished: break vcf_writer.close()
def add_node_tag_randomly(self, tree, input_vcf, output_vcf, alpha, add_info_tag='NODE', add_info_num=1, add_info_type='String', add_info_description='Nodes in a tree.', add_info_source=None, add_info_version=None): weights = self.__sample_node_proportion(tree, alpha) original_reader = vcf.Reader(open(input_vcf, 'r')) original_reader.infos[add_info_tag] = VcfInfo( add_info_tag, add_info_num, add_info_type, add_info_description, add_info_source, add_info_version) writer = vcf.Writer(open(output_vcf, 'w'), original_reader, lineterminator='\n') for record in original_reader: node = self.__sample_node(tree, weights) nodes = tree.sub_tree_nodes(at=node) nodes_string = '/'.join(map(str, nodes)) record.add_info(add_info_tag, nodes_string) writer.write_record(record) writer.close()
def add_annotation(self): """ <p> Read the input VCF file, add annotations to the #INFO column and write it back to the output VCF file. </p> """ vcfReader = vcf.Reader(open(self.inputFile, 'r')) """ How to add info header <http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41> """ vcfReader.infos['TSSOL'] = VcfInfo( 'TSSOL', vcf_field_counts['A'], 'String', 'Info indicates whether the variant overlapping with the' ' transcription start site(TSS)') vcfReader.infos['CCURI'] = VcfInfo( 'CCURI', vcf_field_counts['A'], 'String', 'Info includes the URL of the cage cluster to which the' ' variant overlapping') vcfReader.infos['SAMPURI'] = VcfInfo( 'SAMPURI', vcf_field_counts['A'], 'String', 'Info includes the URL of the samples with to which the' ' variant overlapping') vcfWriter = vcf.VCFWriter(open(self.outputFile, 'w'), vcfReader) cnt = 0 cnt_block = 100 t1 = time.time() #pool = Pool(self.n_parallel) #batch = list(itertools.islice(vcfReader, self.n_parallel)) #res = pool.map(parallel_annotation_caller, zip([self]*len(batch), batch)) for record in vcfReader: vcfWriter.write_record(self.get_annotation(record)) if cnt % cnt_block == 1: t2 = time.time() ips = cnt_block / (t2 - t1) print "speed: %.2f iters/s = %d iters p/h = %.1f hours/million iters" % \ (ips, ips * 3600, 1000000 / ips / 3600) t1 = time.time() cnt += 1 vcfWriter.close()
def annotation_vcf(parsed_args, process_num): records, results = Queue(100 * process_num), Queue() input_finished = False output_finished = False wait_records = dict() processes = list() records_id = count() for i in range(process_num): p = Process(target=score_vcf, args=(records, results, parsed_args.annotation)) processes.append(p) p.start() vcf_reader = vcf.Reader(filename=parsed_args.file_in) vcf_reader.infos['SpliceAI'] = VcfInfo('SpliceAI', vcf_field_counts['A'], 'String', 'SpliceAIv1.3 variant annotation. These include delta scores (DS) and delta positions (DP) for ' 'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). ' 'Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL', version=None, source=None) vcf_writer = vcf.Writer(open(parsed_args.file_out, 'w'), vcf_reader) while True: while not records.full() and not input_finished: try: record = next(vcf_reader) record_id = next(records_id) wait_records[record_id] = record record_infos = list() chromosome = str(record.CHROM).replace('chr', '') pos = record.POS ref = record.REF for alt in record.ALT: record_infos.append(VariantRecord(chromosome, pos, ref, str(alt))) records.put((record_id, record_infos)) except StopIteration: input_finished = True records.put('END') break processes_status = list() for p in processes: processes_status.append(p.is_alive()) if True not in processes_status: results.put('END') while True: try: result = results.get(False) except queue.Empty: break if result != 'END': record_id, record_score = result[0], result[1] record_write = wait_records.pop(record_id) record_write.add_info('SpliceAI', record_score) vcf_writer.write_record(record_write) else: output_finished = True break if output_finished: break vcf_writer.close()
def annotate_vcf(in_vcf, out_vcf, bw, ann_name): #add annotation field to header in_vcf.infos[ann_name] = VcfInfo(ann_name, vcf_field_counts['A'], 'Float', 'Replication Value', source="douglas", version="none") #annotate, read by read for i, record in enumerate(in_vcf): chrom = "chr" + record.CHROM pos = record.POS #get replication val at base from bw val = query_position(chrom, pos, bw) record.add_info(ann_name, val) out_vcf.write_record(record)
def main(): parser, (options, args) = _get_args() if len(args) != 3: parser.error("Missing arguments!") pp2file = args[0] input_vcf = args[1] output_vcf = args[2] engine = create_engine('sqlite:///{}'.format(pp2file)) conn = engine.connect() annotation_identifier = "PP2" reader = vcf.Reader(open(input_vcf, 'r')) reader.infos[annotation_identifier] = VcfInfo( annotation_identifier, 1, 'String', ("PolyPhen2 annotations in the following order:" "Gene name; " "UniProt id; " "Amino acid change; " "HVar effect category; " "Strength of var effect (probability); ", "HDiv effect category; " "Strength of div effect (probability)"), 'PolyPhen2', 'PGV001') writer = vcf.Writer(open(output_vcf, 'w'), reader, lineterminator='\n') for v in reader: res = annotate_variant( conn, v.CHROM if v.CHROM.startswith('chr') else 'chr{}'.format(v.CHROM), v.POS, v.REF, v.ALT[0]) if res is None: annotation = ['.', '.', '.', '.', '.', '.', '.'] else: annotation = [ res.gene, res.protein, res.aa_change, res.hvar_pred, res.hvar_prob, res.hdiv_pred, res.hdiv_prob ] v.add_info(annotation_identifier, annotation) writer.write_record(v)
def main(argv): # Read and parse arguments infile = '' outfile = '' do_batch_lookups = True input_error = False parser = argparse.ArgumentParser( description='Simple VCF Client application') parser.add_argument('-i', help='Input VCF file', type=str, metavar='Input File', required=True) parser.add_argument('-o', help='Output VCF file', type=str, metavar='Output File', required=True) parser.add_argument('-k', help='Your key to the API', type=str, metavar='API Key', required=False) parser.add_argument('-g', help='Reference genome either 1019 (default) or 1038', type=int, metavar='Reference Genome', required=False, default=1019) parser.add_argument('-nb', help="Do not do batch requests", action='store_true') args = parser.parse_args() infile = args.i outfile = args.o api_key = args.k ref_genome = args.g if args.g is not None else _ref_genome do_batch_lookups = not args.nb # Open and load vcf file into vfc reader object print("Reading input file ", infile) vcf_reader = vcf.Reader(filename=infile, encoding='utf8') # Add a new GENE info field in the metadata description, so that we may store such data for each record vcf_reader.infos['GENE'] = VcfInfo('GENE', ".", 'String', 'Concatenated list of GENE sumbols', "", "") # Prepare output for writing. print("Opening output file ", outfile) vcf_writer = vcf.Writer(open(outfile, 'w'), vcf_reader, lineterminator='\n') # Declare an array of Variant_lookup_data objects to hold data for executing the # lookups and process its outcome. variant_lookup_data_array = [] # A counter for the total number of rows processed thus far total_counter = 0 # Initialize client connection to API api = VariantAPIClient(api_key) if (api is None): print("Failed to connect to API") sys.exit() print("Start parsing input file") # Iterate throught all the records read from the input VCF file while True: try: # Get next record (corresponds to a data row in the file) vcf_record = next(vcf_reader) # A vcf_record (i.e. row in the VCF file) may correspond to more than one variants, if it contains # more than one ALT values. We generate a Variant_lookup_data record for each variant, # and add them to the variant_lookup_data_array. # Note: A reference to the same "vcf_record" object will be stored in each Variant_lookup_data record, # however the "alt" field will contain a different ALT value. variant_lookup_data_from_vcf_record(vcf_record, variant_lookup_data_array) except StopIteration as e: # Reached end of input VCF file, no new vcf_record was read vcf_record = None # If we are performing batch lookups... if (do_batch_lookups): # Check if we have reached (or slightly crossed) the limit of variants we want for the batch request, or the end of the input file. # Note: In this implementation we may cross the limit if the last vcf_record read contained more than one variants. This is OK. if (len(variant_lookup_data_array) >= _batch_limit or vcf_record is None): # Extract variant strings from array. variant_string_array = [ vld.variant_string for vld in variant_lookup_data_array ] # Execute batch lookup request batch_data = api.batch_lookup(variant_string_array, ref_genome=ref_genome, params={'add-all-data': 1}) # Process response, variant by variant batch_counter = 0 for data in batch_data: process_single_variant_response_data( variant_lookup_data_array[batch_counter], data, vcf_writer) batch_counter += 1 # Clear array del variant_lookup_data_array[:] if (vcf_record is None): # Reached the end of the file, finish break # If we are performing individual lookups for each variant (which is not recommended for performance issues), # execute the lookup and process the outcome else: if (vcf_record is not None): # Execute lookup requests for each element in the array for vld in variant_lookup_data_array: data = api.lookup(vld.variant_string, ref_genome=ref_genome) process_single_variant_response_data(vld, data, vcf_writer) # Clear array del variant_lookup_data_array[:] else: # Reached the end of the file, finish break total_counter += 1 if (total_counter % 1000 == 0): print("Read ", total_counter, " rows") print("Finished reading ", total_counter, " rows, exiting")
#!/usr/bin/env python import vcf import sys from vcf.parser import _Info as VcfInfo vcf_reader = vcf.Reader(sys.stdin) vcf_reader.infos['ALT_idx'] = VcfInfo('ALT_idx', 'A', 'String', 'index for the alternative alleles') vcf_reader.infos['ALT_pos'] = VcfInfo( 'ALT_pos', 1, 'String', 'original postition for the multiallele complex') vcf_reader.infos['ALT_num'] = VcfInfo( 'ALT_num', 1, 'Integer', 'number of alternative allele for the multiallele complex') vcf_reader.infos['ALTs_orig'] = VcfInfo('ALTs_orig', 1, 'String', 'all original ALTs') vcf_reader.infos['REF_orig'] = VcfInfo('REF_orig', 1, 'String', 'original REF') vcf_reader.infos['ACs_orig'] = VcfInfo('ACs_orig', 1, 'String', 'all original ACs') writer = vcf.Writer(sys.stdout, vcf_reader, lineterminator='\n') # info field that might be multi-allelic for Record in vcf_reader: # when encountering multiallele, annotate alt sequence if len(Record.ALT) > 1: ALT_idx = ','.join([str(x) for x in range(1, len(Record.ALT) + 1)]) ALT_pos = str(Record.CHROM) + ':' + str(Record.POS)
################################################################### #override the snpEff EFF fields in both header and info, split the fields and add them in INFO for easier conversion to table ################################################################### import sys import re import vcf from vcf.parser import _Info as VcfInfo if __name__ == '__main__': vcf_reader = vcf.Reader(sys.stdin) vcf_reader.infos['Allele'] = VcfInfo('Allele', 1, 'String', 'variant allele') vcf_reader.infos['ENSG'] = VcfInfo('ENSG', 1, 'String', 'Ensembl Gene ID') vcf_reader.infos['Feature'] = VcfInfo('Feature', 1, 'String', 'Ensembl Transcript') vcf_reader.infos['Feature_type'] = VcfInfo('Feature', 1, 'String', 'Feature type') vcf_reader.infos['Consequence'] = VcfInfo('Consequence', 1, 'String', 'Functional effect') vcf_reader.infos['cDNA_position'] = VcfInfo('cDNA_position', 1, 'String', 'cDNA position') vcf_reader.infos['CDS_position'] = VcfInfo('CDS_position', 1, 'String', 'CDS position') vcf_reader.infos['Protein_position'] = VcfInfo('Protein_position', 1, 'String', 'Protein position') vcf_reader.infos['Amino_acids'] = VcfInfo('Amino_acids', 1, 'String',
help="Fasta file for the genomic reference") parser.add_argument('--length', '-l', type=int, default=25, help="length of sequence on the 3 prime side") parser.add_argument('--thresh_times', '-t', type=int, default=5, help="times of repeat above which STR is called") args = parser.parse_args() vcf_reader = vcf.Reader(sys.stdin) vcf_reader.infos['Primer3'] = VcfInfo('Primer3', 1, 'String', '3 prime side sequence') vcf_reader.infos['Primer5'] = VcfInfo('Primer5', 1, 'String', '5 prime side sequence') vcf_reader.infos['STR'] = VcfInfo('STR', 0, 'Flag', 'whether there is STR') vcf_reader.infos['STR_RU'] = VcfInfo('STR_RU', 1, 'String', 'repeating unit of STR') vcf_reader.infos['STR_times'] = VcfInfo('STR_times', 1, 'Integer', 'time of repeats for STR') vcf_reader.infos['STR_match'] = VcfInfo( 'STR_match', 0, 'Flag', 'whether the ALT sequence change from REF matches multiples of STR_RU') writer = vcf.Writer(sys.stdout, vcf_reader, lineterminator='\n') for Record in vcf_reader:
def run_std_filter(args): vcf_out = os.path.basename(args.inputVcf) vcf_out = os.path.splitext(vcf_out)[0] txt_out = os.path.basename(args.inputTxt) txt_out = os.path.splitext(txt_out)[0] if args.outdir: vcf_out = os.path.join(args.outdir, vcf_out) txt_out = os.path.join(args.outdir, txt_out) vcf_out = vcf_out + '_STDfilter.vcf' txt_out = txt_out + '_STDfilter.txt' vcf_reader = vcf.Reader(open(args.inputVcf, 'r')) vcf_reader.infos['FAILURE_REASON'] = VcfInfo( 'FAILURE_REASON', '.', 'String', 'Failure Reason from MuTect text File', 'muTect', 'v1.1.5') vcf_reader.infos['set'] = VcfInfo( 'set', '.', 'String', 'The variant callers that reported this event', 'mskcc/basicfiltering', 'v0.2.1') vcf_reader.formats['DP'] = VcfFormat('DP', '1', 'Integer', 'Total read depth at this site') vcf_reader.formats['AD'] = VcfFormat( 'AD', 'R', 'Integer', 'Allelic depths for the ref and alt alleles in the order listed') allsamples = list(vcf_reader.samples) if len(allsamples) != 2: logger.critical( "The VCF does not have two genotype columns. Please input a proper vcf with Tumor/Normal columns" ) sys.exit(1) # If the caller reported the normal genotype column before the tumor, swap those around if_swap_sample = False if allsamples[1] == args.tsampleName: if_swap_sample = True vcf_reader.samples[0] = allsamples[1] vcf_reader.samples[1] = allsamples[0] # Dictionary to store records to keep keepDict = {} # Filter each row (Mutation) txtDF = pd.read_table(args.inputTxt, skiprows=1, dtype=str) txt_fh = open(txt_out, "wb") for index, row in txtDF.iterrows(): chr = row.loc['contig'] pos = row.loc['position'] ref_allele = row.loc['ref_allele'] alt_allele = row.loc['alt_allele'] trd = int(row.loc['t_ref_count']) tad = int(row.loc['t_alt_count']) ############################## # Tumor Variant Calculations # ############################## # Total Depth # Todo: Does this include indels? soft clipping? tdp = trd + tad # Variant Fraction if tdp != 0: tvf = int(tad) / float(tdp) else: tvf = 0 ############################### # Normal Variant Calculations # ############################### nrd = int(row.loc['n_ref_count']) nad = int(row.loc['n_alt_count']) # Total Depth ndp = nrd + nad # Variant Fraction if ndp != 0: nvf = int(nad) / float(ndp) else: nvf = 0 # Get REJECT or PASS judgement = row.loc['judgement'] failure_reason = row.loc['failure_reasons'] # nvfRF is one of the thresholds that the tumor variant fraction must exceed # in order to pass filtering. # # This threshold is equal to the normal variant fraction, multiplied by # the number of times greater we must see the mutation in the tumor (args.tnr): nvfRF = int(args.tnr) * nvf # This will help in filtering VCF key_for_tracking = str(chr) + ':' + str(pos) + ':' + str( ref_allele) + ':' + str(alt_allele) if judgement != 'KEEP': # Check the failure reasons to determine if we should still consider this variant failure_tags = failure_reason.split(',') tag_count = 0 for tag in failure_tags: if tag in ACCEPTED_TAGS: tag_count += 1 # All failure_reasons should be found in accepted tags to continue if tag_count != len(failure_tags): continue else: failure_reason = 'KEEP' if tvf > nvfRF: if (tdp >= int(args.dp)) & (tad >= int(args.ad)) & (tvf >= float( args.vf)): if key_for_tracking in keepDict: print('MutectStdFilter: There is a repeat ', key_for_tracking) else: keepDict[key_for_tracking] = failure_reason out_line = str.encode(args.tsampleName + "\t" + str(chr) + "\t" + str(pos) + "\t" + str(ref_allele) + "\t" + str(alt_allele) + "\t" + str(failure_reason) + "\n") txt_fh.write(out_line) txt_fh.close() # This section uses the keepDict to write all passed mutations to the new VCF file vcf_writer = vcf.Writer(open(vcf_out, 'w'), vcf_reader) for record in vcf_reader: key_for_tracking = str(record.CHROM) + ':' + str( record.POS) + ':' + str(record.REF) + ':' + str(record.ALT[0]) if key_for_tracking in keepDict: failure_reason = keepDict.get(key_for_tracking) # There was no failure reason for calls that had "KEEP" in their judgement column, # but this code uses "KEEP" as the key when they are encountered if failure_reason == 'KEEP': failure_reason = 'None' record.add_info('FAILURE_REASON', failure_reason) record.add_info('set', 'MuTect') if if_swap_sample: nrm = record.samples[0] tum = record.samples[1] record.samples[0] = tum record.samples[1] = nrm if record.FILTER == 'PASS': vcf_writer.write_record(record) # Change the failure reason to PASS, for mutations for which we want to override MuTect's assessment else: record.FILTER = 'PASS' vcf_writer.write_record(record) else: continue vcf_writer.close() # Normalize the events in the VCF, produce a bgzipped VCF, then tabix index it norm_gz_vcf = cmo_util.normalize_vcf(vcf_out, args.refFasta) cmo_util.tabix_file(norm_gz_vcf) return norm_gz_vcf
def serial_annotate(opts, trans_provided_no_acc): # 串行注释, 生成vcf格式 am, hp, hn3, hn5, hdp = annotator(opts.annotation) chrome_dic = generate_chrome_dic(opts.annotation) vcf_reader = vcf.Reader(filename=opts.file_in) vcf_reader.infos['HGVS'] = VcfInfo('HGVS', vcf_field_counts['A'], 'String', 'VCF record alleles in HGVS syntax', version=None, source=None) vcf_reader.infos['HGVS_Normalise'] = VcfInfo( 'HGVS_Normalise', vcf_field_counts['A'], 'String', 'VCF record alleles in HGVS syntax (Normalised)', version=None, source=None) vcf_writer = vcf.Writer(open(opts.file_out, 'w'), vcf_reader) for record in vcf_reader: chrome = str(record.CHROM) start = record.affected_start stop = record.affected_end record_hgvs_list = list() record_hgvs_normalise_list = list() for alt in record.ALT: hgvs_list = list() hgvs_normalise_list = list() if record.is_snp: var_type = 'snv' ref = record.REF call = str(alt) else: if len(record.REF) == 1 and len(str(alt)) > 1: var_type = 'ins' ref = '.' call = str(alt)[1:] elif len(record.REF) > 1 and len(str(alt)) == 1: var_type = 'del' ref = record.REF[1:] call = '.' else: var_type = 'delins' if record.REF[0] == str(alt)[0]: ref = record.REF[1:] call = str(alt)[1:] else: ref = record.REF call = str(alt) start = record.affected_start - 1 record_parser = VariantRecord(chrome, start, stop, ref, call, var_type) g = generate_g(record_parser, chrome_dic) try: g_parser = hp.parse_hgvs_variant(g) g_normalise_3 = hn3.normalize(g_parser) g_normalise_5 = hn5.normalize(g_parser) trans_related = am.relevant_transcripts(g_parser) except (HGVSParseError, HGVSError, HGVSUsageError) as e: error = str(e) logging.error( '{chrome} {start} {stop} {ref} {call} {g} annotate error. {error}.' .format(**locals())) record_hgvs_list.append('.|.|.') record_hgvs_normalise_list.append('.|.|.') continue trans = select_trans(trans_related, trans_provided_no_acc, opts.how) if len(trans) == 0: logging.warning( '{chrome} {start} {stop} {ref} {call} {g} no related transcripts in UTA.' .format(**locals())) record_hgvs_list.append(g + '|.|.') record_hgvs_normalise_list.append(str(g_normalise_3) + '|.|.') continue for tran in trans: try: t = am.g_to_t(g_parser, tran) strand = get_transcript_strand(opts, hdp, g, tran) if strand == 3: g_normalise = g_normalise_3 else: g_normalise = g_normalise_5 t_normalise = am.g_to_t(g_normalise, tran) p = am.t_to_p(t) p_normalise = am.t_to_p(t_normalise) hgvs_ = '|'.join([g, str(t), str(p)]) hgvs_normalise = '|'.join( [str(g_normalise), str(t_normalise), str(p_normalise)]) except (HGVSError, HGVSUsageError, NotImplementedError, IndexError) as e: error = str(e) logging.error( '{chrome} {start} {stop} {ref} {call} {tran} {g} annotate error. {error}.' .format(**locals())) hgvs_ = '|'.join([g, '.', '.']) hgvs_normalise = '|'.join([str(g_normalise_3), '.', '.']) hgvs_list.append(hgvs_) hgvs_normalise_list.append(hgvs_normalise) hgvs_alt = '/'.join(hgvs_list) hgvs_normalise_alt = '/'.join(hgvs_normalise_list) record_hgvs_list.append(hgvs_alt) record_hgvs_normalise_list.append(hgvs_normalise_alt) record_hgvs = ','.join(record_hgvs_list) record_hgvs_normalise = ','.join(record_hgvs_normalise_list) record.add_info('HGVS', record_hgvs) record.add_info('HGVS_Normalise', record_hgvs_normalise) vcf_writer.write_record(record) vcf_writer.close()
def convert_grch38_ref_mismatch_sites_to_grch37(input_vcf_file, output_vcf_basename): """ For ACMG59 reportable range there are 4 sites that have reference mismatch between GRCh37 and GRCh38 All ref and alts in variants overlapping these sites will need to be updated to 37 reference output file will contain variants overlapping mismatch sites and all other variants with original record """ logger = logging.getLogger(__name__) output_vcf_file = f'{output_vcf_basename}.vcf' reader = vcf.Reader(filename=input_vcf_file) records = list(reader) mismatched_site_overlap = {} for record in records: mismatched_site_key = find_overlapping_mismatch_site(record) if mismatched_site_key: mismatched_site_overlap[mismatched_site_key] = True try: update_grch38_ref_to_grch37_for_record_if_needed( record, mismatched_site_key) except ValueError as e: logger.info( f'Record {record.CHROM}:{record.POS} with mismatch site {mismatched_site_key} encountered error {e}' ) reader.infos['PREPROCESSED'] = VcfInfo( 'PREPROCESSED', 0, 'Flag', 'The record was pre-processed. Added when a record needed to be changed for liftover', '', '', ) # if there are no overlapping variants in mismatched sites, # create a homozygous variant matching 37 as ref and 38 as alt for key, site in MISMATCH_SITES.items(): if key not in mismatched_site_overlap.keys(): # TODO: separate out creation of a record mismatch_record = copy(record) mismatch_record.ID = '.' mismatch_record.QUAL = 100 mismatch_record.FILTER = [] mismatch_record.FORMAT = 'GT' mismatch_record.samples = [] # copy the objects within a record. # Without doing an explicit copy it will just be a # pointer to the original record for sample in record.samples: mismatch_record.samples.append(copy(sample)) mismatch_record.samples[0].data = calldata_spec('1/1') mismatch_record.INFO = {} mismatch_record.add_info('preprocessed') mismatch_record.CHROM = site['38_coordinates']['chrom'] mismatch_record.POS = site['38_coordinates']['start'] mismatch_record.REF = site['37_coordinates']['base'] mismatch_record.ALT = [ vcf.model._Substitution(site['38_coordinates']['base']) ] records.append(mismatch_record) contig_order = {c: i for i, c in enumerate(reader.contigs)} def sort_key(record): """ Sorts records by (CHROM,POS,REF). If contigs are specified in the VCF file and record CHROM matches a contig, contig order is maintained. Any unmatched CHROMs will throw an error """ if record.CHROM not in contig_order: raise ValueError( f'Unexpected chrom {record.CHROM} found. Expected one of {contig_order.keys()}' ) return (contig_order[record.CHROM], record.POS, record.REF) records.sort(key=sort_key) with open(output_vcf_file, 'w') as out_fp: writer = vcf.Writer(out_fp, reader, lineterminator='\n') for record in records: writer.write_record(record)
def main(): global options, args # Be sure to get files bgzipped and tabix indexed for vcf_file in options.input_vcf: #if not os.path.isfile(vcf_file + '.gz'): command_line = "bgzip -c " + vcf_file + " > " + vcf_file + ".gz" shlex.split(command_line) retcode = subprocess.check_output(command_line, stderr=subprocess.STDOUT, shell=True) print retcode #if not os.path.isfile(vcf_file + '.gz.tbi'): command_line = "tabix -f -p vcf " + vcf_file + ".gz" shlex.split(command_line) retcode = subprocess.check_output(command_line, stderr=subprocess.STDOUT, shell=True) print retcode # First vcf file will be the template file for outputting parameters template_vcf = vcf.Reader(open(options.input_vcf[0], 'r')) # Add essential fields in both formats and infos (header information) template_vcf.formats['FREQ'] = VcfFormat('FREQ', 1, 'String', 'Variant allele frequency') template_vcf.infos['SFREQ'] = VcfInfo( 'SFREQ', 1, 'Float', 'Maximum variant allele frequency of all samples') template_vcf.infos['SDP'] = VcfInfo( 'SDP', 1, 'Integer', 'Maximum sequencing depth of all samples') # Create a list of sorted variant-sites containing chr and position variant_sites = [] for vcf_file in options.input_vcf: tmp_vcf = vcf.Reader(open(vcf_file, 'r')) for record in tmp_vcf: new_variant_site = (get_chromosome_number(record.CHROM), record.POS) if not new_variant_site in variant_sites: variant_sites.append(new_variant_site) variant_sites.sort(key=lambda variant: (variant[0], variant[1])) # Open all files for random access input_vcf = [] for index, vcf_file in enumerate(options.input_vcf): input_vcf.append(vcf.Reader(open(vcf_file + '.gz', 'r'))) # Perform tests and checks if index > 0 and input_vcf[index].samples != template_vcf.samples: print "INFO: not same sample list in", vcf_file # Add necessary FORMAT or INFO fields definitions in template for info in input_vcf[index].infos: if not info in template_vcf.infos: template_vcf.infos[info] = input_vcf[index].infos[info] for myformat in input_vcf[index].formats: if not myformat in template_vcf.formats: template_vcf.formats[myformat] = input_vcf[index].formats[ myformat] # Open output handles output_vcf = vcf.Writer(open(options.output_vcf, 'w'), template_vcf, lineterminator='\n') output_indels_vcf = vcf.Writer(open(options.output_vcf + '_indels.vcf', 'w'), template_vcf, lineterminator='\n') output_snps_vcf = vcf.Writer(open(options.output_vcf + '_snps.vcf', 'w'), template_vcf, lineterminator='\n') # Now parse each variant-site and fetch information from vcfs: for my_variant_site in variant_sites: records = [] for my_vcf in input_vcf: try: for record in my_vcf.fetch( chromosome_number2str(my_variant_site[0]), my_variant_site[1], my_variant_site[1]): # vcf.fetch returns also next position if described, must be therefore removed if record.POS == my_variant_site[1]: records.append(record) except KeyError: # This exception is raised when the primary key is not found in one of the files. No actions required pass # master_records are those records for being output to merged vcf. A master record will be created for each # group of variants from a same variant site that can be merged master_records = [records[0]] for record in records[1:]: add_to_master = False already_added = False for master_record in master_records: if master_record != record: add_to_master = True else: if not master_record.merge(record): add_to_master = True else: already_added = True if add_to_master and not already_added: master_records.append(record) for master_record in master_records: output_vcf.write_record(master_record) if master_record.is_snp: output_snps_vcf.write_record(master_record) elif master_record.is_indel: output_indels_vcf.write_record(master_record)
if __name__ == '__main__': parser = argparse.ArgumentParser(description=' vcfAnnoSTR') parser.add_argument('--ref', '-r', required=True, help="Fasta file for the genomic reference") parser.add_argument('--anc_ref', '-a', required=True, help="Fasta file for the ancester genomic reference") args = parser.parse_args() vcf_reader = vcf.Reader(sys.stdin) vcf_reader.infos['ALT_Codon'] = VcfInfo('ALT_Codon', 1, 'String', 'Alt codon') vcf_reader.infos['ANC_Codon'] = VcfInfo('ANC_Codon', 1, 'String', 'Ancester allele codon') writer = vcf.Writer(sys.stdout, vcf_reader, lineterminator='\n') for Record in vcf_reader: if Record.INFO.get('LoF_filter', False): if 'ANC_ALLELE' in Record.INFO['LoF_filter']: if Record.INFO['VARTYPE'] == 'SNP': phase = int(Record.INFO['CDS_position'].split('|')[0]) % 3 strand = Record.INFO['STRAND'].split('|')[0] # use samtools faidx to extract
fields = line.split(' ') all_entries.append(fields) # initialize pfam_dict pfam_dict = {} for entry in all_entries: pfam_dict[entry[1]] = [] # put each entry into the slot for entry in all_entries: pfam_dict[entry[1]].append( (int(entry[4]), int(entry[5]), entry[3], entry[7])) vcf_reader = vcf.Reader(sys.stdin) vcf_reader.infos['AA_pos'] = VcfInfo('AA_pos', 1, 'Integer', 'Amino acid change position') vcf_reader.infos['Pfam'] = VcfInfo('Pfam', 0, 'Flag', 'whether Gene in pfam') vcf_reader.infos['InDom'] = VcfInfo('InDom', 0, 'Flag', 'whether in pfam domain') vcf_reader.infos['PfamDom'] = VcfInfo('PfamDom', 1, 'String', 'info of pfam domain') vcf_reader.infos['AftDom'] = VcfInfo('AftDom', 0, 'Flag', 'whether after pfam domain') writer = vcf.Writer(sys.stdout, vcf_reader, lineterminator='\n') numera = re.compile('[0-9]+') for Record in vcf_reader: # if coding change if Record.INFO.get('AAChange', False):
def parallel_annotate(opts, trans_provided_no_acc, process_num): # 并行注释 chrome_dic = generate_chrome_dic(opts.annotation) # 创建队列, 初始化 records, results = Queue(100 * process_num), Queue() input_finished = False output_finished = False wait_records = dict() records_id = count() processes = list() # 开启多个进程监听队列, 注释 for i in range(process_num): p = Process(target=process_record, args=(records, results, opts, trans_provided_no_acc)) processes.append(p) p.start() # 读取vcf信息, 写入新的vcf vcf_reader = vcf.Reader(filename=opts.file_in) vcf_reader.infos['HGVS'] = VcfInfo('HGVS', vcf_field_counts['A'], 'String', 'VCF record alleles in HGVS syntax', version=None, source=None) vcf_reader.infos['HGVS_Normalise'] = VcfInfo( 'HGVS_Normalise', vcf_field_counts['A'], 'String', 'VCF record alleles in HGVS syntax (Normalised)', version=None, source=None) vcf_writer = vcf.Writer(open(opts.file_out, 'w'), vcf_reader) while True: while not records.full() and not input_finished: try: record = next(vcf_reader) chrome = str(record.CHROM) start = record.affected_start stop = record.affected_end record_id = next(records_id) wait_records[record_id] = record record_infos = list() for alt in record.ALT: if record.is_snp: var_type = 'snv' ref = record.REF call = str(alt) else: if len(record.REF) == 1 and len(str(alt)) > 1: var_type = 'ins' ref = '.' call = str(alt)[1:] elif len(record.REF) > 1 and len(str(alt)) == 1: var_type = 'del' ref = record.REF[1:] call = '.' else: var_type = 'delins' if record.REF[0] == str(alt)[0]: ref = record.REF[1:] call = str(alt)[1:] else: ref = record.REF call = str(alt) start = record.affected_start - 1 record_parser = VariantRecord(chrome, start, stop, ref, call, var_type) g = generate_g(record_parser, chrome_dic) record_infos.append((record_parser, g)) records.put((record_id, record_infos)) except StopIteration: input_finished = True records.put('END') break processes_status = list() for p in processes: processes_status.append(p.is_alive()) if True not in processes_status: results.put('END') while True: try: result = results.get(False) except queue.Empty: break if result != 'END': record_id, record_hgvs, record_hgvs_normalise = result[ 0], result[1], result[2] record_write = wait_records.pop(record_id) record_write.add_info('HGVS', record_hgvs) record_write.add_info('HGVS_Normalise', record_hgvs_normalise) vcf_writer.write_record(record_write) else: output_finished = True break if output_finished: break vcf_writer.close()
info_item = info_item.strip() if not info_item.startswith('splice=') and info_item not in [ '', '.' ]: new_INFO.append(info_item.strip()) new_INFO.append(splice_predict.print_vcf(effect)) new_line = '\t'.join(elems[:7] + [';'.join(new_INFO)] + elems[8:]) return new_line # write_header vcf_reader = vcf.Reader(vcf_input) vcf_info_desc = 'Splice effect. Format: Transcript|Effect|MaxEntScan-wild|MaxEntScan-mut|MaxEntScan-closest|dist' vcf_reader.infos['splice'] = VcfInfo(id='splice', num=1, type='String', desc=vcf_info_desc, source='spliceAnnotator', version=__version__) vcf.Writer(vcf_output, vcf_reader) # distribute annotation pool = Pool(processes=args.np) for line in vcf_input: if line.startswith('#'): pass else: pool.apply_async(process, args=(line, ), callback=log_result) pool.close() pool.join() flush(result_list)
help="Filepath to reference FASTA file") parser.add_argument("--in-vcf", required=True, help="Filepath to vcf file to be analyzed") parser.add_argument("--out-vcf", required=True, help="Filepath to vcf file to be output") args = parser.parse_args() ref_path = args.reference reference = Fasta(ref_path, sequence_always_upper=True, read_ahead=1000) in_vcf_path = args.in_vcf in_vcf_handle = open(in_vcf_path) in_vcf = vcf.Reader(in_vcf_handle) in_vcf.infos['HRUN'] = VcfInfo( 'HRUN', 1, 'Integer', 'Homopolymer length to the right of report indel position', "get_hrun", "1.0") out_vcf_path = args.out_vcf out_vcf_handle = open(out_vcf_path, 'w') out_vcf = vcf.Writer(out_vcf_handle, in_vcf) for record in in_vcf: chrom = record.CHROM pos = record.POS - 1 ref = record.REF calc_hrun = False for alt in record.ALT: if len(ref) != len(alt): calc_hrun = True if calc_hrun: window = 50 hrun = 1
################################################################### #override the snpEff EFF fields in both header and info, split the fields and add them in INFO for easier conversion to table ################################################################### import sys import re import vcf from vcf.parser import _Info as VcfInfo if __name__ == '__main__': vcf_reader = vcf.Reader(sys.stdin) vcf_reader.infos['AC'] = VcfInfo('AC', 1, 'Integer', 'Allele count in genotypes') vcf_reader.infos['AF'] = VcfInfo('AF', 1, 'Float', 'Allele Frequency') vcf_reader.infos['MLEAC'] = VcfInfo( 'MLEAC', 1, 'Integer', 'Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC)' ) vcf_reader.infos['MLEAF'] = VcfInfo( 'MLEAF', 1, 'Float', 'Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF)' ) vcf_reader.infos['ALT_idx'] = VcfInfo('ALT_idx', 1, 'String', 'index for the alternative alleles') vcf_reader.infos['length'] = VcfInfo( 'length', 1, 'Integer', 'length(ALT) - length(REF) for each ALT') vcf_reader.infos['VARTYPE'] = VcfInfo('VARTYPE', 1, 'String', 'variant types')
def addTSSInfo(self, vcfInputFile): vcf_reader = vcf.Reader(open(vcfInputFile, 'r')) vcf_reader.infos['TSSOL'] = VcfInfo( 'TSSOL', vcf_field_counts['A'], 'String', 'Info indicates whether the variant overlapping with the' ' transcription start site(TSS)') vcf_writer = vcf.VCFWriter(open('output.vcf', 'w'), vcf_reader) query = SPARQLQueries.sparqlQueries() totalVar = 0 tssOLVar = 0 lo = LiftOver('hg38ToHg19.over.chain.gz') for record in vcf_reader: variantStart = record.start variantEnd = record.end variantChromosome = record.CHROM variantSubType = record.var_subtype isOverlapping = False # Adding chr prefix to the chromosome if "chr" not in variantChromosome: variantChromosome = "chr" + str(record.CHROM) #liftover from hg20 to hg19 data = lo.convert_coordinate(variantChromosome, variantStart) #print variantChromosome print variantStart print variantEnd if ((data != None)): data2 = data.pop() variantChromosomehg19 = data2[0] variantStarthg19 = data2[1] data = lo.convert_coordinate(variantChromosome, variantEnd) data2 = data.pop() variantEndhg19 = data2[1] # SPARQL query result = query.getTSS('http://ep.dbcls.jp/fantom5/sparql', variantStarthg19, variantEndhg19, variantChromosomehg19) for row in result: values = sparql.unpack_row(row) cageStart = values[1] cageEnd = values[2] if ((variantSubType == 'ins') & (variantStart > cageStart)): isOverlapping = True tssOLVar = tssOLVar + 1 break elif ((variantSubType != 'ins') & (cageStart > 0)): isOverlapping = True tssOLVar = tssOLVar + 1 break totalVar = totalVar + 1 record.add_info('TSSOL', [isOverlapping]) else: print "No liftover found for this pos = " + record.ID vcf_writer.write_record(record) print "No of variants = " + str(totalVar) print "No of tss overlapping variants = " + str(tssOLVar)
#print(input) #Ensure the bam and csv file match #if input==bam: # print("working with vcf: "+input + " and bam: " +bam) #else: # print( "bam:"+bam +" does not match vcf: " +input) # sys.exit(1) #Put header on true false and exp csv files in_var = vcf.Reader(open(sys.argv[1], 'r')) ## update infos ## in_var.infos['MapQ'] = VcfInfo( id='MapQ', num=1, type='Float', desc="The average MapQ of the reads containing the called variant" ) #,source=None, version=None) in_var.infos['Read_pos'] = VcfInfo( id='Read_pos', num=1, type='Float', desc="The average read cycle that called the given variant" ) #,source=None, version=None) in_var.infos['Phred'] = VcfInfo( id='Phred', num=1, type='Float', desc="The average Phred score of the called variant" ) #,source=None, version=None)
def run_std_filter(args): vcf_out = os.path.basename(args.inputVcf) vcf_out = os.path.splitext(vcf_out)[0] if args.outdir: vcf_out = os.path.join(args.outdir, vcf_out) txt_out = vcf_out + '_STDfilter.txt' vcf_out = vcf_out + '_STDfilter.vcf' vcf_reader = vcf.Reader(open(args.inputVcf, 'r')) vcf_reader.infos['set'] = VcfInfo( 'set', '.', 'String', 'The variant callers that reported this event', 'mskcc/basicfiltering', 'v0.2.1') vcf_reader.formats['DP'] = VcfFormat('DP', '1', 'Integer', 'Total read depth at this site') vcf_reader.formats['AD'] = VcfFormat( 'AD', 'R', 'Integer', 'Allelic depths for the ref and alt alleles in the order listed') allsamples = list(vcf_reader.samples) if len(allsamples) != 2: if args.verbose: logger.critical( 'The VCF does not have two genotype columns. Please input a proper vcf with Tumor/Normal columns' ) sys.exit(1) # If the caller reported the normal genotype column before the tumor, swap those around if_swap_sample = False if allsamples[1] == args.tsampleName: if_swap_sample = True vcf_reader.samples[0] = allsamples[1] vcf_reader.samples[1] = allsamples[0] nsampleName = vcf_reader.samples[1] vcf_writer = vcf.Writer(open(vcf_out, 'w'), vcf_reader) txt_fh = open(txt_out, "wb") # Iterate through rows and filter mutations for record in vcf_reader: tcall = record.genotype(args.tsampleName) keep_based_on_status = True if "Somatic" not in record.INFO['STATUS'] and args.filter_germline: keep_based_on_status = False if tcall['QUAL'] is not None: tmq = int(tcall['QUAL']) else: tmq = 0 if tcall['DP'] is not None: tdp = int(tcall['DP']) else: tdp = 0 if tcall['VD'] is not None: tad = int(tcall['VD']) else: tad = 0 if tdp != 0: tvf = int(tad) / float(tdp) else: tvf = 0 ncall = record.genotype(nsampleName) if ncall: if ncall['QUAL'] is not None: nmq = int(ncall['QUAL']) else: nmq = 0 if ncall['DP'] is not None: ndp = int(ncall['DP']) else: ndp = 0 if ncall['VD'] is not None: nad = int(ncall['VD']) else: nad = 0 if ndp != 0: nvf = nad / ndp else: nvf = 0 nvfRF = int(args.tnr) * nvf else: logger.critical( "filter_vardict: There are no genotype values for Normal. We will exit." ) sys.exit(1) record.add_info('set', 'VarDict') if if_swap_sample: nrm = record.samples[0] tum = record.samples[1] record.samples[0] = tum record.samples[1] = nrm if tvf > nvfRF: if keep_based_on_status & (tmq >= int(args.mq)) & (nmq >= int( args.mq)) & (tdp >= int(args.dp)) & (tad >= int( args.ad)) & (tvf >= float(args.vf)): vcf_writer.write_record(record) out_line = str.encode(args.tsampleName + "\t" + record.CHROM + "\t" + str(record.POS) + "\t" + str(record.REF) + "\t" + str(record.ALT[0]) + "\t" + "." + "\n") txt_fh.write(out_line) vcf_writer.close() txt_fh.close() # Normalize the events in the VCF, produce a bgzipped VCF, then tabix index it norm_gz_vcf = cmo_util.normalize_vcf(vcf_out, args.refFasta) cmo_util.tabix_file(norm_gz_vcf) return norm_gz_vcf
def RunStdFilter(args): vcf_out = os.path.basename(args.inputVcf) vcf_out = os.path.splitext(vcf_out)[0] txt_out = os.path.basename(args.inputTxt) txt_out = os.path.splitext(txt_out)[0] if (args.outdir): vcf_out = os.path.join(args.outdir, vcf_out + "_STDfilter.vcf") txt_out = os.path.join(args.outdir, txt_out + "_STDfilter.txt") else: vcf_out = vcf_out + "_STDfilter.vcf" txt_out = txt_out + "_STDfilter.txt" vcf_reader = vcf.Reader(open(args.inputVcf, 'r')) vcf_reader.infos['FAILURE_REASON'] = VcfInfo( 'FAILURE_REASON', '1', 'String', 'Failure Reason from MuTect text File') vcf_writer = vcf.Writer(open(vcf_out, 'w'), vcf_reader) txtDF = pd.read_table(args.inputTxt, skiprows=1, low_memory=False) txt_fh = open(txt_out, "wb") allsamples = vcf_reader.samples sample1 = allsamples[0] sample2 = allsamples[1] if (sample1 == args.tsampleName): nsampleName = sample2 else: nsampleName = sample1 # Dictionalry to store records to keep keepDict = {} for index, row in txtDF.iterrows(): chr = row.loc['contig'] # Get Chromosome pos = row.loc['position'] # Get Position ref_allele = row.loc['ref_allele'] alt_allele = row.loc['alt_allele'] trd = int(row.loc['t_ref_count']) tad = int(row.loc['t_alt_count']) tdp = trd + tad if (tdp != 0): tvf = int(tad) / float(tdp) else: tvf = 0 nrd = int(row.loc['n_ref_count']) nad = int(row.loc['n_alt_count']) ndp = nrd + nad if (ndp != 0): nvf = int(nad) / float(ndp) else: nvf = 0 judgement = row.loc['judgement'] # Get REJECT or PASS failure_reason = row.loc['failure_reasons'] # Get Reject Reason nvfRF = int(args.tnr) * nvf if (args.hotspotVcf): hotspotFlag = checkHotspot(args.hotspotVcf, chr, pos) else: hotspotFlag = False # This will help in filtering VCF key_for_tracking = str(chr) + ":" + str(pos) + ":" + str( ref_allele) + ":" + str(alt_allele) if (judgement == "KEEP"): if (key_for_tracking in keepDict): print("MutectStdFilter:There is a repeat ", key_for_tracking) else: keepDict[key_for_tracking] = judgement txt_fh.write(args.tsampleName + "\t" + str(chr) + "\t" + str(pos) + "\t" + str(ref_allele) + "\t" + str(alt_allele) + "\t" + str(judgement) + "\n") else: accepted_tags = [ "alt_allele_in_normal", "nearby_gap_events", "triallelic_site", "possible_contamination", "clustered_read_position" ] failure_tags = failure_reason.split(",") tag_count = 0 for tag in failure_tags: if tag in accepted_tags: tag_count = tag_count + 1 else: continue if (tag_count != len(failure_tags)): continue if (tvf > nvfRF): if ((tdp >= int(args.dp)) & (tad >= int(args.ad)) & (tvf >= float(args.vf))): if (key_for_tracking in keepDict): print("MutectStdFilter:There is a repeat ", key_for_tracking) else: keepDict[key_for_tracking] = failure_reason txt_fh.write(args.tsampleName + "\t" + str(chr) + "\t" + str(pos) + "\t" + str(ref_allele) + "\t" + str(alt_allele) + "\t" + str(failure_reason) + "\n") else: if (hotspotFlag): if ((tdp >= int(args.dp)) & (tad >= int(args.ad)) & (tvf >= float(args.vf))): if (key_for_tracking in keepDict): print("MutectStdFilter:There is a repeat ", key_for_tracking) else: keepDict[key_for_tracking] = failure_reason txt_fh.write(args.tsampleName + "\t" + str(chr) + "\t" + str(pos) + "\t" + str(ref_allele) + "\t" + str(alt_allele) + "\t" + str(failure_reason) + "\n") txt_fh.close() for record in vcf_reader: key_for_tracking = str(record.CHROM) + ":" + str( record.POS) + ":" + str(record.REF) + ":" + str(record.ALT[0]) if (key_for_tracking in keepDict): failure_reason = keepDict.get(key_for_tracking) if (failure_reason == "KEEP"): failure_reason = "None" record.add_info('FAILURE_REASON', failure_reason) if (record.FILTER == "PASS"): vcf_writer.write_record(record) else: record.FILTER = "PASS" vcf_writer.write_record(record) else: continue vcf_writer.close() return (vcf_out)
################################################################### #override the snpEff EFF fields in both header and info, split the fields and add them in INFO for easier conversion to table ################################################################### import sys import re import vcf from vcf.parser import _Info as VcfInfo if __name__ == '__main__': vcf_reader = vcf.Reader(sys.stdin) vcf_reader.infos['EFF'] = VcfInfo('EFF', 1, 'String', 'Effect of mutation') vcf_reader.infos['Impact'] = VcfInfo('Impact', 1, 'String', 'Likely impact of mutation') vcf_reader.infos['FunClass'] = VcfInfo('FunClass', 1, 'String', 'Class') vcf_reader.infos['CodonChange'] = VcfInfo('CodonChange', 1, 'String', 'Nucleotide Change') vcf_reader.infos['AAChange'] = VcfInfo('AAChange', 1, 'String', 'Protein Change') #vcf_reader.infos['AAChange.p'] = VcfInfo('AAChange.p', 1, 'String', 'Protein Change') #vcf_reader.infos['AAChange.c'] = VcfInfo('AAChange.c', 1, 'String', 'Protein Change') vcf_reader.infos['AALength'] = VcfInfo('AALength', 1, 'Integer', 'Protein Length') vcf_reader.infos['Gene'] = VcfInfo('Gene', 1, 'String', 'Gene') vcf_reader.infos['BioType'] = VcfInfo('BioType', 1, 'String', 'BioType') vcf_reader.infos['Coding'] = VcfInfo('Coding', 1, 'String', 'Coding') vcf_reader.infos['Transcript'] = VcfInfo('Transcript', 1, 'String',