def test_add_flag(): vcf = VCF(VCF_PATH) vcf.add_info_to_header({ 'ID': 'myflag', 'Description': 'myflag', 'Type': 'Flag', 'Number': '0' }) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec = next(vcf) rec.INFO["myflag"] = True w.write_record(rec) w.close() v = next(VCF(f)) assert v.INFO["myflag"] is True, dict(v.INFO) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec.INFO["myflag"] = False w.write_record(rec) v = next(VCF(f)) assert_raises(KeyError, v.INFO.__getitem__, "myflag")
def filter_vcf(vcf, output, minlength=0, truncate_svlen=float("inf"), suffix=""): vcf_in = VCF(vcf) if not output: output = vcf.replace(".vcf", "_{}.vcf".format(suffix)) vcf_in.add_info_to_header({ 'ID': 'TRUNCATED', 'Description': "SVLEN truncated", 'Type': 'Flag', 'Number': '0' }) vcf_out = Writer(output, vcf_in) records_truncated = 0 records_filtered = 0 for v in vcf_in: svlen = get_svlen(v) if svlen >= minlength: if svlen > truncate_svlen: v.INFO['SVLEN'] = 1 v.INFO['END'] = v.start + 1 v.INFO['TRUNCATED'] = True records_truncated += 1 vcf_out.write_record(v) else: records_filtered += 1 if records_truncated != 0: sys.stderr.write("Truncated {} records where SVLEN > {}\n".format( records_truncated, int(truncate_svlen))) if records_filtered != 0: sys.stderr.write("Filtered {} records where SVLEN < {}\n".format( records_filtered, int(minlength)))
def filter_annotate_calls(): parser = argparse.ArgumentParser(description='') parser.add_argument('--alignments', type=str, help='') parser.add_argument('--regions', type=str, help='') parser.add_argument('--calls', type=str, help='') parser.add_argument('--parameters', type=str, help='') args = parser.parse_args() import json parameters = json.load(open('{}.json'.format(args.parameters))) vcf = VCF(args.calls + '.vcf.gz') vcf.add_info_to_header({ 'ID': 'Confidence', 'Description': 'Measure of confidence in call based upon unitig structure', 'Type': 'String', 'Number': '1' }) with pysam.AlignmentFile(args.alignments + '.bam', 'rb') as unitigs, gzip.open( args.regions + '.bed.gz', 'rt') as regions: print(vcf.raw_header, end='') for region in regions: chromosome, start, end = region.strip().split('\t') region = '{}:{}-{}'.format(chromosome, start, end) for variant in vcf(region): retain_call, call_confidence = retainCall_reportConfidence( unitigs, variant, region, parameters) if retain_call: print(annotate(variant, call_confidence), end='') vcf.close()
def cli(context, vcf, repeats_file, loglevel): """Annotate str variants with str status""" coloredlogs.install(level=loglevel) header_string = 'STR_STATUS' repeat_information = None with open(repeats_file, 'r') as file_handle: repeat_information = parse_repeat_file(file_handle) if not repeat_information: LOG.warning("Could not find any repeat info") context.abort() vcf_obj = VCF(vcf) vcf_obj.add_info_to_header({ "ID": header_string, "Number": 'A', "Type": "String", "Description": "Repeat expansion status. Alternatives in ['normal', 'pre_mutation', 'full_mutation']" }) print_headers(vcf_obj) for var in vcf_obj: repeat_string = get_repeat_info(var, repeat_information) if repeat_string: var.INFO[header_string] = repeat_string click.echo(str(var).rstrip())
def annotate_allelic_balance(vcffile, region): vcf = VCF(vcffile) header_hetab_param_info = { 'ID': 'HetAB', 'Description': 'heterozygous genotype allele balance', 'Type': 'Float', 'Number': '1' } header_het_hom_alt_ab_param_info = { 'ID': 'HetHomAltAB', 'Description': 'heterozygous + homozygous ALT genotype allele balance', 'Type': 'Float', 'Number': '1' } vcf.add_info_to_header(header_hetab_param_info) vcf.add_info_to_header(header_het_hom_alt_ab_param_info) out = Writer('-', vcf) (total_sites, noted_sites) = (0, 0) for variant in vcf(region): total_sites += 1 if is_biallelic(variant): noted_sites += 1 (hetab, het_hom_alt_ab) = compute_allelic_balances(variant) variant = update_variant(variant, hetab, het_hom_alt_ab) out.write_record(variant) out.close() msg = "Annotated {} out of a possible {} sites" msg = msg.format(noted_sites, total_sites) log(msg)
def main(): args = get_args() vcf_in = VCF(args.vcf) vcf_in.add_info_to_header({ 'ID': 'SVLEN', 'Description': 'length of sv', 'Type': 'Integer', 'Number': '1' }) vcf_in.add_info_to_header({ 'ID': 'SVTYPE', 'Description': 'type of sv - just DEL or INS based on SVLEN', 'Type': 'String', 'Number': '1' }) vcf_out = Writer(args.output, vcf_in) for v in vcf_in: if abs(len(v.REF) - max([len(alt) for alt in v.ALT])) > 49: v.INFO["SVLEN"] = max([len(alt) for alt in v.ALT]) - len(v.REF) if v.INFO["SVLEN"] > 0: v.INFO["SVTYPE"] = "INS" else: v.INFO["SVTYPE"] = "DEL" vcf_out.write_record(v) vcf_in.close() vcf_out.close()
def merge(in_vcf, cadd_tsv): new_headers = annotation_info_headers() log("Collecting the CADD annotation information") cadd_annotations = create_CADD_annotation_dictionary(cadd_tsv) log("Processing the build37 vcf") vcf = VCF(in_vcf) for info_hdr in new_headers: vcf.add_info_to_header(info_hdr) out = Writer('-', vcf) in_vcf_variants = set() for variant in vcf: (variant, key) = update_variant(variant, cadd_annotations) in_vcf_variants.add(key) out.write_record(variant) out.close() log("Checking whether CADD completed correctly") ensure_cadd_completed_successfully( in_vcf, cadd_tsv, in_vcf_variants, frozenset(list(cadd_annotations.keys()))) log("All Done!")
def test_add_info_to_header(): v = VCF(VCF_PATH) v.add_info_to_header({ 'ID': 'abcdefg', 'Description': 'abcdefg', 'Type': 'Character', 'Number': '1' }) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, v) import sys rec = next(v) rec.INFO["abcdefg"] = "XXX" w.write_record(rec) w.close() v = next(VCF(f)) ret = v.INFO["abcdefg"] if isinstance(ret, bytes): ret = ret.decode() assert ret == "XXX", (dict(v.INFO), v.INFO["abcdefg"])
def mark_missing_sites(vcffile, region, missing_threshold, soft_filter): vcf = VCF(vcffile) header_param_id = { 'ID': 'MISSING', 'Description': 'failed variant site missingness threshold ({} %)'.format( missing_threshold) } header_param_info = { 'ID': 'MISSINGPCT', 'Description': 'site missingness percentage', 'Type': 'Float', 'Number': '1' } vcf.add_filter_to_header(header_param_id) vcf.add_info_to_header(header_param_info) out = Writer('-', vcf) (total_sites, noted_sites) = (0, 0) for variant in vcf(region): total_sites += 1 (missing_pct, missing, total) = compute_missingness(variant) verdict = variant_missing_criteria(missing_threshold, missing_pct) variant = update_variant(variant, verdict, missing_pct) if verdict == "pass": noted_sites += 1 out.write_record(variant) elif verdict == "fail" and soft_filter: out.write_record(variant) out.close() msg = "After filtering, passed {} out of a possible {} Sites ({})" msg = msg.format(noted_sites, total_sites, 'pass') print(msg, file=sys.stderr)
def test_add_flag(): vcf = VCF(VCF_PATH) vcf.add_info_to_header({'ID': 'myflag', 'Description': 'myflag', 'Type':'Flag', 'Number': '0'}) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec = vcf.next() rec.INFO["myflag"] = True w.write_record(rec) w.close() v = next(VCF(f)) assert v.INFO["myflag"] is None, dict(v.INFO) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, vcf) rec.INFO["myflag"] = False w.write_record(rec) v = next(VCF(f)) assert_raises(KeyError, v.INFO.__getitem__, "myflag")
def processVCF(invcf, remm, dann, out): vcf_data = VCF(invcf, gts012=True) tbx_remm = pysam.TabixFile(remm) tbx_dann = pysam.TabixFile(dann) vcf_data.add_info_to_header({ 'ID': 'DANN', 'Description': 'A deep neural network aimed to recognize pathogenic variants by annotating genetic variants, especially in noncoding regions.', 'Type': 'String', 'Number': '.' }) w = Writer(out, vcf_data) for record in vcf_data: try: for row in tbx_remm.fetch(record.CHROM, record.start, record.end): if int(str(row).split()[1]) == record.POS: record.INFO["ReMM"] = str(row).split()[2] if not record.INFO["ReMM"]: record.INFO["ReMM"] = "." except ValueError: record.INFO["ReMM"] = "." try: for row in tbx_dann.fetch(record.CHROM, record.start, record.end): if int(row.split()[1]) == record.POS and row.split( )[2] == record.REF and row.split()[3] == record.ALT[0]: record.INFO["DANN"] = round(float(row.split()[4]), 3) break else: record.INFO["DANN"] = "." except ValueError: record.INFO["DANN"] = "." w.write_record(record)
def setUp(self): # load test data # store each variant object into specific variables for tes test_directory = os.path.dirname(os.path.abspath(__file__)) reader = VCF(os.path.join(test_directory, "test.vcf")) self.test_filter = refilter.Filter(0.3, 0.7, 'AB', 'VAR_DP', 5, ['MISSING'], ['DB']) reader.add_filter_to_header(self.test_filter.filtered_header()) reader.add_info_to_header(self.test_filter.rescued_header()) self.variants = [ variant for variant in reader ]
def main(): opt = parse_arguments() aggregated_variants = {} for vcf_fn in opt.vcfs: vcf_reader = VCF(vcf_fn) vcf_reader.add_info_to_header({ 'ID': 'blaha', 'Description': 'aList of variant callers which detected the variant', 'Type': 'Character', 'Number': '1' }) vcf_reader.add_info_to_header({ 'ID': 'variant_callers', 'Description': 'List of variant callers which detected the variant', 'Type': 'Character', 'Number': '1' }) variant_caller = which_variantcaller(vcf_reader) for var in vcf_reader: # Check if multi-allelic site if len(var.ALT) > 1: raise NameError('Split and normalize you variants!') #calc_GT_fields(var, variant_caller) # Save variant in aggregated_variants if it hasn't been found before var_id = str(var.CHROM) + "_" + str(var.POS) + "_" + str( var.REF) + "_" + str(var.ALT[0]) if not aggregated_variants.get(var_id): aggregated_variants[var_id] = var # Add variant caller information to an INFO field vcs = "" if not aggregated_variants[var_id].INFO.get("variant_callers"): vcs = variant_caller aggregated_variants[var_id].INFO["variant_callers"] = vcs #aggregated_variants[var_id].INFO["variant_callers"] = variant_caller else: print("INTHERE") vcs = "TEST" aggregated_variants[var_id].INFO["blaha"] = vcs #aggregated_variants[var_id].INFO["blaha"] = "multiple" print var
def main(min_allele_balance, max_allele_balance, allele_balance_tag, variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields, vcf): reader = VCF(vcf) refilter = Filter(min_allele_balance, max_allele_balance, allele_balance_tag, variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields) reader.add_filter_to_header(refilter.filtered_header()) reader.add_info_to_header(refilter.rescued_header()) writer = Writer('-', reader) for variant in reader: refilter(variant) # Modifies variant filter status in place writer.write_record(variant)
def add_info_headers(vcf: VCF): vcf.add_info_to_header({ "ID": LOCI_ID, "Number": "1", "Type": "String", "Description": "CHROM (loci) in the original VCF", }) vcf.add_info_to_header({ "ID": LOCI_POS_ID, "Number": "1", "Type": "Integer", "Description": f"{LOCI_ID} (see other INFO header) POS in the original VCF", })
def test_add_info_to_header(): v = VCF(VCF_PATH) v.add_info_to_header({'ID': 'abcdefg', 'Description': 'abcdefg', 'Type':'Character', 'Number': '1'}) # NOTE that we have to add the info to the header of the reader, # not the writer because the record will be associated with the reader f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, v) import sys rec = v.next() rec.INFO["abcdefg"] = "XXX" w.write_record(rec) w.close() v = next(VCF(f)) assert v.INFO["abcdefg"] == "XXX", dict(v.INFO)
def process_vcf(vcf): vcf_data = VCF(vcf, gts012=True) vcf_data.add_info_to_header({ 'ID': 'Gene_SpliceAI', 'Description': 'Gene for which spliceAI gave the prediction.', 'Type': 'String', 'Number': '.' }) vcf_data.add_info_to_header({ 'ID': 'DS_AG', 'Description': 'SpliceAI score for an acceptor gain.', 'Type': 'String', 'Number': '.' }) vcf_data.add_info_to_header({ 'ID': 'DS_AL', 'Description': 'SpliceAI score for an acceptor lost.', 'Type': 'String', 'Number': '.' }) vcf_data.add_info_to_header({ 'ID': 'DS_DG', 'Description': 'SpliceAI score for a donor gain.', 'Type': 'String', 'Number': '.' }) vcf_data.add_info_to_header({ 'ID': 'DS_DL', 'Description': 'SpliceAI score for a donor lost.', 'Type': 'String', 'Number': '.' }) print(vcf_data.raw_header.rstrip()) for record in vcf_data: snvs = record.INFO.get('SpliceAI') indels = record.INFO.get('SpliceAI_ind') if snvs: record = set_new_fields(record, snvs) elif indels: record = set_new_fields(record, indels) print(str(record).rstrip()) vcf_data.close()
def processVariants(self): cyVCF = VCF(self.vcfFilePath) self.families.setSampleIdxs(cyVCF.samples) getCSQList = self.getCSQList(cyVCF.raw_header) cyVCF.add_info_to_header({ "ID": "Evidence_Codes", "Number": "1", "Type": "String", "Description": "All ACMG evidence codes that apply to this variant" }) cyVCF.add_info_to_header({ "ID": "Posterior_Pathogenic_Probability", "Number": "1", "Type": "String", "Description": "Posterior Pathogenic Probability" }) self.outputVCF.write(cyVCF.raw_header) for v in cyVCF: matchingClinVarVariants = [] for alt in v.ALT: key = "%s:%s:%s:%s" % (v.CHROM, v.POS, v.REF, alt) if key in self.clinVarData: matchingClinVarVariants.append(self.clinVarData[key]) var = variant.Variant(v, self.families, self.gnomAD_AF_Threshold, self.REVEL_Threshold, getCSQList, matchingClinVarVariants) if not var.printVariant: continue posterior = self.getPosterior(var) v.INFO["Evidence_Codes"] = var.getEvidenceCodesString() v.INFO["Posterior_Pathogenic_Probability"] = str( format(self.getPosterior(var), '.3f')) self.outputVCF.write(str(v))
def parse_header_vcf(vcf_file, vep_field=None, vep_separator=None): vcf = VCF(vcf_file) vcf.add_info_to_header({ 'ID': 'True_Label', 'Description': 'Pathogenic/Benign labelled variant', 'Type': 'Integer', 'Number': '1' }) vcf.add_info_to_header({ 'ID': 'Source', 'Description': 'File source', 'Type': 'String', 'Number': '1' }) vcf.add_info_to_header({ 'ID': 'SF', 'Description': '', 'Type': 'String', 'Number': '1' }) index_dict = dict() if vep_field: for h in vcf.header_iter(): try: if h.info()['ID'] == vep_field: csq_header = h.info()['Description'].split(vep_separator) for elem in csq_header: index_dict[elem] = csq_header.index(elem) except: pass return vcf, index_dict
def write_new_info_fields(vcf: VCF): vcf.add_info_to_header( { "ID": LOCI_ID, "Description": "name of overlapping loci", "Type": "String", "Number": "1", } ) vcf.add_info_to_header( { "ID": START_ID, "Description": "Loci start position; 1-based inclusive", "Type": "Integer", "Number": "1", } ) vcf.add_info_to_header( { "ID": END_ID, "Description": "Loci end position; 1-based inclusive", "Type": "Integer", "Number": "1", } )
def unliftover_vcf(b38_vcf, b37_vcf, annotation_type, auto_fill, update_id): new_info_headers = annotation_type_headers(annotation_type) new_annotation_fields = annotation_type_info_fields(annotation_type) log("Collecting the build 37 vcf annotation information") b37_annotations = create_b37_annotation_dictionary(b37_vcf, annotation_type, update_id) log("Processing the build38 vcf") vcf = VCF(b38_vcf) for info_hdr in new_info_headers: vcf.add_info_to_header(info_hdr) out = Writer('-', vcf) for variant in vcf: variant = update_variant(variant, b37_annotations, new_annotation_fields, auto_fill, update_id) out.write_record(variant) out.close() log("All Done!")
key: line[key] for key in args.fields.split(',') } else: tsv_fields = line.copy() new_tags = tsv_fields.keys() annotations[var_id] = tsv_fields for tag in new_tags: vcf.add_info_to_header({ 'ID': tag, 'Description': 'Annotation from' + args.annotate, 'Type': 'String', 'Number': '1' }) w = Writer(args.output, vcf) for v in vcf: var_id = "_".join([v.CHROM, v.end, v.REF, ','.join(v.ALT)]) if var_id in annotations.keys(): for tag, value in annotations[var_id].items(): v.INFO[tag] = value if args.fields: out_info_fields = args.fields.split(',') for key, value in v.INFO:
parser.add_argument('--output_vcf', type=str, default=None, help='Output VCF file') return parser.parse_args() if __name__ == '__main__': args = parse_args() # Reading the VCF file and adding 2 more attributes into INFO header data_vcf = VCF(args.vcffile) data_vcf.add_info_to_header({ 'ID': 'ps_filter', 'Description': 'Mask/Caution', 'Type': 'String', 'Number': '1' }) data_vcf.add_info_to_header({ 'ID': 'ps_exc', 'Description': 'Reasons for mask/caution', 'Type': 'String', 'Number': '1' }) # create a new vcf Writer using the input vcf as a template. fname = args.output_vcf w = Writer(fname, data_vcf) prob_vcf_columns = [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'
def read_vcf(vcf): vcf_obj = VCF(vcf) vcf_obj.add_info_to_header(VARIANTICS_HIST_VCF_HEADER) return vcf_obj
help='Server address(default http://127.0.0.1:8080)') parser.add_argument('--vcf', dest='vcf_file_name', required=True, help='Name of VCF to annotate') args = parser.parse_args() vcf = VCF(args.vcf_file_name) s = cmd2web.Client.connect(args.host) vcf.add_info_to_header({ 'ID': 'STIX_NONZERO', 'Description': 'The number of samples in cohort with non-zero evidence', 'Type': 'Integer', 'Number': '1' }) print(str(vcf.raw_header), end='', flush=True) for v in vcf: chrom = v.CHROM start = v.POS end = v.INFO.get('END') svtype = v.INFO.get('SVTYPE') cipos = v.INFO.get('CIPOS') ciend = v.INFO.get('CIEND') if None in [chrom, start, end, svtype]: continue
TEMPLATE_VCF_FILE = joboutdir / "template.vcf" TEMPLATE_VCF_FILE.write_text(TEMPLATE_VCF) vcf = VCF(TEMPLATE_VCF_FILE) # Add source vcf.add_to_header(f"##source=biopipen.ns.bed.Bed2Vcf") # Add genome assembly if genome: vcf.add_to_header(f"##reference={genome}") vcf.add_info_to_header( { "ID": "END", "Number": "1", "Type": "Integer", "Description": "End position of the variant described in this record" } ) vcf.add_format_to_header( { "ID": "GT", "Number": "1", "Type": "String", "Description": "Genotype", } ) # Add contigs contigs = set()
def extend_vcf_annotations(query_vcf, pcgr_db_directory, pcgr_predispose): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Cancer-relevant gene annotations, e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc. 3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc. 4. Variant effect predictions """ ## read VEP and PCGR tags to be appended to VCF file pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file( os.path.join(pcgr_db_directory, 'pcgr_infotags.tsv')) if pcgr_predispose is True: pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file( os.path.join(pcgr_db_directory, 'pcgr_infotags_predisposition.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf) vep_to_pcgr_af = { 'gnomAD_AMR_AF': 'AMR_AF_GNOMAD', 'gnomAD_AFR_AF': 'AFR_AF_GNOMAD', 'gnomAD_EAS_AF': 'EAS_AF_GNOMAD', 'gnomAD_NFE_AF': 'NFE_AF_GNOMAD', 'gnomAD_AF': 'GLOBAL_AF_GNOMAD', 'gnomAD_SAS_AF': 'SAS_AF_GNOMAD', 'gnomAD_OTH_AF': 'OTH_AF_GNOMAD', 'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD', 'gnomAD_FIN_AF': 'FIN_AF_GNOMAD', 'AFR_AF': 'AFR_AF_1KG', 'AMR_AF': 'AMR_AF_1KG', 'SAS_AF': 'SAS_AF_1KG', 'EUR_AF': 'EUR_AF_1KG', 'EAS_AF': 'EAS_AF_1KG', 'AF': 'GLOBAL_AF_1KG' } vcf = VCF(query_vcf) vep_csq_index2fields = {} vep_csq_fields2index = {} dbnsfp_prediction_algorithms = [] effect_predictions_description = "" for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element.keys(): identifier = str(header_element['ID']) if identifier == 'CSQ' or identifier == 'DBNSFP': description = str(header_element['Description']) if 'Format: ' in description: subtags = description.split('Format: ')[1].split('|') if identifier == 'CSQ': i = 0 for t in subtags: v = t if t in vep_to_pcgr_af: v = str(vep_to_pcgr_af[t]) if v in pcgr_vcf_infotags_meta: vep_csq_index2fields[i] = v vep_csq_fields2index[v] = i i = i + 1 if identifier == 'DBNSFP': if len(subtags) > 7: effect_predictions_description = "Format: " + '|'.join( subtags[7:]) i = 7 while (i < len(subtags)): dbnsfp_prediction_algorithms.append( str( re.sub(r'((_score)|(_pred))"*$', '', subtags[i]))) i = i + 1 for tag in pcgr_vcf_infotags_meta: if not vcf.contains(tag): vcf.add_info_to_header({ 'ID': tag, 'Description': str(pcgr_vcf_infotags_meta[tag]['description']), 'Type': str(pcgr_vcf_infotags_meta[tag]['type']), 'Number': str(pcgr_vcf_infotags_meta[tag]['number']) }) w = Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 pcgr_onco_xref_map = { 'SYMBOL': 1, 'ENTREZ_ID': 2, 'UNIPROT_ID': 3, 'APPRIS': 4, 'UNIPROT_ACC': 5, 'CHORUM_ID': 6, 'TUMOR_SUPPRESSOR': 7, 'ONCOGENE': 8, 'NETWORK_CG': 9, 'DISGENET_CUI': 10, 'CHEMBL_COMPOUND_ID': 11, 'INTOGEN_DRIVER': 12, 'ONCOSCORE': 13, 'CANCER_PREDISPOSITION': 14 } for rec in vcf: all_transcript_consequences = [] if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: logger.info( 'Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str( rec.REF) + '>' + alt_allele logger.warning( 'Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?) - variant will be skipped' ) continue pcgr_onco_xref = {} num_chromosome_records_processed += 1 if not rec.INFO.get('PCGR_ONCO_XREF') is None: for transcript_onco_xref in rec.INFO.get('PCGR_ONCO_XREF').split( ','): xrefs = transcript_onco_xref.split('|') ensembl_transcript_id = str(xrefs[0]) pcgr_onco_xref[ensembl_transcript_id] = {} for annotation in pcgr_onco_xref_map.keys(): annotation_index = pcgr_onco_xref_map[annotation] if annotation_index > (len(xrefs) - 1): continue if xrefs[annotation_index] != '': pcgr_onco_xref[ensembl_transcript_id][ annotation] = xrefs[annotation_index] for identifier in ['CSQ', 'DBNSFP']: if identifier == 'CSQ': num_picks = 0 for csq in rec.INFO.get(identifier).split(','): csq_fields = csq.split('|') if csq_fields[vep_csq_fields2index[ 'PICK']] == "1": ## only consider the primary/picked consequence when expanding with annotation tags num_picks += 1 j = 0 ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele) while (j < len(csq_fields)): if j in vep_csq_index2fields: if csq_fields[j] != '': rec.INFO[vep_csq_index2fields[j]] = str( csq_fields[j]) if vep_csq_index2fields[j] == 'Feature': ensembl_transcript_id = str( csq_fields[j]) if ensembl_transcript_id in pcgr_onco_xref: for annotation in pcgr_onco_xref_map.keys( ): if annotation == 'CHORUM_ID' or annotation == 'UNIPROT_ACC': continue if annotation in pcgr_onco_xref[ ensembl_transcript_id]: if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE' or annotation == 'NETWORK_CG' or annotation == 'CANCER_PREDISPOSITION': rec.INFO[ annotation] = True else: rec.INFO[annotation] = pcgr_onco_xref[ ensembl_transcript_id][ annotation] if vep_csq_index2fields[j] == 'DOMAINS': domain_identifiers = str( csq_fields[j]).split('&') for v in domain_identifiers: if v.startswith('Pfam_domain'): rec.INFO['PFAM_DOMAIN'] = str( re.sub( r'\.[0-9]{1,}$', '', re.sub( r'Pfam_domain:', '', v))) if vep_csq_index2fields[ j] == 'Existing_variation': var_identifiers = str( csq_fields[j]).split('&') cosmic_identifiers = [] dbsnp_identifiers = [] for v in var_identifiers: if v.startswith('COSM'): cosmic_identifiers.append(v) if v.startswith('rs'): dbsnp_identifiers.append( re.sub('^rs', '', v)) if len(cosmic_identifiers) > 0: rec.INFO[ 'COSMIC_MUTATION_ID'] = '&'.join( cosmic_identifiers) if len(dbsnp_identifiers) > 0: rec.INFO['DBSNPRSID'] = '&'.join( dbsnp_identifiers) j = j + 1 set_coding_change(rec) symbol = '.' if csq_fields[vep_csq_fields2index['SYMBOL']] != "": symbol = str( csq_fields[vep_csq_fields2index['SYMBOL']]) consequence_entry = str( csq_fields[vep_csq_fields2index['Consequence']] ) + ':' + str(symbol) + ':' + str(csq_fields[ vep_csq_fields2index['Feature_type']]) + ':' + str( csq_fields[vep_csq_fields2index['Feature']] ) + ':' + str( csq_fields[vep_csq_fields2index['BIOTYPE']]) all_transcript_consequences.append(consequence_entry) if identifier == 'DBNSFP': if not rec.INFO.get('DBNSFP') is None: map_variant_effect_predictors( rec, dbnsfp_prediction_algorithms) rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences) w.write_record(rec) w.close() logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: os.system('bgzip -f ' + str(out_vcf)) os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz') annotated_vcf = out_vcf + '.gz' write_pass_vcf(annotated_vcf) else: pcgrutils.pcgr_error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger) else: pcgrutils.pcgr_error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
"ORF3a": "cds-YP_009724391.1", "E": "cds-YP_009724392.1", "M": "cds-YP_009724393.1", "ORF6": "cds-YP_009724394.1", "ORF7a": "cds-YP_009724395.1", "ORF7b": "cds-YP_009725318.1", "ORF8": "cds-YP_009724396.1", "N": "cds-YP_009724397.2", "ORF10": "cds-YP_009725255.1", } db = gffutils.create_db( args.annotation_file, 'ncov_annotation.db', force=True, merge_strategy="merge") data_vcf = VCF(args.vcf_file) data_vcf.add_info_to_header( {'ID': 'mat_pep_id', 'Description': 'Mature Peptide ID', 'Type': 'String', 'Number': '.'}) data_vcf.add_info_to_header( {'ID': 'mat_pep_desc', 'Description': 'Mature Peptide Description', 'Type': 'String', 'Number': '.'}) data_vcf.add_info_to_header( {'ID': 'mat_pep_acc', 'Description': 'Mature Peptide Accession Number', 'Type': 'String', 'Number': '.'}) output_file_name = args.output_vcf w = Writer(output_file_name, data_vcf) for record in data_vcf: gene = db[gene_protein[record.INFO.get('EFF').split("|")[5]]] record.INFO["mat_pep_id"] = "n/a"
def overlaps_match(q: str, qpos: int, p: str, ppos: int) -> bool: qend = qpos + len(q) pend = ppos + len(p) qidx = slice(max(0, ppos - qpos), pend - qpos) pidx = slice(max(0, qpos - ppos), qend - ppos) return q[qidx] == p[pidx] TAG = "CLF" truth_rdr = VCF(snakemake.input.truth_vcf) query_rdr = VCF(snakemake.input.query_vcf) query_rdr.add_info_to_header( { "ID": TAG, "Description": "Classification of record", "Type": "String", "Number": ".", } ) query_wtr = Writer(snakemake.output.annotated_query_vcf, tmpl=query_rdr) classifications = [] classified_qrecords = set() classified_trecords = set() for query_record in query_rdr: if query_record.FILTER is not None: continue record_clfs = [] query_gt = Genotype.from_arr(query_record.genotypes[0]) qalt_idx = query_gt.alt_index()
def annotate(filepath, VCFDataFrame): """Function to write specific calculated and API values into desired VCF This is a very explicit function. Parameters: filepath: File path to desired .vcf to annotate VCFDataFrame: Dataframe with values that we want to annotate Returns: updates .vcf file at filepath """ vcf = VCF(filepath) #This is hardcoded as it is curated list_of_annotations = [{ 'ID': 'VAR', 'Description': "Selected Variant based on prioritization of \ (1) 'complex', (2) 'ins', (3) 'del', (4) 'mnp', (5) 'snp'", 'Type': 'String', 'Number': '1' }, { 'ID': 'VAR_TYPE', 'Description': "Annotated variant type based on prioritization of \ (1) 'complex', (2) 'ins', (3) 'del', (4) 'mnp', (5) 'snp'", 'Type': 'String', 'Number': '1' }, { 'ID': 'VAR_COUNT', 'Description': "Count of times selected variant was observed", 'Type': 'Interger', 'Number': '1' }, { 'ID': 'VAR_FRAC', 'Description': "Fraction of total reads that the variant was observed", 'Type': 'Float', 'Number': '1' }, { 'ID': 'FREQ_ExAC', 'Description': "Allele frequency of prioritized alt according to ExAC", 'Type': 'Float', 'Number': '1' }, { 'ID': 'TYPE_vep', 'Description': "vep annotation of major consequence of variant", 'Type': 'String', 'Number': '1' }] for annotation in list_of_annotations: vcf.add_info_to_header(annotation) w = Writer('Annotated_{}'.format(filepath), vcf) for i, variant in enumerate(vcf): for annotation in list_of_annotations: variant.INFO[annotation['ID']] = str( VCFDataFrame.at[i, annotation['ID']]) w.write_record(variant) w.close() vcf.close()
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc 3. Protein-relevant annotations, e.g. c functional protein features etc. 4. Variant effect predictions """ ## read VEP and PCGR tags to be appended to VCF file vcf_infotags_meta = annoutils.read_infotag_file( os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf) meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf( query_vcf, vcf_infotags_meta) dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[ 'dbnsfp_prediction_algorithms'] vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap'] vcf = VCF(query_vcf) for tag in vcf_infotags_meta: if lof_prediction == 0: if not tag.startswith('LoF'): vcf.add_info_to_header({ 'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']), 'Type': str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number']) }) else: vcf.add_info_to_header({ 'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']), 'Type': str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number']) }) w = Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 gvanno_xref_map = { 'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID': 1, 'ENSEMBL_PROTEIN_ID': 2, 'SYMBOL': 3, 'SYMBOL_ENTREZ': 4, 'ENTREZ_ID': 5, 'UNIPROT_ID': 6, 'UNIPROT_ACC': 7, 'REFSEQ_MRNA': 8, 'CORUM_ID': 9, 'TUMOR_SUPPRESSOR': 10, 'TUMOR_SUPPRESSOR_EVIDENCE': 11, 'ONCOGENE': 12, 'ONCOGENE_EVIDENCE': 13, 'MIM_PHENOTYPE_ID': 14, 'OPENTARGETS_DISEASE_ASSOCS': 15, 'OPENTARGETS_TRACTABILITY_COMPOUND': 16, 'OPENTARGETS_TRACTABILITY_ANTIBODY': 17, 'PROB_HAPLOINSUFFICIENCY': 18, 'PROB_EXAC_LOF_INTOLERANT': 19, 'PROB_EXAC_LOF_INTOLERANT_HOM': 20, 'PROB_EXAC_LOF_TOLERANT_NULL': 21, 'PROB_EXAC_NONTCGA_LOF_INTOLERANT': 22, 'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM': 23, 'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL': 24, 'PROB_GNOMAD_LOF_INTOLERANT': 25, 'PROB_GNOMAD_LOF_INTOLERANT_HOM': 26, 'PROB_GNOMAD_LOF_TOLERANT_NULL': 27, 'ESSENTIAL_GENE_CRISPR': 28, 'ESSENTIAL_GENE_CRISPR2': 29 } vcf_info_element_types = {} for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element: identifier = str(header_element['ID']) fieldtype = str(header_element['Type']) vcf_info_element_types[identifier] = fieldtype for rec in vcf: if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: logger.info( 'Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str( rec.REF) + '>' + alt_allele logger.warning( 'Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?) - variant will be skipped' ) continue num_chromosome_records_processed += 1 gvanno_xref = annoutils.make_transcript_xref_map( rec, gvanno_xref_map, xref_tag="GVANNO_XREF") csq_record_results = annoutils.parse_vep_csq(rec, gvanno_xref, vep_csq_fields_map, logger, pick_only=True, csq_identifier='CSQ') if 'vep_all_csq' in csq_record_results: rec.INFO['VEP_ALL_CSQ'] = ','.join( csq_record_results['vep_all_csq']) if 'vep_block' in csq_record_results: vep_csq_records = csq_record_results['vep_block'] block_idx = 0 record = vep_csq_records[block_idx] for k in record: if k in vcf_info_element_types: if vcf_info_element_types[k] == "Flag" and record[k] == "1": rec.INFO[k] = True else: if not record[k] is None: rec.INFO[k] = record[k] if not rec.INFO.get('DBNSFP') is None: annoutils.map_variant_effect_predictors( rec, dbnsfp_prediction_algorithms) w.write_record(rec) w.close() logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: os.system('bgzip -f ' + str(out_vcf)) os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz') annotated_vcf = out_vcf + '.gz' annoutils.write_pass_vcf(annotated_vcf, logger) else: annoutils.error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)', logger) else: annoutils.error_message( 'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)', logger)
def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, pon_annotation, cpsr): """ Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from 1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc. 2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc. 3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc. 4. Variant effect predictions 5. Panel-of-normal (blacklisted variants) annotation List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_directory """ ## read VEP and PCGR tags to be appended to VCF file vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'pcgr_infotags.tsv')) if cpsr is True: vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'cpsr_infotags.tsv')) out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf) meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta) dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms'] vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap'] vcf = VCF(query_vcf) for tag in sorted(vcf_infotags_meta): if pon_annotation == 0: if not tag.startswith('PANEL_OF_NORMALS'): vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) else: vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])}) w = Writer(out_vcf, vcf) current_chrom = None num_chromosome_records_processed = 0 pcgr_onco_xref_map = {'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID':1, 'ENSEMBL_PROTEIN_ID':2, 'SYMBOL':3, 'SYMBOL_ENTREZ':4, 'ENTREZ_ID':5, 'UNIPROT_ID':6, 'APPRIS':7,'UNIPROT_ACC':8,'REFSEQ_MRNA':9,'CORUM_ID':10,'TUMOR_SUPPRESSOR':11, 'TUMOR_SUPPRESSOR_EVIDENCE':12, 'ONCOGENE':13, 'ONCOGENE_EVIDENCE':14, 'NETWORK_CG':15,'DISGENET_CUI':16,'CHEMBL_COMPOUND_ID':17,'CHEMBL_COMPOUND_ID_EARLY_PHASE':18, 'INTOGEN_DRIVER':19, 'TCGA_DRIVER':20,'ONCOSCORE':21, 'MIM_PHENOTYPE_ID':22, 'CANCER_PREDISPOSITION_SOURCE':23, 'CANCER_SUSCEPTIBILITY_CUI':24, 'CANCER_SYNDROME_CUI':25, 'CANCER_PREDISPOSITION_MOI':26, 'CANCER_PREDISPOSITION_MOD':27, 'SIGNALING_PATHWAY':28, 'OPENTARGETS_DISEASE_ASSOCS':29, 'OPENTARGETS_TRACTABILITY_COMPOUND':30, 'OPENTARGETS_TRACTABILITY_ANTIBODY':31, 'GE_PANEL_ID':32, 'ACTIONABLE_TARGET':33,'GENCODE_GENE_STATUS':34, 'PROB_HAPLOINSUFFICIENCY':35,'PROB_EXAC_LOF_INTOLERANT':36,'PROB_EXAC_LOF_INTOLERANT_HOM':37, 'PROB_EXAC_LOF_TOLERANT_NULL':38,'PROB_EXAC_NONTCGA_LOF_INTOLERANT':39, 'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM':40, 'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL':41, 'PROB_GNOMAD_LOF_INTOLERANT':42, 'PROB_GNOMAD_LOF_INTOLERANT_HOM':43, 'PROB_GNOMAD_LOF_TOLERANT_NULL':44, 'ESSENTIAL_GENE_CRISPR':45, 'ESSENTIAL_GENE_CRISPR2':46} vcf_info_element_types = {} for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element: identifier = str(header_element['ID']) fieldtype = str(header_element['Type']) vcf_info_element_types[identifier] = fieldtype for rec in vcf: if current_chrom is None: current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 else: if str(rec.CHROM) != current_chrom: if not current_chrom is None: logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) current_chrom = str(rec.CHROM) num_chromosome_records_processed = 0 if rec.INFO.get('CSQ') is None: alt_allele = ','.join(rec.ALT) pos = rec.start + 1 variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(rec.REF) + '>' + alt_allele logger.warning('Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?) - variant will be skipped') continue csq_record_results = {} num_chromosome_records_processed += 1 pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF") csq_record_results = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ') vep_csq_records = None if 'vep_all_csq' in csq_record_results: rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results['vep_all_csq']) if 'vep_block' in csq_record_results: vep_csq_records = csq_record_results['vep_block'] block_idx = 0 if cpsr is True: block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records) record = vep_csq_records[block_idx] for k in record: if k in vcf_info_element_types: if vcf_info_element_types[k] == "Flag" and record[k] == "1": rec.INFO[k] = True else: if not record[k] is None: rec.INFO[k] = record[k] if not rec.INFO.get('DBNSFP') is None: annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms) w.write_record(rec) w.close() if current_chrom is not None: logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom)) vcf.close() if os.path.exists(out_vcf): if os.path.getsize(out_vcf) > 0: check_subprocess(logger, 'bgzip -f ' + str(out_vcf)) check_subprocess(logger, 'tabix -f -p vcf ' + str(out_vcf) + '.gz') annotated_vcf = out_vcf + '.gz' annoutils.write_pass_vcf(annotated_vcf, logger) else: annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger) else: annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)