Exemplo n.º 1
0
def check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly,
                                 logger):
    """
   Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR
   If any coinciding tags, an error will be returned
   """

    pcgr_infotags_desc = annoutils.read_infotag_file(
        os.path.join(pcgr_directory, 'data', genome_assembly,
                     'cpsr_infotags.tsv'))

    vcf = VCF(input_vcf)
    logger.info(
        'Checking if existing INFO tags of query VCF file coincide with CPSR INFO tags'
    )
    ret = 1
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys(
        ) and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO':
                if header_element['ID'] in pcgr_infotags_desc.keys():
                    err_msg = 'INFO tag ' + str(
                        header_element['ID']
                    ) + ' in the query VCF coincides with a VCF annotation tag produced by CPSR - please remove or rename this tag in your query VCF'
                    return annoutils.error_message(err_msg, logger)

    logger.info('No query VCF INFO tags coincide with CPSR INFO tags')
    return ret
Exemplo n.º 2
0
def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, pon_annotation, cpsr):
   """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   5. Panel-of-normal (blacklisted variants) annotation

   List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_directory
   """

   ## read VEP and PCGR tags to be appended to VCF file
   vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'pcgr_infotags.tsv'))
   if cpsr is True:
      vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'cpsr_infotags.tsv'))

   out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf)

   meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta)
   dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms']
   vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
   vcf = VCF(query_vcf)
   for tag in sorted(vcf_infotags_meta):
      if pon_annotation == 0:
         if not tag.startswith('PANEL_OF_NORMALS'):
            vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
      else:
         vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})

   w = Writer(out_vcf, vcf)
   current_chrom = None
   num_chromosome_records_processed = 0
   pcgr_onco_xref_map = {'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID':1, 'ENSEMBL_PROTEIN_ID':2, 'SYMBOL':3, 'SYMBOL_ENTREZ':4, 
                        'ENTREZ_ID':5, 'UNIPROT_ID':6, 'APPRIS':7,'UNIPROT_ACC':8,'REFSEQ_MRNA':9,'CORUM_ID':10,'TUMOR_SUPPRESSOR':11,
                        'TUMOR_SUPPRESSOR_EVIDENCE':12, 'ONCOGENE':13, 'ONCOGENE_EVIDENCE':14,
                        'NETWORK_CG':15,'DISGENET_CUI':16,'CHEMBL_COMPOUND_ID':17,'CHEMBL_COMPOUND_ID_EARLY_PHASE':18, 'INTOGEN_DRIVER':19,
                        'TCGA_DRIVER':20,'ONCOSCORE':21, 'MIM_PHENOTYPE_ID':22, 'CANCER_PREDISPOSITION_SOURCE':23, 
                        'CANCER_SUSCEPTIBILITY_CUI':24, 'CANCER_SYNDROME_CUI':25, 'CANCER_PREDISPOSITION_MOI':26, 
                        'CANCER_PREDISPOSITION_MOD':27, 'SIGNALING_PATHWAY':28, 'OPENTARGETS_DISEASE_ASSOCS':29,
                        'OPENTARGETS_TRACTABILITY_COMPOUND':30, 'OPENTARGETS_TRACTABILITY_ANTIBODY':31, 'GE_PANEL_ID':32, 
                        'ACTIONABLE_TARGET':33,'GENCODE_GENE_STATUS':34,
                        'PROB_HAPLOINSUFFICIENCY':35,'PROB_EXAC_LOF_INTOLERANT':36,'PROB_EXAC_LOF_INTOLERANT_HOM':37,
                        'PROB_EXAC_LOF_TOLERANT_NULL':38,'PROB_EXAC_NONTCGA_LOF_INTOLERANT':39,
                        'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM':40, 'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL':41,
                        'PROB_GNOMAD_LOF_INTOLERANT':42, 'PROB_GNOMAD_LOF_INTOLERANT_HOM':43, 'PROB_GNOMAD_LOF_TOLERANT_NULL':44,
                        'ESSENTIAL_GENE_CRISPR':45, 'ESSENTIAL_GENE_CRISPR2':46}
   
   vcf_info_element_types = {}
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
         identifier = str(header_element['ID'])
         fieldtype = str(header_element['Type'])
         vcf_info_element_types[identifier] = fieldtype

   for rec in vcf:
      if current_chrom is None:
         current_chrom = str(rec.CHROM)
         num_chromosome_records_processed = 0
      else:
         if str(rec.CHROM) != current_chrom:
            if not current_chrom is None:
               logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
      if rec.INFO.get('CSQ') is None:
         alt_allele = ','.join(rec.ALT)
         pos = rec.start + 1
         variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(rec.REF) + '>' + alt_allele
         logger.warning('Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped')
         continue
      csq_record_results = {}
      num_chromosome_records_processed += 1
      pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF")
      csq_record_results = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ')

      vep_csq_records = None 
      if 'vep_all_csq' in csq_record_results:
         rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results['vep_all_csq'])
      if 'vep_block' in csq_record_results:
         vep_csq_records = csq_record_results['vep_block']

         block_idx = 0
         if cpsr is True:
            block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records)
         record = vep_csq_records[block_idx]
         for k in record:
            if k in vcf_info_element_types:
               if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                  rec.INFO[k] = True
               else:
                  if not record[k] is None:
                     rec.INFO[k] = record[k]
      
      if not rec.INFO.get('DBNSFP') is None:
         annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms)


      w.write_record(rec)
   w.close()
   if current_chrom is not None:
      logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
   vcf.close()

   if os.path.exists(out_vcf):
      if os.path.getsize(out_vcf) > 0:
         check_subprocess(logger, 'bgzip -f ' + str(out_vcf))
         check_subprocess(logger, 'tabix -f -p vcf ' + str(out_vcf) + '.gz')
         annotated_vcf = out_vcf + '.gz'
         annoutils.write_pass_vcf(annotated_vcf, logger)
      else:
         annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
   else:
      annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
Exemplo n.º 3
0
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc
   3. Protein-relevant annotations, e.g. c functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(
        os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv'))
    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(
        query_vcf, vcf_infotags_meta)
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[
        'dbnsfp_prediction_algorithms']
    vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
    vcf = VCF(query_vcf)
    for tag in vcf_infotags_meta:
        if lof_prediction == 0:
            if not tag.startswith('LoF'):
                vcf.add_info_to_header({
                    'ID':
                    tag,
                    'Description':
                    str(vcf_infotags_meta[tag]['description']),
                    'Type':
                    str(vcf_infotags_meta[tag]['type']),
                    'Number':
                    str(vcf_infotags_meta[tag]['number'])
                })
        else:
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(vcf_infotags_meta[tag]['description']),
                'Type':
                str(vcf_infotags_meta[tag]['type']),
                'Number':
                str(vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    gvanno_xref_map = {
        'ENSEMBL_TRANSCRIPT_ID': 0,
        'ENSEMBL_GENE_ID': 1,
        'ENSEMBL_PROTEIN_ID': 2,
        'SYMBOL': 3,
        'SYMBOL_ENTREZ': 4,
        'ENTREZ_ID': 5,
        'UNIPROT_ID': 6,
        'UNIPROT_ACC': 7,
        'REFSEQ_MRNA': 8,
        'CORUM_ID': 9,
        'TUMOR_SUPPRESSOR': 10,
        'TUMOR_SUPPRESSOR_EVIDENCE': 11,
        'ONCOGENE': 12,
        'ONCOGENE_EVIDENCE': 13,
        'MIM_PHENOTYPE_ID': 14,
        'OPENTARGETS_DISEASE_ASSOCS': 15,
        'OPENTARGETS_TRACTABILITY_COMPOUND': 16,
        'OPENTARGETS_TRACTABILITY_ANTIBODY': 17,
        'PROB_HAPLOINSUFFICIENCY': 18,
        'PROB_EXAC_LOF_INTOLERANT': 19,
        'PROB_EXAC_LOF_INTOLERANT_HOM': 20,
        'PROB_EXAC_LOF_TOLERANT_NULL': 21,
        'PROB_EXAC_NONTCGA_LOF_INTOLERANT': 22,
        'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM': 23,
        'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL': 24,
        'PROB_GNOMAD_LOF_INTOLERANT': 25,
        'PROB_GNOMAD_LOF_INTOLERANT_HOM': 26,
        'PROB_GNOMAD_LOF_TOLERANT_NULL': 27,
        'ESSENTIAL_GENE_CRISPR': 28,
        'ESSENTIAL_GENE_CRISPR2': 29
    }

    vcf_info_element_types = {}
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
            identifier = str(header_element['ID'])
            fieldtype = str(header_element['Type'])
            vcf_info_element_types[identifier] = fieldtype

    for rec in vcf:
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        num_chromosome_records_processed += 1
        gvanno_xref = annoutils.make_transcript_xref_map(
            rec, gvanno_xref_map, xref_tag="GVANNO_XREF")

        csq_record_results = annoutils.parse_vep_csq(rec,
                                                     gvanno_xref,
                                                     vep_csq_fields_map,
                                                     logger,
                                                     pick_only=True,
                                                     csq_identifier='CSQ')
        if 'vep_all_csq' in csq_record_results:
            rec.INFO['VEP_ALL_CSQ'] = ','.join(
                csq_record_results['vep_all_csq'])
        if 'vep_block' in csq_record_results:
            vep_csq_records = csq_record_results['vep_block']
            block_idx = 0
            record = vep_csq_records[block_idx]
            for k in record:
                if k in vcf_info_element_types:
                    if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                        rec.INFO[k] = True
                    else:
                        if not record[k] is None:
                            rec.INFO[k] = record[k]

        if not rec.INFO.get('DBNSFP') is None:
            annoutils.map_variant_effect_predictors(
                rec, dbnsfp_prediction_algorithms)

        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            annoutils.error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
                logger)
    else:
        annoutils.error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
            logger)
Exemplo n.º 4
0
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc
   3. Protein-relevant annotations, e.g. c functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(
        os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv'))
    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(
        query_vcf, vcf_infotags_meta)
    vep_csq_index2fields = meta_vep_dbnsfp_info['vep_csq_index2fields']
    vep_csq_fields2index = meta_vep_dbnsfp_info['vep_csq_fields2index']
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[
        'dbnsfp_prediction_algorithms']

    vcf = VCF(query_vcf)
    for tag in vcf_infotags_meta:
        if lof_prediction == 0:
            if not tag.startswith('LoF'):
                vcf.add_info_to_header({
                    'ID':
                    tag,
                    'Description':
                    str(vcf_infotags_meta[tag]['description']),
                    'Type':
                    str(vcf_infotags_meta[tag]['type']),
                    'Number':
                    str(vcf_infotags_meta[tag]['number'])
                })
        else:
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(vcf_infotags_meta[tag]['description']),
                'Type':
                str(vcf_infotags_meta[tag]['type']),
                'Number':
                str(vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    gvanno_xref_map = {
        'ENSEMBL_TRANSCRIPT_ID': 0,
        'ENSEMBL_GENE_ID': 1,
        'SYMBOL': 2,
        'ENTREZ_ID': 3,
        'UNIPROT_ID': 4,
        'APPRIS': 5,
        'UNIPROT_ACC': 6,
        'REFSEQ_MRNA': 7,
        'CORUM_ID': 8,
        'TUMOR_SUPPRESSOR': 9,
        'ONCOGENE': 10,
        'DISGENET_CUI': 11,
        'MIM_PHENOTYPE_ID': 12
    }
    for rec in vcf:
        all_transcript_consequences = []
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        gvanno_xref = {}
        num_chromosome_records_processed += 1
        if not rec.INFO.get('GVANNO_XREF') is None:
            for transcript_xref in rec.INFO.get('GVANNO_XREF').split(','):
                xrefs = transcript_xref.split('|')
                ensembl_transcript_id = str(xrefs[0])
                gvanno_xref[ensembl_transcript_id] = {}
                for annotation in gvanno_xref_map.keys():
                    annotation_index = gvanno_xref_map[annotation]
                    if annotation_index > (len(xrefs) - 1):
                        continue
                    if xrefs[annotation_index] != '':
                        gvanno_xref[ensembl_transcript_id][annotation] = xrefs[
                            annotation_index]
        for identifier in ['CSQ', 'DBNSFP']:
            if identifier == 'CSQ':
                num_picks = 0
                for csq in rec.INFO.get(identifier).split(','):
                    csq_fields = csq.split('|')
                    if csq_fields[vep_csq_fields2index[
                            'PICK']] == "1":  ## only consider the primary/picked consequence when expanding with annotation tags
                        num_picks += 1
                        j = 0
                        ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele)
                        while (j < len(csq_fields)):
                            if j in vep_csq_index2fields:
                                if csq_fields[j] != '':
                                    rec.INFO[vep_csq_index2fields[j]] = str(
                                        csq_fields[j])
                                    if vep_csq_index2fields[j] == 'Feature':
                                        ensembl_transcript_id = str(
                                            csq_fields[j])
                                        if ensembl_transcript_id in gvanno_xref:
                                            for annotation in gvanno_xref_map.keys(
                                            ):
                                                if annotation in gvanno_xref[
                                                        ensembl_transcript_id]:
                                                    if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE':
                                                        rec.INFO[
                                                            annotation] = True
                                                    else:
                                                        rec.INFO[annotation] = gvanno_xref[
                                                            ensembl_transcript_id][
                                                                annotation]
                                    if vep_csq_index2fields[j] == 'DOMAINS':
                                        domain_identifiers = str(
                                            csq_fields[j]).split('&')
                                        for v in domain_identifiers:
                                            if v.startswith('Pfam_domain'):
                                                rec.INFO['PFAM_DOMAIN'] = str(
                                                    re.sub(
                                                        r'\.[0-9]{1,}$', '',
                                                        re.sub(
                                                            r'Pfam_domain:',
                                                            '', v)))

                                    if vep_csq_index2fields[
                                            j] == 'Existing_variation':
                                        var_identifiers = str(
                                            csq_fields[j]).split('&')
                                        cosmic_identifiers = []
                                        dbsnp_identifiers = []
                                        for v in var_identifiers:
                                            if v.startswith('COSM'):
                                                cosmic_identifiers.append(v)
                                            if v.startswith('rs'):
                                                dbsnp_identifiers.append(v)
                                        if len(cosmic_identifiers) > 0:
                                            rec.INFO[
                                                'COSMIC_MUTATION_ID'] = '&'.join(
                                                    cosmic_identifiers)
                                        if len(dbsnp_identifiers) > 0:
                                            rec.INFO['DBSNPRSID'] = '&'.join(
                                                dbsnp_identifiers)

                            j = j + 1
                        annoutils.set_coding_change(rec)
                    symbol = '.'
                    if csq_fields[vep_csq_fields2index['SYMBOL']] != "":
                        symbol = str(
                            csq_fields[vep_csq_fields2index['SYMBOL']])
                    consequence_entry = str(
                        csq_fields[vep_csq_fields2index['Consequence']]
                    ) + ':' + str(symbol) + ':' + str(csq_fields[
                        vep_csq_fields2index['Feature_type']]) + ':' + str(
                            csq_fields[vep_csq_fields2index['Feature']]
                        ) + ':' + str(
                            csq_fields[vep_csq_fields2index['BIOTYPE']])
                    all_transcript_consequences.append(consequence_entry)

            if identifier == 'DBNSFP':
                if not rec.INFO.get('DBNSFP') is None:
                    annoutils.map_variant_effect_predictors(
                        rec, dbnsfp_prediction_algorithms)
        rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences)
        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            annoutils.error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
                logger)
    else:
        annoutils.error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
            logger)
Exemplo n.º 5
0
def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, cpsr):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations, e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    pcgr_vcf_infotags_meta = annoutils.read_infotag_file(
        os.path.join(pcgr_db_directory, 'pcgr_infotags.tsv'))
    if cpsr is True:
        pcgr_vcf_infotags_meta = annoutils.read_infotag_file(
            os.path.join(pcgr_db_directory, 'cpsr_infotags.tsv'))

    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    vep_to_pcgr_af = {
        'gnomAD_AMR_AF': 'AMR_AF_GNOMAD',
        'gnomAD_AFR_AF': 'AFR_AF_GNOMAD',
        'gnomAD_EAS_AF': 'EAS_AF_GNOMAD',
        'gnomAD_NFE_AF': 'NFE_AF_GNOMAD',
        'gnomAD_AF': 'GLOBAL_AF_GNOMAD',
        'gnomAD_SAS_AF': 'SAS_AF_GNOMAD',
        'gnomAD_OTH_AF': 'OTH_AF_GNOMAD',
        'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD',
        'gnomAD_FIN_AF': 'FIN_AF_GNOMAD',
        'AFR_AF': 'AFR_AF_1KG',
        'AMR_AF': 'AMR_AF_1KG',
        'SAS_AF': 'SAS_AF_1KG',
        'EUR_AF': 'EUR_AF_1KG',
        'EAS_AF': 'EAS_AF_1KG',
        'AF': 'GLOBAL_AF_1KG'
    }

    vcf = VCF(query_vcf)
    vep_csq_index2fields = {}
    vep_csq_fields2index = {}
    dbnsfp_prediction_algorithms = []
    effect_predictions_description = ""
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys():
            identifier = str(header_element['ID'])
            if identifier == 'CSQ' or identifier == 'DBNSFP':
                description = str(header_element['Description'])
                if 'Format: ' in description:
                    subtags = description.split('Format: ')[1].split('|')
                    if identifier == 'CSQ':
                        i = 0
                        for t in subtags:
                            v = t
                            if t in vep_to_pcgr_af:
                                v = str(vep_to_pcgr_af[t])
                            if v in pcgr_vcf_infotags_meta:
                                vep_csq_index2fields[i] = v
                                vep_csq_fields2index[v] = i
                            i = i + 1
                    if identifier == 'DBNSFP':
                        if len(subtags) > 7:
                            effect_predictions_description = "Format: " + '|'.join(
                                subtags[7:])
                        i = 7
                        while (i < len(subtags)):
                            dbnsfp_prediction_algorithms.append(
                                str(
                                    re.sub(r'((_score)|(_pred))"*$', '',
                                           subtags[i])))
                            i = i + 1

    for tag in pcgr_vcf_infotags_meta:
        #if not vcf.contains(tag):
        vcf.add_info_to_header({
            'ID':
            tag,
            'Description':
            str(pcgr_vcf_infotags_meta[tag]['description']),
            'Type':
            str(pcgr_vcf_infotags_meta[tag]['type']),
            'Number':
            str(pcgr_vcf_infotags_meta[tag]['number'])
        })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    pcgr_onco_xref_map = {
        'SYMBOL': 1,
        'ENTREZ_ID': 2,
        'UNIPROT_ID': 3,
        'APPRIS': 4,
        'UNIPROT_ACC': 5,
        'CHORUM_ID': 6,
        'TUMOR_SUPPRESSOR': 7,
        'ONCOGENE': 8,
        'NETWORK_CG': 9,
        'DISGENET_CUI': 10,
        'CHEMBL_COMPOUND_ID': 11,
        'INTOGEN_DRIVER': 12,
        'ONCOSCORE': 13,
        'CANCER_PREDISPOSITION_SOURCE': 15,
        'CANCER_SUSCEPTIBILITY_CUI': 16,
        'CANCER_SYNDROME_CUI': 17,
        'CANCER_PREDISPOSITION_MOI': 18
    }
    for rec in vcf:
        all_transcript_consequences = []
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        pcgr_onco_xref = {}
        num_chromosome_records_processed += 1
        if not rec.INFO.get('PCGR_ONCO_XREF') is None:
            for transcript_onco_xref in rec.INFO.get('PCGR_ONCO_XREF').split(
                    ','):
                xrefs = transcript_onco_xref.split('|')
                ensembl_transcript_id = str(xrefs[0])
                pcgr_onco_xref[ensembl_transcript_id] = {}
                for annotation in pcgr_onco_xref_map.keys():
                    annotation_index = pcgr_onco_xref_map[annotation]
                    if annotation_index > (len(xrefs) - 1):
                        continue
                    if xrefs[annotation_index] != '':
                        pcgr_onco_xref[ensembl_transcript_id][
                            annotation] = xrefs[annotation_index]
        for identifier in ['CSQ', 'DBNSFP']:
            if identifier == 'CSQ':
                num_picks = 0
                for csq in rec.INFO.get(identifier).split(','):
                    csq_fields = csq.split('|')
                    if csq_fields[vep_csq_fields2index[
                            'PICK']] == "1":  ## only consider the primary/picked consequence when expanding with annotation tags
                        num_picks += 1
                        j = 0
                        ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele)
                        while (j < len(csq_fields)):
                            if j in vep_csq_index2fields:
                                if csq_fields[j] != '':
                                    rec.INFO[vep_csq_index2fields[j]] = str(
                                        csq_fields[j])
                                    if vep_csq_index2fields[j] == 'Feature':
                                        ensembl_transcript_id = str(
                                            csq_fields[j])
                                        if ensembl_transcript_id in pcgr_onco_xref:
                                            for annotation in pcgr_onco_xref_map.keys(
                                            ):
                                                if annotation == 'CHORUM_ID' or annotation == 'UNIPROT_ACC':
                                                    continue
                                                if annotation in pcgr_onco_xref[
                                                        ensembl_transcript_id]:
                                                    if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE' or annotation == 'NETWORK_CG':
                                                        rec.INFO[
                                                            annotation] = True
                                                    else:
                                                        if annotation.startswith(
                                                                'CANCER_'):
                                                            if cpsr is True:
                                                                rec.INFO[
                                                                    annotation] = pcgr_onco_xref[
                                                                        ensembl_transcript_id][
                                                                            annotation]
                                                        else:
                                                            rec.INFO[annotation] = pcgr_onco_xref[
                                                                ensembl_transcript_id][
                                                                    annotation]
                                    if vep_csq_index2fields[j] == 'DOMAINS':
                                        domain_identifiers = str(
                                            csq_fields[j]).split('&')
                                        for v in domain_identifiers:
                                            if v.startswith('Pfam_domain'):
                                                rec.INFO['PFAM_DOMAIN'] = str(
                                                    re.sub(
                                                        r'\.[0-9]{1,}$', '',
                                                        re.sub(
                                                            r'Pfam_domain:',
                                                            '', v)))

                                    if vep_csq_index2fields[
                                            j] == 'Existing_variation':
                                        var_identifiers = str(
                                            csq_fields[j]).split('&')
                                        cosmic_identifiers = []
                                        dbsnp_identifiers = []
                                        for v in var_identifiers:
                                            if v.startswith('COSM'):
                                                cosmic_identifiers.append(v)
                                            if v.startswith('rs'):
                                                dbsnp_identifiers.append(v)
                                        if len(cosmic_identifiers) > 0:
                                            rec.INFO[
                                                'COSMIC_MUTATION_ID'] = '&'.join(
                                                    cosmic_identifiers)
                                        if len(dbsnp_identifiers) > 0:
                                            rec.INFO['DBSNPRSID'] = '&'.join(
                                                dbsnp_identifiers)
                            j = j + 1
                        annoutils.set_coding_change(rec)
                    symbol = '.'
                    if csq_fields[vep_csq_fields2index['SYMBOL']] != "":
                        symbol = str(
                            csq_fields[vep_csq_fields2index['SYMBOL']])
                    consequence_entry = str(
                        csq_fields[vep_csq_fields2index['Consequence']]
                    ) + ':' + str(symbol) + ':' + str(csq_fields[
                        vep_csq_fields2index['Feature_type']]) + ':' + str(
                            csq_fields[vep_csq_fields2index['Feature']]
                        ) + ':' + str(
                            csq_fields[vep_csq_fields2index['BIOTYPE']])
                    all_transcript_consequences.append(consequence_entry)

            if identifier == 'DBNSFP':
                if not rec.INFO.get('DBNSFP') is None:
                    annoutils.map_variant_effect_predictors(
                        rec, dbnsfp_prediction_algorithms)
        rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences)
        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            annoutils.error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
                logger)
    else:
        annoutils.error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
            logger)
Exemplo n.º 6
0
def extend_vcf_annotations(query_vcf, pcgr_db_dir, logger, pon_annotation, regulatory_annotation, cpsr, debug):
    """
    Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
    1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
    2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
    3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
    4. Variant effect predictions
    5. Panel-of-normal (blacklisted variants) annotation

    List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_dir
    """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'pcgr_infotags.tsv'))
    if cpsr is True:
        vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'cpsr_infotags.tsv'))
    pcgr_onco_xref_map = annoutils.read_genexref_namemap(os.path.join(pcgr_db_dir, 'pcgr_onco_xref', 'pcgr_onco_xref_namemap.tsv'))


    out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta)
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms']
    vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
    vcf = cyvcf2.VCF(query_vcf)
    for tag in sorted(vcf_infotags_meta):
        if pon_annotation == 0 and regulatory_annotation == 0:
            if not tag.startswith('PANEL_OF_NORMALS') and not tag.startswith('REGULATORY_'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        elif pon_annotation == 1 and regulatory_annotation == 0:
            if not tag.startswith('REGULATORY_'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        elif pon_annotation == 0 and regulatory_annotation == 1:
            if not tag.startswith('PANEL_OF_NORMALS'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        else:
            vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})

    w = cyvcf2.Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0

    vcf_info_element_types = {}
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
            identifier = str(header_element['ID'])
            fieldtype = str(header_element['Type'])
            vcf_info_element_types[identifier] = fieldtype

    vars_no_csq = list()
    for rec in vcf:
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                if not current_chrom is None:
                    logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}")
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = f"g.{rec.CHROM}:{pos}{rec.REF}>{alt_allele}"
            vars_no_csq.append(variant_id)
            continue

        num_chromosome_records_processed += 1
        pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF")

        if regulatory_annotation == 1:
            csq_record_results_all = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = False, csq_identifier = 'CSQ')
            if 'vep_block' in csq_record_results_all:
                vep_csq_records_all = csq_record_results_all['vep_block']
                rec.INFO['REGULATORY_ANNOTATION'] = annoutils.map_regulatory_variant_annotations(vep_csq_records_all)

        csq_record_results_pick = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ')
        vep_csq_records = None
        if 'vep_all_csq' in csq_record_results_pick:
            rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results_pick['vep_all_csq'])
        if 'vep_block' in csq_record_results_pick:
            vep_csq_records = csq_record_results_pick['vep_block']
            block_idx = 0
            if cpsr is True:
                block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records)
            record = vep_csq_records[block_idx]
            for k in record:
                if k in vcf_info_element_types:
                    if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                        rec.INFO[k] = True
                    else:
                        if not record[k] is None:
                            rec.INFO[k] = record[k]
        if not rec.INFO.get('DBNSFP') is None:
            annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms)

        w.write_record(rec)
    if vars_no_csq:
        logger.warning(f"There were {len(vars_no_csq)} records with no CSQ tag from VEP (was --vep_no_intergenic flag set?). Skipping them and showing (up to) the first 100:")
        print('----')
        print(', '.join(vars_no_csq[:100]))
        print('----')
    w.close()
    if current_chrom is not None:
        logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}")
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            check_subprocess(logger, f'bgzip -f {out_vcf}', debug=False)
            check_subprocess(logger, f'tabix -f -p vcf {out_vcf}.gz', debug=False)
            annotated_vcf = f'{out_vcf}.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger)
    else:
        error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger)