def get_varcnt(invcf): '''Computes number of exonic variants per gene Args: invcf(str): VARANT annotated VCF Returns: varcnt(dictionary): Returns exonic variant count per gene ''' varcnt = {} vcf = VCFParser(invcf) for rec in vcf: vcf.parseinfo(rec) ant = vp.parse(rec.info) prant = vp.prio_trans(ant) cache = [] for altid, antinfo in prant.items(): if altid != 'intergenic': genelist = antinfo.keys() for gene in genelist: txant = antinfo[gene]['TRANSCRIPT'] key = (rec.chrom, gene) if 'CodingExonic' in txant.region.split('_')\ and txant.mutation != 'Syn' \ and rec.info['ESPAF'] < 5.0 and key not in cache: #TODO (to be replaced by ExAC?) cache.append(key) if key not in varcnt: varcnt[key] = 1 else: varcnt[key] += 1 return varcnt
def vtexonic(self, rec): """Check if variant is in the exonic region (varant annotated)""" hpm = 'StopGain StopLoss StartLoss NonSyn FrameShiftInsert FrameShiftDelete NonFrameShiftInsert NonFrameShiftDelete'.split( ) splc = ['SpliceDonor', 'SpliceAcceptor'] warn = ['CDS_NOT_MULTIPLE_OF_3'] vpop = vp.parse(rec.info) for altnum, val in vpop.items(): for gene, gd in val.items(): if gd: for t in gd['TRANSCRIPTS']: if t.mutation in hpm: return True if t.splice in splc: return True if t.warning in warn: return True return False
def ranking_vcf(self): ''' this function is obsolete and replaced by vcf2xls_varant() ''' import gcn.lib.io.vcf as vcf job_name = 'ranking_vcf' msg = 'annotating Divine prediction score into filtered VCF ... [%s;%s]' % ( job_name, self.vcf) lib_utils.msgout('notice', msg) self.logger.info(msg) ranked_vcf = '%s.ranked' % self.vcf ostream = open(ranked_vcf, 'w') v = vcf.VCFParser(self.vcf) v.add_meta_info("DVN", "1", "Float",\ "Gene damage score predicted by Divine:%s"%self.command) v.writeheader(ostream) for rec in v: v.parseinfo(rec) vpop = vp.parse(rec.info) max_dmg_sc = 0. for altnum, val in vpop.items(): for gene, gd in val.items(): if gene in self.gene_dmg: if self.gene_dmg[gene] > max_dmg_sc: max_dmg_score = self.gene_dmg[gene] rec.info.DVN = max_dmg_score v.write(ostream, rec) ostream.close() v.stream.close() os.rename(ranked_vcf, self.vcf) msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg)
def _store_variants(self, beta_fits): ''' collect essential info on each variant ''' job_name = '_store_variants' msg = 'collecting variant information and class label to determine genetic damage [%s;%s]...' % ( job_name, self.vcf) lib_utils.msgout('notice', msg) self.logger.info(msg) mutation_info = [] v = vcf.VCFParser(self.vcf) for rec in v: v.parseinfo(rec) varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt) mut_type = varlist[0][-1] if ':' in rec.id[0]: mut_type = 'mnp' # collect conservation prediction score (CADD and GERP++) cadd_aa = './.' px_cadd = None if rec.info.CADD_raw: # to get CADD_raw (average) px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, beta_fits) # to get GERP++ score px_gerp = None if rec.info.GerpConserve: px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa, rec.info.GerpRSScore, beta_fits) # which score can be chosen px = 0.5 if self.cadd > 0 and px_cadd is not None: px = px_cadd elif px_gerp is not None: px = px_gerp vpop = vp.parse(rec.info) genes = [] # to get MAF in the order of ExAC, ESP, and 1K if rec.info.EXACDB: maf = float(rec.info.EXACAF[0]) elif rec.info.ESPDB: maf = float(rec.info.ESPAF[0]) elif rec.info.KGDB: maf = float(rec.info.KGAF[0]) else: maf = 0. # to compute a significance of MAF maf_offset = 0. if maf > 0: maf_offset = ( 1. - self.dm.beta1 * math.exp(1000. * maf)) / self.dm.beta2 if maf_offset < 0.: maf_offset = 0. # to get transcript length for altnum, val in vpop.items(): # for each gene involved with the variant for gene, gd in val.items(): protein_len = self.dm.avg_protein_len if gd: for t in gd['TRANSCRIPTS']: if t.protein_len: protein_len = float(t.protein_len) break # store a set of essential annotation to be used for genetic damage if gene not in genes: mutation_info.append([ gene, rec.info.INDEL, rec.info.CLASS_TAG, protein_len, px, maf_offset ]) genes.append(gene) # done reading filterd VCF file v.stream.close() msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) return mutation_info
def vtexonic(self, rec): """ Check if variant is in the exonic region (varant annotated) TODO: apply a max distance of donor/acceptor distance; a cascade filter """ hpm = 'StopGain StopLoss StartLoss NonSyn FrameShiftInsert FrameShiftDelete NonFrameShiftInsert NonFrameShiftDelete'.split( ) splc = ['SpliceDonor', 'SpliceAcceptor'] splc_coding = ['ESS', 'ESE'] warn = ['CDS_NOT_MULTIPLE_OF_3'] intronic = ['CodingIntronic', 'NonCodingIntronic'] exonic = ['CodingExonic', 'NonCodingExonic'] utr = ['UTR5', 'UTR3'] vpop = vp.parse(rec.info) regions = [] for altnum, val in vpop.items(): for gene, gd in val.items(): if gd: #to extract recessive/dominant inherited = 'as_recessive' for mim in gd['MIM_PHENS']: if 'AUTOSOMAL_DOMINANT' in mim: #TODO:https://ghr.nlm.nih.gov/handbook/inheritance/inheritancepatterns inherited = 'as_dominant' for t in gd['TRANSCRIPTS']: if self.regincl == 'all' or t.region in self.regincl: region = [ t.trans_id, t.region, t.protein_len, inherited ] if t.region in exonic: #CodingExonic,NonCodingExonic if t.splice in splc_coding: region_tag = 'splc_coding' if t.mutation in hpm: region_tag += ';hpm' region.append(region_tag) regions.append(region) elif t.mutation in hpm: region.append('hpm') regions.append(region) elif t.warning in warn: region.append('warn') regions.append(region) elif self.dconf['splice_dist'] > 0: dists = [] tcdna = t.cdna.split('_') T = len(tcdna) if t.region in intronic: mObj = re.search(r'c\.(.+)[+-](\d+)', tcdna[0]) if mObj: dists.append(int(mObj.group(2))) if T > 1: mObj = re.search( r'(.+)[+-](\d+)', tcdna[1]) if mObj: dists.append(int(mObj.group(2))) elif t.region in utr: mObj = re.search(r'c\.[-\*](\d+)', tcdna[0]) if mObj: dists.append(int(mObj.group(1))) if T > 1: mObj = re.search( r'[-\*](\d+)', tcdna[1]) if mObj: dists.append(int(mObj.group(1))) if dists: if min(dists) <= self.dconf['splice_dist']: region.append('splc_ext_intron') regions.append(region) if regions: return regions elif self.dconf['regulome']: for altnum, val in vpop.items(): for gene, gd in val.items(): if gd: inherited = 'as_recessive' for mim in gd['MIM_PHENS']: if 'AUTOSOMAL_DOMINANT' in mim: #TODO:https://ghr.nlm.nih.gov/handbook/inheritance/inheritancepatterns inherited = 'as_dominant' break for t in gd['TRANSCRIPTS']: if rec.info.RegulomeScore: return [ t.trans_id, t.region, t.protein_len, inherited, 'regulome' ] else: return [ t.trans_id, None, None, inherited, None ]
def in_gene(self, rec, genes): vpop = vp.parse(rec.info) for altnum, val in vpop.items(): for gene, gd in val.items(): if gene in genes: return True
def get_gene_data(vcffile, pedigree, GQ_THRES): """Retrieves gene_transcript wise variants where there exits at least one frameshift/stopgain mutation. Args: - vcffile(str): Input VCF file. Note - VCF should be VARANT annotated. - pedigree(list): [Father SampleID, Mother SampleID, Child SampleID]. Expects the order in which the SampleIDs are mentioned above. - GQ_THRES(int): Threshold Genotype Quality Returns: - gene_data_phased(dictionary): Genotype Phased gene_transcript wise variants where there is at least one Frameshift/ Stopgain mutation. - gene_data_unphased(dictionary): Genotype Unphased gene_transcript wise variants where there is at least one Frameshift/Stopgain mutation in homozygous state. """ data1 = {} data2 = {} FILTER = ['PASS', 'VQSRTrancheSNP99.00to99.90'] v = vcf.VCFParser(vcffile) for rec in v: v.parseinfo(rec) v.parsegenotypes(rec) varfltr = rec['filter'] if len([True for flt in FILTER if flt in varfltr]) > 0: genotypes = check_genotype(rec, pedigree, GQ_THRES) if genotypes: pg = phase(*genotypes) if pg[1] == '|': c1, c2 = int(pg[0]), int(pg[-1]) va = vp.parse(rec.info) for idx, altid in enumerate([c1, c2]): if altid != 0: if altid in va: gene = va[altid].keys()[0] if len(va[altid][gene]) > 0: for ta in va[altid][gene]['TRANSCRIPTS']: if ta.region == 'CodingExonic': trans_id = ta.trans_id key = (rec.chrom, rec.pos, \ ','.join(rec.id), rec.ref, \ rec.alt[altid - 1], altid) gi = (gene, trans_id) if gi not in data1: data1[gi] = [{}, {}] data1[gi][idx][key] = \ [ta.mutation, pg, genotypes[0], genotypes[1]] else: data1[gi][idx][key] = \ [ta.mutation, pg, genotypes[0], genotypes[1]] else: c1, c2 = int(pg[0]), int(pg[-1]) va = vp.parse(rec.info) for altid in [c1, c2]: if altid != 0: if altid in va: gene = va[altid].keys()[0] if len(va[altid][gene]) > 0: for ta in va[altid][gene]['TRANSCRIPTS']: if ta.region == 'CodingExonic': trans_id = ta.trans_id key = (rec.chrom, rec.pos, \ ','.join(rec.id), rec.ref, \ rec.alt[altid - 1], altid) gi = (gene, trans_id) if gi not in data2: data2[gi] = [{}] data2[gi][0][key] = \ [ta.mutation, pg, genotypes[0], genotypes[1]] else: data2[gi][0][key] = \ [ta.mutation, pg, genotypes[0], genotypes[1]] gene_data_phased = {} for k, v in data1.items(): for e in v: if len(e) > 0: if len(e.values()) > 1: if len([True for mut in [x[0] for x in e.values()] \ if mut.startswith('FrameShift') \ or mut == 'StopGain']) > 0: if k not in gene_data_phased: gene_data_phased[k] = [e] else: gene_data_phased[k].append(e) del data1 gene_data_unphased = {} for k, v in data2.items(): for e in v: if len(e) > 0: if len(e.values()) > 1: if len([True for y in [(x[0], x[1]) for x in e.values()] \ if (y[0].startswith('FrameShift') or \ y[0] == 'StopGain') and \ int(y[1][0]) == int(y[1][2])]) > 0: if k not in gene_data_unphased: gene_data_unphased[k] = [e] else: gene_data_unphased[k].append(e) del data2 return gene_data_phased, gene_data_unphased
def _extract_mutation_info(self,beta_fits): ''' objective: to extract (gene_qsymbol,mutation_type,variant_class_tag,transcript_length,insillico_prediction_score,MAF_significance_offset,zygosity) from annotated/filtered VCF file to transfer genmod information to class_tag and also get rid of some redundancy in VCF info ''' job_name = '_extract_mutation_info' msg='collecting variant information and class label to determine genetic damage [%s;%s]...'%(job_name,self.vcf) lib_utils.msgout('notice',msg);self.logger.info(msg) mutation_info = [] if self.proband_id: rewrite_vcf = True v = vcf.VCFParser(self.vcf,sampleids=[self.proband_id]) pdom,pdom0 = self.gather_pdomain_scores(v) v.stream.close() vcf_tmp = self.vcf+'.tmp' ostream = open(vcf_tmp, 'w') rmInfo = ['Exonic','Annotation','Compounds'] v = vcf.VCFParser(self.vcf) v.writeheader(ostream,to_del_info = rmInfo) else: rewrite_vcf = False v = vcf.VCFParser(self.vcf) pdom,pdom0 = self.gather_pdomain_scores(v) v.stream.close() v = vcf.VCFParser(self.vcf) msg = 'Importing max transcript length for each gene ...' lib_utils.msgout('notice', msg); self.logger.info(msg) refgene = Refgene() cds_lens = refgene.get_max_cds_length() tx_lens = {} for gene, cds_len in cds_lens.iteritems(): tx_lens[gene] = int(cds_len/3.) ridx = 0 for rec in v: v.parseinfo(rec) #to remove redundant gene symbols annotated by genmod but add transcript version if rewrite_vcf: for rkey in rmInfo: v.delete_info(rec, rkey) if rec.info.GeneticModels: genmod_tag = lib_ped.parse_genmod_inherit_model(\ rec.info.GeneticModels[0].split(':')[1]) rec.info.CLASS_TAG += genmod_tag v.write(ostream, rec) varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt) mut_type = varlist[0][-1] if ':' in rec.id[0]: mut_type = 'mnp' # collect conservation prediction score (CADD and GERP++) cadd_aa = './.' px_cadd = None if rec.info.CADD_raw: # to get CADD_raw (average) px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, beta_fits) # to get GERP++ score px_gerp = None if rec.info.GerpConserve: px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa, rec.info.GerpRSScore, beta_fits) # which score can be chosen px = 0.5 if self.cadd>0 and px_cadd is not None: px = px_cadd elif px_gerp is not None: px = px_gerp vpop = vp.parse(rec.info) genes = [] # to get MAF in the order of ExAC, ESP, and 1K if rec.info.EXACDB: maf = get_min_maf(rec.info.EXACAF[0]) elif rec.info.ESPDB: maf = get_min_maf(rec.info.ESPAF[0]) elif rec.info.KGDB: maf = get_min_maf(rec.info.KGAF[0]) else: maf = 0. # to compute a significance of MAF maf_offset = self.dm.get_maf_xoffset(maf) #pdom.iloc[ridx]==ridx pdom_idx = pdom.index[pdom.ridx == ridx].tolist() if pdom_idx: patho_p = pdom.phat_lo[pdom_idx[0]] patho_pden = pdom.patho_dens_p[pdom_idx[0]] else: # assign a default pathogenic domain value (15% quantile value) patho_p = pdom0.phat_lo patho_pden = pdom0.patho_dens_p vartype = get_var_type(rec.ref,rec.alt) # to get transcript length for altnum, val in vpop.items(): # for each gene involved with the variant for gene, gd in val.items(): protein_len = self.dm.avg_protein_len if gene in tx_lens: protein_len = tx_lens[gene] # store a set of essential annotation to be used for genetic damage if gene not in genes: mutation_info.append([gene, vartype, rec.info.CLASS_TAG, protein_len, px, maf_offset, patho_p, patho_pden]) genes.append(gene) ridx += 1 # done reading filterd VCF file if rewrite_vcf: v.stream.close() ostream.close() os.rename(vcf_tmp,self.vcf) msg = 'done. [%s]'%job_name lib_utils.msgout('notice',msg); self.logger.info(msg) return mutation_info
def _load(invcf, thres_af, nmethod, data=None, sc=1): if not data: data = {} vcfs = VCFParser(invcf) samples = vcfs.samples for rec in vcfs: vcfs.parseinfo(rec) vcfs.parsegenotypes(rec) if not _is_HQVar(rec.filter): # Checks variant is PASS continue for sid in samples: if sid not in data: data[sid] = {} gi = rec[sid] gt, gq = gi.GT, gi.GQ # Checks if genotype is not reference or GQ >= 30 if not _genotype_check(gt, gq): continue altid = int(gt.split('/')[1]) var = rec.chrom + ':' + str(rec.pos) + ':' + rec.ref +\ ':' + rec.alt[altid - 1] af, flag = isRare(altid, rec.info, thres_af) if not flag: # Checks if variant is not Rare (AF < 5%) in ExAC continue if 'LCR' in rec.info: continue if 'CLNDBN' in rec.info: dn = rec.info.CLNDBN[altid - 1] sig_num = rec.info.CLNSIG[altid - 1] if '|' in sig_num: sig_num = [int(e) for e in sig_num.split('|') if e != '.'] if sig_num: sig_num.sort() sig_num = sig_num[-1] cln_sig = CLNSIG_MAP[sig_num] else: cln_sig, dn = '', '' elif sig_num != '.': sig_num = int(sig_num) cln_sig = CLNSIG_MAP[sig_num] else: dn, cln_sig = '', '' else: dn, cln_sig = '', '' if 'LCR' in rec.info: lcr = 'LCR' else: lcr = '' if 'CADD_phred' in rec.info: val = rec.info['CADD_phred'][altid - 1] if val == '.': cadd = '' else: cadd = float(val) else: cadd = '' if len(rec.ref) == len(rec.alt[altid - 1]) and len(rec.ref) == 1: ada_score, rf_score = get_dbscSNV_ant(rec.chrom, rec.pos, rec.ref, rec.alt[altid - 1]) if (ada_score and ada_score > 0.6) or (rf_score and rf_score > 0.6): scpred = 'Damaging' else: scpred = '' else: ada_score, rf_score, scpred = '', '', '' sc_ant = [scpred, ada_score, rf_score] #Parse annotation and prioritize transcript pa = vp.prio_trans(vp.parse(rec.info)) # Ignore the intergenic variants if altid not in pa: continue eqtl_flag = False for gene, ant in pa[altid].items(): ta = ant['TRANSCRIPT'] key = ta.trans_id + '_' + ta.aa snps3d_pred = ['', '', '', ''] # SC-1 variant present in Clinvar as Pathogenic or Likely Pathogenic if sc == 1: # Search Criteria 1 if (cln_sig in ['Pathogenic', 'Likely pathogenic']): if gene not in data[sid]: data[sid][gene] = [] data[sid][gene].append( (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig, lcr, sc_ant, snps3d_pred)) # SC-2 variant is protein altering + SC-1 if sc == 2: # Search Criteria 2 if (_is_PASnv(ta) and _is_Damaging(altid, rec.info, ta, snps3d_pred, nmethod)) or _is_NonSense(ta) \ or _is_Splicing(ta) or _is_PAIndel(ta) or \ scpred == 'Damaging' or (cln_sig in ['Pathogenic', 'Likely pathogenic']): if gene not in data[sid]: data[sid][gene] = [] data[sid][gene].append( (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig, lcr, sc_ant, snps3d_pred)) # SC-5 in intronic and UTR variants + SC-1 + SC-2 if sc == 3: # Search Criteria 3 if _is_Intronic(ta) or _is_UTR(ta) or _is_PASnv(ta) or \ _is_NonSense(ta) or _is_Splicing(ta) or _is_PAIndel(ta) \ or scpred == 'Damaging' or cln_sig in ['Pathogenic', 'Likely pathogenic']: if gene not in data[sid]: data[sid][gene] = [] data[sid][gene].append( (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig, lcr, sc_ant, snps3d_pred)) return data