def _makedb(self): """Internal method. Do not use""" self.logger.info('Creating REFMRNA database ...') self.logger.info('Input file: %s' % self.inname) if not os.path.exists(self.inname): self.logger.error('%s: No such file' % self.inname) self.logger.error('Database not created') sys.exit(1) self.load(db=self.outname) for s in SCHEMA: self.createtable(s, True) self.curs = self.conn.cursor() refg = Refgene() human_chrom = ['chr' + str(e) for e in range(1, 23)] + \ ['chrX', 'chrY', 'chrM'] records = [] entry_cnt = 0 for chrom in human_chrom: self.logger.info('Processing chrom-%s' % (chrom)) self.load_hg(chrom) for entry in refg.iterate_db(chrom): chrom = entry.chrom if '_' not in chrom: strand = entry.strand start_positions = [int(pos) for pos in \ entry.str_exon_pos.split(',')] end_positions = [int(pos) for pos in \ entry.end_exon_pos.split(',')] seq = self.getseq(chrom, start_positions, \ end_positions, strand) if seq: records.append((entry.gid, entry.refseq_acc, \ len(seq), seq.lower())) if len(records) == 1000: #print 'Inserted %d records [seqLen:%d]' % (1000,len(seq)) self._insert(records) entry_cnt += 1000 records = [] if records: self._insert(records) entry_cnt += len(records) records = [] # Add version details fd = dt.fromtimestamp(os.path.getmtime(\ dbconfig.DBCONFIG['REFGENE']['name'])\ ).strftime('%Y-%m-%d') version = "v%s_%s" % tuple(fd.split('-')[:2]) self.set_version('refmrna', fd, version, entry_cnt) self.conn.commit() self.curs.close() self.conn.close() self.logger.info('... REFMRNA database created')
def get_genelens(): '''Computes gene's CDS length based on RefGene definition Returns: gene_lens(dictionary): Returns the gene's CDS length ''' gene_lens = {} gene_exons = {} gene_cds_se = {} rfg = Refgene() for rec in rfg.iterate_db(): chrom = rec.chrom if 'chr' in chrom: chrom = chrom[3:] gene = rec.gene for exs, exe in zip(rec.str_exon_pos.split(','), \ rec.end_exon_pos.split(',')): try: gene_exons[(chrom, gene)].add((int(exs), int(exe))) if int(rec.str_cds) < gene_cds_se[(chrom, gene)][0]: gene_cds_se[(chrom, gene)][0] = int(rec.str_cds) if int(rec.end_cds) > gene_cds_se[(chrom, gene)][1]: gene_cds_se[(chrom, gene)][1] = int(rec.end_cds) except: gene_exons[(chrom, gene)] = set([]) gene_exons[(chrom, gene)].add((int(exs), int(exe))) gene_cds_se[(chrom, gene)] = [int(rec.str_cds), int(rec.end_cds)] for chrom_gene, exons in gene_exons.items(): str_cds, end_cds = gene_cds_se[chrom_gene] cds_len = 0 flag = False exons = list(exons) exons.sort() for spos, epos in exons: if spos <= str_cds <= epos and spos <= end_cds <= epos: cds_len += end_cds - str_cds break elif spos <= str_cds <= epos: flag = True cds_len += epos - str_cds elif spos <= end_cds <= epos: cds_len += end_cds - spos flag = False break elif flag == True: cds_len += epos - spos gene_lens[chrom_gene] = cds_len return gene_lens
def known_pathov_stats(reuse=True, has_hgmd_license=False): """ to retrieve variant types (LOF, missense, etc) from known pathogenic mutation database (clinvar or HGMD) :return: """ pathog_prof_pyv = fileconfig.FILECONFIG['PATHOG_PROF'] if reuse and os.path.exists(pathog_prof_pyv): msg = 'loading some statistics on known pathogenic variants (%s) ...' % pathog_prof_pyv msgout('notice', msg) fp = open(pathog_prof_pyv, 'rb') pathov_prof_gene = dill.load(fp) fp.close() else: refgene = Refgene() cds_len_per_gene = refgene.get_cds_len_per_gene() pathov_prof_gene = pathogenic_per_gene(cds_len_per_gene, hgmd_on=has_hgmd_license) fpw = open(pathog_prof_pyv, 'wb') dill.dump(pathov_prof_gene, fpw) fpw.close() #TODO: use SVM to infer optimal variables to classify benign vs. pathogenic return pathov_prof_gene
def __init__(self, gene): self.gene = gene self.refgene = Refgene() self.refmrna = Refmrna() self.genelist = self.load_refgene(self.gene) self.splicedb = Splicedb()
class GeneMap(): def __init__(self, gene): self.gene = gene self.refgene = Refgene() self.refmrna = Refmrna() self.genelist = self.load_refgene(self.gene) self.splicedb = Splicedb() def load_refgene(self, gene): '''Loads the refgene data for given Gene''' genelist = [] for record in self.refgene.iterate_db(gene=self.gene): genelist.append(record) return genelist def get_cds_exonnum(self, frames): '''Returns the Start coding exon number and End Coding exon number''' str_cds_exon_number = 0 end_cds_exon_number = 0 for e in frames: if e != -1: break else: str_cds_exon_number += 1 for e in frames: if e == -1: continue else: end_cds_exon_number += 1 end_cds_exon_number = str_cds_exon_number + end_cds_exon_number - 1 return str_cds_exon_number, end_cds_exon_number def translate(self, mrna): protein = "" codon = "" for neu in mrna[:-3]: codon += neu if len(codon) == 3: aa = CODONTABLE['codons'][codon.upper()] protein += aa codon = "" return protein def _expand_exon(self, entry): '''Internal Method''' chrom = entry.chrom strand = entry.strand refseq_acc = entry.refseq_acc seq = self.refmrna.retrieve(entry.refseq_acc, entry.gid).sequence str_cds = int(entry.str_cds) end_cds = int(entry.end_cds) exon_str = [int(e) for e in entry.str_exon_pos.split(',')] exon_end = [int(e) for e in entry.end_exon_pos.split(',')] frames = [int(e) for e in entry.frames.split(',')] cds_flag = False cds_cnt = 0 neu_cnt = 0 cds_seq = "" gene_info = [] if strand == '+': exon_cnt = 0 for start, end, frame in zip(exon_str, exon_end, frames): exon_cnt += 1 if frame != -1: for i in range(start, end): if i == str_cds: cds_flag = True elif i == end_cds: cds_flag = False if cds_flag == True: splice_reg_site = self.splicedb.get_annot(chrom, \ i + 1, \ refseq_acc) if not splice_reg_site: splice_reg_site = '' cds_cnt += 1 gene_info.append([cds_cnt, chrom, i + 1, strand, str(seq[neu_cnt]), 'E-' + str(exon_cnt), splice_reg_site]) cds_seq += seq[neu_cnt] neu_cnt += 1 else: neu_cnt += end - start elif strand == '-': exon_cnt = len(exon_str) + 1 exon_str.reverse() exon_end.reverse() frames.reverse() for start, end, frame in zip(exon_str, exon_end, frames): exon_cnt -= 1 if frame != -1: for i in reversed(range(start + 1, end + 1)): if i == end_cds: cds_flag = True elif i == str_cds: cds_flag = False if cds_flag == True: splice_reg_site = self.splicedb.get_annot(chrom, \ i + 1, \ refseq_acc) if not splice_reg_site: splice_reg_site = '' cds_cnt += 1 gene_info.append([cds_cnt, chrom, i, strand, \ str(seq[neu_cnt]), \ 'E' + str(exon_cnt), splice_reg_site]) cds_seq += seq[neu_cnt] neu_cnt += 1 else: neu_cnt += end - start protein_seq = self.translate(cds_seq) return gene_info, protein_seq def mapcoord(self): '''Maps the gene coordinates to protein position and return a dictionary of format - map_dict[Refseq_acc]={aa_pos: [aa, \ codon, cds_numbers, Exon_numers, chrom, coordinates, strand]}''' refgene_entries = self.genelist map_dict = {} for entry in refgene_entries: acc = entry.refseq_acc map_dict[acc] = {} gene_info, protein_seq = self._expand_exon(entry) aa_pos = 0 for idx in range(0, len(gene_info), 3): if idx == len(gene_info) - 3: aa = 'STOP_CODON' else: aa = protein_seq[aa_pos] aa_pos += 1 d = zip(gene_info[idx], gene_info[idx + 1], gene_info[idx + 2]) map_dict[acc][aa_pos] = [aa, ''.join(list(d[4])), list(d[0]), \ list(d[5]), d[1][0], list(d[2]), \ d[3][0], list(d[6])] return map_dict
def filter_dwnmut(gene_data): """Removes the variants upstream to Frameshift/StopGain mutation. Args: - gene_data(dictionary): gene_transcript wise variants where there is at least one Frameshift/Stopgain mutation. Returns: - flt_data(dictionary): gene_transcript wise variants where there is at least one Frameshift/StopGain mutation and at least one downstream coding exonic variant. """ rfgene = Refgene() flt_gene_data = {} for gene_info, val in gene_data.items(): trans_id = gene_info[1] strand = rfgene.get_strand(trans_id) if not strand: continue for e in val: t = {} variants = e.keys() if strand == '+': variants.sort() elif strand == '-': variants.sort(reverse=True) size = 0 mut_type = '' flag = False for var in variants: if flag == False and e[var][0] == 'StopGain': mut_type = 'StopGain' t[tuple(list(var) + ['#'])] = e[var] flag = True elif flag == False and e[var][0].startswith('FrameShift'): if e[var][0][10:] == 'Insert': size += len(var[4]) - 1 elif e[var][0][10:] == 'Delete': size -= len(var[3]) - 1 t[tuple(list(var) + ['#'])] = e[var] flag = True elif flag == True: if mut_type == 'StopGain': t[var] = e[var] elif e[var][0].startswith('FrameShift'): if e[var][0][10:] == 'Insert': size += len(var[4]) - 1 elif e[var][0][10:] == 'Delete': size -= len(var[3]) - 1 t[var] = e[var] if size == 0 or divmod(size, 3)[1] == 0: flag = False elif e[var][0].startswith('NonFrameShift'): if e[var][0][13:] == 'Insert': size += len(var[4]) - 1 elif e[var][0][13:] == 'Delete': size -= len(var[3]) - 1 t[var] = e[var] if size == 0 or divmod(size, 3)[1] == 0: flag = False else: t[var] = e[var] if len(t) > 1: key = tuple(list(gene_info) + [strand]) if key not in flt_gene_data: flt_gene_data[key] = [t] else: if t != flt_gene_data[key][0]: flt_gene_data[key].append(t) return flt_gene_data
def _extract_mutation_info(self,beta_fits): ''' objective: to extract (gene_qsymbol,mutation_type,variant_class_tag,transcript_length,insillico_prediction_score,MAF_significance_offset,zygosity) from annotated/filtered VCF file to transfer genmod information to class_tag and also get rid of some redundancy in VCF info ''' job_name = '_extract_mutation_info' msg='collecting variant information and class label to determine genetic damage [%s;%s]...'%(job_name,self.vcf) lib_utils.msgout('notice',msg);self.logger.info(msg) mutation_info = [] if self.proband_id: rewrite_vcf = True v = vcf.VCFParser(self.vcf,sampleids=[self.proband_id]) pdom,pdom0 = self.gather_pdomain_scores(v) v.stream.close() vcf_tmp = self.vcf+'.tmp' ostream = open(vcf_tmp, 'w') rmInfo = ['Exonic','Annotation','Compounds'] v = vcf.VCFParser(self.vcf) v.writeheader(ostream,to_del_info = rmInfo) else: rewrite_vcf = False v = vcf.VCFParser(self.vcf) pdom,pdom0 = self.gather_pdomain_scores(v) v.stream.close() v = vcf.VCFParser(self.vcf) msg = 'Importing max transcript length for each gene ...' lib_utils.msgout('notice', msg); self.logger.info(msg) refgene = Refgene() cds_lens = refgene.get_max_cds_length() tx_lens = {} for gene, cds_len in cds_lens.iteritems(): tx_lens[gene] = int(cds_len/3.) ridx = 0 for rec in v: v.parseinfo(rec) #to remove redundant gene symbols annotated by genmod but add transcript version if rewrite_vcf: for rkey in rmInfo: v.delete_info(rec, rkey) if rec.info.GeneticModels: genmod_tag = lib_ped.parse_genmod_inherit_model(\ rec.info.GeneticModels[0].split(':')[1]) rec.info.CLASS_TAG += genmod_tag v.write(ostream, rec) varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt) mut_type = varlist[0][-1] if ':' in rec.id[0]: mut_type = 'mnp' # collect conservation prediction score (CADD and GERP++) cadd_aa = './.' px_cadd = None if rec.info.CADD_raw: # to get CADD_raw (average) px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, beta_fits) # to get GERP++ score px_gerp = None if rec.info.GerpConserve: px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa, rec.info.GerpRSScore, beta_fits) # which score can be chosen px = 0.5 if self.cadd>0 and px_cadd is not None: px = px_cadd elif px_gerp is not None: px = px_gerp vpop = vp.parse(rec.info) genes = [] # to get MAF in the order of ExAC, ESP, and 1K if rec.info.EXACDB: maf = get_min_maf(rec.info.EXACAF[0]) elif rec.info.ESPDB: maf = get_min_maf(rec.info.ESPAF[0]) elif rec.info.KGDB: maf = get_min_maf(rec.info.KGAF[0]) else: maf = 0. # to compute a significance of MAF maf_offset = self.dm.get_maf_xoffset(maf) #pdom.iloc[ridx]==ridx pdom_idx = pdom.index[pdom.ridx == ridx].tolist() if pdom_idx: patho_p = pdom.phat_lo[pdom_idx[0]] patho_pden = pdom.patho_dens_p[pdom_idx[0]] else: # assign a default pathogenic domain value (15% quantile value) patho_p = pdom0.phat_lo patho_pden = pdom0.patho_dens_p vartype = get_var_type(rec.ref,rec.alt) # to get transcript length for altnum, val in vpop.items(): # for each gene involved with the variant for gene, gd in val.items(): protein_len = self.dm.avg_protein_len if gene in tx_lens: protein_len = tx_lens[gene] # store a set of essential annotation to be used for genetic damage if gene not in genes: mutation_info.append([gene, vartype, rec.info.CLASS_TAG, protein_len, px, maf_offset, patho_p, patho_pden]) genes.append(gene) ridx += 1 # done reading filterd VCF file if rewrite_vcf: v.stream.close() ostream.close() os.rename(vcf_tmp,self.vcf) msg = 'done. [%s]'%job_name lib_utils.msgout('notice',msg); self.logger.info(msg) return mutation_info