示例#1
0
文件: mkmrna.py 项目: kalon33/divine
    def _makedb(self):
        """Internal method. Do not use"""

        self.logger.info('Creating REFMRNA database ...')
        self.logger.info('Input file: %s' % self.inname)
        if not os.path.exists(self.inname):
            self.logger.error('%s: No such file' % self.inname)
            self.logger.error('Database not created')
            sys.exit(1)

        self.load(db=self.outname)

        for s in SCHEMA:
            self.createtable(s, True)

        self.curs = self.conn.cursor()

        refg = Refgene()

        human_chrom = ['chr' + str(e) for e in range(1, 23)] + \
                                ['chrX', 'chrY', 'chrM']
        records = []
        entry_cnt = 0
        for chrom in human_chrom:
            self.logger.info('Processing chrom-%s' % (chrom))
            self.load_hg(chrom)
            for entry in refg.iterate_db(chrom):
                chrom = entry.chrom
                if '_' not in chrom:
                    strand = entry.strand
                    start_positions = [int(pos) for pos in \
                                       entry.str_exon_pos.split(',')]
                    end_positions = [int(pos) for pos in \
                                     entry.end_exon_pos.split(',')]
                    seq = self.getseq(chrom, start_positions, \
                                       end_positions, strand)
                    if seq:
                        records.append((entry.gid, entry.refseq_acc, \
                                        len(seq), seq.lower()))
                        if len(records) == 1000:
                            #print 'Inserted %d records [seqLen:%d]' % (1000,len(seq))
                            self._insert(records)
                            entry_cnt += 1000
                            records = []
        if records:
            self._insert(records)
            entry_cnt += len(records)
            records = []

        # Add version details
        fd = dt.fromtimestamp(os.path.getmtime(\
                                dbconfig.DBCONFIG['REFGENE']['name'])\
                              ).strftime('%Y-%m-%d')
        version = "v%s_%s" % tuple(fd.split('-')[:2])
        self.set_version('refmrna', fd, version, entry_cnt)

        self.conn.commit()
        self.curs.close()
        self.conn.close()
        self.logger.info('... REFMRNA database created')
示例#2
0
def get_genelens():
    '''Computes gene's CDS length based on RefGene definition

    Returns:
        gene_lens(dictionary):    Returns the gene's CDS length
    '''
    gene_lens = {}
    gene_exons = {}
    gene_cds_se = {}
    rfg = Refgene()
    for rec in rfg.iterate_db():
        chrom = rec.chrom
        if 'chr' in chrom:
            chrom = chrom[3:]
        gene = rec.gene
        for exs, exe in zip(rec.str_exon_pos.split(','), \
                                            rec.end_exon_pos.split(',')):
            try:
                gene_exons[(chrom, gene)].add((int(exs), int(exe)))
                if int(rec.str_cds) < gene_cds_se[(chrom, gene)][0]:
                    gene_cds_se[(chrom, gene)][0] = int(rec.str_cds)
                if int(rec.end_cds) > gene_cds_se[(chrom, gene)][1]:
                    gene_cds_se[(chrom, gene)][1] = int(rec.end_cds)
            except:
                gene_exons[(chrom, gene)] = set([])
                gene_exons[(chrom, gene)].add((int(exs), int(exe)))
                gene_cds_se[(chrom,
                             gene)] = [int(rec.str_cds),
                                       int(rec.end_cds)]

    for chrom_gene, exons in gene_exons.items():
        str_cds, end_cds = gene_cds_se[chrom_gene]
        cds_len = 0
        flag = False
        exons = list(exons)
        exons.sort()
        for spos, epos in exons:
            if spos <= str_cds <= epos and spos <= end_cds <= epos:
                cds_len += end_cds - str_cds
                break
            elif spos <= str_cds <= epos:
                flag = True
                cds_len += epos - str_cds
            elif spos <= end_cds <= epos:
                cds_len += end_cds - spos
                flag = False
                break
            elif flag == True:
                cds_len += epos - spos
        gene_lens[chrom_gene] = cds_len
    return gene_lens
示例#3
0
def known_pathov_stats(reuse=True, has_hgmd_license=False):
    """
	to retrieve variant types (LOF, missense, etc) from known pathogenic mutation database (clinvar or HGMD)
	:return:
	"""
    pathog_prof_pyv = fileconfig.FILECONFIG['PATHOG_PROF']
    if reuse and os.path.exists(pathog_prof_pyv):
        msg = 'loading some statistics on known pathogenic variants (%s) ...' % pathog_prof_pyv
        msgout('notice', msg)
        fp = open(pathog_prof_pyv, 'rb')
        pathov_prof_gene = dill.load(fp)
        fp.close()
    else:
        refgene = Refgene()
        cds_len_per_gene = refgene.get_cds_len_per_gene()
        pathov_prof_gene = pathogenic_per_gene(cds_len_per_gene,
                                               hgmd_on=has_hgmd_license)
        fpw = open(pathog_prof_pyv, 'wb')
        dill.dump(pathov_prof_gene, fpw)
        fpw.close()

    #TODO: use SVM to infer optimal variables to classify benign vs. pathogenic

    return pathov_prof_gene
示例#4
0
 def __init__(self, gene):
     self.gene = gene
     self.refgene = Refgene()
     self.refmrna = Refmrna()
     self.genelist = self.load_refgene(self.gene)
     self.splicedb = Splicedb()
示例#5
0
class GeneMap():

    def __init__(self, gene):
        self.gene = gene
        self.refgene = Refgene()
        self.refmrna = Refmrna()
        self.genelist = self.load_refgene(self.gene)
        self.splicedb = Splicedb()

    def load_refgene(self, gene):
        '''Loads the refgene data for given Gene'''
        genelist = []
        for record in self.refgene.iterate_db(gene=self.gene):
            genelist.append(record)
        return genelist

    def get_cds_exonnum(self, frames):
        '''Returns the Start coding exon number and End Coding exon number'''
        str_cds_exon_number = 0
        end_cds_exon_number = 0
        for e in frames:
            if e != -1:
                break
            else:
                str_cds_exon_number += 1
        for e in frames:
            if e == -1:
                continue
            else:
                end_cds_exon_number += 1

        end_cds_exon_number = str_cds_exon_number + end_cds_exon_number - 1
        return str_cds_exon_number, end_cds_exon_number

    def translate(self, mrna):
        protein = ""
        codon = ""
        for neu in mrna[:-3]:
            codon += neu
            if len(codon) == 3:
                aa = CODONTABLE['codons'][codon.upper()]
                protein += aa
                codon = ""
        return protein

    def _expand_exon(self, entry):
        '''Internal Method'''
        chrom = entry.chrom
        strand = entry.strand
        refseq_acc = entry.refseq_acc
        seq = self.refmrna.retrieve(entry.refseq_acc, entry.gid).sequence
        str_cds = int(entry.str_cds)
        end_cds = int(entry.end_cds)
        exon_str = [int(e) for e in entry.str_exon_pos.split(',')]
        exon_end = [int(e) for e in entry.end_exon_pos.split(',')]
        frames = [int(e) for e in entry.frames.split(',')]
        cds_flag = False
        cds_cnt = 0
        neu_cnt = 0
        cds_seq = ""
        gene_info = []
        if strand == '+':
            exon_cnt = 0
            for start, end, frame in zip(exon_str, exon_end, frames):
                exon_cnt += 1
                if frame != -1:
                    for i in range(start, end):
                        if i == str_cds:
                            cds_flag = True
                        elif i == end_cds:
                            cds_flag = False
                        if cds_flag == True:
                            splice_reg_site = self.splicedb.get_annot(chrom, \
                                                                i + 1, \
                                                                refseq_acc)
                            if not splice_reg_site:
                                splice_reg_site = ''
                            cds_cnt += 1
                            gene_info.append([cds_cnt, chrom, i + 1, strand,
                                              str(seq[neu_cnt]),
                                              'E-' + str(exon_cnt),
                                              splice_reg_site])
                            cds_seq += seq[neu_cnt]
                        neu_cnt += 1
                else:
                    neu_cnt += end - start
        elif strand == '-':
            exon_cnt = len(exon_str) + 1
            exon_str.reverse()
            exon_end.reverse()
            frames.reverse()
            for start, end, frame in zip(exon_str, exon_end, frames):
                exon_cnt -= 1
                if frame != -1:
                    for i in reversed(range(start + 1, end + 1)):
                        if i == end_cds:
                            cds_flag = True
                        elif i == str_cds:
                            cds_flag = False
                        if cds_flag == True:
                            splice_reg_site = self.splicedb.get_annot(chrom, \
                                                                      i + 1, \
                                                                refseq_acc)
                            if not splice_reg_site:
                                splice_reg_site = ''
                            cds_cnt += 1
                            gene_info.append([cds_cnt, chrom, i, strand, \
                                              str(seq[neu_cnt]), \
                                              'E' + str(exon_cnt),
                                              splice_reg_site])
                            cds_seq += seq[neu_cnt]
                        neu_cnt += 1
                else:
                    neu_cnt += end - start
        protein_seq = self.translate(cds_seq)
        return gene_info, protein_seq

    def mapcoord(self):
        '''Maps the gene coordinates to protein position and
        return a dictionary of format - map_dict[Refseq_acc]={aa_pos: [aa, \
        codon, cds_numbers, Exon_numers, chrom, coordinates, strand]}'''
        refgene_entries = self.genelist
        map_dict = {}
        for entry in refgene_entries:
            acc = entry.refseq_acc
            map_dict[acc] = {}
            gene_info, protein_seq = self._expand_exon(entry)
            aa_pos = 0
            for idx in range(0, len(gene_info), 3):
                if idx == len(gene_info) - 3:
                    aa = 'STOP_CODON'
                else:
                    aa = protein_seq[aa_pos]
                aa_pos += 1
                d = zip(gene_info[idx], gene_info[idx + 1], gene_info[idx + 2])
                map_dict[acc][aa_pos] = [aa, ''.join(list(d[4])), list(d[0]), \
                                         list(d[5]), d[1][0], list(d[2]), \
                                         d[3][0], list(d[6])]
        return map_dict
示例#6
0
def filter_dwnmut(gene_data):
    """Removes the variants upstream to Frameshift/StopGain mutation.
    Args:
        - gene_data(dictionary):     gene_transcript wise variants where
                                     there is at least one Frameshift/Stopgain
                                     mutation.

    Returns:
        - flt_data(dictionary):    gene_transcript wise variants where there
                                   is at least one Frameshift/StopGain mutation
                                   and at least one downstream coding exonic
                                   variant.
    """
    rfgene = Refgene()
    flt_gene_data = {}
    for gene_info, val in gene_data.items():
        trans_id = gene_info[1]
        strand = rfgene.get_strand(trans_id)
        if not strand:
            continue
        for e in val:
            t = {}
            variants = e.keys()
            if strand == '+':
                variants.sort()
            elif strand == '-':
                variants.sort(reverse=True)
            size = 0
            mut_type = ''
            flag = False

            for var in variants:
                if flag == False and e[var][0] == 'StopGain':
                    mut_type = 'StopGain'
                    t[tuple(list(var) + ['#'])] = e[var]
                    flag = True

                elif flag == False and e[var][0].startswith('FrameShift'):
                    if e[var][0][10:] == 'Insert':
                        size += len(var[4]) - 1
                    elif e[var][0][10:] == 'Delete':
                        size -= len(var[3]) - 1
                    t[tuple(list(var) + ['#'])] = e[var]
                    flag = True

                elif flag == True:
                    if mut_type == 'StopGain':
                        t[var] = e[var]
                    elif e[var][0].startswith('FrameShift'):
                        if e[var][0][10:] == 'Insert':
                            size += len(var[4]) - 1
                        elif e[var][0][10:] == 'Delete':
                            size -= len(var[3]) - 1
                        t[var] = e[var]
                        if size == 0 or divmod(size, 3)[1] == 0:
                            flag = False
                    elif e[var][0].startswith('NonFrameShift'):
                        if e[var][0][13:] == 'Insert':
                            size += len(var[4]) - 1
                        elif e[var][0][13:] == 'Delete':
                            size -= len(var[3]) - 1
                        t[var] = e[var]
                        if size == 0 or divmod(size, 3)[1] == 0:
                            flag = False
                    else:
                        t[var] = e[var]

            if len(t) > 1:
                key = tuple(list(gene_info) + [strand])
                if key not in flt_gene_data:
                    flt_gene_data[key] = [t]
                else:
                    if t != flt_gene_data[key][0]:
                        flt_gene_data[key].append(t)
    return flt_gene_data
示例#7
0
文件: divine.py 项目: kalon33/divine
	def _extract_mutation_info(self,beta_fits):
		'''
		objective: to extract (gene_qsymbol,mutation_type,variant_class_tag,transcript_length,insillico_prediction_score,MAF_significance_offset,zygosity) from annotated/filtered VCF file
			to transfer genmod information to class_tag and also get rid of some redundancy in VCF info
		'''
		job_name = '_extract_mutation_info'
		msg='collecting variant information and class label to determine genetic damage [%s;%s]...'%(job_name,self.vcf)
		lib_utils.msgout('notice',msg);self.logger.info(msg)
		
		mutation_info = []
		if self.proband_id:
			rewrite_vcf = True
			v = vcf.VCFParser(self.vcf,sampleids=[self.proband_id])
			pdom,pdom0 = self.gather_pdomain_scores(v)
			v.stream.close()
			vcf_tmp = self.vcf+'.tmp'
			ostream = open(vcf_tmp, 'w')
			rmInfo = ['Exonic','Annotation','Compounds']
			v = vcf.VCFParser(self.vcf)
			v.writeheader(ostream,to_del_info = rmInfo)
		else:
			rewrite_vcf = False
			v = vcf.VCFParser(self.vcf)
			pdom,pdom0 = self.gather_pdomain_scores(v)
			v.stream.close()
			v = vcf.VCFParser(self.vcf)

		msg = 'Importing max transcript length for each gene ...'
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)

		refgene = Refgene()
		cds_lens = refgene.get_max_cds_length()
		tx_lens = {}
		for gene, cds_len in cds_lens.iteritems():
			tx_lens[gene] = int(cds_len/3.)

		ridx = 0
		for rec in v:
			
			v.parseinfo(rec)
			
			#to remove redundant gene symbols annotated by genmod but add transcript version
			if rewrite_vcf:
				for rkey in rmInfo:
					v.delete_info(rec, rkey)

				if rec.info.GeneticModels:
					genmod_tag = lib_ped.parse_genmod_inherit_model(\
												rec.info.GeneticModels[0].split(':')[1])
					rec.info.CLASS_TAG += genmod_tag
					
				v.write(ostream, rec)
			
			varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt)
			mut_type = varlist[0][-1]

			if ':' in rec.id[0]:
				mut_type = 'mnp'
			
			# collect conservation prediction score (CADD and GERP++) 
			cadd_aa = './.'
			px_cadd = None
			if rec.info.CADD_raw:
				# to get CADD_raw (average)
				px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, beta_fits)
			
			# to get GERP++ score
			px_gerp = None
			if rec.info.GerpConserve:
				px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa, rec.info.GerpRSScore, beta_fits)
			
			# which score can be chosen
			px = 0.5
			if self.cadd>0 and px_cadd is not None:
				px = px_cadd
			elif px_gerp is not None:
				px = px_gerp

			vpop = vp.parse(rec.info)
			genes = []
			
			# to get MAF in the order of ExAC, ESP, and 1K
			if rec.info.EXACDB:
				maf = get_min_maf(rec.info.EXACAF[0])
			elif rec.info.ESPDB:
				maf = get_min_maf(rec.info.ESPAF[0])
			elif rec.info.KGDB:
				maf = get_min_maf(rec.info.KGAF[0])
			else:
				maf = 0.
			
			# to compute a significance of MAF
			maf_offset = self.dm.get_maf_xoffset(maf)

			#pdom.iloc[ridx]==ridx
			pdom_idx = pdom.index[pdom.ridx == ridx].tolist()
			if pdom_idx:
				patho_p = pdom.phat_lo[pdom_idx[0]]
				patho_pden = pdom.patho_dens_p[pdom_idx[0]]
			else:
				# assign a default pathogenic domain value (15% quantile value)
				patho_p = pdom0.phat_lo
				patho_pden = pdom0.patho_dens_p

			vartype = get_var_type(rec.ref,rec.alt)

			# to get transcript length
			for altnum, val in vpop.items():
				# for each gene involved with the variant
				for gene, gd in val.items():
					protein_len = self.dm.avg_protein_len
					if gene in tx_lens:
						protein_len = tx_lens[gene]

					# store a set of essential annotation to be used for genetic damage
					if gene not in genes:
						mutation_info.append([gene, vartype, rec.info.CLASS_TAG, protein_len, px, maf_offset, patho_p, patho_pden])
						genes.append(gene)

			ridx += 1
			
		# done reading filterd VCF file
		if rewrite_vcf:
			v.stream.close()
			ostream.close()
			os.rename(vcf_tmp,self.vcf)
			
		msg = 'done. [%s]'%job_name
		lib_utils.msgout('notice',msg); self.logger.info(msg)

		return mutation_info