예제 #1
0
    def collapse_bed(self,tmp_bed,job_name,ext_bp):
        msg = 'sorting bed file ... @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        tmp_so_bed = os.path.join(self.work_dir, 'refGene_e%d_so.bed' % ext_bp)
        # sort
        lib_utils.sort_tsv_by_col2(tmp_bed, [1, 2, 3], ['V', 'n', 'n'], True, tmp_so_bed)

        msg = 'merging exon coordinates overlapped each other... @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        # merge boundaries if any overlapped
        fp = open(tmp_so_bed, 'r')
        fp2 = open(self.bed_fn, 'w')

        chromp, e1p, e2p, annotp = fp.next().rstrip().split('\t')
        e1p = int(e1p)
        e2p = int(e2p)

        wrapup = 1;
        merge = 2
        fp.seek(0)
        for i in fp:
            chrom, e1, e2, annot = i.rstrip().split('\t')
            e1 = int(e1)
            e2 = int(e2)
            if chrom == chromp:
                if e2p < e1:
                    action = wrapup
                else:
                    action = merge
            else:
                action = wrapup

            if action == wrapup:
                fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp))
                chromp, e1p, e2p, annotp = chrom, e1, e2, annot
            elif action == merge:
                if e2p < e2:
                    e2p = e2
                    annotp += '|%s' % annot
        fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp))
        fp.close()
        fp2.close()

        os.unlink(tmp_so_bed)

        msg = 'done. @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)
예제 #2
0
    def gdna_to_vcf(self, mutalyzer_batch_outfn):

        if not os.path.exists(mutalyzer_batch_outfn):
            raise RuntimeError('check if input file [%s] exists'%\
                     mutalyzer_batch_outfn)

        cHgvs = Hgvs2()
        cHgvs.load_resource()

        fp = open(mutalyzer_batch_outfn, 'r')
        fp.next()
        gdna_cache = {}
        for mutalyzer in fp:
            mut = mutalyzer.split('\t')
            if mut[1].strip(): continue
            gdna = mut[2].strip()
            variants = cHgvs.gdna_to_vcf(gdna)
            if variants:
                gdna_cache[mut[0].strip()] = variants
        fp.close()

        self.out_vcf = lib_utils.file_tag(self.tsv, None, 'vcf')

        tmp_vcf = self.out_vcf + '.tmp'
        fpw = open(tmp_vcf, 'w')
        self._write_vcf_head(fpw)
        qual = 100
        filter = 'PASS'
        rsid = '.'

        for cvt in self._iterfile():
            if self.may_pass(cvt): continue
            if cvt.nt_change not in gdna_cache: continue

            for chrom, pos, ref, alt in gdna_cache[cvt.nt_change]:
                if len(ref) > 100 or len(alt) > 100: continue
                info = 'cDNA=%s;' % cvt.nt_change
                info += 'VC=%s;' % self.determine_vclass(cvt.rep_class)
                info += 'SRC=%s;' % cvt.source
                info += 'UPD=%s;' % cvt.last_upd
                info += 'URL=%s' % cvt.url
                if chrom.startswith('chr'):
                    if chrom.startswith('chrM'): chrom = 'MT'
                    else: chrom = chrom[3:]
                cols = [chrom, pos, rsid, ref, alt, qual, filter, info]
                fpw.write('%s\n' % lib_utils.joined(cols, '\t'))
        fpw.close()
        lib_utils.sort_tsv_by_col2(tmp_vcf,[1,2],\
         ['V','n'],False,self.out_vcf)
        os.unlink(tmp_vcf)
예제 #3
0
파일: divine.py 프로젝트: kalon33/divine
	def rank_pheno_gene(self):
		job_name = 'rank_pheno_gene'
		
		msg = 'selecting genes matched by patient phenotypes ... [%s;%s]'%(job_name,self.hpo_query)
		lib_utils.msgout('notice',msg); self.logger.info(msg)
	
		tmp_fn = '%s.tmp' % self.gene_rank_fn
		fp2=open(tmp_fn,'w')
		fp2.write('#gene\tphenotypic_score\n')
		for gene,cPhenoGene in self.pheno_dmg.iteritems():
			fp2.write('%s\t%g\n'%(gene,cPhenoGene.score))
		fp2.close()
		
		lib_utils.sort_tsv_by_col2(tmp_fn,[2],['gr'],False,self.gene_rank_fn)
		msg = 'done. [%s]'%job_name
		os.unlink(tmp_fn)
		lib_utils.msgout('notice',msg); self.logger.info(msg)
예제 #4
0
def convert_node2gene(FinalNodeScores, PerturbedGenes, dProtein2gene,
                      lnkProteins, rank_fn):

    nodeScores, dangledScores = FinalNodeScores
    cPerturbedGenes, dangledGenes = PerturbedGenes

    rank_fn2 = lib_utils.file_tag2(rank_fn, 'tmp', None)
    fp2 = lib_utils.open2(rank_fn2, 'w')
    fp2.write('#gene\tpredicted_score[-1/log10(x)]\tseed_score\n')
    for n, protein in enumerate(lnkProteins):
        seed_score = 0.
        gene = protein
        genetic_dmg_score = 0.
        if protein in dProtein2gene:
            gene = dProtein2gene[protein]
            if gene in cPerturbedGenes:
                seed_score = cPerturbedGenes[gene].score
                genetic_dmg_score = cPerturbedGenes[gene].gdmg

        pred_score = 0.
        if nodeScores[n] > 0:
            pred_score = -1. / math.log10(nodeScores[n])
        if genetic_dmg_score > 0.:
            fp2.write('%s\t%g\t%g\n' % (gene, pred_score, seed_score))

    #add dangled node score
    for n, gene in enumerate(dangledGenes):
        pred_score = 0.
        if dangledScores[n] > 0:
            pred_score = -1. / math.log10(dangledScores[n])
        if cPerturbedGenes[gene].gdmg > 0.:
            fp2.write('%s\t%g\t%g\n' %
                      (gene, pred_score, cPerturbedGenes[gene].score))

    fp2.close()

    #sort by score
    lib_utils.sort_tsv_by_col2(rank_fn2, [2], ['gr'], False, rank_fn)
    os.unlink(rank_fn2)
예제 #5
0
    def _preproces(self, min_go_score=0.5):

        #to get temp dir
        tmpD = os.path.join(os.path.dirname(self.inname), 'tmp')
        if not os.path.exists(tmpD):
            os.makedirs(tmpD)

        self.uniprot_to_gene()

        inname_g = self.inname + '.gene'
        fp2 = open(inname_g, 'w')

        #convert uniprot to hgnc
        for entry in self._iterfile(
        ):  #uniprot1, uniprot2, score=(BP+MF+CC)/3, denominator (1<=x<=3)
            if float(entry.score) >= min_go_score:
                gene1 = self.uni2gene[entry.prod1]
                gene2 = self.uni2gene[entry.prod2]
                fp2.write('%s\t%s\t%s\n'%('\t'.join(sorted([gene1,gene2])),\
                 entry.score,entry.denominator))
        fp2.close()

        #sorting
        inname_gso = inname_g + '.so'
        lib_utils.sort_tsv_by_col2(inname_g, [1, 2, 3], ['V', 'V', 'g'], False,
                                   inname_gso, tmpD)  #temp dir

        #take max funSim for each pair
        fp = open(inname_gso, 'r')
        j = fp.next().strip().split('\t')
        prev_pair = '%s\t%s' % (j[0], j[1])
        mx_score1 = 0.
        mx_denom1 = 0
        mx_score2 = 0.
        mx_denom2 = 0
        fp.close()

        self.inname = inname_gso
        fp2 = open(inname_g, 'w')
        for entry in self._iterfile():
            pair = '%s\t%s' % (entry.prod1, entry.prod2)
            if pair != prev_pair:  #wraup prev
                if mx_denom2 > 0:
                    fp2.write('%s\t%g\t%d\n' %
                              (prev_pair, mx_score2, mx_denom2))
                elif mx_denom1 > 0:
                    fp2.write('%s\t%g\t%d\n' %
                              (prev_pair, mx_score1, mx_denom1))
                prev_pair = pair
                mx_score1 = 0.
                mx_denom1 = 0
                mx_score2 = 0.
                mx_denom2 = 0

            score = float(entry.score)
            denominator = int(entry.denominator)
            if denominator > 1:
                if score > mx_score2:
                    mx_score2 = score
                    mx_denom2 = denominator
            else:
                if score > mx_score1:
                    mx_score1 = score
                    mx_denom1 = denominator

        #don't forget the last entries
        if mx_denom2 > 0:
            fp2.write('%s\t%g\t%d\n' % (prev_pair, mx_score2, mx_denom2))
        elif mx_denom1 > 0:
            fp2.write('%s\t%g\t%d\n' % (prev_pair, mx_score1, mx_denom1))

        fp2.close()
        self.inname = inname_g
        os.unlink(inname_gso)
        os.system('rm -rf %s' % tmpD)
예제 #6
0
    def convert_node2gene(self):
        '''
		for only gt_dmg genes, print out gene, harmonic score, and seed score 
		'''

        rank_fn_tmp = '%s.tmp' % self.dv.gene_rank_fn
        fp2 = lib_utils.open2(rank_fn_tmp, 'w')
        fp2.write(
            '#gene\tpredicted_score\tseed_score\tgt_dmg_score\tpheno_score\tcontain_known_pathogenic\n'
        )
        genes_printed = {}
        #browsing each node in the whole (original) ppi network
        for n, protein in enumerate(self.Prots):
            seed_score = 0.
            gene = protein

            #check if this node (restart value) was assigned previously
            if protein in self.dProt2Gene:
                gene = self.dProt2Gene[protein]

                if gene in self.dv.gene_dmg:
                    seed_score = self.dv.gene_dmg[gene][0]

            #to get harmonic score and save into dv.gene_dmg
            pred_score = 0.
            if self.harmonic_sc[n][0] > 0.:
                pred_score = self.harmonic_sc[n][0]
                if gene in self.dv.gene_dmg:
                    self.dv.gene_dmg[gene][1] = pred_score

            #NOTE that print only a gene having at one mutation
            if (not self.dv.gt_dmg) or \
              (gene in self.dv.gt_dmg and self.dv.gt_dmg[gene].score>0.):

                pheno_sc = 0.
                if gene in self.dv.pheno_dmg:
                    pheno_sc = self.dv.pheno_dmg[gene].score

                if self.dv.vknown:
                    if gene in self.dv.vknown_genes: is_vknown = 'Y'
                    else: is_vknown = 'N'
                else: is_vknown = 'NA'

                if gene in genes_printed:
                    gene2 = '%s|%s' % (gene, protein)
                else:
                    gene2 = gene
                    genes_printed[gene] = True

                fp2.write('%s\t%g\t%g\t%g\t%g\t%s\n'%\
                    (gene2,pred_score,seed_score,\
                    self.dv.gt_dmg[gene].score,pheno_sc,is_vknown))

        #repeat the same procedure to dangled nodes
        for n, gene in enumerate(self.dangledGenes):

            self.dv.gene_dmg[gene][1] = self.harmonic_dng_sc[n][0]

            if (not self.dv.gt_dmg) or \
              (gene in self.dv.gt_dmg and self.dv.gt_dmg[gene].score>0.):

                pheno_sc = 0.
                if gene in self.dv.pheno_dmg:
                    pheno_sc = self.dv.pheno_dmg[gene].score

                if self.dv.vknown:
                    if gene in self.dv.vknown_genes: is_vknown = 'Y'
                    else: is_vknown = 'N'
                else: is_vknown = 'NA'

                fp2.write('%s\t%g\t%g\t%s\t%g\t%s\n'%\
                 (gene,self.dv.gene_dmg[gene][1],self.dv.gene_dmg[gene][0],\
                 self.dv.gt_dmg[gene].score,pheno_sc,is_vknown))

        fp2.close()

        #sort by score
        lib_utils.sort_tsv_by_col2(\
         rank_fn_tmp, [2], ['gr'], False, self.dv.gene_rank_fn)
        os.unlink(rank_fn_tmp)
예제 #7
0
    def create_bed(self, ext_bp=0, reuse=False):

        job_name = 'RefGeneUcscTB.create_bed'

        self.bed_fn = os.path.join(self.work_dir,
                                   'refGene_e%d_so_merged.bed' % ext_bp)

        msg = 'creating a bed file[%s] containing RefGene coding region (cmpl/incmpl/unk) @ %s' % (
            self.bed_fn, job_name)

        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        if reuse and lib_utils.check_if_file_valid(self.bed_fn):
            msg = 'reuse bed file [%s] generated previously @ %s' % (
                self.bed_fn, job_name)
            lib_utils.msgout('notice', msg)
            if self.logger: self.logger.info(msg)
            return self.bed_fn

        #to get a working directory
        tmp_bed = os.path.join(self.work_dir, 'refGene_e%d.bed' % ext_bp)

        fp = open(self.refGene_fn, 'r')
        fp2 = open(tmp_bed, 'w')
        for i in fp:
            j = i.rstrip().split('\t')
            chrom = j[2]

            for e1, e2 in zip(j[9].split(',')[:-1], j[10].split(',')[:-1]):
                e1_ext = int(e1) - ext_bp
                e2_ext = int(e2) + ext_bp
                fp2.write('%s\t%d\t%d\t%s;%s\n' %
                          (chrom, e1_ext, e2_ext, j[12], j[1]))
        fp2.close()
        fp.close()

        msg = 'sorting bed file ... @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        tmp_so_bed = os.path.join(self.work_dir, 'refGene_e%d_so.bed' % ext_bp)
        #sort
        lib_utils.sort_tsv_by_col2(tmp_bed, [1, 2, 3], ['V', 'n', 'n'], True,
                                   tmp_so_bed)

        msg = 'merging exon coordinates overlapped each other... @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        #merge boundaries if any overlapped
        fp = open(tmp_so_bed, 'r')
        fp2 = open(self.bed_fn, 'w')

        chromp, e1p, e2p, annotp = fp.next().rstrip().split('\t')
        e1p = int(e1p)
        e2p = int(e2p)

        wrapup = 1
        merge = 2
        fp.seek(0)
        for i in fp:
            chrom, e1, e2, annot = i.rstrip().split('\t')
            e1 = int(e1)
            e2 = int(e2)
            if chrom == chromp:
                if e2p < e1: action = wrapup
                else: action = merge
            else: action = wrapup

            if action == wrapup:
                fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp))
                chromp, e1p, e2p, annotp = chrom, e1, e2, annot
            elif action == merge:
                if e2p < e2:
                    e2p = e2
                    annotp += '|%s' % annot
        fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp))
        fp.close()
        fp2.close()

        os.unlink(tmp_bed)
        os.unlink(tmp_so_bed)

        msg = 'done. @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        return self.bed_fn
예제 #8
0
def reformat_go_sim_fns(go_sim_fns,
                        out_fn,
                        method_id=1):  #method_id (1) means SimRel

    suflabs = ['BP', 'MF', 'CC']
    fp2 = lib_utils.open2(out_fn, 'w')
    v = 0
    for key, go_sim_fn in go_sim_fns.iteritems():  #BP,MF,CC
        print 'appending root node at the end of [%s]' % go_sim_fn
        suflab = suflabs[v]
        fp = lib_utils.open2(go_sim_fn, 'r')
        go_sim_fn2 = lib_utils.file_tag2(go_sim_fn, 'category', None)
        fp.next()  #strip off head
        for i in fp:
            j = i.rstrip().split('\t')
            if len(j) == 2:
                j.append('-1.')
            fp2.write('%s\t%s\n' %
                      ('\t'.join(j), suflab))  #uniprot1,uniprot2,score,BP
        fp.close()
        print 'done.'
        v += 1
    fp2.close()

    print 'sorting...'
    #to get temporary file to sort
    out_fn2 = lib_utils.file_tag2(out_fn, 'sort', None)
    temp_sort_dir, _, _, _ = lib_utils.separateDirFn2(out_fn)
    lib_utils.sort_tsv_by_col2(out_fn, [1, 2, 4], ['V', 'V', 'V'],
                               False,
                               out_fn2,
                               temp_dir=temp_sort_dir)
    os.rename(out_fn2, out_fn)
    print 'done.'

    #groupping

    print 'collapsing GO sim scores to make the format easier to import SQL [%s] ...' % out_fn
    out_fn2 = lib_utils.file_tag2(out_fn, 'dense', None)
    fp2 = lib_utils.open2(out_fn2, 'w')
    #heads = '#uniprot1\tuniprot2\tscore_mode\tBP\tMF\tCC\tmethod_id'
    #fp2.write('%s\n'%heads)

    fp = lib_utils.open2(out_fn, 'r')

    visit1 = True
    idx = {'BP': 0, 'MF': 1, 'CC': 2}
    prev_key = None
    gosim_holder = ['-1', '-1', '-1']  #-1 means N/A

    for i in fp:
        prot1, prot2, score, go_class = i.rstrip().split('\t')
        key = '%s\t%s' % (prot1, prot2)
        if key != prev_key:
            if visit1:
                visit1 = False
            else:  #wrap up
                fp2.write('%s\t%s\t%d\n' %
                          (prev_key, '\t'.join(gosim_holder), method_id))
                gosim_holder = ['-1', '-1', '-1']
            gosim_holder[idx[go_class]] = score
            prev_key = key
        else:  #keep storing values
            gosim_holder[idx[go_class]] = score
    fp.close()

    #don't forget the last entry
    fp2.write('%s\t%s\t%d\n' % (prev_key, '\t'.join(gosim_holder), method_id))
    fp2.close()
    os.rename(out_fn2, out_fn)

    print 'done.'