Пример #1
0
def exonSkip_proc_annot(inReportFileName,outReportFileName,inCnaGctFileName=None):

	geneDB = mygenome.getGeneDB()
	frameInfoH = mygenome.getFrameInfoH()

	if inCnaGctFileName:
		cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
	else:
		cnaDB = None

	outReportFile = open(outReportFileName,'w')

	for line in open(inReportFileName):

		(sampN,bp1,bp2,t1,t2,nmatch,nseq,nreg) = line[:-1].split('\t')

		if inCnaGctFileName:
			indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*',sampN).group(1)

		geneS = set()
		geneH = {}

		for tL in (t1,t2):

			for t in tL.split(','):

				ro = re.match('(.*)\.exon([0-9]*)/[0-9]*',t)

				t = ro.group(1)
				e = int(ro.group(2))

				mybasic.addHash(geneH,t,e)

				g = mygenome.gene(t,geneDB=geneDB)

				if g.geneName:
					geneS.add(g.geneName)

		frameL = []

		for transId in geneH:

			exnList = geneH[transId]

			if len(exnList) != 2:
				continue

			#exnList.sort()
			cons = mygenome.frameCons(transId,exnList[0], transId,exnList[1],frameInfoH)

			if cons:
				frameL.append('%s:%s' % (transId,cons))
			else:
				continue

		cnaInfo = []
		geneInfo = []
		censusInfo = []

		goInfoS = set()
		keggInfoS = set()
		biocInfoS = set()

		for geneName in geneS:

			gene = mygenome.gene(geneName,geneDB=geneDB)

			if cnaDB:
				cnaInfo.append('%s:%s' % (geneName,cnaDB.query(indivId,geneName)))

			geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

			goInfoS = goInfoS.union(set(gene.getAttr('go')))
			keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
			biocInfoS = biocInfoS.union(set(gene.getAttr('bioc')))

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \
			';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
Пример #2
0
def fusion_proc_annot(inReportFileName,outReportFileName,inCnaGctFileName=None):

	geneDB = mygenome.getGeneDB()
	frameInfoH = mygenome.getFrameInfoH()
	refFlatH = mygenome.loadRefFlatByChr()

	if inCnaGctFileName:
		cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
	else:
		cnaDB = None

	outReportFile = open(outReportFileName,'w')

	for line in open(inReportFileName):

		(splice_type,sampN,bp1,bp2,teStr1,teStr2,nmatch,nseq,nreg) = line[:-1].split('\t')

		if inCnaGctFileName:
			indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*',sampN).group(1)

		geneStatL = []

		for (bp,teStr) in ((bp1,teStr1),(bp2,teStr2)):

			geneS = set()
			teL = []

			for te in teStr.split(','):

				rm = re.match('(.*)\.exon([0-9]*)/[0-9]*',te)

				if rm:

					t = rm.group(1)
					e = int(rm.group(2))

					g = mygenome.gene(t,geneDB=geneDB)

					if g.geneName:
						geneS.add(g.geneName)

					teL.append((t,e))

			rm = re.match('([+-])(chr[^:]*):([0-9]*)',bp)

			bp_geneS = set(mygenome.locus('%s:%s-%s%s' % (rm.group(2),int(rm.group(3))-1,rm.group(3),rm.group(1))).overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True))
			bp_geneS = bp_geneS.difference(geneS)

			cnaInfo = []
			geneInfo = []
			censusInfo = []

			goInfoS = set()
			keggInfoS = set()
			biocartaInfoS = set()

			for geneName in list(geneS) + list(bp_geneS):

				gene = mygenome.gene(geneName,geneDB=geneDB)

				if cnaDB:
					cnaInfo.append('%s:%s' % (geneName,cnaDB.query(indivId,geneName)))

				geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
				censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))
				goInfoS = goInfoS.union(set(gene.getAttr('go')))
				keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
				biocartaInfoS = biocartaInfoS.union(set(gene.getAttr('biocarta')))

			geneStatL.append((bp1.split(':')[0],bp,teStr,teL,geneS,bp_geneS,cnaInfo,geneInfo,censusInfo,goInfoS,keggInfoS,biocartaInfoS))


		(chrom1,bp1,teStr1,teL1,geneS1,bp_geneS1,cnaInfo1,geneInfo1,censusInfo1,goInfoS1,keggInfoS1,biocartaInfoS1) = geneStatL[0]
		(chrom2,bp2,teStr2,teL2,geneS2,bp_geneS2,cnaInfo2,geneInfo2,censusInfo2,goInfoS2,keggInfoS2,biocartaInfoS2) = geneStatL[1]


		if chrom1 == chrom2:
			type = 'intra'
		else:
			type = 'inter'


		frameL = []

		for (t1,e1) in teL1:

			for (t2,e2) in teL2:

				cons = mygenome.frameCons(t1,e1, t2,e2, frameInfoH)

				if cons=='Y':
					frameL.append('%s.%s-%s.%s:%s' % (t1,e1,t2,e2,cons))

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(sampN, splice_type, type, bp1,bp2, teStr1,teStr2, ','.join(frameL), ','.join(cnaInfo1), ','.join(cnaInfo2), \
			'%s;%s' % (','.join(geneS1),','.join(bp_geneS1)), ';'.join(geneInfo1), ';'.join(censusInfo1), \
			';'.join(map(str,goInfoS1)), ';'.join(map(str,keggInfoS1)), ';'.join(map(str,biocartaInfoS1)), 
			'%s;%s' % (','.join(geneS2),','.join(bp_geneS2)), ';'.join(geneInfo2), ';'.join(censusInfo2), \
			';'.join(map(str,goInfoS2)), ';'.join(map(str,keggInfoS2)), ';'.join(map(str,biocartaInfoS2)), 
			nmatch,nseq,nreg))
Пример #3
0
def exonSkip_proc_annot(inReportFileName,
                        outReportFileName,
                        inCnaGctFileName=None):

    geneDB = mygenome.getGeneDB()
    frameInfoH = mygenome.getFrameInfoH()

    if inCnaGctFileName:
        cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
    else:
        cnaDB = None

    outReportFile = open(outReportFileName, 'w')

    for line in open(inReportFileName):

        (sampN, bp1, bp2, t1, t2, nmatch, nseq, nreg) = line[:-1].split('\t')

        if inCnaGctFileName:
            indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*', sampN).group(1)

        geneS = set()
        geneH = {}

        for tL in (t1, t2):

            for t in tL.split(','):

                ro = re.match('(.*)\.exon([0-9]*)/[0-9]*', t)

                t = ro.group(1)
                e = int(ro.group(2))

                mybasic.addHash(geneH, t, e)

                g = mygenome.gene(t, geneDB=geneDB)

                if g.geneName:
                    geneS.add(g.geneName)

        frameL = []

        for transId in geneH:

            exnList = geneH[transId]

            if len(exnList) != 2:
                continue

            #exnList.sort()
            cons = mygenome.frameCons(transId, exnList[0], transId, exnList[1],
                                      frameInfoH)

            if cons:
                frameL.append('%s:%s' % (transId, cons))
            else:
                continue

        cnaInfo = []
        geneInfo = []
        censusInfo = []

        goInfoS = set()
        keggInfoS = set()
        biocInfoS = set()

        for geneName in geneS:

            gene = mygenome.gene(geneName, geneDB=geneDB)

            if cnaDB:
                cnaInfo.append('%s:%s' %
                               (geneName, cnaDB.query(indivId, geneName)))

            geneInfo.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo.append('%s:%s:%s:%s' %
                              (gene.getAttr('census_somatic'),
                               gene.getAttr('census_germline'),
                               gene.getAttr('census_mutType'),
                               gene.getAttr('census_translocPartners')))

            goInfoS = goInfoS.union(set(gene.getAttr('go')))
            keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
            biocInfoS = biocInfoS.union(set(gene.getAttr('bioc')))

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \
         ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
Пример #4
0
def fusion_proc_annot(inReportFileName,
                      outReportFileName,
                      inCnaGctFileName=None):

    geneDB = mygenome.getGeneDB()
    frameInfoH = mygenome.getFrameInfoH()
    refFlatH = mygenome.loadRefFlatByChr()

    if inCnaGctFileName:
        cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
    else:
        cnaDB = None

    outReportFile = open(outReportFileName, 'w')

    for line in open(inReportFileName):

        (splice_type, sampN, bp1, bp2, teStr1, teStr2, nmatch, nseq,
         nreg) = line[:-1].split('\t')

        if inCnaGctFileName:
            indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*', sampN).group(1)

        geneStatL = []

        for (bp, teStr) in ((bp1, teStr1), (bp2, teStr2)):

            geneS = set()
            teL = []

            for te in teStr.split(','):

                rm = re.match('(.*)\.exon([0-9]*)/[0-9]*', te)

                if rm:

                    t = rm.group(1)
                    e = int(rm.group(2))

                    g = mygenome.gene(t, geneDB=geneDB)

                    if g.geneName:
                        geneS.add(g.geneName)

                    teL.append((t, e))

            rm = re.match('([+-])(chr[^:]*):([0-9]*)', bp)

            bp_geneS = set(
                mygenome.locus('%s:%s-%s%s' %
                               (rm.group(2), int(rm.group(3)) - 1, rm.group(3),
                                rm.group(1))).overlappingGeneL(
                                    refFlatH=refFlatH, strand_sensitive=True))
            bp_geneS = bp_geneS.difference(geneS)

            cnaInfo = []
            geneInfo = []
            censusInfo = []

            goInfoS = set()
            keggInfoS = set()
            biocartaInfoS = set()

            for geneName in list(geneS) + list(bp_geneS):

                gene = mygenome.gene(geneName, geneDB=geneDB)

                if cnaDB:
                    cnaInfo.append('%s:%s' %
                                   (geneName, cnaDB.query(indivId, geneName)))

                geneInfo.append(
                    '%s:%s:%s' %
                    (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
                censusInfo.append('%s:%s:%s:%s' %
                                  (gene.getAttr('census_somatic'),
                                   gene.getAttr('census_germline'),
                                   gene.getAttr('census_mutType'),
                                   gene.getAttr('census_translocPartners')))
                goInfoS = goInfoS.union(set(gene.getAttr('go')))
                keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
                biocartaInfoS = biocartaInfoS.union(
                    set(gene.getAttr('biocarta')))

            geneStatL.append(
                (bp1.split(':')[0], bp, teStr, teL, geneS, bp_geneS, cnaInfo,
                 geneInfo, censusInfo, goInfoS, keggInfoS, biocartaInfoS))

        (chrom1, bp1, teStr1, teL1, geneS1, bp_geneS1, cnaInfo1, geneInfo1,
         censusInfo1, goInfoS1, keggInfoS1, biocartaInfoS1) = geneStatL[0]
        (chrom2, bp2, teStr2, teL2, geneS2, bp_geneS2, cnaInfo2, geneInfo2,
         censusInfo2, goInfoS2, keggInfoS2, biocartaInfoS2) = geneStatL[1]

        if chrom1 == chrom2:
            type = 'intra'
        else:
            type = 'inter'

        frameL = []

        for (t1, e1) in teL1:

            for (t2, e2) in teL2:

                cons = mygenome.frameCons(t1, e1, t2, e2, frameInfoH)

                if cons == 'Y':
                    frameL.append('%s.%s-%s.%s:%s' % (t1, e1, t2, e2, cons))

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (sampN, splice_type, type, bp1,bp2, teStr1,teStr2, ','.join(frameL), ','.join(cnaInfo1), ','.join(cnaInfo2), \
         '%s;%s' % (','.join(geneS1),','.join(bp_geneS1)), ';'.join(geneInfo1), ';'.join(censusInfo1), \
         ';'.join(map(str,goInfoS1)), ';'.join(map(str,keggInfoS1)), ';'.join(map(str,biocartaInfoS1)),
         '%s;%s' % (','.join(geneS2),','.join(bp_geneS2)), ';'.join(geneInfo2), ';'.join(censusInfo2), \
         ';'.join(map(str,goInfoS2)), ';'.join(map(str,keggInfoS2)), ';'.join(map(str,biocartaInfoS2)),
         nmatch,nseq,nreg))