Пример #1
0
def gene_annot(geneN_idx=0, inFileN='', outFileN='', have_header=True):

    geneDB = mygenome.getGeneDB()

    inFile = sys.stdin
    if inFileN != '':
        inFile = open(inFileN, 'r')
    outFile = sys.stdout
    if outFileN != '':
        outFile = open(outFileN, 'w')

    if have_header:
        header = inFile.readline()[:-1]
        outFile.write('%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n' % header)
    else:
        last_pos = inFile.tell()  # remember current position
        header = inFile.readline().rstrip()
        ncol = len(header.split('\t'))
        headerL = map(lambda x: 'X%s' % (x + 1), range(ncol))
        outFile.write('%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n' %
                      '\t'.join(headerL))
        inFile.seek(last_pos)  # return to original position
    #if have_header

    for line in inFile:

        tokL = line[:-1].split('\t')
        geneName = tokL[geneN_idx].split(',')[0]

        geneS = set()
        geneH = {}

        geneInfo = []
        censusInfo = []

        goInfoS = set()
        keggInfoS = set()
        biocInfoS = set()

        gene = mygenome.gene(geneName, geneDB=geneDB)

        geneInfo.append(
            '%s:%s:%s' %
            (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
        censusInfo.append(
            '%s:%s:%s:%s' %
            (gene.getAttr('census_somatic'), gene.getAttr('census_germline'),
             gene.getAttr('census_mutType'),
             gene.getAttr('census_translocPartners')))

        goInfoS = goInfoS.union(set(gene.getAttr('go')))
        keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
        biocInfoS = biocInfoS.union(set(gene.getAttr('biocarta')))

        outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % \
         ('\t'.join(tokL), ';'.join(geneInfo),';'.join(censusInfo), \
         ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS))))
    outFile.flush()
    outFile.close()
    inFile.close()
Пример #2
0
def gene_annot(inReportFileName, outReportFileName):

    geneDB = mygenome.getGeneDB()

    outReportFile = open(outReportFileName, 'w')

    inFile = open(inReportFileName)

    header = inFile.readline()[:-1]

    outReportFile.write('%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n' % header)

    headerL = header.split('\t')

    if 'geneN' in headerL:
        geneN_idx = headerL.index('geneN')

    if 'gene_symL' in headerL:
        geneN_idx = headerL.index('gene_symL')

    if 'SYMBOL' in headerL:
        geneN_idx = headerL.index('SYMBOL')

    for line in inFile:

        tokL = line[:-1].split('\t')
        #		geneName = tokL[geneN_idx].split(',')[0]
        geneName = tokL[geneN_idx].split(';')[0]

        geneS = set()
        geneH = {}

        geneInfo = []
        censusInfo = []

        goInfoS = set()
        keggInfoS = set()
        biocInfoS = set()

        gene = mygenome.gene(geneName, geneDB=geneDB)

        geneInfo.append(
            '%s:%s:%s' %
            (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
        censusInfo.append(
            '%s:%s:%s:%s' %
            (gene.getAttr('census_somatic'), gene.getAttr('census_germline'),
             gene.getAttr('census_mutType'),
             gene.getAttr('census_translocPartners')))

        goInfoS = goInfoS.union(set(gene.getAttr('go')))
        keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
        biocInfoS = biocInfoS.union(set(gene.getAttr('biocarta')))

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % \
         ('\t'.join(tokL), ';'.join(geneInfo),';'.join(censusInfo), \
         ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS))))
Пример #3
0
def gene_annot(inReportFileName,outReportFileName):

	geneDB = mygenome.getGeneDB()
	
	outReportFile = open(outReportFileName,'w')

	inFile = open(inReportFileName)

	header = inFile.readline()[:-1]

	outReportFile.write('%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n' % header)

	headerL = header.split('\t')

	if 'geneN' in headerL:
		geneN_idx = headerL.index('geneN')

	if 'gene_symL' in headerL:
		geneN_idx = headerL.index('gene_symL')
	
	if 'SYMBOL' in headerL:
		geneN_idx = headerL.index('SYMBOL')

	for line in inFile:

		tokL = line[:-1].split('\t')
#		geneName = tokL[geneN_idx].split(',')[0]
		geneName = tokL[geneN_idx].split(';')[0]

		geneS = set()
		geneH = {}

		geneInfo = []
		censusInfo = []

		goInfoS = set()
		keggInfoS = set()
		biocInfoS = set()

		gene = mygenome.gene(geneName,geneDB=geneDB)

		geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
		censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

		goInfoS = goInfoS.union(set(gene.getAttr('go')))
		keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
		biocInfoS = biocInfoS.union(set(gene.getAttr('biocarta')))

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % \
			('\t'.join(tokL), ';'.join(geneInfo),';'.join(censusInfo), \
			';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS))))
Пример #4
0
def gene_annot(geneN_idx=0, inFileN="", outFileN="", have_header=True):

    geneDB = mygenome.getGeneDB()

    inFile = sys.stdin
    if inFileN != "":
        inFile = open(inFileN, "r")
    outFile = sys.stdout
    if outFileN != "":
        outFile = open(outFileN, "w")

    if have_header:
        header = inFile.readline()[:-1]
        outFile.write("%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n" % header)
    else:
        last_pos = inFile.tell()  # remember current position
        header = inFile.readline().rstrip()
        ncol = len(header.split("\t"))
        headerL = map(lambda x: "X%s" % (x + 1), range(ncol))
        outFile.write("%s\tgeneInfo\tcensus\tGO\tKEGG\tBiocarta\n" % "\t".join(headerL))
        inFile.seek(last_pos)  # return to original position
        # if have_header

    for line in inFile:

        tokL = line[:-1].split("\t")
        geneName = tokL[geneN_idx].split(",")[0]

        geneS = set()
        geneH = {}

        geneInfo = []
        censusInfo = []

        goInfoS = set()
        keggInfoS = set()
        biocInfoS = set()

        gene = mygenome.gene(geneName, geneDB=geneDB)

        geneInfo.append("%s:%s:%s" % (geneName, gene.getAttr("desc"), gene.getAttr("summary")))
        censusInfo.append(
            "%s:%s:%s:%s"
            % (
                gene.getAttr("census_somatic"),
                gene.getAttr("census_germline"),
                gene.getAttr("census_mutType"),
                gene.getAttr("census_translocPartners"),
            )
        )

        goInfoS = goInfoS.union(set(gene.getAttr("go")))
        keggInfoS = keggInfoS.union(set(gene.getAttr("kegg")))
        biocInfoS = biocInfoS.union(set(gene.getAttr("biocarta")))

        outFile.write(
            "%s\t%s\t%s\t%s\t%s\t%s\n"
            % (
                "\t".join(tokL),
                ";".join(geneInfo),
                ";".join(censusInfo),
                ";".join(map(str, goInfoS)),
                ";".join(map(str, keggInfoS)),
                ";".join(map(str, biocInfoS)),
            )
        )
    outFile.flush()
    outFile.close()
    inFile.close()
Пример #5
0
def exonSkip_proc(inGsnapFileName, outGsnapFileName, outReportFileName, sampN):

    geneNameH = mygenome.geneNameH()
    geneSetH = mygenome.geneSetH()
    geneInfoH = mygenome.geneInfoH(geneNameH, geneSetH)
    refFlatH = mygenome.loadRefFlatByChr()

    result = mygsnap.gsnapFile(inGsnapFileName, False)

    juncHH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        splice_type = re.search('splice_type:([^,\t]*)',
                                match.segL[0][3]).group(1)
        direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)
        offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1))

        transcript1 = re.search('label_[12]:([^,\t]*)', match.segL[0][3])
        gene1 = set()

        if transcript1:

            transcript1 = tuple(
                [x.split('.exon')[0] for x in transcript1.group(1).split('|')])

            for t in transcript1:

                g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH)

                if g.geneName:
                    gene1.add(g.geneName)

        else:

            transcript1 = ()

        transcript2 = re.search('label_[12]:([^,\t]*)', match.segL[1][3])
        gene2 = set()

        if transcript2:

            transcript2 = tuple(
                [x.split('.exon')[0] for x in transcript2.group(1).split('|')])

            for t in transcript2:

                g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH)

                if g.geneName:
                    gene2.add(g.geneName)

        else:

            transcript2 = ()

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)', s1)
        bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+', s2)

        if (bp1.group(1), direction) in (('+', 'sense'), ('-', 'antisense')):
            trans_strand1 = '+'
        elif (bp1.group(1), direction) in (('+', 'antisense'), ('-', 'sense')):
            trans_strand1 = '-'
        else:
            raise Exception

        if (bp2.group(1), direction) in (('+', 'sense'), ('-', 'antisense')):
            trans_strand2 = '+'
        elif (bp2.group(1), direction) in (('+', 'antisense'), ('-', 'sense')):
            trans_strand2 = '-'
        else:
            raise Exception

        bp_gene1 = mygenome.locus(
            '%s:%s-%s%s' % (bp1.group(2), int(bp1.group(3)) - 1, bp1.group(3),
                            trans_strand1)).overlappingGeneL(
                                refFlatH=refFlatH, strand_sensitive=True)
        bp_gene2 = mygenome.locus(
            '%s:%s-%s%s' % (bp2.group(2), int(bp2.group(3)) - 1, bp2.group(3),
                            trans_strand2)).overlappingGeneL(
                                refFlatH=refFlatH, strand_sensitive=True)

        if direction == 'sense':
            key = (bp1.groups()[1:], bp2.groups()[1:])
            transcript = (transcript1, transcript2)
            gene = (tuple(gene1), tuple(gene2))
            bp_gene = (bp_gene1, bp_gene2)
        elif direction == 'antisense':
            key = (bp2.groups()[1:], bp1.groups()[1:])
            transcript = (transcript2, transcript1)
            gene = (tuple(gene2), tuple(gene1))
            bp_gene = (bp_gene2, bp_gene1)
        else:
            raise Exception

        if key in juncHH:

            juncHH[key]['match'].append(r)
            juncHH[key]['seq'].append(r.seq())
            juncHH[key]['reg'].append((direction, offset))

        else:

            juncHH[key] = {
                'match': [r],
                'splice_type': splice_type,
                'seq': [r.seq()],
                'reg': [(direction, offset)],
                'transcript': transcript,
                'gene': gene,
                'bp_gene': bp_gene
            }

    juncKH = juncHH.items()
    juncKH.sort(lambda x, y: cmp(len(set(y[1]['reg'])), len(set(x[1]['reg']))))

    outGsnapFile = open(outGsnapFileName, 'w')
    outReportFile = open(outReportFileName, 'w')

    for (key, juncH) in juncKH:

        if key[0][0] == key[1][0]:
            type = 'intra'
        else:
            type = 'inter'

        geneInfo1 = []
        censusInfo1 = []

        for geneName in juncH['gene'][0]:
            gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH)
            geneInfo1.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo1.append('%s:%s:%s:%s' %
                               (gene.getAttr('census_somatic'),
                                gene.getAttr('census_germline'),
                                gene.getAttr('census_mutType'),
                                gene.getAttr('census_translocPartners')))

        geneInfo2 = []
        censusInfo2 = []

        for geneName in juncH['gene'][1]:
            gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH)
            geneInfo2.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo2.append('%s:%s:%s:%s' %
                               (gene.getAttr('census_somatic'),
                                gene.getAttr('census_germline'),
                                gene.getAttr('census_mutType'),
                                gene.getAttr('census_translocPartners')))

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (type, juncH['splice_type'], sampN, ':'.join(key[0]), ':'.join(key[1]), \
         ';'.join(juncH['transcript'][0]), ';'.join(juncH['transcript'][1]), ';'.join(juncH['gene'][0]), ';'.join(juncH['gene'][1]), ';'.join(geneInfo1), ';'.join(geneInfo2), \
         ';'.join(censusInfo1), ';'.join(censusInfo2), ','.join(juncH['bp_gene'][0]), ','.join(juncH['bp_gene'][1]), \
         len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg']))))

        for m in juncH['match']:
            outGsnapFile.write(m.rawText() + '\n')
Пример #6
0
def genCompositeModel(outTextFileName,outFaFileName,intronSize=100): 

	geneNameH = mygenome.geneNameH()
	geneSetH = mygenome.geneSetH()
	geneInfoH = mygenome.geneInfoH(geneNameH,geneSetH)

	geneH = mygenome.loadKgByChr()
	#geneH = mygenome.loadLincByChr(h=geneH)

	outTextFile = open(outTextFileName, 'w')
	outFaFile = open(outFaFileName, 'w')

	for chrNum in range(1,23)+['X','Y','M']:
	#for chrNum in [1]:

		chrom = 'chr%s' % chrNum

		geneH_byChr = filter(lambda x: mygenome.gene(x['geneId'],geneNameH,geneSetH,geneInfoH).geneName in mygenome.TK, geneH[chrom])

		txnLocusL_combined = []

		for strand in ['+','-']:

			txnLocusL = [mygenome.locus('%s:%s-%s%s' % (chrom,h['txnSta'],h['txnEnd'],strand),h['geneId']) for h in filter(lambda x: x['strand']==strand, geneH_byChr)]
			n_before = len(txnLocusL)

			txnLocusL = mygenome.mergeLoci(txnLocusL)
			n_after = len(txnLocusL)

			#print chrom, strand, n_before, n_after

			txnLocusL_combined += txnLocusL

		txnLocusL_combined.sort(lambda x,y: cmp(x.chrEnd,y.chrEnd))
		txnLocusL_combined.sort(lambda x,y: cmp(x.chrSta,y.chrSta))

		for txnLoc in txnLocusL_combined:

			exnLocusL = []

			for h in filter(lambda x: x['geneId'] in txnLoc.id, geneH_byChr):
				for (exnSta,exnEnd) in h['exnList']:
					exnLocusL.append(mygenome.locus('%s:%s-%s%s' % (chrom, exnSta, exnEnd, h['strand'])))

			exnLocusL.sort(lambda x,y: cmp(x.chrEnd,y.chrEnd))
			exnLocusL.sort(lambda x,y: cmp(x.chrSta,y.chrSta))

			exnLocusL = mygenome.mergeLoci(exnLocusL)

			exnStaL = [str(exnLoc.chrSta) for exnLoc in exnLocusL]
			exnEndL = [str(exnLoc.chrEnd) for exnLoc in exnLocusL]

			outTextFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (txnLoc.id,txnLoc.chrom,txnLoc.strand,txnLoc.chrSta,txnLoc.chrEnd,len(exnLocusL),','.join(exnStaL),','.join(exnEndL)))

			outFaFile.write('>%s|%s|%s|%s|%s\n' % (txnLoc.id,txnLoc.chrom,txnLoc.strand,txnLoc.chrSta,txnLoc.chrEnd))

			for i in range(len(exnLocusL)):

				exnLocCopy = copy.deepcopy(exnLocusL[i])

				exnLocCopy.strand = '+'

				if i > 0:
					exnLocCopy.chrSta -= min(intronSize, int((exnLocusL[i].chrSta - exnLocusL[i-1].chrEnd)/2))

				if i < len(exnLocusL)-1:
					exnLocCopy.chrEnd += min(intronSize, int((exnLocusL[i+1].chrSta - exnLocusL[i].chrEnd)/2))

				outFaFile.write(exnLocCopy.nibFrag())

			outFaFile.write('\n')

	outTextFile.close()
	outFaFile.close()
Пример #7
0
def gsnap_process_junction(inGsnapFileName,outGsnapFileName,outReportFileName,sampN):

	geneNameH = mygenome.geneNameH()
	geneSetH = mygenome.geneSetH()
	geneInfoH = mygenome.geneInfoH(geneNameH,geneSetH)
	refFlatH = mygenome.loadRefFlatByChr()

	result = mygsnap.gsnapFile(inGsnapFileName,False)

	juncHH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1)
		direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)
		offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1))

		rm = re.search('label_[12]:([^,\t]*)', match.segL[0][3])
		gene1 = set()

		if rm:

			trans_exon1 = rm.group(1).split('|')

			for t in trans_exon1:

				g = mygenome.gene(t.split('.exon')[0],geneNameH,geneSetH,geneInfoH)

				if g.geneName:
					gene1.add(g.geneName)

		else:

			trans_exon1 = ()

		rm = re.search('label_[12]:([^,\t]*)', match.segL[0][3])
		gene2 = set()

		if rm:

			trans_exon2 = rm.group(1).split('|')

			for t in trans_exon2:

				g = mygenome.gene(t.split('.exon')[0],geneNameH,geneSetH,geneInfoH)

				if g.geneName:
					gene2.add(g.geneName)

		else:

			trans_exon2 = ()


		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1)
		bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2)

		if (bp1.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand1 = '+'
		elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand1 = '-'
		else:
			raise Exception

		if (bp2.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand2 = '+'
		elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand2 = '-'
		else:
			raise Exception

		locus1 = mygenome.locus('%s:%s-%s%s' % (bp1.group(2),int(bp1.group(3))-1,bp1.group(3),trans_strand1))
		bp_gene1 = list(set(locus1.overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)).difference(gene1))

		locus2 = mygenome.locus('%s:%s-%s%s' % (bp2.group(2),int(bp2.group(3))-2,bp2.group(3),trans_strand2))
		bp_gene2 = list(set(locus2.overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)).difference(gene2))

		if direction=='sense':
			key = (bp1.groups()[1:],bp2.groups()[1:])
			trans_exon = (trans_exon1,trans_exon2)
			gene = (list(gene1),list(gene2))
			bp_gene = (bp_gene1,bp_gene2)
		elif direction=='antisense':
			key = (bp2.groups()[1:],bp1.groups()[1:])
			trans_exon = (trans_exon2,trans_exon1)
			gene = (list(gene2),list(gene1))
			bp_gene = (bp_gene2,bp_gene1)
		else:
			raise Exception

		if key in juncHH:

			juncHH[key]['match'].append(r)
			juncHH[key]['seq'].append(r.seq())
			juncHH[key]['reg'].append((direction,offset))

		else:

			juncHH[key] = {'match':[r], 'splice_type':splice_type, 'seq':[r.seq()], 'reg':[(direction,offset)], 'trans_exon':trans_exon, 'gene':gene, 'bp_gene':bp_gene}

	juncKH = juncHH.items()
	juncKH.sort(lambda x,y: cmp(len(set(y[1]['reg'])),len(set(x[1]['reg']))))

	outGsnapFile = open(outGsnapFileName,'w')
	outReportFile = open(outReportFileName,'w')

	for (key, juncH) in juncKH:

		if key[0][0] == key[1][0]:
			type = 'intra'
		else:
			type = 'inter'

		geneInfo1 = []
		censusInfo1 = []

		for geneName in juncH['gene'][0]+juncH['bp_gene'][0]:
			gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH)
			geneInfo1.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

		geneInfo2 = []
		censusInfo2 = []

		for geneName in juncH['gene'][1]+juncH['bp_gene'][1]:
			gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH)
			geneInfo2.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s;%s\t%s;%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(type, juncH['splice_type'], sampN, ':'.join(key[0]), ':'.join(key[1]), \
			','.join(juncH['trans_exon'][0]), ','.join(juncH['trans_exon'][1]), \
			','.join(juncH['gene'][0]), ','.join(juncH['bp_gene'][0]), ','.join(juncH['gene'][1]), ','.join(juncH['bp_gene'][1]), \
			';'.join(geneInfo1), ';'.join(geneInfo2), ';'.join(censusInfo1), ';'.join(censusInfo2), \
			len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg']))))

		for m in juncH['match']:
			outGsnapFile.write(m.rawText()+'\n')
Пример #8
0
def exonSkip_proc_annot(inReportFileName,outReportFileName,inCnaGctFileName=None):

	geneDB = mygenome.getGeneDB()
	frameInfoH = mygenome.getFrameInfoH()

	if inCnaGctFileName:
		cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
	else:
		cnaDB = None

	outReportFile = open(outReportFileName,'w')

	for line in open(inReportFileName):

		(sampN,bp1,bp2,t1,t2,nmatch,nseq,nreg) = line[:-1].split('\t')

		if inCnaGctFileName:
			indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*',sampN).group(1)

		geneS = set()
		geneH = {}

		for tL in (t1,t2):

			for t in tL.split(','):

				ro = re.match('(.*)\.exon([0-9]*)/[0-9]*',t)

				t = ro.group(1)
				e = int(ro.group(2))

				mybasic.addHash(geneH,t,e)

				g = mygenome.gene(t,geneDB=geneDB)

				if g.geneName:
					geneS.add(g.geneName)

		frameL = []

		for transId in geneH:

			exnList = geneH[transId]

			if len(exnList) != 2:
				continue

			#exnList.sort()
			cons = mygenome.frameCons(transId,exnList[0], transId,exnList[1],frameInfoH)

			if cons:
				frameL.append('%s:%s' % (transId,cons))
			else:
				continue

		cnaInfo = []
		geneInfo = []
		censusInfo = []

		goInfoS = set()
		keggInfoS = set()
		biocInfoS = set()

		for geneName in geneS:

			gene = mygenome.gene(geneName,geneDB=geneDB)

			if cnaDB:
				cnaInfo.append('%s:%s' % (geneName,cnaDB.query(indivId,geneName)))

			geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

			goInfoS = goInfoS.union(set(gene.getAttr('go')))
			keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
			biocInfoS = biocInfoS.union(set(gene.getAttr('bioc')))

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \
			';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
Пример #9
0
def gsnap_process_junction(inReportFileName,outReportFileName):

	geneNameH = mygenome.geneNameH()
	geneSetH = mygenome.geneSetH()
	geneInfoH = mygenome.geneInfoH(geneNameH,geneSetH)

	outReportFile = open(outReportFileName,'w')

	for line in open(inReportFileName):

		(spliceType,sampN,bp1,bp2,t1,t2,nmatch,nseq,nreg) = line[:-1].split('\t')

		gene1 = set()

		if t1:

			transcript1 = tuple(t1.split(';'))
	
			for t in transcript1:

				g = mygenome.gene(t,geneNameH,geneSetH,geneInfoH)

				if g.geneName:
					gene1.add(g.geneName)

		else:

			gene1 = ()

		gene2 = set()

		if t2:

			transcript2 = tuple(t2.split(';'))

			for t in transcript2:

				g = mygenome.gene(t,geneNameH,geneSetH,geneInfoH)

				if g.geneName:
					gene2.add(g.geneName)

		else:

			gene2 = ()

		bp_gene1 = set()

#		transcript1 = tuple([x for x in bp1.split('|') if "uc" in x])

		for t in tuple([x for x in bp1.split('|') if "uc" in x]):

			g = mygenome.gene(t,geneNameH,geneSetH,geneInfoH)

			if g.geneName:
				bp_gene1.add(g.geneName)
		
		bp_gene2 = set()

#		transcript2 = tuple([x for x in bp2.split('|') if "uc" in x])

		for t in tuple([x for x in bp2.split('|') if "uc" in x]):

			g = mygenome.gene(t,geneNameH,geneSetH,geneInfoH)

			if g.geneName:
				bp_gene2.add(g.geneName)

#		ch1 =  tuple([x for x in id1.split('|') if not "uc" in x])
#		ch2 =  tuple([x for x in id2.split('|') if not "uc" in x])


		if tuple([x for x in bp1.split('|') if "chr" in x])[0] == tuple([x for x in bp2.split('|') if "chr" in x])[0]:
			type = 'intra'
		else:
			type = 'inter'


		geneInfo1 = []
		censusInfo1 = []

		for geneName in gene1:
			gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH)
			geneInfo1.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

		geneInfo2 = []
		censusInfo2 = []

		for geneName in gene2:
			gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH)
			geneInfo2.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))


		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(type, spliceType, sampN, bp1, bp2, \
			t1, t2, ';'.join(gene1), ';'.join(gene2), ';'.join(geneInfo1), ';'.join(geneInfo2), \
			';'.join(censusInfo1), ';'.join(censusInfo2), ','.join(bp_gene1), ','.join(bp_gene2), \
			nmatch ,nseq, nreg))
Пример #10
0
def exonSkip_proc_annot(inReportFileName,
                        outReportFileName,
                        inCnaGctFileName=None):

    geneDB = mygenome.getGeneDB()
    frameInfoH = mygenome.getFrameInfoH()

    if inCnaGctFileName:
        cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
    else:
        cnaDB = None

    outReportFile = open(outReportFileName, 'w')

    for line in open(inReportFileName):

        (sampN, bp1, bp2, t1, t2, nmatch, nseq, nreg) = line[:-1].split('\t')

        if inCnaGctFileName:
            indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*', sampN).group(1)

        geneS = set()
        geneH = {}

        for tL in (t1, t2):

            for t in tL.split(','):

                ro = re.match('(.*)\.exon([0-9]*)/[0-9]*', t)

                t = ro.group(1)
                e = int(ro.group(2))

                mybasic.addHash(geneH, t, e)

                g = mygenome.gene(t, geneDB=geneDB)

                if g.geneName:
                    geneS.add(g.geneName)

        frameL = []

        for transId in geneH:

            exnList = geneH[transId]

            if len(exnList) != 2:
                continue

            #exnList.sort()
            cons = mygenome.frameCons(transId, exnList[0], transId, exnList[1],
                                      frameInfoH)

            if cons:
                frameL.append('%s:%s' % (transId, cons))
            else:
                continue

        cnaInfo = []
        geneInfo = []
        censusInfo = []

        goInfoS = set()
        keggInfoS = set()
        biocInfoS = set()

        for geneName in geneS:

            gene = mygenome.gene(geneName, geneDB=geneDB)

            if cnaDB:
                cnaInfo.append('%s:%s' %
                               (geneName, cnaDB.query(indivId, geneName)))

            geneInfo.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo.append('%s:%s:%s:%s' %
                              (gene.getAttr('census_somatic'),
                               gene.getAttr('census_germline'),
                               gene.getAttr('census_mutType'),
                               gene.getAttr('census_translocPartners')))

            goInfoS = goInfoS.union(set(gene.getAttr('go')))
            keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
            biocInfoS = biocInfoS.union(set(gene.getAttr('bioc')))

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \
         ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
Пример #11
0
def fusion_proc_annot(inReportFileName,outReportFileName,inCnaGctFileName=None):

	geneDB = mygenome.getGeneDB()
	frameInfoH = mygenome.getFrameInfoH()
	refFlatH = mygenome.loadRefFlatByChr()

	if inCnaGctFileName:
		cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
	else:
		cnaDB = None

	outReportFile = open(outReportFileName,'w')

	for line in open(inReportFileName):

		(splice_type,sampN,bp1,bp2,teStr1,teStr2,nmatch,nseq,nreg) = line[:-1].split('\t')

		if inCnaGctFileName:
			indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*',sampN).group(1)

		geneStatL = []

		for (bp,teStr) in ((bp1,teStr1),(bp2,teStr2)):

			geneS = set()
			teL = []

			for te in teStr.split(','):

				rm = re.match('(.*)\.exon([0-9]*)/[0-9]*',te)

				if rm:

					t = rm.group(1)
					e = int(rm.group(2))

					g = mygenome.gene(t,geneDB=geneDB)

					if g.geneName:
						geneS.add(g.geneName)

					teL.append((t,e))

			rm = re.match('([+-])(chr[^:]*):([0-9]*)',bp)

			bp_geneS = set(mygenome.locus('%s:%s-%s%s' % (rm.group(2),int(rm.group(3))-1,rm.group(3),rm.group(1))).overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True))
			bp_geneS = bp_geneS.difference(geneS)

			cnaInfo = []
			geneInfo = []
			censusInfo = []

			goInfoS = set()
			keggInfoS = set()
			biocartaInfoS = set()

			for geneName in list(geneS) + list(bp_geneS):

				gene = mygenome.gene(geneName,geneDB=geneDB)

				if cnaDB:
					cnaInfo.append('%s:%s' % (geneName,cnaDB.query(indivId,geneName)))

				geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
				censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))
				goInfoS = goInfoS.union(set(gene.getAttr('go')))
				keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
				biocartaInfoS = biocartaInfoS.union(set(gene.getAttr('biocarta')))

			geneStatL.append((bp1.split(':')[0],bp,teStr,teL,geneS,bp_geneS,cnaInfo,geneInfo,censusInfo,goInfoS,keggInfoS,biocartaInfoS))


		(chrom1,bp1,teStr1,teL1,geneS1,bp_geneS1,cnaInfo1,geneInfo1,censusInfo1,goInfoS1,keggInfoS1,biocartaInfoS1) = geneStatL[0]
		(chrom2,bp2,teStr2,teL2,geneS2,bp_geneS2,cnaInfo2,geneInfo2,censusInfo2,goInfoS2,keggInfoS2,biocartaInfoS2) = geneStatL[1]


		if chrom1 == chrom2:
			type = 'intra'
		else:
			type = 'inter'


		frameL = []

		for (t1,e1) in teL1:

			for (t2,e2) in teL2:

				cons = mygenome.frameCons(t1,e1, t2,e2, frameInfoH)

				if cons=='Y':
					frameL.append('%s.%s-%s.%s:%s' % (t1,e1,t2,e2,cons))

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(sampN, splice_type, type, bp1,bp2, teStr1,teStr2, ','.join(frameL), ','.join(cnaInfo1), ','.join(cnaInfo2), \
			'%s;%s' % (','.join(geneS1),','.join(bp_geneS1)), ';'.join(geneInfo1), ';'.join(censusInfo1), \
			';'.join(map(str,goInfoS1)), ';'.join(map(str,keggInfoS1)), ';'.join(map(str,biocartaInfoS1)), 
			'%s;%s' % (','.join(geneS2),','.join(bp_geneS2)), ';'.join(geneInfo2), ';'.join(censusInfo2), \
			';'.join(map(str,goInfoS2)), ';'.join(map(str,keggInfoS2)), ';'.join(map(str,biocartaInfoS2)), 
			nmatch,nseq,nreg))
Пример #12
0
def gsnap_process_junction(inReportFileName, outReportFileName):

    geneNameH = mygenome.geneNameH()
    geneSetH = mygenome.geneSetH()
    geneInfoH = mygenome.geneInfoH(geneNameH, geneSetH)

    outReportFile = open(outReportFileName, 'w')

    for line in open(inReportFileName):

        (spliceType, sampN, bp1, bp2, t1, t2, nmatch, nseq,
         nreg) = line[:-1].split('\t')

        gene1 = set()

        if t1:

            transcript1 = tuple(t1.split(';'))

            for t in transcript1:

                g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH)

                if g.geneName:
                    gene1.add(g.geneName)

        else:

            gene1 = ()

        gene2 = set()

        if t2:

            transcript2 = tuple(t2.split(';'))

            for t in transcript2:

                g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH)

                if g.geneName:
                    gene2.add(g.geneName)

        else:

            gene2 = ()

        bp_gene1 = set()

        #		transcript1 = tuple([x for x in bp1.split('|') if "uc" in x])

        for t in tuple([x for x in bp1.split('|') if "uc" in x]):

            g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH)

            if g.geneName:
                bp_gene1.add(g.geneName)

        bp_gene2 = set()

        #		transcript2 = tuple([x for x in bp2.split('|') if "uc" in x])

        for t in tuple([x for x in bp2.split('|') if "uc" in x]):

            g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH)

            if g.geneName:
                bp_gene2.add(g.geneName)

#		ch1 =  tuple([x for x in id1.split('|') if not "uc" in x])
#		ch2 =  tuple([x for x in id2.split('|') if not "uc" in x])

        if tuple([x for x in bp1.split('|') if "chr" in x
                  ])[0] == tuple([x for x in bp2.split('|') if "chr" in x])[0]:
            type = 'intra'
        else:
            type = 'inter'

        geneInfo1 = []
        censusInfo1 = []

        for geneName in gene1:
            gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH)
            geneInfo1.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo1.append('%s:%s:%s:%s' %
                               (gene.getAttr('census_somatic'),
                                gene.getAttr('census_germline'),
                                gene.getAttr('census_mutType'),
                                gene.getAttr('census_translocPartners')))

        geneInfo2 = []
        censusInfo2 = []

        for geneName in gene2:
            gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH)
            geneInfo2.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo2.append('%s:%s:%s:%s' %
                               (gene.getAttr('census_somatic'),
                                gene.getAttr('census_germline'),
                                gene.getAttr('census_mutType'),
                                gene.getAttr('census_translocPartners')))


        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (type, spliceType, sampN, bp1, bp2, \
         t1, t2, ';'.join(gene1), ';'.join(gene2), ';'.join(geneInfo1), ';'.join(geneInfo2), \
         ';'.join(censusInfo1), ';'.join(censusInfo2), ','.join(bp_gene1), ','.join(bp_gene2), \
         nmatch ,nseq, nreg))
Пример #13
0
def fusion_proc_annot(inReportFileName,
                      outReportFileName,
                      inCnaGctFileName=None):

    geneDB = mygenome.getGeneDB()
    frameInfoH = mygenome.getFrameInfoH()
    refFlatH = mygenome.loadRefFlatByChr()

    if inCnaGctFileName:
        cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
    else:
        cnaDB = None

    outReportFile = open(outReportFileName, 'w')

    for line in open(inReportFileName):

        (splice_type, sampN, bp1, bp2, teStr1, teStr2, nmatch, nseq,
         nreg) = line[:-1].split('\t')

        if inCnaGctFileName:
            indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*', sampN).group(1)

        geneStatL = []

        for (bp, teStr) in ((bp1, teStr1), (bp2, teStr2)):

            geneS = set()
            teL = []

            for te in teStr.split(','):

                rm = re.match('(.*)\.exon([0-9]*)/[0-9]*', te)

                if rm:

                    t = rm.group(1)
                    e = int(rm.group(2))

                    g = mygenome.gene(t, geneDB=geneDB)

                    if g.geneName:
                        geneS.add(g.geneName)

                    teL.append((t, e))

            rm = re.match('([+-])(chr[^:]*):([0-9]*)', bp)

            bp_geneS = set(
                mygenome.locus('%s:%s-%s%s' %
                               (rm.group(2), int(rm.group(3)) - 1, rm.group(3),
                                rm.group(1))).overlappingGeneL(
                                    refFlatH=refFlatH, strand_sensitive=True))
            bp_geneS = bp_geneS.difference(geneS)

            cnaInfo = []
            geneInfo = []
            censusInfo = []

            goInfoS = set()
            keggInfoS = set()
            biocartaInfoS = set()

            for geneName in list(geneS) + list(bp_geneS):

                gene = mygenome.gene(geneName, geneDB=geneDB)

                if cnaDB:
                    cnaInfo.append('%s:%s' %
                                   (geneName, cnaDB.query(indivId, geneName)))

                geneInfo.append(
                    '%s:%s:%s' %
                    (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
                censusInfo.append('%s:%s:%s:%s' %
                                  (gene.getAttr('census_somatic'),
                                   gene.getAttr('census_germline'),
                                   gene.getAttr('census_mutType'),
                                   gene.getAttr('census_translocPartners')))
                goInfoS = goInfoS.union(set(gene.getAttr('go')))
                keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
                biocartaInfoS = biocartaInfoS.union(
                    set(gene.getAttr('biocarta')))

            geneStatL.append(
                (bp1.split(':')[0], bp, teStr, teL, geneS, bp_geneS, cnaInfo,
                 geneInfo, censusInfo, goInfoS, keggInfoS, biocartaInfoS))

        (chrom1, bp1, teStr1, teL1, geneS1, bp_geneS1, cnaInfo1, geneInfo1,
         censusInfo1, goInfoS1, keggInfoS1, biocartaInfoS1) = geneStatL[0]
        (chrom2, bp2, teStr2, teL2, geneS2, bp_geneS2, cnaInfo2, geneInfo2,
         censusInfo2, goInfoS2, keggInfoS2, biocartaInfoS2) = geneStatL[1]

        if chrom1 == chrom2:
            type = 'intra'
        else:
            type = 'inter'

        frameL = []

        for (t1, e1) in teL1:

            for (t2, e2) in teL2:

                cons = mygenome.frameCons(t1, e1, t2, e2, frameInfoH)

                if cons == 'Y':
                    frameL.append('%s.%s-%s.%s:%s' % (t1, e1, t2, e2, cons))

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (sampN, splice_type, type, bp1,bp2, teStr1,teStr2, ','.join(frameL), ','.join(cnaInfo1), ','.join(cnaInfo2), \
         '%s;%s' % (','.join(geneS1),','.join(bp_geneS1)), ';'.join(geneInfo1), ';'.join(censusInfo1), \
         ';'.join(map(str,goInfoS1)), ';'.join(map(str,keggInfoS1)), ';'.join(map(str,biocartaInfoS1)),
         '%s;%s' % (','.join(geneS2),','.join(bp_geneS2)), ';'.join(geneInfo2), ';'.join(censusInfo2), \
         ';'.join(map(str,goInfoS2)), ';'.join(map(str,keggInfoS2)), ';'.join(map(str,biocartaInfoS2)),
         nmatch,nseq,nreg))