예제 #1
0
def parse(exon1,exon2):

	h = {}

	for t in exon1.split(','):

		rm = re.match('(.+)\.exon(.+)\/(.+)',t)
		h[(rm.group(1),int(rm.group(3)))] = [int(rm.group(2))]

	for t in exon2.split(','):

		rm = re.match('(.+)\.exon(.+)\/(.+)',t)

		if (rm.group(1),int(rm.group(3))) in h:
			h[(rm.group(1),int(rm.group(3)))].append(int(rm.group(2)))

	h2 = {}

	for t in h:
		
		if len(h[t]) == 2:
			mybasic.addHash(h2,tuple(h[t]),t)

	h2_items = h2.items()
	h2_items.sort(lambda x,y: cmp(len(y[1]),len(x[1])))

	return ','.join(['%s-%s' % (eS+1,eE-1) for ((eS,eE),l) in h2_items])
예제 #2
0
def geneInfoH(geneNameH, geneSetH, refSeqSummaryFileName='/Z/Sequence/ucsc_hg19/annot/refSeqSummary.txt', hugoFileName='/Z/Sequence/geneinfo/hugo.txt', \
		censusFileName='/Z/Sequence/geneinfo/cancer_gene_census.txt', biocartaFileName='/Z/Sequence/geneinfo/BIOCARTA.gmt', \
		goFileName='/Z/Sequence/geneinfo/hugo.txt', keggFileName='/Z/Sequence/geneinfo/hugo.txt'):

	geneInfoH = {}

	for line in open(refSeqSummaryFileName):

		(refSeqId,status,summary) = line[:-1].split('\t')

		if refSeqId in geneNameH:

			geneName = geneNameH[refSeqId]

			if geneName not in geneInfoH:
				geneInfoH[geneName] = {}

			geneInfoH[geneName]['summary'] = summary

	for line in open(hugoFileName):

		(geneName,desc,aliases,geneCardName,refSeqIds) = line[:-1].split('\t')

		if geneName not in geneInfoH:
			geneInfoH[geneName] = {}

		geneInfoH[geneName]['desc'] = desc 
		geneInfoH[geneName]['aliases'] = aliases
		geneInfoH[geneName]['refSeqIds'] = refSeqIds

	for line in open(censusFileName):

		tokL = line[:-1].split('\t')

		(geneName,desc,somatic,germline,mutType,translocPartners) = (tokL[0],tokL[1],tokL[7],tokL[8],tokL[12],tokL[13])

		if geneName == 'Symbol':
			continue

		if geneName not in geneInfoH:
			geneInfoH[geneName] = {'desc':desc}

		geneInfoH[geneName]['census_somatic'] = somatic
		geneInfoH[geneName]['census_germline'] = germline
		geneInfoH[geneName]['census_mutType'] = mutType
		geneInfoH[geneName]['census_translocPartners'] = translocPartners


	for geneSetDB in geneSetH.keys():

		for (geneSetName,(geneSetDesc,geneNameL)) in geneSetH[geneSetDB].iteritems():

			for geneName in geneNameL:

				if geneName in geneInfoH:
					mybasic.addHash(geneInfoH[geneName],geneSetDB,(geneSetName,geneSetDesc))
				else:
					geneInfoH[geneName] = {geneSetDB:[(geneSetName,geneSetDesc)]}

	return geneInfoH
def main(dirPath):

    fileNameL = filter(lambda x: re.match('.*TCGA-..-....-0..*\.bam', x),
                       os.listdir(dirPath))  # normal sample, bam
    fileNameTokL = map(
        lambda x: re.match('.*(TCGA-..-....)-...-..([DW]).*\.bam', x),
        fileNameL)

    h = {}

    for rm in fileNameTokL:

        sN = rm.group(1)
        type = rm.group(2)

        if 'SOLiD' in rm.group(0):
            type += '-SD'
        elif 'IlluminaGA' in rm.group(0):
            type += '-GA'

        mybasic.addHash(h, sN, type)

    for (sN, typeL) in h.iteritems():
        typeL = list(set(typeL))
        typeL.sort()
        sys.stdout.write('%s\tXSeq_%s\n' % (sN, ','.join(typeL)))
예제 #4
0
def parse(exon1, exon2):

    h = {}

    for t in exon1.split(','):

        rm = re.match('(.+)\.exon(.+)\/(.+)', t)
        h[(rm.group(1), int(rm.group(3)))] = [int(rm.group(2))]

    for t in exon2.split(','):

        rm = re.match('(.+)\.exon(.+)\/(.+)', t)

        if (rm.group(1), int(rm.group(3))) in h:
            h[(rm.group(1), int(rm.group(3)))].append(int(rm.group(2)))

    h2 = {}

    for t in h:

        if len(h[t]) == 2:
            mybasic.addHash(h2, tuple(h[t]), t)

    h2_items = h2.items()
    h2_items.sort(lambda x, y: cmp(len(y[1]), len(x[1])))

    return ','.join(['%s-%s' % (eS + 1, eE - 1) for ((eS, eE), l) in h2_items])
예제 #5
0
def loadKgByChr(dataFileName='/Z/Sequence/ucsc_hg19/annot/knownGene.txt',h={}):

	for line in open(dataFileName):
	
		r = processKgLine(line)

		mybasic.addHash(h, r['chrom'], r)
	
	return h
예제 #6
0
def loadLincByChr(dataFileName='/Z/Sequence/ucsc_hg19/annot/lincRNAsTranscripts.txt',h={}):

	for line in open(dataFileName):
	
		r = processLincLine(line)

		mybasic.addHash(h, r['chrom'], r)
	
	return h
예제 #7
0
def loadRefFlatByGene(refFlatFileName):

	h = {}

	for line in open(refFlatFileName):
	
		r = processRefFlatLine(line)

		mybasic.addHash(h, r['geneName'], r)
	
	return h
예제 #8
0
def process_bp(inGsnapFileName):

    result = mygsnap.gsnapFile(inGsnapFileName, False)
    #outBpFile = open(outBpFileName, 'w')

    seqH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)

        bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups()
        bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups()

        #		if bp1[0] == bp2[0]:
        #			continue

        if direction == 'sense':
            seq = r.seq()
            offset = int(match.segL[0][1].split('..')[1])
            bp12 = (bp1, bp2)
        else:
            seq = mybasic.rc(r.seq(), 'DNA')
            offset = len(seq) - int(match.segL[0][1].split('..')[1])
            bp12 = (bp2, bp1)

        mybasic.addHash(seqH, bp12, (offset, seq))

    seqL = seqH.items()
    seqL.sort(lambda x, y: cmp(len(y[1]), len(x[1])))

    for ((bp1, bp2), vL) in seqL:

        vL.sort(lambda x, y: cmp(y[0], x[0]))

        maxOffset = vL[0][0]

        print '\n', bp1, bp2, len(vL), '\n'

        for (offset, seq) in vL:

            print '%s%s %s' % (' ' * (maxOffset - offset), seq[:offset],
                               seq[offset:])
def process_bp(inGsnapFileName, outBpFileName):

    result = mygsnap.gsnapFile(inGsnapFileName, False)
    outBpFile = open(outBpFileName, 'w')

    seqH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        if s1[0] != s2[0]:
            raise Exception

        strand = s1[0]

        s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups()
        s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups()

        if strand == '+':
            seq = r.seq()
            offset = int(match.segL[0][1].split('..')[1])
            junction = (s1T, s2T)
        else:
            seq = mybasic.rc(r.seq(), 'DNA')
            offset = len(seq) - int(match.segL[0][1].split('..')[1])
            junction = (s2T, s1T)

        mybasic.addHash(seqH, junction, (offset, seq))

    for ((j1, j2), vL) in seqH.items():

        vL.sort(lambda x, y: cmp(x[0], y[0]))

        vL_mod = []

        for (offset, seq) in vL:

            offset = blockSize - offset + 1
            vL_mod.append('%s:%s' % (offset, seq))

        outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' %
                        (j1[0].split('_')[0], int(j1[1]) - blockSize, j1[1],
                         j1[0].split('_')[0], j1[1], int(j1[1]) + blockSize,
                         '|'.join(vL_mod)))
예제 #10
0
def loadRefFlatByChr(refFlatFileName='/data1/Sequence/ucsc_hg19/annot/refFlat_splice_EGFR.txt'):

	h = {}

	for line in open(refFlatFileName):

		r = mygenome.processRefFlatLine(line)

		mybasic.addHash(h, r['chrom'], r)

	return h
예제 #11
0
def process_bp(inGsnapFileName,outBpFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		if s1[0] != s2[0]:
			raise Exception

		strand = s1[0]

		s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

		if strand == '+':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			junction = (s1T, s2T)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			junction = (s2T, s1T)

		mybasic.addHash(seqH,junction,(offset,seq))

	for ((k1,k2), v) in seqH.items():

		v.sort(lambda x,y: cmp(y[0],x[0]))

		k1T = re.match()
		k2T = re.match()

		k1_pos = 
		k2_pos = 

		k1_seq = 
		k2_seq = 

		outBpFile.write('%s,%s,%s\n' % (':'.join(k1),':'.join(k2),'|'.join(['%s:%s' % (offset,seq) for (offset,seq) in v])))
예제 #12
0
def process_bp(inGsnapFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	#outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)

		bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

#		if bp1[0] == bp2[0]:
#			continue

		if direction == 'sense':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			bp12 = (bp1, bp2)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			bp12 = (bp2, bp1)

		mybasic.addHash(seqH,bp12,(offset,seq))

	seqL = seqH.items()
	seqL.sort(lambda x,y: cmp(len(y[1]),len(x[1])))

	for ((bp1,bp2), vL) in seqL:

		vL.sort(lambda x,y: cmp(y[0],x[0]))

		maxOffset = vL[0][0]

		print '\n',bp1,bp2,len(vL),'\n'

		for (offset,seq) in vL:

			print '%s%s %s' % (' ' * (maxOffset-offset),seq[:offset],seq[offset:])
예제 #13
0
def loadRefFlatByChr(
        refFlatFileName='/data1/Sequence/ucsc_hg19/annot/refFlat_splice_EGFR.txt'
):

    h = {}

    for line in open(refFlatFileName):

        r = mygenome.processRefFlatLine(line)

        mybasic.addHash(h, r['chrom'], r)

    return h
예제 #14
0
def process_bp(inGsnapFileName,outBpFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		if s1[0] != s2[0]:
			raise Exception

		strand = s1[0]

		s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

		if strand == '+':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			junction = (s1T, s2T)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			junction = (s2T, s1T)

		mybasic.addHash(seqH,junction,(offset,seq))

	for ((j1,j2), vL) in seqH.items():

		vL.sort(lambda x,y: cmp(x[0],y[0]))

		vL_mod = []

		for (offset,seq) in vL:

			offset = blockSize-offset+1
			vL_mod.append('%s:%s' % (offset,seq))

		outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0],int(j1[1])-blockSize,j1[1], j1[0].split('_')[0],j1[1],int(j1[1])+blockSize, '|'.join(vL_mod)))
예제 #15
0
def loadRefFlatByChr(refFlatFileName='/Z/Sequence/ucsc_hg19/annot/refFlat.txt'):

	h = {}

	for line in open(refFlatFileName):
	
		r = processRefFlatLine(line)

		mybasic.addHash(h, r['chrom'], r)

	if 'chrM' not in h:
		h['chrM'] = []
	
	return h
예제 #16
0
def loadAnnot(geneL=[]):

	refFlatH = mygenome.loadRefFlatByChr()

	eiH = {}
	ei_keyH = {}
	juncInfoH = {}

	for chrom in refFlatH.keys():

		eiH[chrom] = {}
		juncInfoH[chrom] = {}

		refFlatL = refFlatH[chrom]

		for tH in refFlatL:

			if geneL!=[] and tH['geneName'] not in geneL:
				continue

			for i in range(len(tH['exnList'])):

				if tH['strand'] == '+':
					pos = tH['exnList'][i][1]
					e_num = i+1
				else:
					pos = tH['exnList'][i][0]
					e_num = len(tH['exnList'])-i

				mybasic.addHash(juncInfoH[chrom], pos, '%s%s:%s:%s/%s' % (tH['strand'], tH['geneName'], tH['refSeqId'], e_num, len(tH['exnList'])))
				eiH[chrom][pos] = 0

		ei_keyH[chrom] = eiH[chrom].keys()
		ei_keyH[chrom].sort()

	ei_cntH = {}
	for chrom in juncInfoH.keys():
		ei_cntH[chrom] = {}
		i = 0
		for pos in sorted(juncInfoH[chrom].keys()):
			i += 1
			ei_cntH[chrom][pos] = i

	return eiH,ei_keyH,juncInfoH,ei_cntH
예제 #17
0
def parse(loc, juncInfo):

    rm = re.match('([^:]+):([^:]+)', loc)
    chrom, pos = rm.groups()

    h = {}

    for junc in juncInfo.split(','):

        rm = re.match('([+-])([^:]+):[^:]+:(.*)\/(.*)', junc)
        strand, geneN, exonIdx, exonTot = rm.groups()

        if strand == '+':
            locParsed = '%s%s:%s' % (strand, chrom, pos)
        else:
            locParsed = '%s%s:%s' % (strand, chrom, int(pos) + 1)

        mybasic.addHash(h, (locParsed, geneN), junc)

    parseL = []

    for ((locParsed, geneN), juncL) in h.iteritems():

        maxTrans = 0
        isLastExon = True

        for junc in juncL:

            rm = re.match('([+-])([^:]+):[^:]+:(.*)\/(.*)', junc)
            strand, geneN, exonIdx, exonTot = rm.groups()

            if exonIdx != exonTot:
                isLastExon = False

            if int(exonTot) > maxTrans:
                alias = '%s/%s' % (exonIdx, exonTot)
                maxTrans = int(exonTot)

        parseL.append((locParsed, geneN, alias, isLastExon, ','.join(juncL)))

    return parseL
예제 #18
0
def parse(loc,juncInfo):

	rm = re.match('([^:]+):([^:]+)',loc)
	chrom, pos = rm.groups()

	h = {}

	for junc in juncInfo.split(','):

		rm = re.match('([+-])([^:]+):[^:]+:(.*)\/(.*)',junc)
		strand,geneN,exonIdx,exonTot = rm.groups()

		if strand == '+':
			locParsed = '%s%s:%s' % (strand,chrom,pos)
		else:
			locParsed = '%s%s:%s' % (strand,chrom,int(pos)+1)

		mybasic.addHash(h,(locParsed,geneN),junc)

	parseL = []

	for ((locParsed,geneN),juncL) in h.iteritems():

		maxTrans = 0
		isLastExon = True

		for junc in juncL:

			rm = re.match('([+-])([^:]+):[^:]+:(.*)\/(.*)',junc)
			strand,geneN,exonIdx,exonTot = rm.groups()

			if exonIdx!=exonTot:
				isLastExon = False

			if int(exonTot) > maxTrans:
				alias = '%s/%s' % (exonIdx,exonTot)
				maxTrans = int(exonTot)

		parseL.append((locParsed,geneN,alias,isLastExon,','.join(juncL)))
		
	return parseL
예제 #19
0
def loadAnnot(geneL=[]):

	refFlatH = mygenome.loadRefFlatByChr()

	eiH = {}
	ei_keyH = {}
	juncInfoH = {}

	for chrom in refFlatH.keys():

		eiH[chrom] = {}
		juncInfoH[chrom] = {}

		refFlatL = refFlatH[chrom]

		for tH in refFlatL:

			if geneL!=[] and tH['geneName'] not in geneL:
				continue

			for i in range(len(tH['exnList'])):

				if tH['strand'] == '+':
					pos = tH['exnList'][i][1]
					e_num = i+1
				else:
					pos = tH['exnList'][i][0]
					e_num = len(tH['exnList'])-i

				mybasic.addHash(juncInfoH[chrom], pos, '%s%s:%s:%s/%s' % (tH['strand'], tH['geneName'], tH['refSeqId'], e_num, len(tH['exnList'])))
				eiH[chrom][pos] = 0

				cursor.execute('replace into temp_table (chrom,pos) values ("%s",%s)' % (chrom,pos))

		ei_keyH[chrom] = eiH[chrom].keys()
		ei_keyH[chrom].sort()

	return eiH,ei_keyH,juncInfoH
예제 #20
0
def main(dirPath):

	fileNameL = filter(lambda x: re.match('.*TCGA-..-....-0..*\.bam',x), os.listdir(dirPath)) # normal sample, bam
	fileNameTokL = map(lambda x: re.match('.*(TCGA-..-....)-...-..([DW]).*\.bam',x), fileNameL)

	h = {}

	for rm in fileNameTokL:

		sN = rm.group(1)
		type = rm.group(2)

		if 'SOLiD' in rm.group(0):
			type += '-SD'
		elif 'IlluminaGA' in rm.group(0):
			type += '-GA'

		mybasic.addHash(h,sN,type)

	for (sN,typeL) in h.iteritems():	
		typeL = list(set(typeL))
		typeL.sort()
		sys.stdout.write('%s\tXSeq_%s\n' % (sN,','.join(typeL)))
예제 #21
0
def exonSkip_proc_annot(inReportFileName,
                        outReportFileName,
                        inCnaGctFileName=None):

    geneDB = mygenome.getGeneDB()
    frameInfoH = mygenome.getFrameInfoH()

    if inCnaGctFileName:
        cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
    else:
        cnaDB = None

    outReportFile = open(outReportFileName, 'w')

    for line in open(inReportFileName):

        (sampN, bp1, bp2, t1, t2, nmatch, nseq, nreg) = line[:-1].split('\t')

        if inCnaGctFileName:
            indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*', sampN).group(1)

        geneS = set()
        geneH = {}

        for tL in (t1, t2):

            for t in tL.split(','):

                ro = re.match('(.*)\.exon([0-9]*)/[0-9]*', t)

                t = ro.group(1)
                e = int(ro.group(2))

                mybasic.addHash(geneH, t, e)

                g = mygenome.gene(t, geneDB=geneDB)

                if g.geneName:
                    geneS.add(g.geneName)

        frameL = []

        for transId in geneH:

            exnList = geneH[transId]

            if len(exnList) != 2:
                continue

            #exnList.sort()
            cons = mygenome.frameCons(transId, exnList[0], transId, exnList[1],
                                      frameInfoH)

            if cons:
                frameL.append('%s:%s' % (transId, cons))
            else:
                continue

        cnaInfo = []
        geneInfo = []
        censusInfo = []

        goInfoS = set()
        keggInfoS = set()
        biocInfoS = set()

        for geneName in geneS:

            gene = mygenome.gene(geneName, geneDB=geneDB)

            if cnaDB:
                cnaInfo.append('%s:%s' %
                               (geneName, cnaDB.query(indivId, geneName)))

            geneInfo.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo.append('%s:%s:%s:%s' %
                              (gene.getAttr('census_somatic'),
                               gene.getAttr('census_germline'),
                               gene.getAttr('census_mutType'),
                               gene.getAttr('census_translocPartners')))

            goInfoS = goInfoS.union(set(gene.getAttr('go')))
            keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
            biocInfoS = biocInfoS.union(set(gene.getAttr('bioc')))

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \
         ';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
예제 #22
0
파일: drugRespByFus.py 프로젝트: SMC1/JK1
def main(inDrugFileName,outDirName,outFileName,geneL='wg',cutoff=0.05, plottype='AUC', plot='FALSE', outPlotDirName='/home/heejin/DrugScreening/figure',seqType='WTS'):

	if seqType == 'WES':
		idH = WESidH
	else:
		idH = WTSidH

	inFile = open(inDrugFileName)

	drugH = {}

	drugL = inFile.readline()[:-2].split(',')[1:]

	for drug in drugL:

		drugH[drug] = {}

	for line in inFile:
		
		dataH = {}

		dataL = line[:-2].split(',')

		sId = dataL[0]

		for i in range(len(drugL)):
			dataH[sId] = dataL[i+1]
			drugH[drugL[i]].update(dataH)


	con,cursor = mymysql.connectDB(db='common')

	if geneL == 'wg':
		cursor.execute('SELECT distinct geneName FROM refFlat_hg19')
		geneL = [x for (x,) in cursor.fetchall()]
	elif geneL == 'cs':
		cursor.execute('SELECT distinct gene_sym FROM cs_gene')
		geneL = [x for (x,) in cursor.fetchall()]
	else:
		geneL = geneL


	# fusion 
	
	outFile = open(outFileName, 'w')
	outFile.write('Drug\tGene\tp_twosided\tp_greater\tp_less\tD\tp_twosided2\tD2\twtN\tmutN\twt_sampN\tmut_sampN\twilcox_p\tttest_p\tmed_z.score\tmean_z.score\tAltInfo\n')
	outFile.close()

	con,cursor = mymysql.connectDB(db='ircr1')

	dbIdL = idH.values()

	cursor.execute('select distinct samp_id from rpkm_gene_expr')
	procSampL = [x for (x,) in cursor.fetchall()]

	for gN in geneL:
		
#		if gN != 'MET':
#			continue

		tempFileName = '%s/temp4test.txt' % outDirName
		tempFile = open(tempFileName, 'w')

		tempFile.write('Drug\tGene\tds_id\tdb_id\tAUC\tAlt\n')

		cursor.execute('SELECT samp_id,gene_sym1,gene_sym2,nReads/(nReads+nReads_w1) as maf FROM splice_fusion_AF \
			where (gene_sym1 ="%s" or gene_sym2="%s") and nPos>2 and frame like "%s:Y%s" ' % (gN,gN,'%','%'))
		result = cursor.fetchall()

		mutH = {}
		
		if len(result) == 0:
			continue

		for (dbId, gs, gs2, maf) in result:
			try:
				if float(maf) < cutoff:
					continue
			except:
				continue

			type = '%s_%s' % (gs,gs2)
			#mutH[dbId] = (type,maf)	
			mybasic.addHash(mutH,dbId,(type,maf))

		scr_idL = drugH[drugH.keys()[0]].keys()

		for drug in drugH.keys():
		
			for id in idH.keys():
		
				if id not in scr_idL:
					continue

				if idH[id] not in procSampL:
					continue

				try:
					Alt = mutH[idH[id]]
					Alt = '/'.join(map(str,Alt))
				except:
					Alt = 'NA'

				tempFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (drug, gN, id, idH[id], drugH[drug][id], Alt))

		tempFile.close()
		os.system('Rscript ~/JK1/NSL/HTS/drugRespByFus.R %s %s %s %s %s' % (tempFileName, outFileName,plot,outPlotDirName,plottype))
예제 #23
0
def main(inFileDir,outFileName):

	outFile = open(outFileName,'w')

	registry = []

	inFileNameL = glob.glob('%s/*HumanMethylation*' % (inFileDir,))
	inFileNameL.sort(lambda x,y: cmp(y,x))

	for inFileName in inFileNameL:

		sId = inFileName[inFileName.index('TCGA-'):inFileName.index('TCGA-')+28]

		if sId in registry:
			continue

		registry.append(sId)
	
		pId = sId[:12]

		if int(sId[13:15])<10:
			TN = 'T'
		else:
			TN = 'N'

		if 'HumanMethylation450' in inFileName:
			platform = 'Infinium450k'
		else:
			platform = 'Infinium27k'

		print sId, platform

		inFile = open(inFileName)
		line = inFile.readline()

		inFile.readline()

		geneH = {}

		for line in inFile:

			tokL = line.rstrip().split('\t')

			geneN = tokL[3]
			value = tokL[2]

			if not geneN or value=='NA':
				continue

			mybasic.addHash(geneH,geneN,float(value))

		for geneN,valueL in geneH.iteritems():

			v = numpy.mean(valueL)

			for g in geneN.split(';'):
				outFile.write('%s\t%s\t%s\t%s\t%s\t%.2f\n' % (platform,sId,pId,TN,g,v))

	inFileNameL = glob.glob('%s/*OMA002*' % (inFileDir,))
	platform = 'GoldenGate3k'

	for inFileName in inFileNameL:

		inFile = open(inFileName)
		line = inFile.readline()

		sId = line.rstrip().split('\t')[1]

		if sId in registry:
			continue

		registry.append(sId)

		pId = sId[:12]

		if int(sId[13:15])<10:
			TN = 'T'
		else:
			TN = 'N'

		inFile.readline()

		print sId, platform

		geneH = {}

		for line in inFile:

			name,value = line.rstrip().split('\t')
			geneN = name[:name.find('_')]
			loc = name[name.find('_')+1:]

			if not geneN or value=='N/A':
				continue

			mybasic.addHash(geneH,geneN,float(value))

		for geneN,valueL in geneH.iteritems():

			for g in geneN.split(';'):
				outFile.write('%s\t%s\t%s\t%s\t%s\t%.2f\n' % (platform,sId,pId,TN,g,v))
예제 #24
0
def exonSkip_filter(inFileName, outFileName):
    '''
	filters-in exon-skipping candidates in splice-mapped gsnap
	'''

    result = mygsnap.gsnapFile(inFileName, False)
    if outFileName[-3:] == '.gz':
        outFile = gzip.open(outFileName, 'wb')
    else:
        outFile = open(outFileName, 'w')

    count_all = 0
    count_include = 0

    for r in result:

        if r.nLoci != 1:
            continue

        match = r.matchL()[0]

        if len(match.segL) != 2:
            continue

        segObjL = match.getSegInfo()

        jncH = {}

        skip = False

        for segObj in segObjL:

            if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5:
                skip = True
                break

            if segObj.label == '':
                break

            for b in segObj.label.split('|'):

                rm2 = re.match('(.*)\.exon([0-9]+)\/[0-9]+', b)

                transId = rm2.group(1)
                exonNum = int(rm2.group(2))

                mybasic.addHash(jncH, transId, exonNum)

        if skip:
            continue

        jncL = jncH.items()

        if len(jncL) > 0 and max([len(j[1]) for j in jncL]) > 1:

            minDist = 100

            for i in range(len(jncL)):

                if len(jncL[i][1]) == 2 and abs(jncL[i][1][0] -
                                                jncL[i][1][1]) < minDist:
                    minDist = abs(jncL[i][1][0] - jncL[i][1][1])

            if minDist == 1:  # only difference

                outFile.write(r.rawText() + '\n')
                count_include += 1

        count_all += 1

    print 'Results:', count_include, count_all
예제 #25
0
def exonSkip_proc_annot(inReportFileName,outReportFileName,inCnaGctFileName=None):

	geneDB = mygenome.getGeneDB()
	frameInfoH = mygenome.getFrameInfoH()

	if inCnaGctFileName:
		cnaDB = mygenome.tcgaCnaDB(inCnaGctFileName)
	else:
		cnaDB = None

	outReportFile = open(outReportFileName,'w')

	for line in open(inReportFileName):

		(sampN,bp1,bp2,t1,t2,nmatch,nseq,nreg) = line[:-1].split('\t')

		if inCnaGctFileName:
			indivId = re.match('.*(TCGA-[0-9]{2}-[0-9]{4}).*',sampN).group(1)

		geneS = set()
		geneH = {}

		for tL in (t1,t2):

			for t in tL.split(','):

				ro = re.match('(.*)\.exon([0-9]*)/[0-9]*',t)

				t = ro.group(1)
				e = int(ro.group(2))

				mybasic.addHash(geneH,t,e)

				g = mygenome.gene(t,geneDB=geneDB)

				if g.geneName:
					geneS.add(g.geneName)

		frameL = []

		for transId in geneH:

			exnList = geneH[transId]

			if len(exnList) != 2:
				continue

			#exnList.sort()
			cons = mygenome.frameCons(transId,exnList[0], transId,exnList[1],frameInfoH)

			if cons:
				frameL.append('%s:%s' % (transId,cons))
			else:
				continue

		cnaInfo = []
		geneInfo = []
		censusInfo = []

		goInfoS = set()
		keggInfoS = set()
		biocInfoS = set()

		for geneName in geneS:

			gene = mygenome.gene(geneName,geneDB=geneDB)

			if cnaDB:
				cnaInfo.append('%s:%s' % (geneName,cnaDB.query(indivId,geneName)))

			geneInfo.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

			goInfoS = goInfoS.union(set(gene.getAttr('go')))
			keggInfoS = keggInfoS.union(set(gene.getAttr('kegg')))
			biocInfoS = biocInfoS.union(set(gene.getAttr('bioc')))

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(sampN, bp1,bp2, t1,t2, ','.join(frameL), ';'.join(geneS), ','.join(cnaInfo),';'.join(geneInfo),';'.join(censusInfo), \
			';'.join(map(str,goInfoS)), ';'.join(map(str,keggInfoS)),';'.join(map(str,biocInfoS)),nmatch,nseq,nreg))
예제 #26
0
파일: exonSkip_filter.py 프로젝트: SMC1/JK1
def exonSkip_filter(inFileName,outFileName):
	'''
	filters-in exon-skipping candidates in splice-mapped gsnap
	''' 

	result = mygsnap.gsnapFile(inFileName, False)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_include = 0

	for r in result:

		if r.nLoci != 1:
			continue
		
		match = r.matchL()[0]

		if len(match.segL) != 2:
			continue

		segObjL = match.getSegInfo()

		jncH = {}

		skip = False

		for segObj in segObjL:

			if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5:
				skip = True
				break

			if segObj.label == '':
				break

			for b in segObj.label.split('|'):

				rm2 = re.match('(.*)\.exon([0-9]+)\/[0-9]+',b)

				transId = rm2.group(1)
				exonNum = int(rm2.group(2))

				mybasic.addHash(jncH,transId,exonNum)

		if skip:
			continue

		jncL = jncH.items()
		
		if len(jncL)>0 and max([len(j[1]) for j in jncL])>1:

			minDist = 100

			for i in range(len(jncL)):

				if len(jncL[i][1]) == 2 and abs(jncL[i][1][0]-jncL[i][1][1]) < minDist:
					minDist = abs(jncL[i][1][0]-jncL[i][1][1])

			if minDist > 1:

				outFile.write(r.rawText()+'\n')
				count_include += 1

		count_all += 1

	print 'Results:',count_include, count_all
예제 #27
0
def main(inFileDir, outFileName):

    outFile = open(outFileName, 'w')

    registry = []

    inFileNameL = glob.glob('%s/*HumanMethylation*' % (inFileDir, ))
    inFileNameL.sort(lambda x, y: cmp(y, x))

    for inFileName in inFileNameL:

        sId = inFileName[inFileName.index('TCGA-'):inFileName.index('TCGA-') +
                         28]

        if sId in registry:
            continue

        registry.append(sId)

        pId = sId[:12]

        if int(sId[13:15]) < 10:
            TN = 'T'
        else:
            TN = 'N'

        if 'HumanMethylation450' in inFileName:
            platform = 'Infinium450k'
        else:
            platform = 'Infinium27k'

        print sId, platform

        inFile = open(inFileName)
        line = inFile.readline()

        inFile.readline()

        geneH = {}

        for line in inFile:

            tokL = line.rstrip().split('\t')

            geneN = tokL[3]
            value = tokL[2]

            if not geneN or value == 'NA':
                continue

            mybasic.addHash(geneH, geneN, float(value))

        for geneN, valueL in geneH.iteritems():

            v = numpy.mean(valueL)

            for g in geneN.split(';'):
                outFile.write('%s\t%s\t%s\t%s\t%s\t%.2f\n' %
                              (platform, sId, pId, TN, g, v))

    inFileNameL = glob.glob('%s/*OMA002*' % (inFileDir, ))
    platform = 'GoldenGate3k'

    for inFileName in inFileNameL:

        inFile = open(inFileName)
        line = inFile.readline()

        sId = line.rstrip().split('\t')[1]

        if sId in registry:
            continue

        registry.append(sId)

        pId = sId[:12]

        if int(sId[13:15]) < 10:
            TN = 'T'
        else:
            TN = 'N'

        inFile.readline()

        print sId, platform

        geneH = {}

        for line in inFile:

            name, value = line.rstrip().split('\t')
            geneN = name[:name.find('_')]
            loc = name[name.find('_') + 1:]

            if not geneN or value == 'N/A':
                continue

            mybasic.addHash(geneH, geneN, float(value))

        for geneN, valueL in geneH.iteritems():

            for g in geneN.split(';'):
                outFile.write('%s\t%s\t%s\t%s\t%s\t%.2f\n' %
                              (platform, sId, pId, TN, g, v))