示例#1
0
文件: loh2gene.py 项目: SMC1/JK1
def main(inFileName, outFileName, inRefFlatFileName='/data1/Sequence/ucsc_hg19/annot/refFlat.txt', assembly='hg19'):

	geneNameL = list(set([line.split('\t')[0] for line in open(inRefFlatFileName)]))
	geneNameL.sort()

	inFileL = [line[:-1].split('\t') for line in open(inFileName) if line[:-1].split('\t')[0] != 'ID']

	outFile = open(outFileName, 'w')

	for geneName in geneNameL:

		try:
			trans = mygenome.transcript(geneName,inRefFlatFileName,assembly)
		except:
			continue

		for dataL in inFileL:

			(sId, chrNum, chrSta, chrEnd, type, cn) = dataL

			overlap = trans.cdsOverlap((chrNum,int(chrSta),int(chrEnd)))
			
			if overlap > 0:

				if type != 'gain':
					outFile.write('%s\t%s\t%s\n' % (sId,geneName,type))
示例#2
0
def main(inFileName,
         outFileName,
         inRefFlatFileName='/data1/Sequence/ucsc_hg19/annot/refFlat.txt',
         assembly='hg19'):

    geneNameL = list(
        set([line.split('\t')[0] for line in open(inRefFlatFileName)]))
    geneNameL.sort()

    inFileL = [
        line[:-1].split('\t') for line in open(inFileName)
        if line[:-1].split('\t')[0] != 'ID'
    ]

    outFile = open(outFileName, 'w')

    for geneName in geneNameL:

        try:
            trans = mygenome.transcript(geneName, inRefFlatFileName, assembly)
        except:
            continue

        for dataL in inFileL:

            (sId, chrNum, chrSta, chrEnd, type, cn) = dataL

            overlap = trans.cdsOverlap((chrNum, int(chrSta), int(chrEnd)))

            if overlap > 0:

                if type != 'gain':
                    outFile.write('%s\t%s\t%s\n' % (sId, geneName, type))
示例#3
0
def main(inSegFileName, inRefFlatFileName, outFileName, geneNameL, assembly='hg19'):

	#sampN = re.match('(.*).ngCGH.seg', inSegFileName.split('/')[-1]).group(1)
	(sid, postfix) = re.match('(.*)_([TXC].{,2})_.*.ngCGH.seg', inSegFileName.split('/')[-1]).groups()
	if postfix != 'T':
		sampN = sid + '_' + postfix
	else:
		sampN = sid
	
	if geneNameL == []:
		geneNameL = list(set([line.split('\t')[0] for line in open(inRefFlatFileName)]))
		geneNameL.sort()

	inSegFileMem = [line[:-1].split('\t') for line in open(inSegFileName) if line[:-1].split('\t')[0] != 'ID']

	sIdL = list(set([tokL[0] for tokL in inSegFileMem]))
	sIdL.sort()

	outFile = open(outFileName, 'w')
	
	for geneName in geneNameL:
		
		print geneName

		try:
			trans = mygenome.transcript(geneName,inRefFlatFileName,assembly)
		except:
			continue

		h = {}

		for sId in sIdL:
			h[sId] = 0.

		for tokL in inSegFileMem:

			(sId,chrNum,chrSta,chrEnd,numMarker,value) = tokL
			if 'chr' in chrNum:
				chrNum = re.match('chr(.*)', chrNum).group(1)

			if chrNum != trans.chrNum or value in ('NA','null','NULL'):
				continue

			overlap = trans.cdsOverlap((chrNum,int(chrSta),int(chrEnd)))

			if overlap > 0:
				h[sId] += overlap/float(trans.cdsLen) * float(value)

		outFile.write('%s\t%s' % (sampN, geneName))
		
		for sId in sIdL:
			outFile.write('\t%s' % h[sId])

		outFile.write('\n')
示例#4
0
文件: seg2gct.py 项目: SMC1/JK1
sIdL = list(set([tokL[0] for tokL in inSegFileMem]))
sIdL.sort()

outGctFileName = '%s.gct' % mygp.stripPath(inSegFileName)[0]
outGctFile = open(outGctFileName,'w')

outGctFile.write('#1.2\n')
outGctFile.write('%s\t%s\n' % (len(geneNameL),len(sIdL)))
outGctFile.write('NAME\tDescription\t%s\n' % '\t'.join(sIdL))


for geneName in geneNameL:

	print geneName

	trans = mygenome.transcript(geneName,inRefFlatFileName,'hg18') # consider only the longest transcript per gene

	h = {}

	for sId in sIdL:
		h[sId] = 0.

	for tokL in inSegFileMem:

		(sId,chrNum,chrSta,chrEnd,numMarker,value) = tokL

		if chrNum != trans.chrNum or value in ('NA','null','NULL'):
			continue

		overlap = trans.cdsOverlap((chrNum,int(chrSta),int(chrEnd)))
示例#5
0
sIdL = list(set([tokL[0] for tokL in inSegFileMem]))
sIdL.sort()

outGctFileName = '%s.gct' % mygp.stripPath(inSegFileName)[0]
outGctFile = open(outGctFileName, 'w')

outGctFile.write('#1.2\n')
outGctFile.write('%s\t%s\n' % (len(geneNameL), len(sIdL)))
outGctFile.write('NAME\tDescription\t%s\n' % '\t'.join(sIdL))

for geneName in geneNameL:

    print geneName

    trans = mygenome.transcript(
        geneName, inRefFlatFileName,
        'hg18')  # consider only the longest transcript per gene

    h = {}

    for sId in sIdL:
        h[sId] = 0.

    for tokL in inSegFileMem:

        (sId, chrNum, chrSta, chrEnd, numMarker, value) = tokL

        if chrNum != trans.chrNum or value in ('NA', 'null', 'NULL'):
            continue

        overlap = trans.cdsOverlap((chrNum, int(chrSta), int(chrEnd)))
示例#6
0
文件: seg2gct.py 项目: SMC1/JK1
sIdL = list(set([tokL[0] for tokL in inSegFileMem]))
sIdL.sort()

#outGctFileName = '%s.gct' % mygp.stripPath(inSegFileName)[0]
#outGctFile = open(outGctFileName,'w')

outGctFile.write('#1.2\n')
outGctFile.write('%s\t%s\n' % (len(geneNameL),len(sIdL)))
outGctFile.write('NAME\tDescription\t%s\n' % '\t'.join(sIdL))

for geneName in geneNameL:

	print geneName

	trans = mygenome.transcript(geneName,inRefFlatFileName,assembly)

	h = {}

	for sId in sIdL:
		h[sId] = 0.

	for tokL in inSegFileMem:

		(sId,chrNum,chrSta,chrEnd,numMarker,value) = tokL

		if chrNum != trans.chrNum or value in ('NA','null','NULL'):
			continue

		overlap = trans.cdsOverlap((chrNum,int(chrSta),int(chrEnd)))
示例#7
0
sIdL = list(set([tokL[0] for tokL in inSegFileMem]))
sIdL.sort()

#outGctFileName = '%s.gct' % mygp.stripPath(inSegFileName)[0]
#outGctFile = open(outGctFileName,'w')

outGctFile.write('#1.2\n')
outGctFile.write('%s\t%s\n' % (len(geneNameL), len(sIdL)))
outGctFile.write('NAME\tDescription\t%s\n' % '\t'.join(sIdL))

for geneName in geneNameL:

    print geneName

    trans = mygenome.transcript(geneName, inRefFlatFileName, assembly)

    h = {}

    for sId in sIdL:
        h[sId] = 0.

    for tokL in inSegFileMem:

        (sId, chrNum, chrSta, chrEnd, numMarker, value) = tokL

        if chrNum != trans.chrNum or value in ('NA', 'null', 'NULL'):
            continue

        overlap = trans.cdsOverlap((chrNum, int(chrSta), int(chrEnd)))