def main(inFileName, outFileName, inRefFlatFileName='/data1/Sequence/ucsc_hg19/annot/refFlat.txt', assembly='hg19'): geneNameL = list(set([line.split('\t')[0] for line in open(inRefFlatFileName)])) geneNameL.sort() inFileL = [line[:-1].split('\t') for line in open(inFileName) if line[:-1].split('\t')[0] != 'ID'] outFile = open(outFileName, 'w') for geneName in geneNameL: try: trans = mygenome.transcript(geneName,inRefFlatFileName,assembly) except: continue for dataL in inFileL: (sId, chrNum, chrSta, chrEnd, type, cn) = dataL overlap = trans.cdsOverlap((chrNum,int(chrSta),int(chrEnd))) if overlap > 0: if type != 'gain': outFile.write('%s\t%s\t%s\n' % (sId,geneName,type))
def main(inFileName, outFileName, inRefFlatFileName='/data1/Sequence/ucsc_hg19/annot/refFlat.txt', assembly='hg19'): geneNameL = list( set([line.split('\t')[0] for line in open(inRefFlatFileName)])) geneNameL.sort() inFileL = [ line[:-1].split('\t') for line in open(inFileName) if line[:-1].split('\t')[0] != 'ID' ] outFile = open(outFileName, 'w') for geneName in geneNameL: try: trans = mygenome.transcript(geneName, inRefFlatFileName, assembly) except: continue for dataL in inFileL: (sId, chrNum, chrSta, chrEnd, type, cn) = dataL overlap = trans.cdsOverlap((chrNum, int(chrSta), int(chrEnd))) if overlap > 0: if type != 'gain': outFile.write('%s\t%s\t%s\n' % (sId, geneName, type))
def main(inSegFileName, inRefFlatFileName, outFileName, geneNameL, assembly='hg19'): #sampN = re.match('(.*).ngCGH.seg', inSegFileName.split('/')[-1]).group(1) (sid, postfix) = re.match('(.*)_([TXC].{,2})_.*.ngCGH.seg', inSegFileName.split('/')[-1]).groups() if postfix != 'T': sampN = sid + '_' + postfix else: sampN = sid if geneNameL == []: geneNameL = list(set([line.split('\t')[0] for line in open(inRefFlatFileName)])) geneNameL.sort() inSegFileMem = [line[:-1].split('\t') for line in open(inSegFileName) if line[:-1].split('\t')[0] != 'ID'] sIdL = list(set([tokL[0] for tokL in inSegFileMem])) sIdL.sort() outFile = open(outFileName, 'w') for geneName in geneNameL: print geneName try: trans = mygenome.transcript(geneName,inRefFlatFileName,assembly) except: continue h = {} for sId in sIdL: h[sId] = 0. for tokL in inSegFileMem: (sId,chrNum,chrSta,chrEnd,numMarker,value) = tokL if 'chr' in chrNum: chrNum = re.match('chr(.*)', chrNum).group(1) if chrNum != trans.chrNum or value in ('NA','null','NULL'): continue overlap = trans.cdsOverlap((chrNum,int(chrSta),int(chrEnd))) if overlap > 0: h[sId] += overlap/float(trans.cdsLen) * float(value) outFile.write('%s\t%s' % (sampN, geneName)) for sId in sIdL: outFile.write('\t%s' % h[sId]) outFile.write('\n')
sIdL = list(set([tokL[0] for tokL in inSegFileMem])) sIdL.sort() outGctFileName = '%s.gct' % mygp.stripPath(inSegFileName)[0] outGctFile = open(outGctFileName,'w') outGctFile.write('#1.2\n') outGctFile.write('%s\t%s\n' % (len(geneNameL),len(sIdL))) outGctFile.write('NAME\tDescription\t%s\n' % '\t'.join(sIdL)) for geneName in geneNameL: print geneName trans = mygenome.transcript(geneName,inRefFlatFileName,'hg18') # consider only the longest transcript per gene h = {} for sId in sIdL: h[sId] = 0. for tokL in inSegFileMem: (sId,chrNum,chrSta,chrEnd,numMarker,value) = tokL if chrNum != trans.chrNum or value in ('NA','null','NULL'): continue overlap = trans.cdsOverlap((chrNum,int(chrSta),int(chrEnd)))
sIdL = list(set([tokL[0] for tokL in inSegFileMem])) sIdL.sort() outGctFileName = '%s.gct' % mygp.stripPath(inSegFileName)[0] outGctFile = open(outGctFileName, 'w') outGctFile.write('#1.2\n') outGctFile.write('%s\t%s\n' % (len(geneNameL), len(sIdL))) outGctFile.write('NAME\tDescription\t%s\n' % '\t'.join(sIdL)) for geneName in geneNameL: print geneName trans = mygenome.transcript( geneName, inRefFlatFileName, 'hg18') # consider only the longest transcript per gene h = {} for sId in sIdL: h[sId] = 0. for tokL in inSegFileMem: (sId, chrNum, chrSta, chrEnd, numMarker, value) = tokL if chrNum != trans.chrNum or value in ('NA', 'null', 'NULL'): continue overlap = trans.cdsOverlap((chrNum, int(chrSta), int(chrEnd)))
sIdL = list(set([tokL[0] for tokL in inSegFileMem])) sIdL.sort() #outGctFileName = '%s.gct' % mygp.stripPath(inSegFileName)[0] #outGctFile = open(outGctFileName,'w') outGctFile.write('#1.2\n') outGctFile.write('%s\t%s\n' % (len(geneNameL),len(sIdL))) outGctFile.write('NAME\tDescription\t%s\n' % '\t'.join(sIdL)) for geneName in geneNameL: print geneName trans = mygenome.transcript(geneName,inRefFlatFileName,assembly) h = {} for sId in sIdL: h[sId] = 0. for tokL in inSegFileMem: (sId,chrNum,chrSta,chrEnd,numMarker,value) = tokL if chrNum != trans.chrNum or value in ('NA','null','NULL'): continue overlap = trans.cdsOverlap((chrNum,int(chrSta),int(chrEnd)))
sIdL = list(set([tokL[0] for tokL in inSegFileMem])) sIdL.sort() #outGctFileName = '%s.gct' % mygp.stripPath(inSegFileName)[0] #outGctFile = open(outGctFileName,'w') outGctFile.write('#1.2\n') outGctFile.write('%s\t%s\n' % (len(geneNameL), len(sIdL))) outGctFile.write('NAME\tDescription\t%s\n' % '\t'.join(sIdL)) for geneName in geneNameL: print geneName trans = mygenome.transcript(geneName, inRefFlatFileName, assembly) h = {} for sId in sIdL: h[sId] = 0. for tokL in inSegFileMem: (sId, chrNum, chrSta, chrEnd, numMarker, value) = tokL if chrNum != trans.chrNum or value in ('NA', 'null', 'NULL'): continue overlap = trans.cdsOverlap((chrNum, int(chrSta), int(chrEnd)))