doCDS = False limitNeighbor = True if '-force': limitNeighbor = False hitRDS = readDataset(hitfile, verbose=True, cache=doCache) readlen = hitRDS.getReadSize() normalizationFactor = 1.0 if normalizeBins: totalCount = len(hitRDS) normalizationFactor = totalCount / 1000000. hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) hg = Genome(genome) idb = geneinfoDB(cache=doCache) gidBins = {} gidLen = {} geneinfoDict = idb.getallGeneInfo(genome) if doFlank: locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor) else: locusByChromDict = getLocusByChromDict(hg,
print 'usage: python %s genome GOID1 [GOID2 ....] [-outfile outfilename] [-append] [-restrict genefile]' sys.exit(1) genome = sys.argv[1] writeOut = False if '-outfile' in sys.argv: writeOut = True outfilename = sys.argv[sys.argv.index('-outfile') + 1] restrict = False if '-restrict' in sys.argv: restrictfilename = sys.argv[sys.argv.index('-restrict') + 1] restrict = True hg = Genome(genome) idb = geneinfoDB() GOIDlist = [] for arg in sys.argv: if 'GO:' in arg: GOIDlist.append(arg) print sys.argv print GOIDlist firstGeneList = [] for GOID in GOIDlist: testList = hg.allGIDsbyGOID(GOID) print 'GOID: %s (%d)' % (GOID, len(testList)) firstGeneList += testList
pass #Main program if len(sys.argv) < 3: print 'usage: python2.5 %s genome snpsfile nondbsnp_geneinfo_outfile' % sys.argv[ 0] sys.exit(1) outStr = "" genome = sys.argv[1] snpfile = sys.argv[2] outfilename = sys.argv[3] infile = file(snpfile, 'r') hg = Genome(genome) additionalDict = {} outS = "" outfile = open(outfilename, 'w') outfile.write( "#Sl\tCl\tchrom\tmis pos\t\tmatch\tuniq_mis\ttot_mis\tbase_chg\tknown_snp\tfunction\tgene\tgeneId\trpkm\n" ) for line in infile: if line[0] == '#': continue fields = line.split() if fields[8].find('N\A') == -1: outfile.write(line) else: outS = ''
if '-cache' in sys.argv: doCache = True printSeq = False if '-printseq' in sys.argv: printSeq = True maxPvalue = 0.0001 mot = Motif('', motifFile=motfilename) motLen = len(mot) bestScore = mot.bestConsensusScore() if hasMotifExtension: print "will use cistematic.core.motif C-extension to speed up motif search" hg = Genome(genome) # minHits=-1 will force regions to be used regardless # maxDist= 0 prevents merging of non-overlapping regions if '-nomerge' in sys.argv: regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, doMerge=False, keepPeak=usePeak) else: regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True,
def main(argv): if len(argv) < 4: print 'usage: python %s genome gtf outfilename [-polyA length]' % argv[0] sys.exit(1) genome = argv[1] gtf=argv[2] outputfilename = argv[3] doPolyA=False if '-polyA' in argv: doPolyA=True tailsize=int(argv[argv.index('-polyA')+1]) tail='' for i in range(tailsize): tail=tail+'A' print 'will add a polyA tail of ', tailsize, 'nt' outfile = open(outputfilename, 'w') hg = Genome(genome) j=0 lineslist = open(gtf) TranscriptDict={} for line in lineslist: j+=1 if j % 100000 == 0: print j, 'lines processed' if line.startswith('#'): continue fields=line.strip().split('\t') if fields[2]!='exon': continue if 'transcript_name "' in fields[8]: TranscriptID=fields[8].split('transcript_name "')[1].split('";')[0] else: TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0] if TranscriptDict.has_key(TranscriptID): pass else: TranscriptDict[TranscriptID]=[] chr=fields[0] left=int(fields[3]) right=int(fields[4]) orientation=fields[6] TranscriptDict[TranscriptID].append((chr,left,right,orientation)) g=0 print 'Found', len(TranscriptDict.keys()), 'transcripts' for transcript in TranscriptDict.keys(): g+=1 if g % 1000 == 0: print g, 'transcripts sequences processed' sequence='' leftEnds=[] rightEnds=[] TranscriptDict[transcript].sort() orientation = TranscriptDict[transcript][0][3] if orientation=='+' or orientation=='F': for (chr,left,right,orientation) in TranscriptDict[transcript]: leftEnds.append(left) rightEnds.append(right) try: sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left) print "can't retrieve sequence" except: for p in range(left,right-left): try: sequence=sequence+hg.sequence(chr[3:len(chr)],p,1) except: sequence=sequence+'N' missed+=1 sense='plus_strand' if orientation=='-' or orientation=='R': for (chr,left,right,orientation) in reversed(TranscriptDict[transcript]): leftEnds.append(left) rightEnds.append(right) try: exonsequence=hg.sequence(chr[3:len(chr)],left-1,right-left+1) sequence=sequence+getReverseComplement(exonsequence) except: for p in range(left-1,right-left+1): try: sequence=sequence+getReverseComplement(hg.sequence(chr[3:len(chr)],p,1)) except: sequence=sequence+'N' missed+=1 sense='minus_strand' LeftEnd=min(leftEnds) RightEnd=max(rightEnds) outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense outfile.write(outline+'\n') if doPolyA: outfile.write(sequence+tail+'\n') else: outfile.write(sequence+'\n') outfile.close()
from cistematic.genomes import Genome print '%s: version 1.1' % sys.argv[0] if len(sys.argv) < 5: print 'usage: python %s genome merlen chrAny:start-stop outfile' % sys.argv[ 0] sys.exit(1) genome = sys.argv[1] merlen = int(sys.argv[2]) location = sys.argv[3] outfilename = sys.argv[4] (chrom, pos) = location.split(':') chrom = chrom[3:] (start, stop) = pos.split('-') start = int(start) regionlength = int(stop) - start + 1 hg = Genome(genome) seq = hg.sequence(chrom, start, regionlength) outfile = open(outfilename, 'w') print 'writing %d %d-mers' % (regionlength - merlen, merlen) for index in range(regionlength - merlen): outfile.write(seq[index:index + merlen].upper() + '\n') outfile.close()
def main(argv): if len(argv) < 4: print 'usage: python %s genome gtf outfilename [-polyA length]' % argv[ 0] sys.exit(1) genome = argv[1] gtf = argv[2] outputfilename = argv[3] doPolyA = False if '-polyA' in argv: doPolyA = True tailsize = int(argv[argv.index('-polyA') + 1]) tail = '' for i in range(tailsize): tail = tail + 'A' print 'will add a polyA tail of ', tailsize, 'nt' outfile = open(outputfilename, 'w') hg = Genome(genome) j = 0 lineslist = open(gtf) TranscriptDict = {} for line in lineslist: j += 1 if j % 100000 == 0: print j, 'lines processed' if line.startswith('#'): continue fields = line.strip().split('\t') if fields[2] != 'exon': continue if 'transcript_name "' in fields[8]: TranscriptID = fields[8].split('transcript_name "')[1].split( '";')[0] else: TranscriptID = fields[8].split('transcript_id "')[1].split('";')[0] if TranscriptDict.has_key(TranscriptID): pass else: TranscriptDict[TranscriptID] = [] chr = fields[0] left = int(fields[3]) right = int(fields[4]) orientation = fields[6] TranscriptDict[TranscriptID].append((chr, left, right, orientation)) g = 0 print 'Found', len(TranscriptDict.keys()), 'transcripts' for transcript in TranscriptDict.keys(): g += 1 if g % 1000 == 0: print g, 'transcripts sequences processed' sequence = '' leftEnds = [] rightEnds = [] TranscriptDict[transcript].sort() orientation = TranscriptDict[transcript][0][3] if orientation == '+' or orientation == 'F': for (chr, left, right, orientation) in TranscriptDict[transcript]: leftEnds.append(left) rightEnds.append(right) try: sequence = sequence + hg.sequence(chr[3:len(chr)], left, right - left) print "can't retrieve sequence" except: for p in range(left, right - left): try: sequence = sequence + hg.sequence( chr[3:len(chr)], p, 1) except: sequence = sequence + 'N' missed += 1 sense = 'plus_strand' if orientation == '-' or orientation == 'R': for (chr, left, right, orientation) in reversed(TranscriptDict[transcript]): leftEnds.append(left) rightEnds.append(right) try: exonsequence = hg.sequence(chr[3:len(chr)], left - 1, right - left + 1) sequence = sequence + getReverseComplement(exonsequence) except: for p in range(left - 1, right - left + 1): try: sequence = sequence + getReverseComplement( hg.sequence(chr[3:len(chr)], p, 1)) except: sequence = sequence + 'N' missed += 1 sense = 'minus_strand' LeftEnd = min(leftEnds) RightEnd = max(rightEnds) outline = '>' + transcript + ':' + chr + ':' + str( LeftEnd) + '-' + str(RightEnd) + '-' + sense outfile.write(outline + '\n') if doPolyA: outfile.write(sequence + tail + '\n') else: outfile.write(sequence + '\n') outfile.close()
fullOnly = False if '-fullOnly' in sys.argv: fullOnly = True #mot = Motif('',motifFile = motifDir + 'NRSE2.mot') #motL = Motif('',motifFile = motifDir + 'NRSE2left.mot') #motR = Motif('',motifFile = motifDir + 'NRSE2right.mot') mot = Motif('',motifFile = motifDir + 'NRSE3.mot') motL = Motif('',motifFile = motifDir + 'NRSE3left.mot') motR = Motif('',motifFile = motifDir + 'NRSE3right.mot') bestScore = mot.bestConsensusScore() bestLeft = motL.bestConsensusScore() bestRight = motR.bestConsensusScore() hg = Genome(genome) regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False) outfile = open(outfilename,'w') outfile.write('#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n' % (chipfilename, infilename, normalize, doMarkov1)) outfile.write('#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n' % (enforcePeakDist, maxpeakdist, fullOnly)) outfile.write('#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n') countList = [] posList = [] index = 0 regionList = [] for rchrom in regions:
doCache = False if '-cache' in sys.argv: doCache = True bins = 10 standardMinThresh = standardMinDist / bins hitRDS = readDataset(hitfile, verbose=True, cache=doCache) readlen = hitRDS.getReadSize() normalizationFactor = 1.0 if normalize: totalCount = len(hitRDS) normalizationFactor = totalCount / 1000000. hg = Genome(genome) idb = geneinfoDB(cache=True) gidDict = {} geneinfoDict = idb.getallGeneInfo(genome) featuresDict = hg.getallGeneFeatures() #infile = open(infilename) outfile = open(outfilename, 'w') gidList = hg.allGIDs() gidList.sort() for gid in gidList: symbol = 'LOC' + gid geneinfo = '' featureList = []
altPosList.append(altPos) posLine[pos] = line if trackStrand: if 'RNAFARP' in line: posStrand[pos] = '+' posStrand[altPos] = '+' else: posStrand[pos] = '-' posStrand[altPos] = '-' geneList = [] geneDict = {} if maxRadius < step: step = maxRadius - 2 hg = Genome(genome, inRAM=True) if extendGenome != '': hg.extendFeatures(extendGenome, replace=replaceModels) geneannotDict = hg.allAnnotInfo() #featureTypes = ['CDS'] + hg.getFeatureTypes('UT%') featureTypes = ['CDS', 'UTR'] for radius in range(1, maxRadius, step): print 'radius %d' % radius print len(posList) if radius == 1: posDict = genesIntersecting(genome, posList, extendGen=extendGenome, replaceMod=replaceModels) else:
RDS = readDataset(rdsfile, verbose=True, cache=doCache) rdsChromList = RDS.getChromosomes() if doVerbose: print time.ctime() distinct = 0 total = 0 outfile = open(outfilename, 'w') idb = geneinfoDB() if genome == 'dmelanogaster': geneinfoDict = idb.getallGeneInfo(genome, infoKey='locus') else: geneinfoDict = idb.getallGeneInfo(genome) hg = Genome(genome) geneannotDict = hg.allAnnotInfo() assigned = {} farConnected = {} for achrom in rdsChromList: if achrom == 'chrM': continue print achrom uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
extendGenome = '' replaceModels = False if '-models' in sys.argv: extendGenome = sys.argv[sys.argv.index('-models') + 1] if '-replacemodels' in sys.argv: replaceModels = True print "will replace gene models with %s" % extendGenome else: print "will extend gene models with %s" % extendGenome doCache = False cachePages = 0 if '-cache' in sys.argv: cacheGeneDB(genome) hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) idb = geneinfoDB(cache=True) print '%s cached' % genome doCache = True cachePages = int(sys.argv[sys.argv.index('-cache') + 1]) else: hg = Genome(genome, inRAM=True) idb = geneinfoDB() if extendGenome != '': hg.extendFeatures(extendGenome, replace=replaceModels) hitRDS = readDataset(hitfile, verbose=True, cache=doCache) if cachePages > hitRDS.getDefaultCacheSize(): hitRDS.setDBcache(cachePages)
psyco.full() except: pass from cistematic.genomes import Genome from math import log import os.path import sys print '%s: version 2.1' % sys.argv[0] if len(sys.argv) < 6: print 'usage: python %s genome outimage gofileroot1 title1 cohortsize1 [gofileroot2 title2 cohortsize2 ...] [-fontsize pts] [-length in] [-width in]' % sys.argv[ 0] sys.exit(1) hg = Genome(sys.argv[1]) allgodesc = hg.allGOterms() godesc = [] import matplotlib matplotlib.use('Agg') from pylab import * doGray = False rootdir = './' imagename = sys.argv[2] options = 0 fontSize = 5
extendGenome = '' replaceModels = False if '-models' in sys.argv: extendGenome = sys.argv[sys.argv.index('-models') + 1] if '-replacemodels' in sys.argv: replaceModels = True print "will replace gene models with %s" % extendGenome else: print "will extend gene models with %s" % extendGenome doCache = False if '-cache' in sys.argv: doCache = True cacheGeneDB(genome) hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) print '%s cached' % genome else: hg = Genome(genome, inRAM=True) if extendGenome != '': hg.extendFeatures(extendGenome, replace=replaceModels) RDS = readDataset(hitfile, verbose=True, cache=doCache, reportCount=False) uniqcount = RDS.getUniqsCount() print '%d unique reads' % uniqcount splicecount = 0 countDict = {} gidList = [] farList = []
for line in infile: if line[0] == '#': continue fields = line.split('\t') chrom = fields[2][3:] start = int(fields[3]) pos = (chrom, start) posList.append(pos) posLine[pos] = line geneList = [] geneDict = {} geneSense = {} hg = Genome(genome) #featureTypes = ['CDS'] + hg.getFeatureTypes('UT%') featureTypes = ['CDS', 'UTR'] for ftype in featureTypes: if flankBP > 0: posDict = genesIntersecting(genome, posList, flank=flankBP) else: posDict = genesIntersecting(genome, posList) for pos in posDict: #print pos geneID = posDict[pos][0][0] try: symbol = geneinfoDict[geneID][0][0] except: symbol = 'LOC' + geneID try:
outfilename = sys.argv[3] # maxBorder should be readlen - 4 maxBorder = int(sys.argv[4]) doVerbose = False if '-verbose' in sys.argv: doVerbose = True spacer = 2 if '-spacer' in sys.argv: spacer = int(sys.argv[sys.argv.index('-spacer') + 1]) spacerseq = 'N' * spacer datafile = open(datafilename) #seqfile = open('knownGeneMrna.txt') hg = Genome(genome) spliceCountDict = {} exonStartDict = {} exonStopDict = {} exonLengthDict = {} nameToChromDict = {} nameToComplementDict = {} alreadySeen = {} counter = 0 for line in datafile: fields = line.split() name = fields[0] spliceCount = int(fields[7]) - 1 if spliceCount < 1:
doDataset = False if '-dataset' in sys.argv: if usePeaks: print "ignoring dataset and relying on peak data" else: hitfile = sys.argv[sys.argv.index('-dataset') + 1] doDataset = True hitRDS = readDataset(hitfile, verbose=True, cache=doCache) readlen = hitRDS.getReadSize() doCompact = False if '-compact' in sys.argv: doCompact = True hg = Genome(genome) outfile = open(outfilename, 'w') #readlen = readSize(hitfile) #hitDict = getReadDict(hitfile) if doCompact: regionDict = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True, chromField=0, compact=True, keepPeak=usePeaks, returnTop=topRegions) else: regionDict = getMergedRegions(regionfile,
def main(argv): if len(argv) < 3: print 'usage: python %s genome gtf outfilename [-spliced] [-class_code symbol]' % argv[0] print ' this script will output the translation of all three possible reading frames; stop codons will be converted to a .' sys.exit(1) genome = argv[1] gtf=argv[2] outputfilename = argv[3] doSpliced=False if '-spliced' in argv: doSpliced=True print 'will only look at transciprs with more than one exon' doClassCode=False if '-class_code' in argv: doClassCode=True class_code=argv[argv.index('-class_code')+1] print 'will only look at transciprs if class code', class_code CodonDict={'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A', 'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L', 'CUA':'L', 'CUG':'L', 'CGU':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R', 'AAA':'K', 'AAG':'K', 'AAU':'N', 'AAC':'N', 'AUG':'M', 'GAU':'D', 'GAC':'D', 'UUU':'F', 'UUC':'F', 'UGU':'C', 'UGC':'C', 'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P', 'CAA':'Q', 'CAG':'Q', 'UCU':'S', 'UCC':'S', 'UCA':'S', 'UCG':'S', 'AGU':'S', 'AGC':'S', 'GAA':'E', 'GAG':'E', 'ACU':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T', 'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G', 'UGG':'W', 'CAU':'H', 'CAC':'H', 'UAU':'Y', 'UAC':'Y', 'AUU':'I', 'AUC':'I', 'AUA':'I', 'GUU':'V', 'GUC':'V', 'GUA':'V', 'GUG':'V', 'START':'AUG', 'UAA':'.', 'UGA':'.', 'UAG':'.'} outfile = open(outputfilename, 'w') hg = Genome(genome) j=0 lineslist = open(gtf) TranscriptDict={} for line in lineslist: j+=1 if j % 100000 == 0: print j, 'lines processed' if line.startswith('#'): continue fields=line.strip().split('\t') if fields[2]!='exon': continue if doClassCode: if 'class_code "' in fields[8]: cc = fields[8].split('class_code "')[1].split('";')[0] if cc != class_code: continue else: continue if 'transcript_name "' in fields[8]: TranscriptID=fields[8].split('transcript_name "')[1].split('";')[0] else: TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0] if TranscriptDict.has_key(TranscriptID): pass else: TranscriptDict[TranscriptID]=[] chr=fields[0] left=int(fields[3]) right=int(fields[4]) orientation=fields[6] TranscriptDict[TranscriptID].append((chr,left,right,orientation)) g=0 print 'Found', len(TranscriptDict.keys()), 'transcripts' for transcript in TranscriptDict.keys(): g+=1 if g % 1000 == 0: print g, 'transcripts sequences processed' TranscriptDict[transcript] = list(Set(TranscriptDict[transcript])) if doSpliced: if len(TranscriptDict[transcript]) == 1: del TranscriptDict[transcript] continue sequence='' leftEnds=[] rightEnds=[] orientation = TranscriptDict[transcript][0][3] TranscriptDict[transcript].sort() if orientation=='+': for (chr,left,right,orientation) in TranscriptDict[transcript]: try: sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left) except: print "can't retrieve sequence", chr,left,right,orientation for p in range(left,right-left): try: sequence=sequence+hg.sequence(chr[3:len(chr)],p,1) except: sequence=sequence+'N' missed+=1 sense='plus_strand' if orientation=='-': for (chr,left,right,orientation) in reversed(TranscriptDict[transcript]): try: exonsequence=hg.sequence(chr[3:len(chr)],left-1,right-left+1) sequence=sequence+getReverseComplement(exonsequence) except: print "can not retrieve sequence", chr,left,right,orientation for p in range(left-1,right-left+1): try: sequence=sequence+getReverseComplement(hg.sequence(chr[3:len(chr)],p,1)) except: sequence=sequence+'N' missed+=1 sense='minus_strand' if orientation=='.': for (chr,left,right,orientation) in TranscriptDict[transcript]: try: sequence=sequence+hg.sequence(chr[3:len(chr)],left,right-left) except: print "can not retrieve sequence", chr,left,right,orientation for p in range(left,right-left): try: sequence=sequence+hg.sequence(chr[3:len(chr)],p,1) except: sequence=sequence+'N' missed+=1 sense='unknown_strand' LeftEnd=TranscriptDict[transcript][0][1] RightEnd=TranscriptDict[transcript][-1][2] if orientation == '+' or orientation == '-': sequence = sequence.upper().replace('T','U') max_protein_length = len(sequence) outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame1' outfile.write(outline+'\n') protein = '' for i in range(0,max_protein_length-3,3): if 'N' in sequence[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame2' outfile.write(outline+'\n') protein = '' for i in range(1,max_protein_length-4,3): if 'N' in sequence[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame3' outfile.write(outline+'\n') protein = '' for i in range(2,max_protein_length-5,3): if 'N' in sequence[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence[i:i+3]] outfile.write(protein+'\n') else: sequence1 = sequence.upper().replace('T','U') sequence2 = getReverseComplement(sequence).upper().replace('T','U') max_protein_length = len(sequence1) outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame1' outfile.write(outline+'\n') protein = '' for i in range(0,max_protein_length-3,3): if 'N' in sequence1[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence1[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame2' outfile.write(outline+'\n') protein = '' for i in range(1,max_protein_length-4,3): if 'N' in sequence1[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence1[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame3' outfile.write(outline+'\n') protein = '' for i in range(2,max_protein_length-5,3): if 'N' in sequence1[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence1[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame4' outfile.write(outline+'\n') protein = '' for i in range(0,max_protein_length-3,3): if 'N' in sequence2[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence2[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame5' outfile.write(outline+'\n') protein = '' for i in range(1,max_protein_length-4,3): if 'N' in sequence2[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence2[i:i+3]] outfile.write(protein+'\n') outline='>'+transcript+':'+chr+':'+str(LeftEnd)+'-'+str(RightEnd)+'-'+sense+'::frame6' outfile.write(outline+'\n') protein = '' for i in range(2,max_protein_length-5,3): if 'N' in sequence2[i:i+3]: protein = protein + '.' else: protein = protein + CodonDict[sequence2[i:i+3]] outfile.write(protein+'\n') outfile.close()