# Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". from GffTranscriptReader import GffTranscriptReader #filename="/home/bmajoros/1000G/assembly/local-genes.gff" #filename="/home/bmajoros/1000G/assembly/tmp.gff" #filename="test/data/tmp.gff" filename="test/data/local-genes.gff" reader=GffTranscriptReader() #transcripts=reader.loadGFF(filename) #for transcript in transcripts: #print(transcript.getID()) #gff=transcript.toGff() #print(gff) #genes=reader.loadGenes(filename) #for gene in genes: # print("gene",gene.getID()) # n=gene.getNumTranscripts() # for i in range(n): # transcript=gene.getIthTranscript(i) # transID=transcript.getID() # print("\t"+transID+"\t"+str(transcript.getBegin())+"\t" # +str(transcript.getEnd()))
readCounts={} with open(readCountsFile,"rt") as IN: while(True): line=IN.readline() if(line==""): break if(rex.find("TOTAL MAPPED READS:\s*(\d+)",line)): totalMappedReads=rex[1] else: fields=line.split() (gene,count)=fields readCounts[gene]=count # Read GFF file to find annotated sites to exclude gff={} exclude={} reader=GffTranscriptReader() transcripts=reader.loadGFF(gffFile) for transcript in transcripts: if(transcript.getID()[0:3]=="ALT"): continue if(rex.find("(\S+)_\d",transcript.getID())): gff[rex[1]]=transcript substrate=transcript.getSubstrate() exclusions=exclude.get(substrate,None) if(exclusions is None): exclusions=exclude[substrate]={} exons=transcript.getRawExons() exons.sort(key=lambda exon:exon.begin) numExons=len(exons) for i in range(numExons-1): key=str(exons[i].getEnd())+"-"+str(exons[i+1].getBegin()) exclusions[key]=True # Read broken-sites file
from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from FastaReader import FastaReader from FastaWriter import FastaWriter from GffTranscriptReader import GffTranscriptReader if(len(sys.argv)!=4): exit(sys.argv[0]+" <in.fasta> <in.gff> <out.fasta>") (fastaFile,gffFile,outFile)=sys.argv[1:] reader=GffTranscriptReader() transcripts=reader.loadGFF(gffFile) keep=set() for transcript in transcripts: if(transcript.getID()[:3]!="ALT"): continue keep.add(transcript.getSubstrate()) reader=FastaReader(fastaFile) writer=FastaWriter() fh=open(outFile,"wt") while(True): (defline,seq)=reader.nextSequence() if(not defline): break (id,attr)=FastaReader.parseDefline(defline) if(id not in keep): continue writer.addToFasta(defline,seq,fh)
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from FastaReader import FastaReader from FastaWriter import FastaWriter from GffTranscriptReader import GffTranscriptReader from Rex import Rex rex=Rex() # Process command line if(len(sys.argv)!=4): exit(sys.argv[0]+" <in.fasta> <in.gff> <out.fasta>") (fastaFile,gffFile,outFile)=sys.argv[1:] # Read GFF reader=GffTranscriptReader() hash=reader.hashBySubstrate(gffFile) # Open output file OUT=open(outFile,"wt") writer=FastaWriter() # Process each substrate in the FASTA file reader=FastaReader(fastaFile) while(True): [defline,seq]=reader.nextSequence() if(not defline): break if(not rex.find("^\s*>\s*(\S+)",defline)): exit("Can't parse defline: "+defline) id=rex[1] transcripts=hash.get(id,None)
unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". from GffTranscriptReader import GffTranscriptReader #filename="/home/bmajoros/1000G/assembly/local-genes.gff" #filename="/home/bmajoros/1000G/assembly/tmp.gff" #filename="test/data/tmp.gff" #filename="test/data/local-genes.gff" filename = "/home/bmajoros/ensembl/protein-coding.gff" reader = GffTranscriptReader() genes = reader.loadGenes(filename) for gene in genes: exons = gene.getMergedExons() unmerged = 0 for transcript in gene.transcripts: unmerged += len(transcript.getRawExons()) print(unmerged, "exons merged to", len(exons)) #for i in range(len(exons)): # print("MERGED TO:",exons[i].begin,exons[i].end) # print() #transcripts=reader.loadGFF(filename) #for transcript in transcripts: #print(transcript.getID()) #gff=transcript.toGff()