def main(self): for m in gff3.models(self.pre(sys.stdin)): self.checkMiRnas(m) self.checkPseudogene(m) self.checkTranscriptNames(m) for f in gff3.flattenModel(m): print(str(f), end='')
def main(): log("Starting trimForAgr...") # soterm2id = loadSOTerms() # writeHeader([ ('data-source', 'MGI'), ('date-produced', time.asctime()), ('assembly', os.environ["ENSEMBLbuild"]), ('annotationSource RefSeq', os.environ["NCBIver"]), ('annotationSource ENSEMBL', os.environ["ENSEMBLver"]), ]) # for m in gff3.models(sys.stdin): processModel(m, soterm2id)
def main(featureSources): for fSource in featureSources: for m in gff3.models(fSource): stn = m.attributes.get("so_term_name",None) count(m, [m.type+("[%s]"%stn if stn else "")], m) # pcounts("root", roots) pcounts("mid", mids) pcounts("leaf", leaves) for k in exemplars: es = list(exemplars[k]) es.sort() if len(es) > 5: es = es[::int(len(es)/5)] paths[k] = str(paths[k])+ "\t" + "," .join(es) pcounts("path", paths)
def loadPslFile(self): # Each PSL line is parsed and turned into a gff3 feature hierarchy (match->match_part*) # Here we iterate over the model roots (matches); the match_parts dangle below (f.children) self.counts = {} for m in gff3.models(psl.toGff(self.pslFile)): # seqid = m.qName.split(DOT)[0] # get the seqid w/o version number mgiid = self.seqid2gene[seqid] # lookup the corresponding mgiid mfeats = self.mgi2feats[ mgiid] # list containing the gene followed by its match features if m.pctLength < MIN_PCT_LENGTH: self.logRejects( "REJECTING SEQUENCE (%s) for GENE (%s) - pctLength (%1.2f) less than minimum (%1.2f)" % (seqid, mgiid, m.pctLength, MIN_PCT_LENGTH)) self.logRejects(str(m)) continue mfeats.append(m) self.counts[seqid] = self.counts.setdefault(seqid, 0) + 1
import sys import gff3 for m in gff3.models(sys.stdin, flatten=True): print(m)
import sys import gff3 from OrderedSet import OrderedSet EXCLUDE_SOURCES = OrderedSet(["NCBI"]) EXCLUDE_TYPES = OrderedSet([ "chromosome", "biological_region", "supercontig", "three_prime_UTR", "five_prime_UTR" ]) filtFcn = lambda f: f.type not in EXCLUDE_TYPES and f.source not in EXCLUDE_SOURCES feats = filter(filtFcn, gff3.iterate(sys.stdin)) for m in gff3.models(feats): for f in gff3.flattenModel(m): if f.attributes.get("ID", "").startswith("transcript:"): f.Name = f.transcript_id f.source = "ENSEMBL" if len(f.parents) == 0: f.attributes["curie"] = "ENSEMBL:" + f.ID.split(":")[1] biotype = f.attributes.get("biotype", None) if biotype and len(f.parents) == 0: if biotype == "protein_coding": biotype = "protein_coding_gene" f.attributes["so_term_name"] = biotype f.attributes.pop("biotype", None) f.attributes.pop("version", None) f.attributes.pop("description", None) f.attributes.pop("logic_name", None) f.attributes.pop("gene_id", None) f.attributes.pop("transcript_support_level", None)
# canonicalize.py # # Turns tree models in canonical form DAG-shaped) models by merging of identical subfeatures. # import gff3 import sys def mergeExon(ex, f): ex.Parent.extend(f.Parent) if f.source not in ex.source: ex.source += ("," + f.source) for feats in gff3.models(sys.stdin, flatten=True): exons = {} # (start,end) -> index into ofeats ofeats = [] # list of feats in current model. Root is 0th item. # merge exons. exons merge if they have the same coordinates. for f in feats: if f.type == "exon": k = (f.start, f.end) if k in exons: # merge f with a previously seen exon, ex # i = exons[k] ex = ofeats[i] mergeExon(ex, f) # # ...and move ex to end of the list ofeats[i] = None