def slop_list2gff(): global_names=globals() for protein_id in slop_list: if protein_id[0:3]=="MGG": head_id=protein_id yield asinterval(MGG_db[protein_id[0:9]]) else: strain_id,protein_ordinal=strain_protein_id_pattern.search(protein_id).group(1,2) yield asinterval(global_names.get(strain_id+"_db")["gene_"+protein_ordinal])
def intron_generator(): """ Construct intron features by subtracting all exons from all genes. """ genes = pybedtools.BedTool( asinterval(g) for g in settings.G.features_of_type('gene')\ if g.chrom in settings.CHROMS) exons = pybedtools.BedTool( asinterval(e) for e in settings.G.features_of_type('exon')\ if e.chrom in settings.CHROMS) for feature in genes.subtract(exons).saveas(): yield feature
def features(self, ignore_unknown=False): """ Generator of currently-selected features. Looks up each feature in the attached `gffutils.FeatureDB` and converts it into a `pybedtools.Interval` object for use with `pybedtools`. Raises a warning if you haven't yet attached a `gffutils.FeatureDB` to this instance. :param ignore_unknown: If `ignore_unknown=False` then an exception will be raised if a feature cannot be found; if `ignore_unknown=True` then silently ignore these cases. Consider using the `strip_unknown_features()` method to handle these cases up front. """ if not self.gffdb: raise ValueError('Please attach a GFF database created by ' 'gffutils by setting the .gffdb attribute to the ' 'database\'s path.') for i in self.data[self.id_column]: try: yield asinterval(self.gffdb[i]) except gffutils.FeatureNotFoundError: if ignore_unknown: continue else: raise gffutils.FeatureNotFoundError('%s not found' % i.id)
def _make_track(self, d, cls): yheight = self.heights[cls] ybase = self.ybase + (self.heights['full'] - yheight) * 0.5 return Track((asinterval(i) for i in d[cls]), ybase=ybase, yheight=yheight, **self.kwargs)
def gen(): """ Generator of pybedtools.Intervals representing TSSes. """ for gene in db.features_of_type('gene'): for transcript in db.children(gene, level=1): if transcript.strand == '-': transcript.start = transcript.stop else: transcript.stop = transcript.start transcript.featuretype = transcript.featuretype + '_TSS' yield helpers.asinterval(transcript)
def gene_generator(): """ The database has inferred full gene models from the GTF, so we can simply iterate over them here. More complex generators can be created as well -- for example, one that only returns unique TSS sites from all isoforms of all genes. """ for g in settings.G.features_of_type('gene'): if g.chrom not in settings.CHROMS: continue yield asinterval(g)
def test_pbt_interval_conversion(): try: import pybedtools except ImportError: return line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690" f = feature.feature_from_line(line, strict=False, keep_order=True) pbt = helpers.asinterval(f) assert pbt.chrom == f.chrom == f.seqid assert pbt.start == f.start - 1 assert pbt.stop == f.stop == f.end pn = pbt.name fn = f.attributes['Name'][0] assert pn == fn, '%s, %s' % (pn, fn)
def test_pbt_interval_conversion(): try: import pybedtools except ImportError: return line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690" f = feature.feature_from_line(line, strict=False, keep_order=True) pbt = helpers.asinterval(f) assert pbt.chrom == f.chrom == f.seqid assert pbt.start == f.start -1 assert pbt.stop == f.stop == f.end pn = pbt.name fn = f.attributes['Name'][0] assert pn == fn, '%s, %s' % (pn, fn)
def plot(self, feature): """ Spawns a new figure showing data for `feature`. :param feature: A `pybedtools.Interval` object Using the pybedtools.Interval `feature`, creates figure specified in :meth:`BaseMiniBrowser.make_fig` and plots data on panels according to `self.panels()`. """ if isinstance(feature, gffutils.Feature): feature = asinterval(feature) self.make_fig() axes = [] for ax, method in self.panels(): feature = method(ax, feature) axes.append(ax) return axes
def TSS(feature, upstream=1000, downstream=1000): """ Transforms a pybedtools.Interval, `feature`, into a TSS extended by upstream/downstream, paying attention to strand and proximity to chromosome limits. Also edits the feature type to be "TSS" """ chrom_size = chromsizes[feature.chrom][1] if feature.strand == '-': start = max(0, feature.stop - downstream) stop = min(feature.stop + upstream, chrom_size) else: start = max(0, feature.start - upstream) stop = min(feature.start + downstream, chrom_size) # Modify featuretype feature[2] = 'TSS' feature.start = start feature.stop = stop return asinterval(feature)
def scored_feature_generator(d): for i in range(len(d)): try: feature = db[d.id[i]] except gffutils.FeatureNotFoundError: raise gffutils.FeatureNotFoundError(d.id[i]) score = -10 * np.log10(d.padj[i]) lfc = d.log2foldchange[i] if np.isnan(lfc): score = 0 if lfc < 0: score *= -1 feature.score = str(score) feature = extend_fields(gff2bed(asinterval(feature)), 9) fields = feature.fields[:] fields[6] = fields[1] fields[7] = fields[2] fields.append(str(d.padj[i])) fields.append(str(d.pval[i])) fields.append('%.3f' % d.log2foldchange[i]) fields.append('%.3f' % d.basemeana[i]) fields.append('%.3f' % d.basemeanb[i]) yield pybedtools.create_interval_from_list(fields)
def tss_generator(): for transcript in db.features_of_type("transcript"): yield TSS(asinterval(transcript), upstream=1000, downstream=1000)
def generator(): for gene_id in df.index: yield asinterval(db[gene_id])
try: # if this transcript has an entry for 'transcript_support_level' # and if the level is below the acceptable threshold: # keep that transcript if int([ i[1] for i in t.attributes.items() if i[0] == 'transcript_support_level' ][0][0]) <= max_TSL: temp_txpts.append(t) except: pass txpts = temp_txpts if (len(txpts) > 0 ): # if there are any transcripts with a sufficiently low TSL: all_exons = (pybedtools.BedTool([ helpers.asinterval(i) for i in db.children(gene, featuretype='exon') ])) all_exons = all_exons.sort().merge( ) # define the ends of the genic region by using the first # and last exon in the annotation as the limits gene_extent = pybedtools.BedTool([ pybedtools.cbedtools.Interval(chrom=chrom, start=min(i.start for i in all_exons), end=max(i.end for i in all_exons)) ]) t_introns = [] for t in txpts: # for each transcript, get all of the exons t_exons = (pybedtools.BedTool([
def generate_interval(category_fl): for strain_id_raw in category_fl: strain_id = strain_id_raw.strip('\n') if strain_id[0:3] != "MGG": continue yield gff2bed(asinterval(MGG_db[strain_id]), name_field=2)
def generator(): G = gffutils.FeatureDB(dbfn) genes = G.features_of_type('gene') for i in range(5000): yield asinterval(genes.next())
def _make_track(self, d, cls): yheight = self.heights[cls] ybase = self.ybase + (self.heights['full'] - yheight) * 0.5 return Track( (asinterval(i) for i in d[cls]), ybase=ybase, yheight=yheight, **self.kwargs)
def tss_generator(): for transcript in db.features_of_type('mRNA'): #CDS/gene/mRNA... yield TSS(asinterval(transcript), upstream=1, downstream=0)
def tss_generator(): """ Generator function to yield TSS of each annotated transcript """ for transcript in db.features_of_type('transcript'): yield TSS(asinterval(transcript), upstream=1, downstream=0)
def generate_bed(gff_feature_item): yield asinterval(gff_feature_item)
def gen(): for i in iterator: yield helpers.asinterval(i)
def tss_generator(): for transcript in db.features_of_type('transcript'): yield TSS(asinterval(transcript), upstream=1000, downstream=1000)
def fsdu(which_id): yield asinterval(db[which_id])
def tss_generator(gtf): """ Generator function to yield TSS +/- 1kb of each annotated transcript """ for transcript in db.features_of_type('transcript'): yield TSS(asinterval(transcript), upstream=1000, downstream=1000)