def setup(self): def gff_id_func(f): if 'ID' in f.attributes: return f.attributes['ID'][0] elif 'Name' in f.attributes: return f.attributes['Name'][0] else: return '{0.featuretype}:{0.seqid}:{0.start}-{0.end}:{0.strand}'.format(f) def gtf_id_func(f): if f.featuretype == 'gene': if 'gene_id' in f.attributes: return f.attributes['gene_id'][0] elif f.featuretype == 'transcript': if 'transcript_id' in f.attributes: return f.attributes['transcript_id'][0] else: return '{0.featuretype}:{0.seqid}:{0.start}-{0.end}:{0.strand}'.format(f) if self.orig_fn.endswith('.gtf'): id_func = gtf_id_func if self.orig_fn.endswith('.gff'): id_func = gff_id_func self.db = create.create_db( self.orig_fn, ':memory:', id_spec=id_func, merge_strategy='create_unique', verbose=False, keep_order=True ) self.c = self.db.conn.cursor() self.dialect = self.db.dialect
def setup(self): def gff_id_func(f): if 'ID' in f.attributes: return f.attributes['ID'][0] elif 'Name' in f.attributes: return f.attributes['Name'][0] else: return '{0.featuretype}:{0.seqid}:{0.start}-{0.end}:{0.strand}'.format( f) def gtf_id_func(f): if f.featuretype == 'gene': if 'gene_id' in f.attributes: return f.attributes['gene_id'][0] elif f.featuretype == 'transcript': if 'transcript_id' in f.attributes: return f.attributes['transcript_id'][0] else: return '{0.featuretype}:{0.seqid}:{0.start}-{0.end}:{0.strand}'.format( f) if self.orig_fn.endswith('.gtf'): id_func = gtf_id_func if self.orig_fn.endswith('.gff'): id_func = gff_id_func self.db = create.create_db(self.orig_fn, ':memory:', id_spec=id_func, merge_strategy='create_unique', verbose=False, keep_order=True) self.c = self.db.conn.cursor() self.dialect = self.db.dialect
def update(self, features, make_backup=True, **kwargs): """ Update database with features. features : str, iterable, FeatureDB instance If FeatureDB, all features will be used. If string, assume it's a filename of a GFF or GTF file. Otherwise, assume it's an iterable of Feature objects. The classes in gffutils.iterators may be helpful in this case. make_backup : bool If True, and the database you're about to update is a file on disk, makes a copy of the existing database and saves it with a .bak extension. Remaining kwargs are passed to create_db. """ from gffutils import create if make_backup: if isinstance(self.dbfn, six.string_types): shutil.copy2(self.dbfn, self.dbfn + '.bak') # No matter what `features` came in as, convert to gffutils.Feature # instances. Since the tricky part -- attribute strings -- have been # parsed into dicts in a Feature, we no longer have to worry about # that. This also allows GTF features to be used to update a GFF # database, or vice versa. if isinstance(features, six.string_types): indb = create.create_db(features, intermediate, **kwargs) features = indb.all_features() if isinstance(features, FeatureDB): features = features.all_features() if self.dialect['fmt'] == 'gtf': if 'id_spec' not in kwargs: kwargs['id_spec'] = { 'gene': 'gene_id', 'transcript': 'transcript_id'} db = create._GTFDBCreator( data=features, dbfn=self.dbfn, dialect=self.dialect, **kwargs) elif self.dialect['fmt'] == 'gff3': if 'id_spec' not in kwargs: kwargs['id_spec'] = 'ID' db = create._GFFDBCreator( data=features, dbfn=self.dbfn, dialect=self.dialect, **kwargs) else: raise ValueError db._populate_from_lines(features) db._update_relations() db._finalize()
def test_inconsistent_dialect(): """ The second feature does not have a trailing semicolon (wormbase_gff2_alt is like this). But since the first feature does, that's what the db's dialect is set to, which can cause errors when parsing attributes. """ db = create.create_db( """ chr1 . gene 1 100 . + . gene_id "gene1"; chr1 . mRNA 1 100 . + . transcript_id "mRNA1" """, ':memory:', from_string=True) items = list(db.all_features()) print(items[0]) # before, was ['"mRNA1'] -- note extra " assert items[1].attributes['transcript_id'] == ['mRNA1'], items[1].attributes['transcript_id']
def test_inconsistent_dialect(): """ The second feature does not have a trailing semicolon (wormbase_gff2_alt is like this). But since the first feature does, that's what the db's dialect is set to, which can cause errors when parsing attributes. """ db = create.create_db(""" chr1 . gene 1 100 . + . gene_id "gene1"; chr1 . mRNA 1 100 . + . transcript_id "mRNA1" """, ':memory:', from_string=True) items = list(db.all_features()) print(items[0]) # before, was ['"mRNA1'] -- note extra " assert items[1].attributes['transcript_id'] == [ 'mRNA1' ], items[1].attributes['transcript_id']
def test_update(): # check both in-memory and file-based dbs db = create.create_db( example_filename('FBgn0031208.gff'), ':memory:', verbose=False, keep_order=True, force=True) orig_num_features = len(list(db.all_features())) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) # no merge strategy required because we're adding a new feature db.update([f]) x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True assert str(x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1", str(x) # ought to be one more now . . . num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Now try updating with the same feature, but using merge_strategy="merge", # which appends items to attributes ( n=1 --> n=1,2 ) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['2'] db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 # Merging does a list(set()) operation, so the order is not guaranteed. # Fix it here for testing... x = x[0] x.attributes['n'].sort() assert str(x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1,2", str(x) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Merging while iterating. e.g., if you're updating children with gene # IDs. db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:', verbose=False, force=True, keep_order=True) for gene in db.features_of_type('gene'): for child in list(db.children(gene)): # important: the FBgn0031208.gff file was designed to have some # funky features: there are two exons without ID attributes. These # are assigned to ids "exon_1" and "exon_2". Upon update, with # still no ID, we then have two new features "exon_3" and "exon_4". # To prevent this issue, we ensure that the ID attribute exists... child.attributes['gene_id'] = [gene.id] if 'ID' not in child.attributes: child.attributes['ID'] = [child.id] db.update([child], merge_strategy='replace') print("\n\nafter\n\n") for child in db.children(gene): print(child.id) assert child.attributes['gene_id'] == ['FBgn0031208'], (child, child.attributes) num_entries = 0 for gene_recs in list(db.iter_by_parent_childs()): # Add attribute to each gene record rec = gene_recs[0] rec.attributes["new"] = ["new_value"] db.update([rec]) num_entries += 1 print(list(db.all_features())) assert (num_entries > 1), "Only %d left after update" % (num_entries) # Replace f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['3'] db.update([f], merge_strategy='replace') x = list(db.features_of_type('testing')) assert len(x) == 1 assert str(x[0]) == "chr2L . testing 1 10 . + . ID=testing_feature;n=3", str(x[0]) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Same thing, but GTF instead of GFF. db = create.create_db( example_filename('FBgn0031208.gtf'), ':memory:', verbose=False, force=True, keep_order=True) f = feature.feature_from_line('chr2L . testing 1 10 . + . gene_id "fake"; n "1"', strict=False) f.keep_order = True db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True # note the trailing semicolon. That's because the db's dialect has # ['trailing semicolon'] = True. assert str(x) == 'chr2L . testing 1 10 . + . gene_id "fake"; n "1";', str(x)
def test_update(): # check both in-memory and file-based dbs db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:', verbose=False, keep_order=True, force=True) orig_num_features = len(list(db.all_features())) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) # no merge strategy required because we're adding a new feature db.update([f]) x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True assert str(x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1", str( x) # ought to be one more now . . . num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Now try updating with the same feature, but using merge_strategy="merge", # which appends items to attributes ( n=1 --> n=1,2 ) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['2'] db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 # Merging does a list(set()) operation, so the order is not guaranteed. # Fix it here for testing... x = x[0] x.attributes['n'].sort() assert str( x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1,2", str(x) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Merging while iterating. e.g., if you're updating children with gene # IDs. db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:', verbose=False, force=True, keep_order=True) def gen(): for gene in db.features_of_type('gene'): for child in list(db.children(gene)): # important: the FBgn0031208.gff file was designed to have some # funky features: there are two exons without ID attributes. These # are assigned to ids "exon_1" and "exon_2". Upon update, with # still no ID, we then have two new features "exon_3" and "exon_4". # To prevent this issue, we ensure that the ID attribute exists... child.attributes['gene_id'] = [gene.id] if 'ID' not in child.attributes: child.attributes['ID'] = [child.id] yield child db.update(gen(), merge_strategy='replace') print("\n\nafter\n\n") for child in db.children('FBgn0031208'): print(child.id) assert child.attributes['gene_id'] == ['FBgn0031208' ], (child, child.attributes) num_entries = 0 for gene_recs in list(db.iter_by_parent_childs()): # Add attribute to each gene record rec = gene_recs[0] rec.attributes["new"] = ["new_value"] db.update([rec]) num_entries += 1 print(list(db.all_features())) assert (num_entries > 1), "Only %d left after update" % (num_entries) # Replace f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['3'] db.update([f], merge_strategy='replace') x = list(db.features_of_type('testing')) assert len(x) == 1 assert str( x[0]) == "chr2L . testing 1 10 . + . ID=testing_feature;n=3", str(x[0]) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Same thing, but GTF instead of GFF. db = create.create_db(example_filename('FBgn0031208.gtf'), ':memory:', verbose=False, force=True, keep_order=True) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . gene_id "fake"; n "1"', strict=False) f.keep_order = True db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True # note the trailing semicolon. That's because the db's dialect has # ['trailing semicolon'] = True. assert str(x) == 'chr2L . testing 1 10 . + . gene_id "fake"; n "1";', str( x)