def test_issue_85(): # when start or stop was empty, #85 would fail Should now work with # blank fields f = feature.feature_from_line('\t'.join([''] * 9)) # or with "." placeholders f = feature.feature_from_line('\t'.join(['.'] * 9))
def test_string_representation(): line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690" f = feature.feature_from_line(line, keep_order=True) assert line == str(f), str(f) line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690 some more stuff" f = feature.feature_from_line(line, keep_order=True) assert line == str(f)
def test_merge_Attributes(self): f1 = feature.feature_from_line('chr2L . testing 1 10 . + . foo=1; baz=1; buz=1; biz=1; boo=1;', strict=False) f2 = feature.feature_from_line('chr2L . testing 1 10 . + . bar=2; baz=2; buz=2; biz=1; boo=1;', strict=False) test = helpers.merge_attributes(f1.attributes, f2.attributes) for k, v in list(test.items()): test[k] = sorted(v) true = {'foo': ['1'], 'bar': ['2'], "baz": ['1', '2'], "boo": ['1'], "buz": ['1', '2'], "biz": ['1']} self.assertDictEqual(test, true)
def test_attribute_order(): # default order is gene_id, transcript_id. But feature_from_line -- if # dialect not provided -- will infer its own dialect. In this case, # transcript_id comes first. attributes = 'transcript_id "mRNA1"; gene_id "gene1";' a = feature.feature_from_line( """ chr1 . mRNA 1 100 . + . %s """ % attributes, strict=False, keep_order=True) a.strict = True a.keep_order = True assert str(a) == 'chr1 . mRNA 1 100 . + . transcript_id "mRNA1"; gene_id "gene1";', str(a) # ensure that using the default dialect uses the default order (and # indidentally converts to GFF3 format) orig_dialect = a.dialect a.dialect = constants.dialect a.keep_order = True assert str(a) == 'chr1 . mRNA 1 100 . + . gene_id=gene1;transcript_id=mRNA1', str(a) # adding an attribute shoud always result in that attribute coming last (as # long as that attribute is not in the dialect order) a['dummy'] = ['asdf'] a.strict = True assert str(a) == 'chr1 . mRNA 1 100 . + . gene_id=gene1;transcript_id=mRNA1;dummy=asdf', str(a)
def test_repr(): line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690 some more stuff" f = feature.feature_from_line(line, keep_order=True) print(repr(f)) print(hex(id(f))) assert repr(f) == ("<Feature exon (chr2L:7529-8116[+]) at %s>" % hex(id(f)))
def test_attribute_order(): # default order is gene_id, transcript_id. But feature_from_line -- if # dialect not provided -- will infer its own dialect. In this case, # transcript_id comes first. attributes = 'transcript_id "mRNA1"; gene_id "gene1";' a = feature.feature_from_line(""" chr1 . mRNA 1 100 . + . %s """ % attributes, strict=False, keep_order=True) a.strict = True a.keep_order = True assert str( a ) == 'chr1 . mRNA 1 100 . + . transcript_id "mRNA1"; gene_id "gene1";', str( a) # ensure that using the default dialect uses the default order (and # indidentally converts to GFF3 format) orig_dialect = a.dialect a.dialect = constants.dialect a.keep_order = True assert str( a) == 'chr1 . mRNA 1 100 . + . gene_id=gene1;transcript_id=mRNA1', str( a) # adding an attribute shoud always result in that attribute coming last (as # long as that attribute is not in the dialect order) a['dummy'] = ['asdf'] a.strict = True assert str( a ) == 'chr1 . mRNA 1 100 . + . gene_id=gene1;transcript_id=mRNA1;dummy=asdf', str( a)
def test_sequence(): fasta = gffutils.example_filename('dm6-chr2L.fa') f = feature.feature_from_line('chr2L FlyBase gene 154 170 . + . ID=one;') seq = f.sequence(fasta) assert seq == 'aCGAGATGATAATATAT' assert len(seq) == len(f) f.strand = '-' seq = f.sequence(fasta) assert seq == 'ATATATTATCATCTCGt' assert len(seq) == len(f)
def test_aliases(): line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690" f = feature.feature_from_line(line, keep_order=True) assert f.chrom == 'chr2L' == f.seqid assert f.end == 8116 == f.stop f.chrom = 'fake' f.stop = 1 assert f.chrom == 'fake' == f.seqid assert f.stop == 1 == f.end
def test_issue_82(): # key-val separator is inside an unquoted attribute value x = ( 'Spenn-ch12\tsgn_markers\tmatch\t2621812\t2622049\t.\t+\t.\t' 'Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126' ) y = feature.feature_from_line(x) assert y.attributes['Note'] == ['marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126'] gffutils.create_db(gffutils.example_filename('keyval_sep_in_attrs.gff'), ':memory:')
def test_sequence(): fasta = gffutils.example_filename('dm6-chr2L.fa') f = feature.feature_from_line( 'chr2L FlyBase gene 154 170 . + . ID=one;') seq = f.sequence(fasta) assert seq == 'aCGAGATGATAATATAT' assert len(seq) == len(f) f.strand = '-' seq = f.sequence(fasta) assert seq == 'ATATATTATCATCTCGt' assert len(seq) == len(f)
def test_merge_Attributes(self): f1 = feature.feature_from_line( 'chr2L . testing 1 10 . + . foo=1; baz=1; buz=1; biz=1; boo=1;', strict=False) f2 = feature.feature_from_line( 'chr2L . testing 1 10 . + . bar=2; baz=2; buz=2; biz=1; boo=1;', strict=False) test = helpers.merge_attributes(f1.attributes, f2.attributes) for k, v in list(test.items()): test[k] = sorted(v) true = { 'foo': ['1'], 'bar': ['2'], "baz": ['1', '2'], "boo": ['1'], "buz": ['1', '2'], "biz": ['1'] } self.assertDictEqual(test, true)
def test_pbt_interval_conversion(): try: import pybedtools except ImportError: return line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690" f = feature.feature_from_line(line, strict=False, keep_order=True) pbt = helpers.asinterval(f) assert pbt.chrom == f.chrom == f.seqid assert pbt.start == f.start - 1 assert pbt.stop == f.stop == f.end pn = pbt.name fn = f.attributes['Name'][0] assert pn == fn, '%s, %s' % (pn, fn)
def test_pbt_interval_conversion(): try: import pybedtools except ImportError: return line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690" f = feature.feature_from_line(line, strict=False, keep_order=True) pbt = helpers.asinterval(f) assert pbt.chrom == f.chrom == f.seqid assert pbt.start == f.start -1 assert pbt.stop == f.stop == f.end pn = pbt.name fn = f.attributes['Name'][0] assert pn == fn, '%s, %s' % (pn, fn)
def test_unquoting(): # incoming is encoded s = ('chr1\tAUGUSTUS\tgene\t6950084\t6951407\t0.26\t-\t.\t' 'ID=INIL01g00009;GeneSymbol=Ndufaf6;Note=NADH dehydrogenase ' '(ubiquinone) complex I%2C assembly factor 6;GO_Terms=GO:0005743|' 'GO:0016740|GO:0009058|GO:0032981;PFam=PF00494') f = feature.feature_from_line(s, keep_order=True) # string representation should be identical assert str(f) == s # accessing attribute should be decoded n = f['Note'] assert n == [ 'NADH dehydrogenase (ubiquinone) complex I, assembly factor 6' ]
def test_unreasonable_unquoting(): s = ( 'chr1\t.\t.\t1\t2\t0.26\t-\t.\t' 'newline=%0A;' 'percent=%25;' 'null=%00;' 'comma=%2C;' # The first parent is "A," (A with a comma), the second is "B%" 'Parent=A%2C,B%25,C;') f = feature.feature_from_line(s, keep_order=True) assert f.attributes['newline'][0] == '\n' assert f.attributes['percent'][0] == '%' assert f.attributes['null'][0] == '\x00' assert f.attributes['comma'][0] == ',' # Commas indicate assert f.attributes['Parent'] == ['A,', 'B%', 'C'] assert str(f) == s
def _custom_iter(self): valid_lines = 0 for i, line in enumerate(self.open_function(self.data)): if isinstance(line, six.binary_type): line = line.decode('utf-8') line = line.rstrip('\n\r') self.current_item = line self.current_item_number = i if line == '##FASTA' or line.startswith('>'): raise StopIteration if line.startswith('##'): self._directive_handler(line) continue if line.startswith(('#')) or len(line) == 0: continue # (If we got here it should be a valid line) valid_lines += 1 yield feature_from_line(line, dialect=self.dialect)
def test_hash(): line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690 some more stuff" f = feature.feature_from_line(line, keep_order=True) assert hash(f) == hash(line)
def test_attributes(): s = "chr2L FlyBase mRNA 7529 9484 . + . ID=FBtr0300690;Name=CG11023-RC;Parent=FBgn0031208;" f = feature.feature_from_line(s) f.keep_order = True assert str(f) == s, str(f)
def test_feature_from_line(): # spaces and tabs should give identical results line1 = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690" line2 = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690" assert feature.feature_from_line(line1, strict=False, keep_order=True) == \ feature.feature_from_line(line2, strict=False, keep_order=True)
def test_feature_single_item(self): line = "chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690 some more stuff" f = feature.feature_from_line(line, keep_order=True) assert f['Name'] == ['CG11023:1']
def test_update(): # check both in-memory and file-based dbs db = create.create_db( example_filename('FBgn0031208.gff'), ':memory:', verbose=False, keep_order=True, force=True) orig_num_features = len(list(db.all_features())) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) # no merge strategy required because we're adding a new feature db.update([f]) x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True assert str(x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1", str(x) # ought to be one more now . . . num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Now try updating with the same feature, but using merge_strategy="merge", # which appends items to attributes ( n=1 --> n=1,2 ) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['2'] db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 # Merging does a list(set()) operation, so the order is not guaranteed. # Fix it here for testing... x = x[0] x.attributes['n'].sort() assert str(x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1,2", str(x) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Merging while iterating. e.g., if you're updating children with gene # IDs. db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:', verbose=False, force=True, keep_order=True) for gene in db.features_of_type('gene'): for child in list(db.children(gene)): # important: the FBgn0031208.gff file was designed to have some # funky features: there are two exons without ID attributes. These # are assigned to ids "exon_1" and "exon_2". Upon update, with # still no ID, we then have two new features "exon_3" and "exon_4". # To prevent this issue, we ensure that the ID attribute exists... child.attributes['gene_id'] = [gene.id] if 'ID' not in child.attributes: child.attributes['ID'] = [child.id] db.update([child], merge_strategy='replace') print("\n\nafter\n\n") for child in db.children(gene): print(child.id) assert child.attributes['gene_id'] == ['FBgn0031208'], (child, child.attributes) num_entries = 0 for gene_recs in list(db.iter_by_parent_childs()): # Add attribute to each gene record rec = gene_recs[0] rec.attributes["new"] = ["new_value"] db.update([rec]) num_entries += 1 print(list(db.all_features())) assert (num_entries > 1), "Only %d left after update" % (num_entries) # Replace f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['3'] db.update([f], merge_strategy='replace') x = list(db.features_of_type('testing')) assert len(x) == 1 assert str(x[0]) == "chr2L . testing 1 10 . + . ID=testing_feature;n=3", str(x[0]) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Same thing, but GTF instead of GFF. db = create.create_db( example_filename('FBgn0031208.gtf'), ':memory:', verbose=False, force=True, keep_order=True) f = feature.feature_from_line('chr2L . testing 1 10 . + . gene_id "fake"; n "1"', strict=False) f.keep_order = True db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True # note the trailing semicolon. That's because the db's dialect has # ['trailing semicolon'] = True. assert str(x) == 'chr2L . testing 1 10 . + . gene_id "fake"; n "1";', str(x)
def test_update(): # check both in-memory and file-based dbs db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:', verbose=False, keep_order=True, force=True) orig_num_features = len(list(db.all_features())) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) # no merge strategy required because we're adding a new feature db.update([f]) x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True assert str(x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1", str( x) # ought to be one more now . . . num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Now try updating with the same feature, but using merge_strategy="merge", # which appends items to attributes ( n=1 --> n=1,2 ) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['2'] db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 # Merging does a list(set()) operation, so the order is not guaranteed. # Fix it here for testing... x = x[0] x.attributes['n'].sort() assert str( x) == "chr2L . testing 1 10 . + . ID=testing_feature;n=1,2", str(x) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Merging while iterating. e.g., if you're updating children with gene # IDs. db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:', verbose=False, force=True, keep_order=True) def gen(): for gene in db.features_of_type('gene'): for child in list(db.children(gene)): # important: the FBgn0031208.gff file was designed to have some # funky features: there are two exons without ID attributes. These # are assigned to ids "exon_1" and "exon_2". Upon update, with # still no ID, we then have two new features "exon_3" and "exon_4". # To prevent this issue, we ensure that the ID attribute exists... child.attributes['gene_id'] = [gene.id] if 'ID' not in child.attributes: child.attributes['ID'] = [child.id] yield child db.update(gen(), merge_strategy='replace') print("\n\nafter\n\n") for child in db.children('FBgn0031208'): print(child.id) assert child.attributes['gene_id'] == ['FBgn0031208' ], (child, child.attributes) num_entries = 0 for gene_recs in list(db.iter_by_parent_childs()): # Add attribute to each gene record rec = gene_recs[0] rec.attributes["new"] = ["new_value"] db.update([rec]) num_entries += 1 print(list(db.all_features())) assert (num_entries > 1), "Only %d left after update" % (num_entries) # Replace f = feature.feature_from_line( 'chr2L . testing 1 10 . + . ID=testing_feature;n=1', dialect=db.dialect, strict=False) f.keep_order = True f.attributes['n'] = ['3'] db.update([f], merge_strategy='replace') x = list(db.features_of_type('testing')) assert len(x) == 1 assert str( x[0]) == "chr2L . testing 1 10 . + . ID=testing_feature;n=3", str(x[0]) # still should have the same number of features as before (still 2) num_features = len(list(db.all_features())) assert num_features == orig_num_features + 1, num_features # Same thing, but GTF instead of GFF. db = create.create_db(example_filename('FBgn0031208.gtf'), ':memory:', verbose=False, force=True, keep_order=True) f = feature.feature_from_line( 'chr2L . testing 1 10 . + . gene_id "fake"; n "1"', strict=False) f.keep_order = True db.update([f], merge_strategy='merge') x = list(db.features_of_type('testing')) assert len(x) == 1 x = x[0] x.keep_order = True # note the trailing semicolon. That's because the db's dialect has # ['trailing semicolon'] = True. assert str(x) == 'chr2L . testing 1 10 . + . gene_id "fake"; n "1";', str( x)