示例#1
0
    def setup(self):

        def gff_id_func(f):
            if 'ID' in f.attributes:
                return f.attributes['ID'][0]
            elif 'Name' in f.attributes:
                return f.attributes['Name'][0]
            else:
                return '{0.featuretype}:{0.seqid}:{0.start}-{0.end}:{0.strand}'.format(f)

        def gtf_id_func(f):
            if f.featuretype == 'gene':
                if 'gene_id' in f.attributes:
                    return f.attributes['gene_id'][0]
            elif f.featuretype == 'transcript':
                if 'transcript_id' in f.attributes:
                    return f.attributes['transcript_id'][0]
            else:
                return '{0.featuretype}:{0.seqid}:{0.start}-{0.end}:{0.strand}'.format(f)

        if self.orig_fn.endswith('.gtf'): id_func = gtf_id_func
        if self.orig_fn.endswith('.gff'): id_func = gff_id_func
        self.db = create.create_db(
            self.orig_fn,
            ':memory:',
            id_spec=id_func,
            merge_strategy='create_unique',
            verbose=False,
            keep_order=True
        )
        self.c = self.db.conn.cursor()
        self.dialect = self.db.dialect
示例#2
0
    def setup(self):
        def gff_id_func(f):
            if 'ID' in f.attributes:
                return f.attributes['ID'][0]
            elif 'Name' in f.attributes:
                return f.attributes['Name'][0]
            else:
                return '{0.featuretype}:{0.seqid}:{0.start}-{0.end}:{0.strand}'.format(
                    f)

        def gtf_id_func(f):
            if f.featuretype == 'gene':
                if 'gene_id' in f.attributes:
                    return f.attributes['gene_id'][0]
            elif f.featuretype == 'transcript':
                if 'transcript_id' in f.attributes:
                    return f.attributes['transcript_id'][0]
            else:
                return '{0.featuretype}:{0.seqid}:{0.start}-{0.end}:{0.strand}'.format(
                    f)

        if self.orig_fn.endswith('.gtf'): id_func = gtf_id_func
        if self.orig_fn.endswith('.gff'): id_func = gff_id_func
        self.db = create.create_db(self.orig_fn,
                                   ':memory:',
                                   id_spec=id_func,
                                   merge_strategy='create_unique',
                                   verbose=False,
                                   keep_order=True)
        self.c = self.db.conn.cursor()
        self.dialect = self.db.dialect
示例#3
0
    def update(self, features, make_backup=True, **kwargs):
        """
        Update database with features.

        features : str, iterable, FeatureDB instance
            If FeatureDB, all features will be used. If string, assume it's
            a filename of a GFF or GTF file.  Otherwise, assume it's an
            iterable of Feature objects.  The classes in gffutils.iterators may
            be helpful in this case.

        make_backup : bool
            If True, and the database you're about to update is a file on disk,
            makes a copy of the existing database and saves it with a .bak
            extension.

        Remaining kwargs are passed to create_db.
        """
        from gffutils import create
        if make_backup:
            if isinstance(self.dbfn, six.string_types):
                shutil.copy2(self.dbfn, self.dbfn + '.bak')

        # No matter what `features` came in as, convert to gffutils.Feature
        # instances.  Since the tricky part -- attribute strings -- have been
        # parsed into dicts in a Feature, we no longer have to worry about
        # that.  This also allows GTF features to be used to update a GFF
        # database, or vice versa.
        if isinstance(features, six.string_types):
            indb = create.create_db(features, intermediate, **kwargs)
            features = indb.all_features()

        if isinstance(features, FeatureDB):
            features = features.all_features()

        if self.dialect['fmt'] == 'gtf':
            if 'id_spec' not in kwargs:
                kwargs['id_spec'] = {
                    'gene': 'gene_id', 'transcript': 'transcript_id'}
            db = create._GTFDBCreator(
                data=features, dbfn=self.dbfn, dialect=self.dialect, **kwargs)
        elif self.dialect['fmt'] == 'gff3':
            if 'id_spec' not in kwargs:
                kwargs['id_spec'] = 'ID'
            db = create._GFFDBCreator(
                data=features, dbfn=self.dbfn, dialect=self.dialect, **kwargs)

        else:
            raise ValueError

        db._populate_from_lines(features)
        db._update_relations()
        db._finalize()
示例#4
0
def test_inconsistent_dialect():
    """
    The second feature does not have a trailing semicolon (wormbase_gff2_alt is
    like this).  But since the first feature does, that's what the db's dialect
    is set to, which can cause errors when parsing attributes.
    """
    db = create.create_db(
    """
    chr1	.	gene	1	100	.	+	.	gene_id "gene1";
    chr1	.	mRNA	1	100	.	+	.	transcript_id "mRNA1"
    """, ':memory:', from_string=True)
    items = list(db.all_features())
    print(items[0])
    # before, was ['"mRNA1'] -- note extra "
    assert items[1].attributes['transcript_id'] == ['mRNA1'], items[1].attributes['transcript_id']
示例#5
0
def test_inconsistent_dialect():
    """
    The second feature does not have a trailing semicolon (wormbase_gff2_alt is
    like this).  But since the first feature does, that's what the db's dialect
    is set to, which can cause errors when parsing attributes.
    """
    db = create.create_db("""
    chr1	.	gene	1	100	.	+	.	gene_id "gene1";
    chr1	.	mRNA	1	100	.	+	.	transcript_id "mRNA1"
    """,
                          ':memory:',
                          from_string=True)
    items = list(db.all_features())
    print(items[0])
    # before, was ['"mRNA1'] -- note extra "
    assert items[1].attributes['transcript_id'] == [
        'mRNA1'
    ], items[1].attributes['transcript_id']
示例#6
0
def test_update():
    # check both in-memory and file-based dbs
    db = create.create_db(
        example_filename('FBgn0031208.gff'), ':memory:', verbose=False,
        keep_order=True,
        force=True)

    orig_num_features = len(list(db.all_features()))

    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect, strict=False)

    # no merge strategy required because we're adding a new feature
    db.update([f])
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    x = x[0]
    x.keep_order = True
    assert str(x) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=1", str(x)

    # ought to be one more now . . .
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features

    # Now try updating with the same feature, but using merge_strategy="merge",
    # which appends items to attributes ( n=1 --> n=1,2 )
    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect, strict=False)
    f.keep_order = True
    f.attributes['n'] = ['2']
    db.update([f], merge_strategy='merge')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1

    # Merging does a list(set()) operation, so the order is not guaranteed.
    # Fix it here for testing...
    x = x[0]
    x.attributes['n'].sort()

    assert str(x) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=1,2", str(x)

    # still should have the same number of features as before (still 2)
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features


    # Merging while iterating.  e.g., if you're updating children with gene
    # IDs.
    db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:',
                          verbose=False, force=True, keep_order=True)
    for gene in db.features_of_type('gene'):
        for child in list(db.children(gene)):
            # important: the FBgn0031208.gff file was designed to have some
            # funky features: there are two exons without ID attributes.  These
            # are assigned to ids "exon_1" and "exon_2".  Upon update, with
            # still no ID, we then have two new features "exon_3" and "exon_4".
            # To prevent this issue, we ensure that the ID attribute exists...
            child.attributes['gene_id'] = [gene.id]
            if 'ID' not in child.attributes:
                child.attributes['ID'] = [child.id]
            db.update([child], merge_strategy='replace')

    print("\n\nafter\n\n")
    for child in db.children(gene):
        print(child.id)
        assert child.attributes['gene_id'] == ['FBgn0031208'], (child, child.attributes)

    num_entries = 0
    for gene_recs in list(db.iter_by_parent_childs()):
        # Add attribute to each gene record
        rec = gene_recs[0]
        rec.attributes["new"] = ["new_value"]
        db.update([rec])
        num_entries += 1
    print(list(db.all_features()))


    assert (num_entries > 1), "Only %d left after update" % (num_entries)


    # Replace
    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect, strict=False)

    f.keep_order = True

    f.attributes['n'] = ['3']
    db.update([f], merge_strategy='replace')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    assert str(x[0]) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=3", str(x[0])
    # still should have the same number of features as before (still 2)
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features


    # Same thing, but GTF instead of GFF.
    db = create.create_db(
        example_filename('FBgn0031208.gtf'), ':memory:', verbose=False,
        force=True, keep_order=True)
    f = feature.feature_from_line('chr2L . testing 1 10 . + . gene_id "fake"; n "1"', strict=False)
    f.keep_order = True
    db.update([f], merge_strategy='merge')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    x = x[0]
    x.keep_order = True

    # note the trailing semicolon.  That's because the db's dialect has
    # ['trailing semicolon'] = True.
    assert str(x) == 'chr2L	.	testing	1	10	.	+	.	gene_id "fake"; n "1";', str(x)
示例#7
0
def test_update():
    # check both in-memory and file-based dbs
    db = create.create_db(example_filename('FBgn0031208.gff'),
                          ':memory:',
                          verbose=False,
                          keep_order=True,
                          force=True)

    orig_num_features = len(list(db.all_features()))

    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect,
        strict=False)

    # no merge strategy required because we're adding a new feature
    db.update([f])
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    x = x[0]
    x.keep_order = True
    assert str(x) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=1", str(
        x)

    # ought to be one more now . . .
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features

    # Now try updating with the same feature, but using merge_strategy="merge",
    # which appends items to attributes ( n=1 --> n=1,2 )
    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect,
        strict=False)
    f.keep_order = True
    f.attributes['n'] = ['2']
    db.update([f], merge_strategy='merge')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1

    # Merging does a list(set()) operation, so the order is not guaranteed.
    # Fix it here for testing...
    x = x[0]
    x.attributes['n'].sort()

    assert str(
        x) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=1,2", str(x)

    # still should have the same number of features as before (still 2)
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features

    # Merging while iterating.  e.g., if you're updating children with gene
    # IDs.
    db = create.create_db(example_filename('FBgn0031208.gff'),
                          ':memory:',
                          verbose=False,
                          force=True,
                          keep_order=True)

    def gen():
        for gene in db.features_of_type('gene'):
            for child in list(db.children(gene)):
                # important: the FBgn0031208.gff file was designed to have some
                # funky features: there are two exons without ID attributes.  These
                # are assigned to ids "exon_1" and "exon_2".  Upon update, with
                # still no ID, we then have two new features "exon_3" and "exon_4".
                # To prevent this issue, we ensure that the ID attribute exists...
                child.attributes['gene_id'] = [gene.id]
                if 'ID' not in child.attributes:
                    child.attributes['ID'] = [child.id]
                yield child

    db.update(gen(), merge_strategy='replace')

    print("\n\nafter\n\n")
    for child in db.children('FBgn0031208'):
        print(child.id)
        assert child.attributes['gene_id'] == ['FBgn0031208'
                                               ], (child, child.attributes)

    num_entries = 0
    for gene_recs in list(db.iter_by_parent_childs()):
        # Add attribute to each gene record
        rec = gene_recs[0]
        rec.attributes["new"] = ["new_value"]
        db.update([rec])
        num_entries += 1
    print(list(db.all_features()))

    assert (num_entries > 1), "Only %d left after update" % (num_entries)

    # Replace
    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect,
        strict=False)

    f.keep_order = True

    f.attributes['n'] = ['3']
    db.update([f], merge_strategy='replace')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    assert str(
        x[0]) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=3", str(x[0])
    # still should have the same number of features as before (still 2)
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features

    # Same thing, but GTF instead of GFF.
    db = create.create_db(example_filename('FBgn0031208.gtf'),
                          ':memory:',
                          verbose=False,
                          force=True,
                          keep_order=True)
    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . gene_id "fake"; n "1"', strict=False)
    f.keep_order = True
    db.update([f], merge_strategy='merge')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    x = x[0]
    x.keep_order = True

    # note the trailing semicolon.  That's because the db's dialect has
    # ['trailing semicolon'] = True.
    assert str(x) == 'chr2L	.	testing	1	10	.	+	.	gene_id "fake"; n "1";', str(
        x)