示例#1
0
class TestPerformanceOnMouse(PerformanceTestFeatureDB, unittest.TestCase):
    '''
        Test frequent scenarios on large genome of mouse.
    '''
    gff_file = gffutils.example_filename('gencode.vM8.annotation.gff3')
    chromsizes_file = gffutils.example_filename('gencode.vM8.chromsizes.txt')
    gene_list = gffutils.example_filename('gencode.vM8.5000_gene_ids.txt')
    transcript_list = gffutils.example_filename(
        'gencode.vM8.5000_transcript_ids.txt')
示例#2
0
def test_valid_line_count():
    p = iterators.FileIterator(example_filename('ncbi_gff3.txt'))
    assert len(list(p)) == 17

    p = iterators.FileIterator(example_filename('hybrid1.gff3'))
    assert len(list(p)) == 6

    p = iterators.FileIterator(example_filename('FBgn0031208.gff'))
    assert len(list(p)) == 27
示例#3
0
def test_valid_line_count():
    p = iterators.FileIterator(example_filename('ncbi_gff3.txt'))
    assert len(list(p)) == 17

    p = iterators.FileIterator(example_filename('hybrid1.gff3'))
    assert len(list(p)) == 6

    p = iterators.FileIterator(example_filename('FBgn0031208.gff'))
    assert len(list(p)) == 27
示例#4
0
class TestPerformanceOnSacCer(PerformanceTestFeatureDB, unittest.TestCase):
    '''
        Test frequent scenarios on medium size genome of yeast.
    '''
    gff_file = gffutils.example_filename(
        'Saccharomyces_cerevisiae.R64-1-1.83.gff3')
    chromsizes_file = gffutils.example_filename(
        'Saccharomyces_cerevisiae.R64-1-1.83.chromsizes.txt')
    gene_list = gffutils.example_filename(
        'Saccharomyces_cerevisiae.R64-1-1.83.5000_gene_ids.txt')
    transcript_list = gffutils.example_filename(
        'Saccharomyces_cerevisiae.R64-1-1.83.5000_transcript_ids.txt')
示例#5
0
def test_clean_gff():
    # test the "full" cleaning -- remove some featuretypes, do sanity-checking,
    # add chr
    fn = gffutils.example_filename('dirty.gff')
    gffutils.clean_gff(fn, newfn='cleaned.tmp',featuretypes_to_remove=['pcr_product','protein'],addchr=True)
    observed = open('cleaned.tmp').readlines()
    expected = open(gffutils.example_filename('fully-cleaned.gff')).readlines()
    assert observed==expected
    os.unlink('cleaned.tmp')
    gffutils.clean_gff(fn, featuretypes_to_remove=None, sanity_check=False)
    observed = open(gffutils.example_filename('dirty.gff.cleaned')).read()
    expected = open(gffutils.example_filename('basic-cleaned.gff')).read()
    assert observed == expected
    os.unlink(gffutils.example_filename('dirty.gff.cleaned'))
示例#6
0
def test_delete():
    db_fname = gffutils.example_filename("gff_example1.gff3")

    # incrementally delete all features
    db = gffutils.create_db(db_fname, ':memory:')
    ids = [i.id for i in db.all_features()]
    current = set(ids)
    for _id in ids:
        db.delete(_id)
        expected = current.difference([_id])
        current = set([i.id for i in db.all_features()])
        assert current == expected, (current, expected)
    assert len(current) == 0

    # same thing, but as a list of Feature objects rather than string IDs
    db = gffutils.create_db(db_fname, ':memory:')
    features = list(db.all_features())
    current = set(features)
    for feature in features:
        db.delete(feature)
        expected = current.difference([feature])
        current = set(list(db.all_features()))
        assert current == expected, (current, expected)
    assert len(current) == 0, current

    # same thing, but use a FeatureDB.
    db1 = gffutils.create_db(db_fname, ':memory:')
    db2 = gffutils.create_db(db_fname, ':memory:')
    db1.delete(db2)
    assert len(list(db1.all_features())) == 0


    db = gffutils.create_db(db_fname, ':memory:')
    db.delete('nonexistent')
示例#7
0
def test_delete():
    db_fname = gffutils.example_filename("gff_example1.gff3")

    # incrementally delete all features
    db = gffutils.create_db(db_fname, ':memory:')
    ids = [i.id for i in db.all_features()]
    current = set(ids)
    for _id in ids:
        db.delete(_id)
        expected = current.difference([_id])
        current = set([i.id for i in db.all_features()])
        assert current == expected, (current, expected)
    assert len(current) == 0

    # same thing, but as a list of Feature objects rather than string IDs
    db = gffutils.create_db(db_fname, ':memory:')
    features = list(db.all_features())
    current = set(features)
    for feature in features:
        db.delete(feature)
        expected = current.difference([feature])
        current = set(list(db.all_features()))
        assert current == expected, (current, expected)
    assert len(current) == 0, current

    # same thing, but use a FeatureDB.
    db1 = gffutils.create_db(db_fname, ':memory:')
    db2 = gffutils.create_db(db_fname, ':memory:')
    db1.delete(db2)
    assert len(list(db1.all_features())) == 0

    db = gffutils.create_db(db_fname, ':memory:')
    db.delete('nonexistent')
示例#8
0
def test_empty_superclass_methods():
    dbcreator = gffutils.db.DBCreator(gffutils.example_filename('FBgn0031208.gff'), 'empty.db', verbose=False)
    dbcreator.populate_from_features([])
    dbcreator.update_relations()
    assert os.path.exists('empty.db')
    assert os.stat('empty.db').st_size == 0
    os.unlink('empty.db')
示例#9
0
def test_pr_131():
    db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),
                            ':memory:')

    # previously would raise ValueError("No lines parsed -- was an empty
    # file provided?")
    db2 = db.update([])
示例#10
0
def write_isoforms(gff, refs, outfolder):
    db_name = os.path.join(outfolder, 'database.db')
    fn = gffutils.example_filename(gff)
    db = gffutils.create_db(fn,
                            dbfn=db_name,
                            force=True,
                            keep_order=True,
                            merge_strategy='merge',
                            sort_attribute_values=True)
    db = gffutils.FeatureDB(db_name, keep_order=True)
    transcripts = {}
    for gene in db.features_of_type('gene'):
        ref_id = gene.seqid
        ref_seq = refs[ref_id]

        for transcript in db.children(gene,
                                      featuretype='transcript',
                                      order_by='start'):
            transcript_seq = []
            for j in db.children(transcript,
                                 featuretype='exon',
                                 order_by='start'):
                exon_seq = ref_seq[j.start - 1:j.end]
                transcript_seq.append(exon_seq)
            transcript_seq = ''.join([e for e in transcript_seq])
            transcripts[transcript.id] = transcript_seq
    return transcripts
示例#11
0
文件: test.py 项目: rbeagrie/gffutils
def test_create_db_from_url():
    """
    Test creation of FeatureDB from URL iterator.
    """
    print("Testing creation of DB from URL iterator")
    # initially run SimpleHTTPServer at port 0 and os will take first available
    Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
    httpd = SocketServer.TCPServer(("", 0), Handler)
    port = str(httpd.socket.getsockname()[1])
    print("serving at port", port)

    # Serving test/data folder
    served_folder = gffutils.example_filename('')
    os.chdir(served_folder)

    print("Starting SimpleHTTPServer in thread")
    server_thread = threading.Thread(target=httpd.serve_forever)
    server_thread.deamon = True
    server_thread.start()
    try:
        url = ''.join(['http://localhost:', port, '/gff_example1.gff3'])
        db = gffutils.create_db(url, ":memory:", keep_order=True)
        def my_iterator():
            for rec in db.all_features():
                yield rec
        new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True)

        print(list(new_db.all_features()))
        gene_feats = new_db.all_features(featuretype="gene")
        assert (len(list(gene_feats)) != 0), "Could not load genes from GFF."
    finally:
        print('Server shutdown.')
        httpd.shutdown()
        server_thread.join()
示例#12
0
def test_infer_gene_extent():
    # Before we deprecate this, make sure it still works but emits a warning.
    with warnings.catch_warnings(record=True) as w:
        gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'),
                           ':memory:',
                           infer_gene_extent=False)
        assert len(w) == 1
示例#13
0
def test_inspect_featuretypes():
    observed = gffutils.inspect_featuretypes(gffutils.example_filename('FBgn0031208.gff'))
    observed.sort()
    expected = ['CDS', 'exon', 'five_prime_UTR', 'gene', 'intron', 'mRNA', 'pcr_product', 'protein', 'three_prime_UTR']
    print observed
    print expected
    assert observed == expected
示例#14
0
def test_issue_79():
    gtf = gffutils.example_filename('keep-order-test.gtf')
    db = gffutils.create_db(gtf,
                            'tmp.db',
                            disable_infer_genes=False,
                            disable_infer_transcripts=False,
                            id_spec={
                                "gene": "gene_id",
                                "transcript": "transcript_id"
                            },
                            merge_strategy="create_unique",
                            keep_order=True,
                            force=True)

    exp = open(gtf).read()
    obs = '\n'.join([str(i) for i in db.all_features()])
    exp_1 = exp.splitlines(True)[0].strip()
    obs_1 = obs.splitlines(True)[0].strip()
    print('EXP')
    print(exp_1)
    print('OBS')
    print(obs_1)
    print('DIFF')
    print(''.join(difflib.ndiff([exp_1], [obs_1])))
    assert obs_1 == exp_1
示例#15
0
def create_gff_db(dicoNiourk):
    print("\x1b[0;38;2;"+dicoNiourk["color"]["light1"]+"m") ; sys.stdout.write("\033[F")
    dicoNiourk["spinner"].text = "    • Create RefSeq DB"
    dicoNiourk["spinner"].start()
    fn = gffutils.example_filename(dicoNiourk["refseq_gff"])
    gffutils.create_db(fn, dbfn=dicoNiourk["refseq_gff"].replace(".gff",".db"), force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
    dicoNiourk["spinner"].stop()
    printcolor("    • RefSeq DB created\n","0",dicoNiourk["color"]["light1"],None,dicoNiourk["color"]["bool"])
示例#16
0
def convertGffToBedGffUtils(gffFile):
    fn = gffutils.example_filename(gffFile)
    # db = gffutils.create_db(fn, dbfn=gffFile[:-4] + '.db', force=True, keep_order=True, \
    # 	merge_strategy='merge', sort_attribute_values=True)
    db = gffutils.FeatureDB(gffFile[:-4] + '.db', keep_order=True)
    gffIterator = db.all_features(order_by='start')
    bedVersion = pybedtools_integration.to_bedtool(gffIterator)
    print bedVersion
示例#17
0
def test_deprecation_handler():
    return 

    # TODO: when infer_gene_extent actually gets deprecated, test here.
    assert_raises(ValueError, gffutils.create_db,
            gffutils.example_filename('FBgn0031208.gtf'),
            ':memory:',
            infer_gene_extent=False)
示例#18
0
def test_infer_gene_extent():
    # Before we deprecate this, make sure it still works but emits a warning.
    with warnings.catch_warnings(record=True) as w:
        gffutils.create_db(
            gffutils.example_filename('FBgn0031208.gtf'),
            ':memory:',
            infer_gene_extent=False)
        assert len(w) == 1
示例#19
0
def test_deprecation_handler():
    return

    # TODO: when infer_gene_extent actually gets deprecated, test here.
    assert_raises(ValueError, gffutils.create_db,
            gffutils.example_filename('FBgn0031208.gtf'),
            ':memory:',
            infer_gene_extent=False)
示例#20
0
def test_false_function():
    # smoke test: before commit ce4b7671f, this would raise "TypeError: object
    # of type 'function' has no len()"
    db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),
                            ':memory:',
                            keep_order=True,
                            id_spec=lambda x: False,
                            merge_strategy='create_unique')
示例#21
0
def read_to_database(d):
    gff3 = gffutils.example_filename(d)
    db = gffutils.create_db(gff3,
                            dbfn='gff3.db',
                            force=True,
                            keep_order=True,
                            merge_strategy='merge',
                            sort_attribute_values=True)
    return db
示例#22
0
    def setup(self):
        """
        Creates a new GFFDB or GTFDB (depending on self.__class__.featureclass)
        """
        self.featureclass = self.__class__.featureclass
        self.Feature = gffutils.Feature
        if self.featureclass == 'GFF':
            extension = '.gff'
            self.fn = gffutils.example_filename('FBgn0031208.gff')
            self.dbfn = testdbfn_gff
        if self.featureclass == 'GTF':
            extension = '.gtf'
            self.fn = gffutils.example_filename('FBgn0031208.gtf')
            self.dbfn = testdbfn_gtf

        self.G = gffutils.FeatureDB(self.dbfn)
        self.conn = sqlite3.connect(self.dbfn)
        self.c = self.conn.cursor()
示例#23
0
def test_verbose():
    # just a smoke test to make sure it runs
    actual_stderr = sys.stderr
    import StringIO
    sys.stderr = StringIO.StringIO()
    gffdb = gffutils.db.GFFDBCreator(gffutils.example_filename('FBgn0031208.gff'),
            'deleteme.db', verbose=True, force=True).create()
    sys.stderr = actual_stderr
    os.unlink('deleteme.db')
示例#24
0
def test_iterator_update():
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    assert len(list(db.all_features())) == 12
    orig_exon_coords = set([(i.start, i.stop)
                            for i in db.features_of_type('exon')])

    # reset all features to have the same coords of start=1, stop=100
    def gen():
        for i in db.features_of_type('gene'):
            i.start = 1
            i.stop = 100
            yield i

    db.update(gen(), merge_strategy='replace')
    assert len(list(db.all_features())) == 12
    assert len(list(db.features_of_type('gene'))) == 1
    g = six.next(db.features_of_type('gene'))
    assert g.start == 1, g.start
    assert g.stop == 100, g.stop

    # exons should have remained unchanged.
    assert orig_exon_coords == set([(i.start, i.stop)
                                    for i in db.features_of_type('exon')])

    def _transform(f):
        f.start = 1
        f.stop = 100
        return f

    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    db.update(db.features_of_type('gene'),
              merge_strategy='replace',
              transform=_transform)
    assert len(list(db.all_features())) == 12
    assert len(list(db.features_of_type('gene'))) == 1
    g = six.next(db.features_of_type('gene'))
    assert g.start == 1, g.start
    assert g.stop == 100, g.stop

    # exons should have remained unchanged.
    assert orig_exon_coords == set([(i.start, i.stop)
                                    for i in db.features_of_type('exon')])
示例#25
0
文件: test.py 项目: arnikz/gffutils
def test_issue_82():
    # key-val separator is inside an unquoted attribute value
    x = (
        'Spenn-ch12\tsgn_markers\tmatch\t2621812\t2622049\t.\t+\t.\t'
        'Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126'
    )
    y = feature.feature_from_line(x)
    assert y.attributes['Note'] == ['marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126']

    gffutils.create_db(gffutils.example_filename('keyval_sep_in_attrs.gff'), ':memory:')
示例#26
0
 def parse_gff3(self):
     print("-------- Ensembl data Parsing --------")
     print("\tParsing gff3 file...")
     print("\tcreating temporary database from file: " + self.gff)
     fn = gffutils.example_filename(self.gff)
     db = gffutils.create_db(fn, ":memory:", merge_strategy="create_unique")
     # gffutils.create_db(fn, "DB.Ensembl_" + self.species[0] +".db", merge_strategy="create_unique")
     # db = gffutils.FeatureDB("DB.Ensembl_" + self.species[0] +".db")
     self.collect_genes(db)
     self.collect_Transcripts(db)
示例#27
0
def test_sequence():
    fasta = gffutils.example_filename('dm6-chr2L.fa')
    f = feature.feature_from_line('chr2L	FlyBase	gene	154	170	.	+	.	ID=one;')
    seq = f.sequence(fasta)
    assert seq == 'aCGAGATGATAATATAT'
    assert len(seq) == len(f)
    f.strand = '-'
    seq = f.sequence(fasta)
    assert seq == 'ATATATTATCATCTCGt'
    assert len(seq) == len(f)
示例#28
0
def test_pr_139():
    db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),
                            ':memory:')
    exons = list(db.features_of_type('exon'))
    inter = list(db.interfeatures(exons))

    # previously, the first exon's attributes would show up in subsequent merged features
    assert exons[0].attributes['Name'][0] not in inter[1].attributes['Name']
    assert exons[0].attributes['Name'][0] not in inter[2].attributes['Name']
    assert exons[0].attributes['Name'][0] not in inter[3].attributes['Name']
示例#29
0
def test_issue_105():
    fn = gffutils.example_filename('FBgn0031208.gtf')
    home = os.path.expanduser('~')
    newfn = os.path.join(home, '.gffutils.test')
    with open(newfn, 'w') as fout:
        fout.write(open(fn).read())
    f = gffutils.iterators.DataIterator(newfn)
    for i in f:
        pass
    os.unlink(newfn)
示例#30
0
def test_disable_infer():
    """
    tests the new semantics for disabling gene/transcript inference
    """
    # To start, we construct a GTF db by inferring genes and transcripts
    db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'),
                            ':memory:')

    # Then create a file missing transcripts, and another missing genes.
    import tempfile
    tempfile.tempdir = None
    no_transcripts = open(tempfile.NamedTemporaryFile(delete=False).name, 'w')
    no_genes = open(tempfile.NamedTemporaryFile(delete=False).name, 'w')
    for feature in db.all_features():
        if feature.featuretype != 'transcript':
            no_transcripts.write(str(feature) + '\n')
        if feature.featuretype != 'gene':
            no_genes.write(str(feature) + '\n')
    no_genes.close()
    no_transcripts.close()

    no_tx_db = gffutils.create_db(no_transcripts.name,
                                  ':memory:',
                                  disable_infer_transcripts=True)
    no_gn_db = gffutils.create_db(no_genes.name,
                                  ':memory:',
                                  disable_infer_genes=True)
    no_xx_db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'),
                                  ':memory:',
                                  disable_infer_genes=True,
                                  disable_infer_transcripts=True)

    # no transcripts but 3 genes
    assert len(list(no_tx_db.features_of_type('transcript'))) == 0
    assert len(list(no_tx_db.features_of_type('gene'))) == 3

    # no genes but 4 transcripts
    assert len(list(no_gn_db.features_of_type('gene'))) == 0
    assert len(list(no_gn_db.features_of_type('transcript'))) == 4

    # no genes or transcripts
    assert len(list(no_xx_db.features_of_type('gene'))) == 0
    assert len(list(no_xx_db.features_of_type('transcript'))) == 0
示例#31
0
def test_false_function():
    # smoke test: before commit ce4b7671f, this would raise "TypeError: object
    # of type 'function' has no len()"
    db = gffutils.create_db(
        gffutils.example_filename('FBgn0031208.gff'),
        ':memory:',
        keep_order=True,
        id_spec=lambda x: False,
        merge_strategy='create_unique'
    )
示例#32
0
def test_issue_82():
    # key-val separator is inside an unquoted attribute value
    x = (
        'Spenn-ch12\tsgn_markers\tmatch\t2621812\t2622049\t.\t+\t.\t'
        'Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126'
    )
    y = feature.feature_from_line(x)
    assert y.attributes['Note'] == ['marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126']

    gffutils.create_db(gffutils.example_filename('keyval_sep_in_attrs.gff'), ':memory:')
示例#33
0
def test_sequence():
    fasta = gffutils.example_filename('dm6-chr2L.fa')
    f = feature.feature_from_line(
        'chr2L	FlyBase	gene	154	170	.	+	.	ID=one;')
    seq = f.sequence(fasta)
    assert seq == 'aCGAGATGATAATATAT'
    assert len(seq) == len(f)
    f.strand = '-'
    seq = f.sequence(fasta)
    assert seq == 'ATATATTATCATCTCGt'
    assert len(seq) == len(f)
示例#34
0
def test_iterator_update():
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    assert len(list(db.all_features())) == 12
    orig_exon_coords = set([(i.start, i.stop) for i in db.features_of_type('exon')])


    # reset all features to have the same coords of start=1, stop=100
    def gen():
        for i in db.features_of_type('gene'):
            i.start = 1
            i.stop = 100
            yield i

    db.update(gen(), merge_strategy='replace')
    assert len(list(db.all_features())) == 12
    assert len(list(db.features_of_type('gene'))) == 1
    g = six.next(db.features_of_type('gene'))
    assert g.start == 1, g.start
    assert g.stop == 100, g.stop

    # exons should have remained unchanged.
    assert orig_exon_coords == set([(i.start, i.stop) for i in db.features_of_type('exon')])


    def _transform(f):
        f.start = 1
        f.stop = 100
        return f

    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    db.update(db.features_of_type('gene'), merge_strategy='replace', transform=_transform)
    assert len(list(db.all_features())) == 12
    assert len(list(db.features_of_type('gene'))) == 1
    g = six.next(db.features_of_type('gene'))
    assert g.start == 1, g.start
    assert g.stop == 100, g.stop

    # exons should have remained unchanged.
    assert orig_exon_coords == set([(i.start, i.stop) for i in db.features_of_type('exon')])
示例#35
0
def test_disable_infer():
    """
    tests the new semantics for disabling gene/transcript inference
    """
    # To start, we construct a GTF db by inferring genes and transcripts
    db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gtf'), ':memory:')

    # Then create a file missing transcripts, and another missing genes.
    import tempfile
    tempfile.tempdir = None
    no_transcripts = open(tempfile.NamedTemporaryFile(delete=False).name, 'w')
    no_genes = open(tempfile.NamedTemporaryFile(delete=False).name, 'w')
    for feature in db.all_features():
        if feature.featuretype != 'transcript':
            no_transcripts.write(str(feature) + '\n')
        if feature.featuretype != 'gene':
            no_genes.write(str(feature) + '\n')
    no_genes.close()
    no_transcripts.close()

    no_tx_db = gffutils.create_db(no_transcripts.name, ':memory:', disable_infer_transcripts=True)
    no_gn_db = gffutils.create_db(no_genes.name, ':memory:', disable_infer_genes=True)
    no_xx_db = gffutils.create_db(
        gffutils.example_filename('FBgn0031208.gtf'),
        ':memory:',
        disable_infer_genes=True,
        disable_infer_transcripts=True
    )

    # no transcripts but 3 genes
    assert len(list(no_tx_db.features_of_type('transcript'))) == 0
    assert len(list(no_tx_db.features_of_type('gene'))) == 3

    # no genes but 4 transcripts
    assert len(list(no_gn_db.features_of_type('gene'))) == 0
    assert len(list(no_gn_db.features_of_type('transcript'))) == 4

    # no genes or transcripts
    assert len(list(no_xx_db.features_of_type('gene'))) == 0
    assert len(list(no_xx_db.features_of_type('transcript'))) == 0
示例#36
0
文件: test.py 项目: roryk/gffutils
def test_nonascii():
    # smoke test (prev. version returned Unicode)
    #
    db = gffutils.create_db(gffutils.example_filename('nonascii'), ":memory:")
    for i in db.all_features():
        # this works in IPython, or using nosetests --with-doctest...
        try:
            print i

        # ...but fails using plain nosetests or when using regular Python
        # interpreter
        except UnicodeEncodeError:
            print unicode(i)
示例#37
0
def test_issue_119():
    db  = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),':memory:')
    db1 = gffutils.create_db(gffutils.example_filename('F3-unique-3.v2.gff'),':memory:')
    db2 = db1.update(db)
    obs = sorted(db2._autoincrements.keys())
    assert obs == ['exon', 'read'], obs

    # More isolated test, merging two databases each created from the same file
    # which itself contains only a single feature with no ID.
    tmp = tempfile.NamedTemporaryFile(delete=False).name
    with open(tmp, 'w') as fout:
        fout.write('chr1\t.\tgene\t10\t15\t.\t+\t.\t\n')

    db3 = gffutils.create_db(tmp, ':memory:')
    assert db3._autoincrements == {'gene': 1}

    db4 = gffutils.create_db(tmp, ':memory:')
    assert db4._autoincrements == {'gene': 1}

    db5 = db3.update(db4)
    assert db5._autoincrements == {'gene': 2}
    assert db3._autoincrements == db5._autoincrements
示例#38
0
def test_create_db_from_iter():
    """
    Test creation of FeatureDB from iterator.
    """
    print("Testing creation of DB from iterator")
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ":memory:", keep_order=True)
    def my_iterator():
        for rec in db.all_features():
            yield rec
    new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True)
    print(list(new_db.all_features()))
    gene_feats = new_db.all_features(featuretype="gene")
    assert (len(list(gene_feats)) != 0), "Could not load genes from GFF."
示例#39
0
def test_nonascii():
    # smoke test (prev. version returned Unicode)
    #
    db = gffutils.create_db(gffutils.example_filename('nonascii'), ":memory:",
                            keep_order=True)
    for i in db.all_features():
        # this works in IPython, or using nosetests --with-doctest...
        try:
            print(i)

        # ...but fails using plain nosetests or when using regular Python
        # interpreter
        except UnicodeEncodeError:
            print(six.text_type(i))
def test_roundtrip():
    """
    Feature -> SeqFeature -> Feature should be invariant.
    """
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    feature = db['ENSMUSG00000033845']
    feature.keep_order = True
    dialect = feature.dialect
    s = bp.to_seqfeature(feature)
    assert s.location.start.position == feature.start - 1
    assert s.location.end.position == feature.stop
    f = bp.from_seqfeature(s, dialect=dialect, keep_order=True)
    assert feature == f
示例#41
0
def test_create_db_from_url():
    """
    Test creation of FeatureDB from URL iterator.
    """
    print("Testing creation of DB from URL iterator")
    # initially run SimpleHTTPServer at port 0 and os will take first available
    Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
    httpd = SocketServer.TCPServer(("", 0), Handler)
    port = str(httpd.socket.getsockname()[1])
    print("serving at port", port)

    # Serving test/data folder
    served_folder = gffutils.example_filename('')
    savedir = os.getcwd()
    os.chdir(served_folder)

    print("Starting SimpleHTTPServer in thread")
    server_thread = threading.Thread(target=httpd.serve_forever)
    server_thread.deamon = True
    server_thread.start()
    try:
        url = ''.join(['http://localhost:', port, '/gff_example1.gff3'])
        db = gffutils.create_db(url, ":memory:", keep_order=True)

        def my_iterator():
            for rec in db.all_features():
                yield rec

        new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True)
        print(list(new_db.all_features()))
        gene_feats = new_db.all_features(featuretype="gene")
        assert (len(list(gene_feats)) != 0), "Could not load genes from GFF."

        url = ''.join(['http://localhost:', port, '/gff_example1.gff3.gz'])
        db = gffutils.create_db(url, ":memory:", keep_order=True)

        def my_iterator():
            for rec in db.all_features():
                yield rec

        new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True)
        print(list(new_db.all_features()))
        gene_feats = new_db.all_features(featuretype="gene")
        assert (len(list(gene_feats)) != 0), "Could not load genes from GFF."

    finally:
        print('Server shutdown.')
        httpd.shutdown()
        server_thread.join()
        os.chdir(savedir)
示例#42
0
def test_roundtrip():
    """
    Feature -> SeqFeature -> Feature should be invariant.
    """
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    feature = db['ENSMUSG00000033845']
    feature.keep_order = True
    dialect = feature.dialect
    s = bp.to_seqfeature(feature)
    assert s.location.start.position == feature.start - 1
    assert s.location.end.position == feature.stop
    f = bp.from_seqfeature(s, dialect=dialect, keep_order=True)
    assert feature == f
示例#43
0
def test_create_db_from_iter():
    """
    Test creation of FeatureDB from iterator.
    """
    print("Testing creation of DB from iterator")
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ":memory:", keep_order=True)
    def my_iterator():
        for rec in db.all_features():
            yield rec
    new_db = gffutils.create_db(my_iterator(), ":memory:", keep_order=True)
    print(list(new_db.all_features()))
    gene_feats = new_db.all_features(featuretype="gene")
    assert (len(list(gene_feats)) != 0), "Could not load genes from GFF."
示例#44
0
def test_random_chr():
    """
    Test on GFF files with random chromosome events.
    """
    gff_fname = gffutils.example_filename("random-chr.gff")
    db = helpers.get_gff_db(gff_fname)
    # Test that we can get children of only a selected type
    gene_id = \
        "chr1_random:165882:165969:-@chr1_random:137473:137600:-@chr1_random:97006:97527:-"
    mRNAs = db.children(gene_id, featuretype="mRNA")
    for mRNA_entry in mRNAs:
        assert (mRNA_entry.featuretype == "mRNA"), \
               "Not all entries are of type mRNA! %s" \
               %(",".join([entry.featuretype for entry in mRNAs]))
    print("Parsed random chromosome successfully.")
示例#45
0
文件: test.py 项目: rbeagrie/gffutils
def test_region():
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ":memory:", keep_order=True)
    all_in_region = list(db.region("chr1:4000000-5000000"))
    all_minus = list(db.region("chr1:4000000-5000000:-"))
    all_plus = list(db.region("chr1:4000000-5000000:+"))
    all_unstranded = list(db.region("chr1:4000000-5000000:."))

    out_of_range = list(db.region("nowhere:1-100"))

    assert len(all_in_region) == 12
    assert len(all_minus) == 12
    assert len(all_plus) == 0
    assert len(all_unstranded) == 0
    assert len(out_of_range) == 0
示例#46
0
def test_random_chr():
    """
    Test on GFF files with random chromosome events.
    """
    gff_fname = gffutils.example_filename("random-chr.gff")
    db = helpers.get_gff_db(gff_fname)
    # Test that we can get children of only a selected type
    gene_id = \
        "chr1_random:165882:165969:-@chr1_random:137473:137600:-@chr1_random:97006:97527:-"
    mRNAs = db.children(gene_id, featuretype="mRNA")
    for mRNA_entry in mRNAs:
        assert (mRNA_entry.featuretype == "mRNA"), \
               "Not all entries are of type mRNA! %s" \
               %(",".join([entry.featuretype for entry in mRNAs]))
    print("Parsed random chromosome successfully.")
示例#47
0
def test_sanitize_gff():
    """
    Test sanitization of GFF. Should be merged with GFF cleaning
    I believe unless they are intended to have different functionalities.
    """
    # Get unsanitized GFF
    fn = gffutils.example_filename("unsanitized.gff")
    # Get its database
    db = helpers.get_gff_db(fn)
    # Sanitize the GFF
    sanitized_recs = helpers.sanitize_gff_db(db)
    # Ensure that sanitization work, meaning all
    # starts must be less than or equal to stops
    for rec in sanitized_recs.all_features():
        assert (rec.start <= rec.stop), "Sanitization failed."
    print("Sanitized GFF successfully.")
示例#48
0
文件: test.py 项目: arnikz/gffutils
def test_for_analyze():
    db = gffutils.create_db(
            gffutils.example_filename('FBgn0031208.gtf'),
            'deleteme',
            force=True
    )
    assert db._analyzed()
    db.execute('DROP TABLE sqlite_stat1')
    assert not db._analyzed()

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        db2 = gffutils.FeatureDB('deleteme')
        assert len(w) == 1
        assert "analyze" in str(w[-1].message)
    db.analyze()
    assert db._analyzed()
    os.unlink('deleteme')
示例#49
0
def test_gffwriter():
    """
    Test GFFWriter.
    """
    print("Testing GFF writer..")
    fn = gffutils.example_filename("unsanitized.gff")
    # Make a copy of it as temporary named file
    temp_f = tempfile.NamedTemporaryFile(delete=False)
    temp_fname_source = temp_f.name
    shutil.copy(fn, temp_fname_source)
    # Now write file in place
    source_first_line = open(temp_fname_source, "r").readline().strip()
    assert (not source_first_line.startswith("#GFF3")), \
           "unsanitized.gff should not have a gffutils-style header."
    db_in = gffutils.create_db(fn, ":memory:", keep_order=True)
    # Fetch first record
    rec = six.next(db_in.all_features())
    ##
    ## Write GFF file in-place test
    ##
    print("Testing in-place writing")
    gff_out = gffwriter.GFFWriter(temp_fname_source,
                                  in_place=True,
                                  with_header=True)
    gff_out.write_rec(rec)
    gff_out.close()
    # Ensure that the file was written with header
    rewritten = open(temp_fname_source, "r")
    new_header = rewritten.readline().strip()
    assert new_header.startswith("#GFF3"), \
           "GFFWriter serialized files should have a #GFF3 header."
    print("  - Wrote GFF file in-place successfully.")
    ##
    ## Write GFF file to new file test
    ##
    print("Testing writing to new file")
    new_file = tempfile.NamedTemporaryFile(delete=False)
    gff_out = gffwriter.GFFWriter(new_file.name)
    gff_out.write_rec(rec)
    gff_out.close()
    new_line = open(new_file.name, "r").readline().strip()
    assert new_line.startswith("#GFF3"), \
           "GFFWriter could not write to a new GFF file."
    print("  - Wrote to new file successfully.")
示例#50
0
文件: test.py 项目: YeoLab/gffutils
def test_attributes_modify():
    """
    Test that attributes can be modified in a GFF record.
    """
    # Test that attributes can be modified
    gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), testdbfn_gff,
                       verbose=False,
                       force=True)
    db = gffutils.FeatureDB(testdbfn_gff)
    gene_id = "FBgn0031208"
    gene_childs = list(db.children(gene_id))
    print "old attributes: "
    print gene_childs[0].attributes
    assert str(gene_childs[0].attributes) == 'ID=FBtr0300689;Name=CG11023-RB;Parent=FBgn0031208;Dbxref=FlyBase_Annotation_IDs:CG11023-RB;score_text=Strongly Supported;score=11'

    gene_childs[0].attributes["ID"] = "Modified"
    print "new attributes: "
    print gene_childs[0].attributes
    assert str(gene_childs[0].attributes) == 'ID=Modified;Name=CG11023-RB;Parent=FBgn0031208;Dbxref=FlyBase_Annotation_IDs:CG11023-RB;score_text=Strongly Supported;score=11;ID=Modified'
示例#51
0
def test_add_relation():
    db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ':memory:', keep_order=True)
    L = len(list(db.children('FBgn0031208:3')))
    assert L == 0, L


    def func(parent, child):
        child['Parent'] = child['Parent'] + [parent.id]
        child['exon_parent'] = [parent.id]
        return child

    db.add_relation('FBgn0031208:3', 'CDS_FBgn0031208:1_737', 1, child_func=func)
    L = len(list(db.children('FBgn0031208:3')))
    assert L == 1, L

    L = list(db.children('FBgn0031208:3'))
    x = L[0]
    assert 'FBgn0031208:3' in x['Parent']
    assert x['exon_parent'] == ['FBgn0031208:3']
示例#52
0
文件: test.py 项目: arnikz/gffutils
def test_issue_79():
    gtf = gffutils.example_filename('keep-order-test.gtf')
    db = gffutils.create_db(gtf, 'tmp.db',
                       disable_infer_genes=False,
                       disable_infer_transcripts=False,
                       id_spec={"gene": "gene_id", "transcript": "transcript_id"},
                       merge_strategy="create_unique",
                       keep_order=True,
                            force=True)

    exp = open(gtf).read()
    obs = '\n'.join([str(i) for i in db.all_features()])
    exp_1 = exp.splitlines(True)[0].strip()
    obs_1 = obs.splitlines(True)[0].strip()
    print('EXP')
    print(exp_1)
    print('OBS')
    print(obs_1)
    print('DIFF')
    print(''.join(difflib.ndiff([exp_1], [obs_1])))
    assert obs_1 == exp_1
示例#53
0
def test_tempfiles():

    # specifiy a writeable temp dir for testing
    tempdir = '/tmp/gffutils-test'

    def clean_tempdir():
        tempfile.tempdir = tempdir
        if os.path.exists(tempdir):
            shutil.rmtree(tempdir)
        os.makedirs(tempdir)

    clean_tempdir()

    # default keep_tempfiles=False should give us nothing.
    db = gffutils.create_db(
        gffutils.example_filename('FBgn0031208.gtf'), ':memory:')
    assert len(os.listdir(tempdir)) == 0

    # adding keep_tempfiles=True should give us 1 tempfile for gtf...
    db = gffutils.create_db(
        gffutils.example_filename('FBgn0031208.gtf'), ':memory:', _keep_tempfiles=True)
    filelist = os.listdir(tempdir)
    assert len(filelist) == 1, filelist
    assert filelist[0].endswith('.gffutils')

    #...and another one for gff. This time, make sure the suffix 
    db = gffutils.create_db(
        gffutils.example_filename('FBgn0031208.gff'), ':memory:', _keep_tempfiles=True)
    filelist = os.listdir(tempdir)
    assert len(filelist) == 2, filelist
    for i in filelist:
        assert i.endswith('.gffutils')

    # OK, now delete what we have so far...
    clean_tempdir()

    # Make sure that works for custom suffixes
    db = gffutils.create_db(
        gffutils.example_filename('FBgn0031208.gtf'), ':memory:', _keep_tempfiles='.GTFtmp')
    filelist = os.listdir(tempdir)
    assert len(filelist) == 1, filelist
    assert filelist[0].endswith('.GTFtmp')

    clean_tempdir()
    db = gffutils.create_db(
        gffutils.example_filename('FBgn0031208.gtf'), ':memory:', _keep_tempfiles='.GFFtmp')
    filelist = os.listdir(tempdir)
    assert len(filelist) == 1, filelist
    assert filelist[0].endswith('.GFFtmp')

    # Test n parallel instances of gffutils across PROCESSES processes.
    #
    # Note that travis-ci doesn't like it when you use multiple cores, so the
    # .travis.yml file sets this to 1.  This also means that
    #   1) `n` shouldn't be too large because travis-ci will run one at a time,
    #      but more importantly,
    #   2) this will only truly test parallel processes on a local machine with
    #      multiple cpus.
    clean_tempdir()


    # .travis.yml sets the PROCESSES env var; otherwise use all available.
    PROCESSES = int(os.environ.get("PROCESSES", multiprocessing.cpu_count()))
    pool = multiprocessing.Pool(PROCESSES)
    n = 100
    res = pool.map(make_db, range(n))
    assert sorted(list(res)) == list(range(n))
    filelist = os.listdir(tempdir)
    assert len(filelist) == n, len(filelist)

    expected = dedent("""\
        FBtr0300689	chr2L	7529	9484	+	transcript	4681	{"transcript_id":["FBtr0300689"],"gene_id":["FBgn0031208"]}
        FBgn0031208	chr2L	7529	9484	+	gene	4681	{"gene_id":["FBgn0031208"]}
        FBtr0300690	chr2L	7529	9484	+	transcript	4681	{"transcript_id":["FBtr0300690"],"gene_id":["FBgn0031208"]}
        transcript_Fk_gene_1	chr2L	10000	11000	-	transcript	4681	{"transcript_id":["transcript_Fk_gene_1"],"gene_id":["Fk_gene_1"]}
        Fk_gene_1	chr2L	10000	11000	-	gene	4681	{"gene_id":["Fk_gene_1"]}
        transcript_Fk_gene_2	chr2L	11500	12500	-	transcript	4681	{"transcript_id":["transcript_Fk_gene_2"],"gene_id":["Fk_gene_2"]}
        Fk_gene_2	chr2L	11500	12500	-	gene	4681	{"gene_id":["Fk_gene_2"]}
        """)


    def matches_expected(fn):
        """
        Python 3 has unpredictable dictionary ordering. This function checks
        the *semantic* similarity of lines by parsing the attributes into
        a dictonary.
        """
        exp_features = expected.splitlines(True)
        new_features = list(open(fn))
        assert len(exp_features) == len(new_features)
        for expline, newline in zip(exp_features, new_features):
            exp_toks = expline.split()
            new_toks = newline.split()
            assert exp_toks[:-1] == new_toks[:-1]
            assert json.loads(exp_toks[-1]) == json.loads(new_toks[-1])


    # make sure that each of the `n` files matches the expected output.
    for fn in filelist:
        fn = os.path.join(tempdir, fn)
        try:
            matches_expected(fn)
        except AssertionError:
            print(''.join(difflib.ndiff(expected.splitlines(True), this.splitlines(True))))
            raise

    clean_tempdir()
示例#54
0
def test_inspect():
    file_results = inspect.inspect(gffutils.example_filename('FBgn0031208.gff'), verbose=False)
    db_results = inspect.inspect(
        gffutils.create_db(
            gffutils.example_filename('FBgn0031208.gff'),
            ':memory:'),
        verbose=False
    )
    expected =  {

        'featuretype': {
            'intron': 3,
            'five_prime_UTR': 1,
            'exon': 6,
            'mRNA': 4,
            'CDS': 5,
            'pcr_product': 1,
            'three_prime_UTR': 2,
            'protein': 2,
            'gene': 3,
        },

        'feature_count': 27,

        'chrom': {
            'chr2L': 27,
        },

        'attribute_keys': {
            u'': 3,
            'Dbxref': 6,
            'Name': 19,
            'Parent': 20,
            ' Parent': 1,
            'score_text': 2,
            'gbunit': 1,
            'derived_computed_cyto': 1,
            'Derives_from': 2,
            'derived_molecular_weight': 2,
            'score': 2,
            'ID': 25,
            'derived_isoelectric_point': 2,
            'Ontology_term': 1,
        }
    }
    assert file_results == db_results == expected


    # file and db work because db is created from

    kwargs = dict(
        look_for=['chrom', 'strand', 'attribute_keys', 'featuretype'],
        verbose=False,
        limit=10,
    )

    file_results = inspect.inspect(
        gffutils.example_filename('FBgn0031208.gff'),
        **kwargs
    )
    iter_results = inspect.inspect(
        iter(iterators._FileIterator(gffutils.example_filename('FBgn0031208.gff'))),
        **kwargs
    )
    db_results = inspect.inspect(
        gffutils.create_db(
            gffutils.example_filename('FBgn0031208.gff'),
            ':memory:'),
        **kwargs
    )

    expected = {
        'attribute_keys': {
            u'Name': 9,
            u'Parent': 9,
            u'score_text': 2,
            u'gbunit': 1,
            u'derived_computed_cyto': 1,
            u'score': 2,
            u'Dbxref': 3,
            u'ID': 8,
            u'Ontology_term': 1,
        },

        'feature_count': 10,

        'chrom': {u'chr2L': 10},

        'strand': {u'+': 10},

        'featuretype': {
            u'five_prime_UTR': 1,
            u'exon': 3,
            u'mRNA': 2,
            u'CDS': 1,
            'intron': 2,
            u'gene': 1}
    }
    assert file_results == db_results == iter_results == expected
示例#55
0
def test_update():
    # check both in-memory and file-based dbs
    db = create.create_db(
        example_filename('FBgn0031208.gff'), ':memory:', verbose=False,
        keep_order=True,
        force=True)

    orig_num_features = len(list(db.all_features()))

    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect, strict=False)

    # no merge strategy required because we're adding a new feature
    db.update([f])
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    x = x[0]
    x.keep_order = True
    assert str(x) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=1", str(x)

    # ought to be one more now . . .
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features

    # Now try updating with the same feature, but using merge_strategy="merge",
    # which appends items to attributes ( n=1 --> n=1,2 )
    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect, strict=False)
    f.keep_order = True
    f.attributes['n'] = ['2']
    db.update([f], merge_strategy='merge')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1

    # Merging does a list(set()) operation, so the order is not guaranteed.
    # Fix it here for testing...
    x = x[0]
    x.attributes['n'].sort()

    assert str(x) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=1,2", str(x)

    # still should have the same number of features as before (still 2)
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features


    # Merging while iterating.  e.g., if you're updating children with gene
    # IDs.
    db = create.create_db(example_filename('FBgn0031208.gff'), ':memory:',
                          verbose=False, force=True, keep_order=True)
    for gene in db.features_of_type('gene'):
        for child in list(db.children(gene)):
            # important: the FBgn0031208.gff file was designed to have some
            # funky features: there are two exons without ID attributes.  These
            # are assigned to ids "exon_1" and "exon_2".  Upon update, with
            # still no ID, we then have two new features "exon_3" and "exon_4".
            # To prevent this issue, we ensure that the ID attribute exists...
            child.attributes['gene_id'] = [gene.id]
            if 'ID' not in child.attributes:
                child.attributes['ID'] = [child.id]
            db.update([child], merge_strategy='replace')

    print("\n\nafter\n\n")
    for child in db.children(gene):
        print(child.id)
        assert child.attributes['gene_id'] == ['FBgn0031208'], (child, child.attributes)

    num_entries = 0
    for gene_recs in list(db.iter_by_parent_childs()):
        # Add attribute to each gene record
        rec = gene_recs[0]
        rec.attributes["new"] = ["new_value"]
        db.update([rec])
        num_entries += 1
    print(list(db.all_features()))


    assert (num_entries > 1), "Only %d left after update" % (num_entries)


    # Replace
    f = feature.feature_from_line(
        'chr2L . testing 1 10 . + . ID=testing_feature;n=1',
        dialect=db.dialect, strict=False)

    f.keep_order = True

    f.attributes['n'] = ['3']
    db.update([f], merge_strategy='replace')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    assert str(x[0]) == "chr2L	.	testing	1	10	.	+	.	ID=testing_feature;n=3", str(x[0])
    # still should have the same number of features as before (still 2)
    num_features = len(list(db.all_features()))
    assert num_features == orig_num_features + 1, num_features


    # Same thing, but GTF instead of GFF.
    db = create.create_db(
        example_filename('FBgn0031208.gtf'), ':memory:', verbose=False,
        force=True, keep_order=True)
    f = feature.feature_from_line('chr2L . testing 1 10 . + . gene_id "fake"; n "1"', strict=False)
    f.keep_order = True
    db.update([f], merge_strategy='merge')
    x = list(db.features_of_type('testing'))
    assert len(x) == 1
    x = x[0]
    x.keep_order = True

    # note the trailing semicolon.  That's because the db's dialect has
    # ['trailing semicolon'] = True.
    assert str(x) == 'chr2L	.	testing	1	10	.	+	.	gene_id "fake"; n "1";', str(x)