def test_fromtabix_noheader(): actual = etl.fromtabix('fixture/test_noheader.bed.gz', region='Pf3D7_02_v3:110000-120000') expect = (('Pf3D7_02_v3', '105800', '447300', 'Core'),) ieq(expect, actual)
def test_fromtabix(): actual = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3:110000-120000') expect = (('#chrom', 'start', 'end', 'region'), ('Pf3D7_02_v3', '105800', '447300', 'Core')) ieq(expect, actual)
# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division # fromtabix() ############# import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3') table1 table2 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3:110000-120000') table2 # fromgff3() ############ import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromgff3('fixture/sample.gff') table1.look(truncate=30) # extract from a specific genome region via tabix table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', region='apidb|MAL5:1289593-1289595') table2.look(truncate=30)
def fromgff3(filename, region=None): """ Extract feature rows from a GFF3 file, e.g.:: >>> import petl as etl >>> # activate bio extensions ... import petlx.bio >>> table1 = etl.fromgff3('fixture/sample.gff') >>> table1.look(truncate=30) +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=======+=========+=======+========+=======+================================+ | 'apidb|MAL1' | 'ApiDB' | 'supercontig' | 1 | 643292 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL2' | 'ApiDB' | 'supercontig' | 1 | 947102 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL3' | 'ApiDB' | 'supercontig' | 1 | 1060087 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL4' | 'ApiDB' | 'supercontig' | 1 | 1204112 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ ... A region query string of the form '[seqid]' or '[seqid]:[start]-[end]' may be given for the `region` argument. If given, requires the GFF3 file to be position sorted, bgzipped and tabix indexed. Requires pysam to be installed. E.g.:: >>> # extract from a specific genome region via tabix ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', ... region='apidb|MAL5:1289593-1289595') >>> table2.look(truncate=30) +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=========+=========+=======+========+=======+================================+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'exon' | 1289594 | 1291685 | '.' | '+' | '.' | {'size': '2092', 'Parent': 'ap | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'gene' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|MAL5_18S', 'web_ | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'rRNA' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|rna_MAL5_18S-1', | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ """ if region is None: # parse file as tab-delimited table = etl.fromtsv(filename) else: # extract via tabix table = etl.fromtabix(filename, region=region) return (table.pushheader(GFF3_HEADER).skipcomments('#') # ignore any row not 9 values long (e.g., trailing fasta) .rowlenselect(9) # parse attributes into a dict .convert('attributes', gff3_parse_attributes) # parse coordinates .convert(('start', 'end'), int))
def fromgff3(filename, region=None): """ Extract feature rows from a GFF3 file, e.g.:: >>> import petl as etl >>> # activate bio extensions ... import petlx.bio >>> table1 = etl.fromgff3('fixture/sample.gff') >>> table1.look(truncate=30) +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=======+=========+=======+========+=======+================================+ | 'apidb|MAL1' | 'ApiDB' | 'supercontig' | 1 | 643292 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL2' | 'ApiDB' | 'supercontig' | 1 | 947102 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL3' | 'ApiDB' | 'supercontig' | 1 | 1060087 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL4' | 'ApiDB' | 'supercontig' | 1 | 1204112 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ ... A region query string of the form '[seqid]' or '[seqid]:[start]-[end]' may be given for the `region` argument. If given, requires the GFF3 file to be position sorted, bgzipped and tabix indexed. Requires pysam to be installed. E.g.:: >>> # extract from a specific genome region via tabix ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', ... region='apidb|MAL5:1289593-1289595') >>> table2.look(truncate=30) +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=========+=========+=======+========+=======+================================+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'exon' | 1289594 | 1291685 | '.' | '+' | '.' | {'size': '2092', 'Parent': 'ap | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'gene' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|MAL5_18S', 'web_ | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'rRNA' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|rna_MAL5_18S-1', | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ """ if region is None: # parse file as tab-delimited table = etl.fromtsv(filename) else: # extract via tabix table = etl.fromtabix(filename, region=region) return ( table .pushheader(GFF3_HEADER) .skipcomments('#') # ignore any row not 9 values long (e.g., trailing fasta) .rowlenselect(9) # parse attributes into a dict .convert('attributes', gff3_parse_attributes) # parse coordinates .convert(('start', 'end'), int) )
# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division # fromtabix() ############# import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3') table1 table2 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3:110000-120000') table2 # fromgff3() ############ import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromgff3('fixture/sample.gff') table1.look(truncate=30) # extract from a specific genome region via tabix table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', region='apidb|MAL5:1289593-1289595') table2.look(truncate=30) # fromvcf() ###########