def test_fromgff3(): features = fromgff3(plasmodb_gff3_file.name) expect_header = ('seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes') eq_(expect_header, header(features)) # apidb|MAL1 ApiDB supercontig 1 643292 . + . ID=apidb|MAL1;Name=MAL1;description=MAL1;size=643292;web_id=MAL1;molecule_type=dsDNA;organism_name=Plasmodium falciparum;translation_table=11;topology=linear;localization=nuclear;Dbxref=ApiDB_PlasmoDB:MAL1,GenBank:NC_004325,taxon:36329 row = list(features)[1] eq_('apidb|MAL1', row[0]) eq_('ApiDB', row[1]) eq_('supercontig', row[2]) eq_(1, row[3]) eq_(643292, row[4]) eq_('.', row[5]) eq_('+', row[6]) eq_('.', row[7]) eq_('apidb|MAL1', row[8]['ID']) eq_('MAL1', row[8]['Name']) eq_('Plasmodium falciparum', row[8]['organism_name']) # test data wrapped in hybrid rows eq_('apidb|MAL1', row['seqid']) eq_('ApiDB', row['source']) eq_('supercontig', row['type']) eq_(1, row['start']) eq_(643292, row['end']) eq_('.', row['score']) eq_('+', row['strand']) eq_('.', row['phase']) eq_('apidb|MAL1', row['attributes']['ID']) eq_('MAL1', row['attributes']['Name']) eq_('Plasmodium falciparum', row['attributes']['organism_name'])
def test_gff3join(): #apidb|MAL1 ApiDB gene 56913 57116 . - . ID=apidb|PFA0035c;Name=PFA0035c;description=hypothetical+protein%2C+conserved+in+P.+falciparum;size=204;web_id=PFA0035c;locus_tag=PFA0035c;size=204;Alias=MAL1P4.06b snps = (('chr', 'pos'), ('apidb|MAL1', 56911), ('apidb|MAL1', 56915)) features = fromgff3(plasmodb_gff3_file.name) genes = selecteq(features, 'type', 'gene') actual = gff3join(snps, genes, seqid='chr', start='pos', end='pos') expect = (('chr', 'pos', 'seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'), ('apidb|MAL1', 56915, 'apidb|MAL1', 'ApiDB', 'gene', 56913, 57116, '.', '-', '.', gff3_parse_attributes("ID=apidb|PFA0035c;Name=PFA0035c;description=hypothetical+protein%2C+conserved+in+P.+falciparum;size=204;web_id=PFA0035c;locus_tag=PFA0035c;size=204;Alias=MAL1P4.06b"))) ieq(expect, actual) ieq(expect, actual)
def test_gff3lookup(): features = fromgff3(plasmodb_gff3_file.name) genes = selecteq(features, 'type', 'gene') lkp = gff3lookup(genes) #apidb|MAL1 ApiDB gene 56913 57116 . - . ID=apidb|PFA0035c;Name=PFA0035c;description=hypothetical+protein%2C+conserved+in+P.+falciparum;size=204;web_id=PFA0035c;locus_tag=PFA0035c;size=204;Alias=MAL1P4.06b actual = lkp['apidb|MAL1'][56911:56915] eq_(1, len(actual)) eq_(56913, actual[0][3]) eq_(57116, actual[0][4]) actual = lkp['apidb|MAL1'][56915] eq_(1, len(actual)) eq_(56913, actual[0][3]) eq_(57116, actual[0][4])
def test_fromgff3_trailing_semicolon(): features = fromgff3(plasmodb_gff3_file.name) #apidb|MAL2 ApiDB supercontig 1 947102 . + . ID=apidb|MAL2;Name=MAL2;description=MAL2;size=947102;web_id=MAL2;molecule_type=dsDNA;organism_name=Plasmodium falciparum;translation_table=11;topology=linear;localization=nuclear;Dbxref=ApiDB_PlasmoDB:MAL2,GenBank:NC_000910,taxon:36329; row = list(features)[2] eq_('apidb|MAL2', row[0]) eq_('ApiDB', row[1]) eq_('supercontig', row[2]) eq_(1, row[3]) eq_(947102, row[4]) eq_('.', row[5]) eq_('+', row[6]) eq_('.', row[7]) eq_('apidb|MAL2', row[8]['ID']) eq_('MAL2', row[8]['Name']) eq_('Plasmodium falciparum', row[8]['organism_name'])
strand TEXT NOT NULL, alias TEXT, description TEXT, fulltext TEXT, CONSTRAINT name_pk PRIMARY KEY (name) ) """ conn = sqlite3.connect('db.sqlite3') conn.execute('PRAGMA legacy_file_format=false') # needed otherwise index sort order is ignored log('drop gene table') conn.execute('DROP TABLE IF EXISTS gene') log('create gene table') conn.execute(schema) log('prepare the data for loading') gff = fromgff3('Pfalciparum_PlasmoDB-7.2.gff') # select only genes t0 = selecteq(gff, 'type', 'gene') # columns from attributes t1 = unpackdict(t0, 'attributes', keys=('ID', 'Name', 'Alias', 'description')) # add a 'chromosome' column with MAL.. style naming def mk_chromosome(row): seqid = row['seqid'] if seqid.startswith('apidb|Pf3D7_0'): return 'MAL' + seqid[13:] elif seqid.startswith('apidb|Pf3D7_'): return 'MAL' + seqid[12:] else: return seqid t2 = addfield(t1, 'chromosome', mk_chromosome) # add a 'fulltext' column
def test_fromgff3_region(): tbl_features = fromgff3('fixture/sample.sorted.gff.gz', region='apidb|MAL5') eq_(7, nrows(tbl_features)) tbl_features = fromgff3('fixture/sample.sorted.gff.gz', region='apidb|MAL5:1289593-1289595') eq_(4, nrows(tbl_features))