def t_ensembl_locus(): line = "LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011\n" s = GenBank.Scanner.GenBankScanner() c = GenBank._FeatureConsumer(True) s._feed_first_line(c, line) assert c.data.name == "HG531_PATCH", c.data.name assert c._expected_size == 1000000, c._expected_size line = "LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011\n" s = GenBank.Scanner.GenBankScanner() c = GenBank._FeatureConsumer(True) s._feed_first_line(c, line) assert c.data.name == "HG531_PATCH", c.data.name assert c._expected_size == 759984, c._expected_size line = "LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011\n" s = GenBank.Scanner.GenBankScanner() c = GenBank._FeatureConsumer(True) s._feed_first_line(c, line) assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name assert c._expected_size == 814959, c._expected_size line = "LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011\n" s = GenBank.Scanner.GenBankScanner() c = GenBank._FeatureConsumer(True) s._feed_first_line(c, line) assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name assert c._expected_size == 1219964, c._expected_size print("Done")
def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN"), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT"), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT"), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN"), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT"), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT"), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual( t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual( mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual( d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_topology_genbank(self): """Check GenBank topology parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("LOCUS U00096", None), ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear"), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular"), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear"), ] for (line, topo) in tests: scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
def test_topology_embl(self): """Check EMBL topology parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.", "linear"), ("ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.", "linear"), ("ID BSUB9999 standard; circular DNA; PRO; 4214630 BP.", "circular"), ("ID SC10H5 standard; DNA; PRO; 4870 BP.", None), ("ID NRP_AX000635; PRT; NR1; 15 SQ", None), ("ID NRP0000016E; PRT; NR2; 5 SQ", None), ] for (line, topo) in tests: scanner = Scanner.EmblScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN", None), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT", None), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT", None), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV", None), ("LOCUS pEH010 5743 bp DNA circular", "circular", "DNA", None, [BiopythonParserWarning]), # This is a test of the format > 80 chars long ("LOCUS AZZZAA02123456789 1000000000 bp DNA linear PRI 15-OCT-2018", "linear", "DNA", "PRI", None) ] for (line, topo, mol_type, div, warning_list) in tests: with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line)) if warning_list is None: self.assertEqual(len(caught), 0) else: self.assertEqual(len(caught), len(warning_list)) for i, warning_class in enumerate(warning_list): self.assertEqual(caught[i].category, warning_class)
def test_topology_embl(self): """Check EMBL ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ # Modern examples with sequence version ("ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.", "linear", "mRNA", "PLN"), ("ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.", "linear", "genomic DNA", "MAM"), # Example to match GenBank example used above: ("ID U49845; SV 1; linear; genomic DNA; STD; FUN; 5028 BP.", "linear", "genomic DNA", "FUN"), # Old examples: ("ID BSUB9999 standard; circular DNA; PRO; 4214630 BP.", "circular", "DNA", "PRO"), ("ID SC10H5 standard; DNA; PRO; 4870 BP.", None, "DNA", "PRO"), # Patent example from 2016-06-10 # ftp://ftp.ebi.ac.uk/pub/databases/embl/patent/ ("ID A01679; SV 1; linear; unassigned DNA; PAT; MUS; 12 BP.", "linear", "unassigned DNA", "MUS"), # Old patent examples ("ID NRP_AX000635; PRT; NR1; 15 SQ", None, None, "NR1"), ("ID NRP0000016E; PRT; NR2; 5 SQ", None, None, "NR2"), # KIPO patent examples ("ID DI500001 STANDARD; PRT; 111 AA.", None, None, None), ("ID DI644510 standard; PRT; 1852 AA.", None, None, None), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.EmblScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual( t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual( mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual( d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_topology_embl(self): """Check EMBL ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ # Modern examples with sequence version ("ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.", "linear", "mRNA", "PLN"), ("ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.", "linear", "genomic DNA", "MAM"), # Example to match GenBank example used above: ("ID U49845; SV 1; linear; genomic DNA; STD; FUN; 5028 BP.", "linear", "genomic DNA", "FUN"), # Old examples: ("ID BSUB9999 standard; circular DNA; PRO; 4214630 BP.", "circular", "DNA", "PRO"), ("ID SC10H5 standard; DNA; PRO; 4870 BP.", None, "DNA", "PRO"), # Patent example from 2016-06-10 # ftp://ftp.ebi.ac.uk/pub/databases/embl/patent/ ("ID A01679; SV 1; linear; unassigned DNA; PAT; MUS; 12 BP.", "linear", "unassigned DNA", "MUS"), # Old patent examples ("ID NRP_AX000635; PRT; NR1; 15 SQ", None, None, "NR1"), ("ID NRP0000016E; PRT; NR2; 5 SQ", None, None, "NR2"), # KIPO patent examples ("ID DI500001 STANDARD; PRT; 111 AA.", None, None, None), ("ID DI644510 standard; PRT; 1852 AA.", None, None, None), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.EmblScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_first_line_imgt(self): """Check IMGT ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("ID HLA00001 standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ("ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner._ImgtScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual( t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual( mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual( d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_first_line_imgt(self): """Check IMGT ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("ID HLA00001 standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ("ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner._ImgtScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line))