Exemplo n.º 1
0
 def test_topology_genbank(self):
     """Check GenBank LOCUS line parsing."""
     # This is a bit low level, but can test pasing the LOCUS line only
     tests = [
         ("LOCUS       U00096", None, None, None),
         # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae:
         ("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
          None, "DNA", "PLN"),
         ("LOCUS       AB070938                6497 bp    DNA     linear   BCT 11-OCT-2001",
          "linear", "DNA", "BCT"),
         ("LOCUS       NC_005816               9609 bp    DNA     circular BCT 21-JUL-2008",
          "circular", "DNA", "BCT"),
         ("LOCUS       SCX3_BUTOC                64 aa            linear   INV 16-OCT-2001",
          "linear", None, "INV"),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner.GenBankScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(
             t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(
             mt, mol_type,
             "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(
             d, div, "Wrong division %r not %r from %r" % (d, div, line))
Exemplo n.º 2
0
 def test_topology_genbank(self):
     """Check GenBank LOCUS line parsing."""
     # This is a bit low level, but can test pasing the LOCUS line only
     tests = [
         ("LOCUS       U00096",
          None, None, None, None),
         # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae:
         ("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
          None, "DNA", "PLN", None),
         ("LOCUS       AB070938                6497 bp    DNA     linear   BCT 11-OCT-2001",
          "linear", "DNA", "BCT", None),
         ("LOCUS       NC_005816               9609 bp    DNA     circular BCT 21-JUL-2008",
          "circular", "DNA", "BCT", None),
         ("LOCUS       SCX3_BUTOC                64 aa            linear   INV 16-OCT-2001",
          "linear", None, "INV", None),
         ("LOCUS       pEH010                  5743 bp    DNA     circular",
          "circular", "DNA", None, [BiopythonParserWarning]),
         # This is a test of the format > 80 chars long
         ("LOCUS       AZZZAA02123456789 1000000000 bp    DNA     linear   PRI 15-OCT-2018",
          "linear", "DNA", "PRI", None)
     ]
     for (line, topo, mol_type, div, warning_list) in tests:
         with warnings.catch_warnings(record=True) as caught:
             warnings.simplefilter("always")
             scanner = Scanner.GenBankScanner()
             consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
             scanner._feed_first_line(consumer, line)
             t = consumer.data.annotations.get('topology', None)
             self.assertEqual(t, topo,
                              "Wrong topology %r not %r from %r" % (t, topo, line))
             mt = consumer.data.annotations.get('molecule_type', None)
             self.assertEqual(mt, mol_type,
                              "Wrong molecule_type %r not %r from %r" %
                              (mt, mol_type, line))
             d = consumer.data.annotations.get('data_file_division', None)
             self.assertEqual(d, div,
                              "Wrong division %r not %r from %r" % (d, div, line))
             if warning_list is None:
                 self.assertEqual(len(caught), 0)
             else:
                 self.assertEqual(len(caught), len(warning_list))
                 for i, warning_class in enumerate(warning_list):
                     self.assertEqual(caught[i].category, warning_class)
Exemplo n.º 3
0
class GenBankScannerTests(unittest.TestCase):
    """GenBank Scanner tests, test parsing gbk and embl files."""

    gb_s = Scanner.GenBankScanner()

    def gb_to_l_cds_f(self, filename, tags2id=None):
        """Gb file to Seq list parse CDS features."""
        with open(filename) as handle:
            if tags2id:
                l_cds_f = list(self.gb_s.parse_cds_features(handle, tags2id=tags2id))
            else:
                l_cds_f = list(self.gb_s.parse_cds_features(handle))
        return l_cds_f

    def gb_to_l_r(self, filename, do_features=False):
        """Gb file to Seq list parse records."""
        with open(filename) as handle:
            l_gb_r = list(self.gb_s.parse_records(handle, do_features=do_features))
        return l_gb_r

    def test_genbank_cds_interaction(self):
        """Test CDS interaction, parse CDS features on gb(k) files."""
        # Test parse CDS features on NC_000932.gb
        l_cds_f = self.gb_to_l_cds_f("GenBank/NC_000932.gb")
        # number of records, should be 85
        self.assertEqual(len(l_cds_f), 85)
        # Seq ID
        self.assertEqual(l_cds_f[0].id, 'NP_051037.1')
        self.assertEqual(l_cds_f[84].id, 'NP_051123.1')

        # Test parse CDS features on NC_005816.gb, Tag to ID
        l_cds_f = self.gb_to_l_cds_f("GenBank/NC_005816.gb",
                                     tags2id=('gene', 'locus_tag', 'product'))
        # number of records, should be 10
        self.assertEqual(len(l_cds_f), 10)
        # Seq ID
        self.assertEqual(l_cds_f[0].id, '<unknown id>')
        self.assertEqual(l_cds_f[0].name, 'YP_pPCP01')

        # Test parse CDS features on NC_000932.gb and NC_005816.gb combined
        l_cds_f1 = self.gb_to_l_cds_f("GenBank/NC_000932.gb",
                                      tags2id=('gene', 'locus_tag', 'product'))
        l_cds_f2 = self.gb_to_l_cds_f("GenBank/NC_005816.gb",
                                      tags2id=('gene', 'locus_tag', 'product'))
        l_cds_combined = l_cds_f1 + l_cds_f2
        # number of records combined, should be 95
        self.assertEqual(len(l_cds_combined), 95)
        # Seq ID
        self.assertEqual(l_cds_combined[0].id, 'rps12')
        self.assertEqual(l_cds_combined[0].description, 'ribosomal protein S12')
        self.assertEqual(l_cds_combined[94].id, '<unknown id>')
        self.assertEqual(l_cds_combined[94].description, 'hypothetical protein')

    def test_genbank_interaction(self):
        """Test GenBank records interaction on gbk files."""
        # Test parse records, on NC_005816, do_features False
        l_r = self.gb_to_l_r("GenBank/NC_005816.gb", do_features=False)
        # number of records, should be 1
        self.assertEqual(len(l_r), 1)
        self.assertEqual(l_r[0].id, 'NC_005816.1')
        self.assertEqual(l_r[0].name, 'NC_005816')
        self.assertEqual(l_r[0].description, 'Yersinia pestis biovar '
                                             'Microtus str. 91001 plasmid '
                                             'pPCP1, complete sequence')
        self.assertEqual(len(l_r[0].features), 0)

        # Test parse records on NC_005816, do_features True
        l_r = self.gb_to_l_r("GenBank/NC_005816.gb", do_features=True)
        # number of records, should be 1
        self.assertEqual(len(l_r), 1)
        self.assertEqual(l_r[0].id, 'NC_005816.1')
        self.assertEqual(l_r[0].name, 'NC_005816')
        self.assertEqual(l_r[0].description, 'Yersinia pestis biovar '
                                             'Microtus str. 91001 plasmid '
                                             'pPCP1, complete sequence')
        self.assertEqual(len(l_r[0].features), 41)

        # Test parse records on "GenBank/NC_000932.gb", do_features False
        l_r = self.gb_to_l_r("GenBank/NC_000932.gb", do_features=False)
        # number of records, should be 1
        self.assertEqual(len(l_r), 1)
        self.assertEqual(l_r[0].id, 'NC_000932.1')
        self.assertEqual(l_r[0].name, 'NC_000932')
        self.assertEqual(l_r[0].description, 'Arabidopsis thaliana chloroplast, '
                                             'complete genome')
        self.assertEqual(len(l_r[0].features), 0)

        # Test parse records on NC_000932, do_features True
        l_r = self.gb_to_l_r("GenBank/NC_000932.gb", do_features=True)
        # number of records, should be 1
        self.assertEqual(len(l_r), 1)
        self.assertEqual(l_r[0].id, 'NC_000932.1')
        self.assertEqual(l_r[0].name, 'NC_000932')
        self.assertEqual(l_r[0].description, 'Arabidopsis thaliana chloroplast, '
                                             'complete genome')
        self.assertEqual(len(l_r[0].features), 259)

    def test_embl_cds_interaction(self):
        """Test EMBL CDS interaction, parse CDS features on embl files."""
        embl_s = Scanner.EmblScanner()

        # Test parse CDS features on embl_file
        with open("EMBL/AE017046.embl") as handle_embl7046:
            l_cds_f = list(embl_s.parse_cds_features(handle_embl7046))
        # number of records, should be 10
        self.assertEqual(len(l_cds_f), 10)
        # Seq ID
        self.assertEqual(l_cds_f[0].id, 'AAS58758.1')
        self.assertEqual(l_cds_f[0].description, 'putative transposase')

    def test_embl_record_interaction(self):
        """Test EMBL Record interaction on embl files."""
        embl_s = Scanner.EmblScanner()

        #  Test parse records on embl_file
        with open("EMBL/AE017046.embl") as handle_embl7046:
            l_embl_r = list(embl_s.parse_records(handle_embl7046, do_features=True))
        # number of records, should be 1
        self.assertEqual(len(l_embl_r), 1)
        self.assertEqual(l_embl_r[0].id, 'AE017046.1')
        self.assertEqual(l_embl_r[0].description, 'Yersinia pestis biovar Microtus '
                                                  'str. 91001 plasmid pPCP1, complete '
                                                  'sequence.')
        self.assertEqual(len(l_embl_r[0].features), 29)