Пример #1
0
 def test_parse_fasta_no_header(self):
     "Test parsing a fasta record without header"
     with open(get_file_path('no_header.fasta'), 'rU') as h:
         # plain BioPython parsing should fail
         records = list(seqio.parse(h))
         self.assertEqual(0, len(records))
         h.seek(0)
         # robust parsing should work
         records = list(seqio.parse(h, robust=True))
         self.assertEqual(1, len(records))
Пример #2
0
 def test_parse_fasta_no_header(self):
     "Test parsing a fasta record without header"
     with open(get_file_path('no_header.fasta'), 'rU') as h:
         # plain BioPython parsing should fail
         records = list(seqio.parse(h))
         self.assertEqual(0, len(records))
         h.seek(0)
         # robust parsing should work
         records = list(seqio.parse(h, robust=True))
         self.assertEqual(1, len(records))
Пример #3
0
    def test_prepeptide_adjustment(self):
        dummy_record = Record(Seq("A"*400, generic_dna))
        subregion = DummySubRegion(start=100, end=300)
        dummy_record.add_subregion(subregion)
        region = Region(subregions=[subregion])
        dummy_record.add_region(region)

        dummy_prepeptide = DummyFeature(200, 230, 1, "CDS_motif")
        # ensure both FeatureLocation and CompoundLocations are handled appropriately
        leader_loc = FeatureLocation(200, 210, 1)
        tail_loc = CompoundLocation([FeatureLocation(220, 223, -1), FeatureLocation(227, 230, -1)])
        dummy_prepeptide._qualifiers["leader_location"] = [str(leader_loc)]
        dummy_prepeptide._qualifiers["tail_location"] = [str(tail_loc)]
        dummy_record.add_feature(dummy_prepeptide)
        # and add a CDS_motif without either qualifier (e.g. NRPS/PKS motif) to ensure that doesn't break
        dummy_record.add_feature(DummyFeature(250, 280, 1, "CDS_motif"))

        with NamedTemporaryFile(suffix=".gbk") as output:
            region.write_to_genbank(output.name)
            bio = list(seqio.parse(output.name))[0]
        assert len(bio.features) == 4
        found = False
        for feature in bio.features:
            tail = feature.qualifiers.get("tail_location")
            leader = feature.qualifiers.get("leader_location")
            if tail and leader:
                # the part locations should now be adjusted backwards 100 bases
                assert leader == ["[100:110](+)"]
                assert tail == ["join{[120:123](-), [127:130](-)}"]
                found = True
        assert found, "prepeptide feature missing in conversion"
Пример #4
0
def _strict_parse(filename: str) -> List[SeqRecord]:
    """ Parses the input record with extra wrappers to catch biopython warnings
        as errors.

        Arguments:
            filename: the name of the file to parse

        Returns:
            a list of SeqRecords parsed
    """
    filter_messages = [
        r".*invalid location.*",
        r".*Expected sequence length.*",
        r".*Couldn't parse feature location.*",
    ]
    try:
        # prepend warning filters to raise exceptions on certain messages
        for message in filter_messages:
            warnings.filterwarnings("error", message=message)
        records = list(seqio.parse(filename))
    except Exception as err:
        message = str(err)
        # strip the "Ignoring" part, since it's not being ignored
        if message.startswith("Ignoring invalid location"):
            message = message[9:]
        logging.error('Parsing %r failed: %s', filename, message)
        raise AntismashInputError(message) from err
    finally:
        # remove the new warning filters (functions in at least 3.5 and 3.6)
        # since mypy doesn't recognise this attribute, ignore the type
        warnings.filters = warnings.filters[len(filter_messages):]   # type: ignore
    return records
Пример #5
0
 def test_genbank(self):
     dummy_record = Record(Seq("A" * 100, generic_dna))
     clusters = [
         create_cluster(3, 20, "prodA"),
         create_cluster(25, 41, "prodB")
     ]
     for cluster in clusters:
         dummy_record.add_cluster(cluster)
     subregion = SubRegion(FeatureLocation(35, 71), "test", 0.7)
     dummy_record.add_subregion(subregion)
     supercluster = SuperCluster(SuperCluster.kinds.NEIGHBOURING, clusters)
     dummy_record.add_supercluster(supercluster)
     region = Region(superclusters=[supercluster], subregions=[subregion])
     dummy_record.add_region(region)
     with NamedTemporaryFile(suffix=".gbk") as output:
         region.write_to_genbank(output.name)
         bio = list(seqio.parse(output.name))
     assert len(bio) == 1
     rec = Record.from_biopython(bio[0], taxon="bacteria")
     assert len(rec.get_regions()) == 1
     new = rec.get_region(0)
     assert new.location.start == 3 - region.location.start
     assert new.location.end == 71 - region.location.start
     assert new.products == region.products
     assert new.probabilities == region.probabilities
Пример #6
0
def parse_input_sequence(filename: str,
                         taxon: str = "bacteria",
                         minimum_length: int = -1,
                         start: int = -1,
                         end: int = -1) -> List[Record]:
    """ Parse input records contained in a file

        Arguments:
            filename: the path of the file to read
            taxon: the taxon of the input, e.g. 'bacteria', 'fungi'
            minimum_length: records with length less than this will be ignored
                            if not positive, all records are included
            start: a start location for trimming the sequence, or -1 to use all
            end: an end location for trimming the sequence, or -1 to use all

        Returns:
            A list of secmet.Record instances, one for each record in the file
    """
    logging.info('Parsing input sequence %r', filename)
    if not isinstance(minimum_length, int):
        raise TypeError("minimum_length must be an int")

    records = []  # type: List[SeqRecord]
    try:
        record_list = list(seqio.parse(filename))
    except Exception as err:
        logging.error('Parsing %r failed: %s', filename, err)
        raise AntismashInputError(str(err)) from err

    for record in record_list:
        if minimum_length < 1 \
                or len(record.seq) >= minimum_length \
                or 'contig' in record.annotations \
                or 'wgs_scafld' in record.annotations \
                or 'wgs' in record.annotations:
            records.append(record)

    # if no records are left, that's a problem
    if not records:
        raise AntismashInputError("no valid records found in file %r" %
                                  filename)

    for record in records:
        if isinstance(record.seq.alphabet, Bio.Alphabet.ProteinAlphabet):
            raise AntismashInputError("protein records are not supported")

    # before conversion to secmet records, trim if required
    if start > -1 or end > -1:
        if len(records) > 1:
            raise ValueError(
                "--start and --end options cannot be used with multiple records"
            )
        records[0] = trim_sequence(records[0], max(start, 0),
                                   min(len(records[0]), end))

    try:
        return [Record.from_biopython(record, taxon) for record in records]
    except SecmetInvalidInputError as err:
        raise AntismashInputError(str(err)) from err
Пример #7
0
def parse_input_sequence(filename: str,
                         taxon: str = "bacteria",
                         minimum_length=-1,
                         start=-1,
                         end=-1) -> List[Record]:
    """ Parse input records contained in a file

        Arguments:
            filename: the path of the file to read
            taxon: the taxon of the input, e.g. 'bacteria', 'fungi'
            minimum_length: records with length less than this will be ignored
                            if not positive, all records are included
            start: a start location for trimming the sequence, or -1 to use all
            end: an end location for trimming the sequence, or -1 to use all

        Returns:
            A list of secmet.Record instances, one for each record in the file
    """
    logging.info('Parsing input sequence %r', filename)
    if not isinstance(minimum_length, int):
        raise TypeError("minimum_length must be an int")

    records = []
    if not os.path.exists(filename):
        msg = "Sequence file not found: %r" % filename
        logging.error(msg)
        raise ValueError(msg)

    try:
        record_list = list(seqio.parse(filename))
        if not record_list:
            raise RuntimeError('No records could be read from file %r' %
                               filename)
        for record in record_list:
            if minimum_length < 1 \
                    or len(record.seq) >= minimum_length \
                    or 'contig' in record.annotations \
                    or 'wgs_scafld' in record.annotations \
                    or 'wgs' in record.annotations:
                records.append(record)
    except (ValueError, AssertionError) as err:
        logging.error('Parsing %r failed: %s', filename, err)
        raise
    except Exception as err:
        logging.error('Parsing %r failed with unhandled exception: %s',
                      filename, err)
        raise

    # before conversion to secmet records, trim if required
    if start > -1 or end > -1:
        if len(records) > 1:
            raise ValueError(
                "--start and --end options cannot be used with multiple records"
            )
        records[0] = trim_sequence(records[0], max(start, 0),
                                   min(len(records[0]), end))
    return [Record.from_biopython(record, taxon) for record in records]
Пример #8
0
 def test_parse_genbank(self):
     "Test parsing a gzipped GenBank file"
     with open(get_file_path('melanin.gbk.gz'), 'rb') as h:
         records = list(seqio.parse(h))
     self.assertEqual(1, len(records))
Пример #9
0
 def test_parse_genbank(self):
     "Test parsing a gzipped GenBank file"
     with open(get_file_path('melanin.gbk.gz'), 'rb') as h:
         records = list(seqio.parse(h))
     self.assertEqual(1, len(records))
Пример #10
0
 def test_parse_genbank_path(self):
     "Test parsing a gzipped GenBank file specified by path"
     fname = get_file_path('melanin.gbk.gz')
     records = list(seqio.parse(fname))
     self.assertEqual(1, len(records))
Пример #11
0
 def test_parse_fasta_valid(self):
     "Test parsing a valid fasta record"
     with open(get_file_path('melanin.fasta'), 'rU') as h:
         records = list(seqio.parse(h))
     self.assertEqual(1, len(records))
Пример #12
0
 def test_parse_calls_biopython(self):
     "Test running the Bio.SeqIO parser"
     mock("Bio.SeqIO.parse", tracker=self.tt, returns=[])
     expected_trace = "    Called Bio.SeqIO.parse(DummyHandle('test.gbk'), 'genbank')"
     seqio.parse(self.handle)
     assert_same_trace(self.tt, expected_trace)
Пример #13
0
 def test_parse_genbank_valid(self):
     "Test parsing a valid genbank record"
     with open(get_file_path('melanin.gbk'), 'rU') as h:
         records = list(seqio.parse(h))
     self.assertEqual(1, len(records))
Пример #14
0
 def test_parse_calls_biopython(self):
     "Test running the Bio.SeqIO parser"
     mock("Bio.SeqIO.parse", tracker=self.tt, returns=[])
     expected_trace = "    Called Bio.SeqIO.parse(DummyHandle('test.gbk'), 'genbank')"
     seqio.parse(self.handle)
     assert_same_trace(self.tt, expected_trace)
Пример #15
0
 def test_parse_seqtype(self):
     "Test running the Bio.SeqIO parser with specified seqtype parameter"
     mock("Bio.SeqIO.parse", tracker=self.tt, returns=[])
     expected_trace = "    Called Bio.SeqIO.parse(DummyHandle('test.gbk'), 'embl')"
     seqio.parse(self.handle, 'embl')
     assert_same_trace(self.tt, expected_trace)
Пример #16
0
 def test_parse_genbank_valid(self):
     "Test parsing a valid genbank record"
     with open(get_file_path('melanin.gbk'), 'rU') as h:
         records = list(seqio.parse(h))
     self.assertEqual(1, len(records))
Пример #17
0
 def test_parse_fasta_valid(self):
     "Test parsing a valid fasta record"
     with open(get_file_path('melanin.fasta'), 'rU') as h:
         records = list(seqio.parse(h))
     self.assertEqual(1, len(records))
Пример #18
0
 def test_parse_genbank_path(self):
     "Test parsing a gzipped GenBank file specified by path"
     fname = get_file_path('melanin.gbk.gz')
     records = list(seqio.parse(fname))
     self.assertEqual(1, len(records))
Пример #19
0
 def test_parse_seqtype(self):
     "Test running the Bio.SeqIO parser with specified seqtype parameter"
     mock("Bio.SeqIO.parse", tracker=self.tt, returns=[])
     expected_trace = "    Called Bio.SeqIO.parse(DummyHandle('test.gbk'), 'embl')"
     seqio.parse(self.handle, 'embl')
     assert_same_trace(self.tt, expected_trace)