Exemplo n.º 1
0
def dna_records():
    return [
        SeqRecordEM2(SeqEM2.dna('ACAGTACCATGTAA'), id='DNA1', name='DNA1'),
        SeqRecordEM2(SeqEM2.dna('ACAG'), id='DNA2', name='DNA2'),
        SeqRecordEM2(SeqEM2.dna('ACAGTA'), id='DNA3', name='DNA3'),
        SeqRecordEM2(SeqEM2.dna('ACAGTACCAT'), id='DNA4', name='DNA4'),
        SeqRecordEM2(SeqEM2.dna('ACAGTACCATGT'), id='DNA5', name='DNA5')
        ]
Exemplo n.º 2
0
def test_get_orfs(dna_rec1_overlap_rec2):
    assert dna_rec1_overlap_rec2.seq.get_orfs() ==\
           set([(0, 9), (14, 29), (18, 36), (35, 44), (22, 58), (49, 58), (53, 59)])
    assert dna_rec1_overlap_rec2.seq.get_orfs() == \
           set([(0, 9), (14, 29), (18, 36), (35, 44), (22, 58), (49, 58), (53, 59)])
    assert dna_rec1_overlap_rec2.seq.get_orfs(start=None) ==\
           set([(0, 9), (2, 29), (12, 36), (32, 44), (4, 58), (47, 59), (39, 60)])
    assert SeqEM2.dna('ACAGTACCAT').get_orfs(start=None) ==\
            set([(0, 9), (1, 10), (2, 8)])
    assert SeqEM2.dna('ACAGTACCAT').get_orfs() ==\
            set()
Exemplo n.º 3
0
    def join(self, other=None, offset=0, keepself=True):
        """
        Joins two SeqRecordEM2 objects into a new one representing the resulting merged sequence

        :param keepself: if True and overlapping subsequences are different, then keep sequence\
        from self record, otherwise keep the sequence of other record.
        :param other: the other SeqRecordEM2 object
        :param offset: the offset of the two sequences. If the value is negative, then the two\
        sequences overlap.
        :return: the result of merging records as a new SeqRecordEM2 object
        """
        if len(self.seq) + offset < 0:
            return other.join(self,
                              offset=-len(self.seq) - len(other.seq) - offset,
                              keepself=not keepself)

        if offset >= 0:
            new_seq = str(self.seq) + self.seq.any_residue * offset + str(
                other.seq)
        else:
            if str(self.seq)[offset:] != str(other.seq)[0:-offset]:
                warnings.warn(
                    'Warning!!! Overlapping subsequences are different.')
            if keepself is True:
                new_seq = str(self.seq) + str(other.seq)[-offset:]
            else:
                new_seq = str(self.seq)[0:offset] + str(other.seq)

        if self.seq.is_protein() and other.seq.is_protein():
            new_record = SeqRecordEM2(SeqEM2.protein(new_seq))
        elif not (self.seq.is_protein() or other.seq.is_protein()):
            new_record = SeqRecordEM2(SeqEM2.dna(new_seq))
        else:
            raise ValueError(
                'Sequences are not of the same type. It is impossible to join them.'
            )

        new_record.id = self.id + '_' + other.id

        for feature in self.features:
            new_record.add_feature(location=feature.location,
                                   strand=feature.strand,
                                   id=feature.id)

        for feature in other.features:
            new_record.add_feature(location=FeatureLocation(
                feature.location.start + len(self.seq) + offset,
                feature.location.end + len(self.seq) + offset),
                                   strand=feature.strand,
                                   id=feature.id)
        return new_record
Exemplo n.º 4
0
def dna_seq(name='rec'):
    s = {'rec': 'ATGAGTCGGTAAAGATGCATGCGCGCCGCTGACGC',
         'rec1': 'ATGAGTCGGTAACGATGCATGCATGCAGCTGACGC',
         'rec2': 'CAGCTGACGCATGAGTCGGTAACGATGCATGCATG',
         'rec3': 'CACCTGACGCATGAGTCGGTAACGATGCATGCATG',
         'rec1NNNrec2': 'ATGAGTCGGTAACGATGCATGCATGCAGCTGACGCNNNCAGCTGACGCATGAGTCGGTAACGATGCATGCATG',
         'rec1_overlap_rec2/rec3': 'ATGAGTCGGTAACGATGCATGCATGCAGCTGACGCATGAGTCGGTAACGATGCATGCATG',
         'rec1_overlp_rec3_kpfls': 'ATGAGTCGGTAACGATGCATGCATGCACCTGACGCATGAGTCGGTAACGATGCATGCATG'
         }
    return SeqEM2.dna(s[name])
Exemplo n.º 5
0
    def reverse_complement(self,
                           id=False,
                           name=False,
                           description=False,
                           features=True,
                           annotations=False,
                           letter_annotations=True,
                           dbxrefs=False):
        """
        Reverse-complement the record adjusting features and their positions accordingly. The record
        id is conserved but if name is not specified 'reversed' is appended. All other arguments are
        passed and handled by the parent method.
        Note that the main goal for this method is to replace SeqRecord and Seq objects by their
        SeqRecordEM2 and SeqEM2 equivalents when reverse/complementing.

        :param id: the id for the reversed record
        :param name: the name for the reversed record
        :param description: the description for the reversed record
        :param features: keep and adjust location of features if True
        :param annotations: keep annotations if True
        :param letter_annotations: keep letter_annotations if True
        :param dbxrefs: keep dbxrefs if True
        :return: a reversed copy of the record
        """
        id = self.id if id is False else id
        name = self.name + ' reversed' if name is False else name
        rev_record = super().reverse_complement(
            id=id,
            name=name,
            description=self.description,
            features=features,
            annotations=annotations,
            letter_annotations=letter_annotations,
            dbxrefs=dbxrefs)
        rev_record = SeqRecordEM2(
            SeqEM2.dna(str(rev_record.seq)),
            id=self.id,
            name=rev_record.name,
            description=self.description,
            features=rev_record.features,
            annotations=rev_record.annotations,
            letter_annotations=rev_record.letter_annotations,
            dbxrefs=rev_record.dbxrefs)
        return rev_record
Exemplo n.º 6
0
class SeqFeatureTests(unittest.TestCase):
    sprot: SeqRecord = SeqRecord(
        SeqEM2.protein('MYNAMEISFREDHEREIAMWHEREARETHEYALLTHISISEXCELLENT'),
        id='X',
        name='DummyProt')
    sprot.features = [
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(0, 11),
                      type='domain',
                      id='d1'),  # MYNAMEISFRED
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(8, 18),
                      type='domain',
                      id='d2'),  # FREDHEREIAM
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(19, 30),
                      type='domain',
                      id='d3'),  # WHEREARETHEY
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(6, 23),
                      type='domain',
                      id='d4'),  # ISFREDHEREIAMWHERE
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(34, AfterPosition(39)),
                      id='d5'),  # THISIS
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(BeforePosition(2), 5),
                      type='domain',
                      id='d6'),  # MYNAME
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(19, 23),
                      type='domain',
                      id='d7'),  # WHERE
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(BeforePosition(30), 37),
                      type='domain',
                      id='d8')  # YALLTHI
    ]

    @classmethod
    def test_parent(cls):
        assert [f.id for f in cls.sprot.features
                ] == ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8']
        assert cls.sprot.features[1].parent.id == cls.sprot.id
        assert cls.sprot.features[1].parent.name == cls.sprot.name
        assert cls.sprot.features[1].parent.seq._data == cls.sprot.seq._data

    @classmethod
    def test_lies_within(cls):
        assert cls.sprot.features[1].lies_within(5, 25)
        assert not cls.sprot.features[1].lies_within(10, 25)
        assert not cls.sprot.features[1].lies_within(19, 25)

    @classmethod
    def test_lies_within_fuzzy(cls):
        with pytest.warns(UserWarning):
            cls.sprot.features[4].lies_within(30, 42)
            cls.sprot.features[5].lies_within(0, 10)

    @classmethod
    def test_overlaps(cls):
        assert cls.sprot.features[2].overlaps(20, 25)
        assert cls.sprot.features[2].overlaps(20, 40)
        assert cls.sprot.features[2].overlaps(20)
        assert not cls.sprot.features[2].overlaps(35)
        assert not cls.sprot.features[2].overlaps(2, 5)

    @classmethod
    def test_overlaps_fuzzy(cls):
        with pytest.warns(UserWarning):
            cls.sprot.features[4].overlaps(35)
            cls.sprot.features[5].overlaps(3)

    @classmethod
    def test_covers(cls):
        assert cls.sprot.features[3].covers(15, 20)
        assert not cls.sprot.features[3].covers(4, 20)

    @classmethod
    def test_covers_fuzzy(cls):
        with pytest.warns(UserWarning):
            cls.sprot.features[4].covers(35, 38)
            cls.sprot.features[5].covers(3, 4)

    @classmethod
    def test_intersect(cls):
        assert cls.sprot.features[4].intersect(
            cls.sprot.features[7]).location == FeatureLocation(34, 37)
        assert cls.sprot.features[2].intersect(
            cls.sprot.features[3]).location == cls.sprot.features[6].location
        assert cls.sprot.features[1].intersect(
            cls.sprot.features[3]).location == FeatureLocation(8, 18)

    @classmethod
    def test_intersect_errors(cls):
        with pytest.raises(ValueError, match=r'Undetermined .*'):
            cls.sprot.features[0].intersect(
                SeqFeatureEM2(location=FeatureLocation(30, 37)))

    @classmethod
    def test_intersect_fuzzy(cls):
        with pytest.warns(UserWarning):
            cls.sprot.features[5].intersect(cls.sprot.features[0])

    @classmethod
    def test_move(cls):
        assert cls.sprot.features[0].move(5).location == FeatureLocation(5, 16)
Exemplo n.º 7
0
def protein_seq():
    return SeqEM2.protein('HITHEREFREDANDGREG')
Exemplo n.º 8
0
class GFFtests(unittest.TestCase):
    sprot: SeqRecord = SeqRecord(
        SeqEM2.dna('ATGAGTCGGTAACGATGCATGCATGCAGCTGACGC'),
        id='X',
        name='DummyDNA')
    sprot.features = [
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(0, 2),
                      type='start',
                      strand=1,
                      qualifiers={
                          'codon': 'start',
                          'source': '',
                          'phase': '0',
                          'score': '0'
                      }),
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(8, 18),
                      type='domain',
                      id='d1',
                      strand=0,
                      qualifiers={
                          'source': '',
                          'phase': '0',
                          'score': '0'
                      }),
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(16, 30),
                      type='domain',
                      id='d2',
                      strand=-1,
                      qualifiers={
                          'source': '',
                          'phase': '0',
                          'score': '0'
                      })
    ]

    df0 = DataFrame({
        'seq_id': ['X'],
        'source': [''],
        'type': ['start'],
        'start': ['0'],
        'end': ['2'],
        'score': ['0'],
        'strand': ['+'],
        'phase': ['0'],
        'attributes': ['codon=start;id=<unknown id>']
    })

    df1 = DataFrame({
        'seq_id': ['X', 'X', 'X'],
        'source': ['', '', ''],
        'type': ['start', 'domain', 'domain'],
        'start': ['0', '8', '16'],
        'end': ['2', '18', '30'],
        'score': ['0', '0', '0'],
        'strand': ['+', '?', '-'],
        'phase': ['0', '0', '0'],
        'attributes': ['codon=start;id=<unknown id>', 'id=d1', 'id=d2']
    })

    @classmethod
    def test_df_from_feature(cls):
        assert_frame_equal(
            GFF.df_from_feature(cls.sprot.features[0]).reset_index(drop=True),
            cls.df0.reset_index(drop=True))
        assert_frame_equal(
            GFF.df_from_feature(None).reset_index(drop=True),
            DataFrame(columns=[
                'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand',
                'phase', 'attributes'
            ]).reset_index(drop=True))

    @classmethod
    def test_from_feature_list(cls):
        assert_frame_equal(
            GFF(cls.sprot.features[0:3]).df.reset_index(drop=True),
            cls.df1.reset_index(drop=True))
        assert_frame_equal(
            GFF([]).df.reset_index(drop=True),
            DataFrame(columns=[
                'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand',
                'phase', 'attributes'
            ]).reset_index(drop=True))
        assert_frame_equal(
            GFF(None).df.reset_index(drop=True),
            DataFrame(columns=[
                'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand',
                'phase', 'attributes'
            ]).reset_index(drop=True))
        assert_frame_equal(
            GFF().df.reset_index(drop=True),
            DataFrame(columns=[
                'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand',
                'phase', 'attributes'
            ]).reset_index(drop=True))

    @classmethod
    def test_add_list(cls):
        assert_frame_equal(
            GFF([cls.sprot.features[0]]).add_feature_list(
                cls.sprot.features[1:3]).df.reset_index(drop=True),
            cls.df1.reset_index(drop=True))

    @classmethod
    def test_to_feature_list(cls):
        for i in range(0, len(cls.sprot.features)):
            assert GFF(input_df=cls.df1).to_feature_list(parents=cls.sprot)[i].__str__() == \
                   cls.sprot.features[i].__str__()
            assert GFF(input_df=cls.df1).to_feature_list(parents=cls.sprot)[i].parent.id == \
                   cls.sprot.features[i].parent.id
            assert GFF(input_df=cls.df1).to_feature_list()[i].parent is None
        for i in range(0, len(cls.sprot.features)):
            assert GFF(input_df=cls.df1).to_feature_list(parents=[cls.sprot,
                                                                  cls.sprot,
                                                                  cls.sprot])[i].__str__() == \
                   cls.sprot.features[i].__str__()
        with pytest.raises(
                ValueError,
                match=
                r'The number of parents should match the number of features .*'
        ):
            GFF(input_df=cls.df1).to_feature_list(parents=[cls.sprot])