def dna_records(): return [ SeqRecordEM2(SeqEM2.dna('ACAGTACCATGTAA'), id='DNA1', name='DNA1'), SeqRecordEM2(SeqEM2.dna('ACAG'), id='DNA2', name='DNA2'), SeqRecordEM2(SeqEM2.dna('ACAGTA'), id='DNA3', name='DNA3'), SeqRecordEM2(SeqEM2.dna('ACAGTACCAT'), id='DNA4', name='DNA4'), SeqRecordEM2(SeqEM2.dna('ACAGTACCATGT'), id='DNA5', name='DNA5') ]
def test_get_orfs(dna_rec1_overlap_rec2): assert dna_rec1_overlap_rec2.seq.get_orfs() ==\ set([(0, 9), (14, 29), (18, 36), (35, 44), (22, 58), (49, 58), (53, 59)]) assert dna_rec1_overlap_rec2.seq.get_orfs() == \ set([(0, 9), (14, 29), (18, 36), (35, 44), (22, 58), (49, 58), (53, 59)]) assert dna_rec1_overlap_rec2.seq.get_orfs(start=None) ==\ set([(0, 9), (2, 29), (12, 36), (32, 44), (4, 58), (47, 59), (39, 60)]) assert SeqEM2.dna('ACAGTACCAT').get_orfs(start=None) ==\ set([(0, 9), (1, 10), (2, 8)]) assert SeqEM2.dna('ACAGTACCAT').get_orfs() ==\ set()
def join(self, other=None, offset=0, keepself=True): """ Joins two SeqRecordEM2 objects into a new one representing the resulting merged sequence :param keepself: if True and overlapping subsequences are different, then keep sequence\ from self record, otherwise keep the sequence of other record. :param other: the other SeqRecordEM2 object :param offset: the offset of the two sequences. If the value is negative, then the two\ sequences overlap. :return: the result of merging records as a new SeqRecordEM2 object """ if len(self.seq) + offset < 0: return other.join(self, offset=-len(self.seq) - len(other.seq) - offset, keepself=not keepself) if offset >= 0: new_seq = str(self.seq) + self.seq.any_residue * offset + str( other.seq) else: if str(self.seq)[offset:] != str(other.seq)[0:-offset]: warnings.warn( 'Warning!!! Overlapping subsequences are different.') if keepself is True: new_seq = str(self.seq) + str(other.seq)[-offset:] else: new_seq = str(self.seq)[0:offset] + str(other.seq) if self.seq.is_protein() and other.seq.is_protein(): new_record = SeqRecordEM2(SeqEM2.protein(new_seq)) elif not (self.seq.is_protein() or other.seq.is_protein()): new_record = SeqRecordEM2(SeqEM2.dna(new_seq)) else: raise ValueError( 'Sequences are not of the same type. It is impossible to join them.' ) new_record.id = self.id + '_' + other.id for feature in self.features: new_record.add_feature(location=feature.location, strand=feature.strand, id=feature.id) for feature in other.features: new_record.add_feature(location=FeatureLocation( feature.location.start + len(self.seq) + offset, feature.location.end + len(self.seq) + offset), strand=feature.strand, id=feature.id) return new_record
def dna_seq(name='rec'): s = {'rec': 'ATGAGTCGGTAAAGATGCATGCGCGCCGCTGACGC', 'rec1': 'ATGAGTCGGTAACGATGCATGCATGCAGCTGACGC', 'rec2': 'CAGCTGACGCATGAGTCGGTAACGATGCATGCATG', 'rec3': 'CACCTGACGCATGAGTCGGTAACGATGCATGCATG', 'rec1NNNrec2': 'ATGAGTCGGTAACGATGCATGCATGCAGCTGACGCNNNCAGCTGACGCATGAGTCGGTAACGATGCATGCATG', 'rec1_overlap_rec2/rec3': 'ATGAGTCGGTAACGATGCATGCATGCAGCTGACGCATGAGTCGGTAACGATGCATGCATG', 'rec1_overlp_rec3_kpfls': 'ATGAGTCGGTAACGATGCATGCATGCACCTGACGCATGAGTCGGTAACGATGCATGCATG' } return SeqEM2.dna(s[name])
def reverse_complement(self, id=False, name=False, description=False, features=True, annotations=False, letter_annotations=True, dbxrefs=False): """ Reverse-complement the record adjusting features and their positions accordingly. The record id is conserved but if name is not specified 'reversed' is appended. All other arguments are passed and handled by the parent method. Note that the main goal for this method is to replace SeqRecord and Seq objects by their SeqRecordEM2 and SeqEM2 equivalents when reverse/complementing. :param id: the id for the reversed record :param name: the name for the reversed record :param description: the description for the reversed record :param features: keep and adjust location of features if True :param annotations: keep annotations if True :param letter_annotations: keep letter_annotations if True :param dbxrefs: keep dbxrefs if True :return: a reversed copy of the record """ id = self.id if id is False else id name = self.name + ' reversed' if name is False else name rev_record = super().reverse_complement( id=id, name=name, description=self.description, features=features, annotations=annotations, letter_annotations=letter_annotations, dbxrefs=dbxrefs) rev_record = SeqRecordEM2( SeqEM2.dna(str(rev_record.seq)), id=self.id, name=rev_record.name, description=self.description, features=rev_record.features, annotations=rev_record.annotations, letter_annotations=rev_record.letter_annotations, dbxrefs=rev_record.dbxrefs) return rev_record
class SeqFeatureTests(unittest.TestCase): sprot: SeqRecord = SeqRecord( SeqEM2.protein('MYNAMEISFREDHEREIAMWHEREARETHEYALLTHISISEXCELLENT'), id='X', name='DummyProt') sprot.features = [ SeqFeatureEM2(parent=sprot, location=FeatureLocation(0, 11), type='domain', id='d1'), # MYNAMEISFRED SeqFeatureEM2(parent=sprot, location=FeatureLocation(8, 18), type='domain', id='d2'), # FREDHEREIAM SeqFeatureEM2(parent=sprot, location=FeatureLocation(19, 30), type='domain', id='d3'), # WHEREARETHEY SeqFeatureEM2(parent=sprot, location=FeatureLocation(6, 23), type='domain', id='d4'), # ISFREDHEREIAMWHERE SeqFeatureEM2(parent=sprot, location=FeatureLocation(34, AfterPosition(39)), id='d5'), # THISIS SeqFeatureEM2(parent=sprot, location=FeatureLocation(BeforePosition(2), 5), type='domain', id='d6'), # MYNAME SeqFeatureEM2(parent=sprot, location=FeatureLocation(19, 23), type='domain', id='d7'), # WHERE SeqFeatureEM2(parent=sprot, location=FeatureLocation(BeforePosition(30), 37), type='domain', id='d8') # YALLTHI ] @classmethod def test_parent(cls): assert [f.id for f in cls.sprot.features ] == ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8'] assert cls.sprot.features[1].parent.id == cls.sprot.id assert cls.sprot.features[1].parent.name == cls.sprot.name assert cls.sprot.features[1].parent.seq._data == cls.sprot.seq._data @classmethod def test_lies_within(cls): assert cls.sprot.features[1].lies_within(5, 25) assert not cls.sprot.features[1].lies_within(10, 25) assert not cls.sprot.features[1].lies_within(19, 25) @classmethod def test_lies_within_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].lies_within(30, 42) cls.sprot.features[5].lies_within(0, 10) @classmethod def test_overlaps(cls): assert cls.sprot.features[2].overlaps(20, 25) assert cls.sprot.features[2].overlaps(20, 40) assert cls.sprot.features[2].overlaps(20) assert not cls.sprot.features[2].overlaps(35) assert not cls.sprot.features[2].overlaps(2, 5) @classmethod def test_overlaps_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].overlaps(35) cls.sprot.features[5].overlaps(3) @classmethod def test_covers(cls): assert cls.sprot.features[3].covers(15, 20) assert not cls.sprot.features[3].covers(4, 20) @classmethod def test_covers_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].covers(35, 38) cls.sprot.features[5].covers(3, 4) @classmethod def test_intersect(cls): assert cls.sprot.features[4].intersect( cls.sprot.features[7]).location == FeatureLocation(34, 37) assert cls.sprot.features[2].intersect( cls.sprot.features[3]).location == cls.sprot.features[6].location assert cls.sprot.features[1].intersect( cls.sprot.features[3]).location == FeatureLocation(8, 18) @classmethod def test_intersect_errors(cls): with pytest.raises(ValueError, match=r'Undetermined .*'): cls.sprot.features[0].intersect( SeqFeatureEM2(location=FeatureLocation(30, 37))) @classmethod def test_intersect_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[5].intersect(cls.sprot.features[0]) @classmethod def test_move(cls): assert cls.sprot.features[0].move(5).location == FeatureLocation(5, 16)
def protein_seq(): return SeqEM2.protein('HITHEREFREDANDGREG')
class GFFtests(unittest.TestCase): sprot: SeqRecord = SeqRecord( SeqEM2.dna('ATGAGTCGGTAACGATGCATGCATGCAGCTGACGC'), id='X', name='DummyDNA') sprot.features = [ SeqFeatureEM2(parent=sprot, location=FeatureLocation(0, 2), type='start', strand=1, qualifiers={ 'codon': 'start', 'source': '', 'phase': '0', 'score': '0' }), SeqFeatureEM2(parent=sprot, location=FeatureLocation(8, 18), type='domain', id='d1', strand=0, qualifiers={ 'source': '', 'phase': '0', 'score': '0' }), SeqFeatureEM2(parent=sprot, location=FeatureLocation(16, 30), type='domain', id='d2', strand=-1, qualifiers={ 'source': '', 'phase': '0', 'score': '0' }) ] df0 = DataFrame({ 'seq_id': ['X'], 'source': [''], 'type': ['start'], 'start': ['0'], 'end': ['2'], 'score': ['0'], 'strand': ['+'], 'phase': ['0'], 'attributes': ['codon=start;id=<unknown id>'] }) df1 = DataFrame({ 'seq_id': ['X', 'X', 'X'], 'source': ['', '', ''], 'type': ['start', 'domain', 'domain'], 'start': ['0', '8', '16'], 'end': ['2', '18', '30'], 'score': ['0', '0', '0'], 'strand': ['+', '?', '-'], 'phase': ['0', '0', '0'], 'attributes': ['codon=start;id=<unknown id>', 'id=d1', 'id=d2'] }) @classmethod def test_df_from_feature(cls): assert_frame_equal( GFF.df_from_feature(cls.sprot.features[0]).reset_index(drop=True), cls.df0.reset_index(drop=True)) assert_frame_equal( GFF.df_from_feature(None).reset_index(drop=True), DataFrame(columns=[ 'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]).reset_index(drop=True)) @classmethod def test_from_feature_list(cls): assert_frame_equal( GFF(cls.sprot.features[0:3]).df.reset_index(drop=True), cls.df1.reset_index(drop=True)) assert_frame_equal( GFF([]).df.reset_index(drop=True), DataFrame(columns=[ 'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]).reset_index(drop=True)) assert_frame_equal( GFF(None).df.reset_index(drop=True), DataFrame(columns=[ 'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]).reset_index(drop=True)) assert_frame_equal( GFF().df.reset_index(drop=True), DataFrame(columns=[ 'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]).reset_index(drop=True)) @classmethod def test_add_list(cls): assert_frame_equal( GFF([cls.sprot.features[0]]).add_feature_list( cls.sprot.features[1:3]).df.reset_index(drop=True), cls.df1.reset_index(drop=True)) @classmethod def test_to_feature_list(cls): for i in range(0, len(cls.sprot.features)): assert GFF(input_df=cls.df1).to_feature_list(parents=cls.sprot)[i].__str__() == \ cls.sprot.features[i].__str__() assert GFF(input_df=cls.df1).to_feature_list(parents=cls.sprot)[i].parent.id == \ cls.sprot.features[i].parent.id assert GFF(input_df=cls.df1).to_feature_list()[i].parent is None for i in range(0, len(cls.sprot.features)): assert GFF(input_df=cls.df1).to_feature_list(parents=[cls.sprot, cls.sprot, cls.sprot])[i].__str__() == \ cls.sprot.features[i].__str__() with pytest.raises( ValueError, match= r'The number of parents should match the number of features .*' ): GFF(input_df=cls.df1).to_feature_list(parents=[cls.sprot])