Exemplo n.º 1
0
 def test_get_counts(self):
     """get_counts: should work with all parameters"""
     seq = RnaSequence('UCAG-NAUGU')
     seq2 = RnaSequence('UAAG-CACGC')
     p = Pairs([(1, 8), (2, 7)])
     p2 = Pairs([
         (1, 8),
         (2, 6),
         (3, 6),
         (4, 9),
     ])
     exp = {'TP':1,'TN':0, 'FN':1,'FP':3,\
         'FP_INCONS':0, 'FP_CONTRA':0, 'FP_COMP':0}
     self.assertEqual(get_counts(p, p2), exp)
     exp = {'TP':1,'TN':0, 'FN':1,'FP':3,\
         'FP_INCONS':1, 'FP_CONTRA':1, 'FP_COMP':1}
     self.assertEqual(get_counts(p, p2, split_fp=True), exp)
     seq = RnaSequence('UCAG-NACGU')
     exp = {'TP':1,'TN':7, 'FN':1,'FP':3,\
         'FP_INCONS':1, 'FP_CONTRA':1, 'FP_COMP':1}
     self.assertEqual(get_counts(p, p2, split_fp=True,\
         sequences=[seq], min_dist=2), exp)
     # check against compare_ct.pm
     exp = {'TP':4,'TN':266, 'FN':6,'FP':6,\
         'FP_INCONS':2, 'FP_CONTRA':2, 'FP_COMP':2}
     seq = 'agguugaaggggauccgauccacuccccggcuggucaaccu'.upper()
     self.assertEqual(get_counts(self.true, self.predicted, split_fp=True,\
         sequences=[seq], min_dist=4), exp)
Exemplo n.º 2
0
 def test_get_all_pairs(self):
     """get_all_pairs: should return the number of possible pairs"""
     seq = RnaSequence('UCAG-NACGU')
     seq2 = RnaSequence('UAAG-CACGC')
     self.assertEqual(get_all_pairs([seq], min_dist=4), 6)
     self.assertEqual(get_all_pairs([seq2], min_dist=4), 4)
     # when given multiple sequences, should average over all of them
     self.assertEqual(get_all_pairs([seq, seq2], min_dist=4), 5)
     # different min distance
     self.assertEqual(get_all_pairs([seq], min_dist=2), 10)
     # error on invalid minimum distance
     self.assertRaises(ValueError, get_all_pairs, [seq], min_dist=-2)
Exemplo n.º 3
0
    def test_seq_ungapping(self):
        rna1 = RnaSequence('U-C-A-G-', Name='rna1')
        model1 = ModelSequence('U-C-A-G-', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.assertEqual(rna1, 'U-C-A-G-')
        self.assertEqual(rna1.degap(), 'UCAG')

        # check is produces the right string from the beginning
        self.assertEqual(str(model1), 'U-C-A-G-')
        self.assertEqual(model1._data, [0, 4, 1, 4, 2, 4, 3, 4])
        # ModelSequence should maybe have the same degap method as normal Seq
        self.assertEqual(str(model1.degap()), 'UCAG')
    def test_seq_ungapping(self):
        rna1 = RnaSequence('U-C-A-G-', Name='rna1')
        model1 = ModelSequence('U-C-A-G-', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        
        self.assertEqual(rna1, 'U-C-A-G-')
        self.assertEqual(rna1.degap(), 'UCAG')

        # check is produces the right string from the beginning
        self.assertEqual(str(model1), 'U-C-A-G-')
        self.assertEqual(model1._data, [0,4,1,4,2,4,3,4])
        # ModelSequence should maybe have the same degap method as normal Seq
        self.assertEqual(str(model1.degap()), 'UCAG')
Exemplo n.º 5
0
    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA)

        obs_sub_aln = self.aln.takePositions([0, 1, 5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertNotEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))
Exemplo n.º 6
0
    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna2, rna3], MolType=RNA)
        aln = Alignment([rna1, rna2, rna3], MolType=RNA)
        obs_sub_aln = aln.takeSeqs(['rna2', 'rna3'])

        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.takeSeqs(['rna3', 'rna2'])
        obs_sub_aln_2 = self.aln.takeSeqs(['rna2', 'rna3'])
        self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
Exemplo n.º 7
0
    def setUp(self):
        """setUp: set up method for all tests"""

        self.rna1 = RnaSequence('UCAG-RYN-N', Name='rna1')
        self.m1 = ModelSequence('UCAG-RYN-N', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.s1 = 'UCAG-RYN-N'
Exemplo n.º 8
0
 def test_extract_seqs(self):
     """extract_seqs: should handle different input formats"""
     s1 = ">seq1\nACGUAGC\n>seq2\nGGUAGCG"
     s2 = [">seq1", "ACGUAGC", ">seq2", "GGUAGCG"]
     s3 = ['ACGUAGC', 'GGUAGCG']
     s4 = [RnaSequence('ACGUAGC'), RnaSequence('GGUAGCG')]
     m1 = ModelSequence('ACGUAGC', Name='rna1',\
         Alphabet=RNA.Alphabets.DegenGapped)
     m2 = ModelSequence('GGUAGCG', Name='rna2',\
         Alphabet=RNA.Alphabets.DegenGapped)
     s5 = [m1, m2]
     f = extract_seqs
     self.assertEqual(f(s1), ['ACGUAGC', 'GGUAGCG'])
     self.assertEqual(f(s2), ['ACGUAGC', 'GGUAGCG'])
     self.assertEqual(f(s3), ['ACGUAGC', 'GGUAGCG'])
     self.assertEqual(f(s4), ['ACGUAGC', 'GGUAGCG'])
     self.assertEqual(f(s5), ['ACGUAGC', 'GGUAGCG'])
Exemplo n.º 9
0
 def test_AlignmentToProfile_ignore(self):
     """AlignmentToProfile: should raise an error if too many chars ignored
     """
     #Same conditions as previous function, but in the last column 
     #there are only gaps, so normalization will fail at that position
     a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAA-')})
     exp =\
     array([[.5,0,.5,0],
      [0,1,0,0],
      [.5,0,.5,0],
      [0,0,0,1],
      [0,0,.75,.25],
      [.25,.25,.5,0],
      [.125,.125,.625,.125],
      [0,0,1,0]])
     self.assertRaises(ValueError,AlnToProfile,a,alphabet=RNA,\
         split_degenerates=True)
Exemplo n.º 10
0
 def test_gapped_to_ungapped_simple(self):
     """gapped_to_ungapped: should work for simple case"""
     s = RnaSequence(self.gapped)
     p = self.simple_g
     obs_seq, obs_pairs = gapped_to_ungapped(s, p)
     self.assertEqual(obs_seq, self.ungapped)
     self.assertEqualItems(obs_pairs, self.simple)
     assert isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_pairs, Pairs)
Exemplo n.º 11
0
def parse_residues(residue_lines, num_base, unpaired_symbol):
    """Return RnaSequence and Pairs object from residue lines.

    residue_lines -- list of lines or anything that behaves like it. 
        Lines should contain:
        residue_position, residue_identiy, residue_partner.
    num_base -- int, basis of the residue numbering. In bpseq files from
        the CRW website, the numbering starts at 1.
    unpaired_symbol -- string, symbol in the 'partner' column that indicates
        that a base is unpaired. In bpseq files from the CRW website, the
        unpaired_symbol is '0'. This parameter should be a string to allow
        other symbols that can't be casted to an integer to indicate
        unpaired bases.
    
    Checks for double entries both in the sequence and the structure, and
    checks that the structre is valid in the sense that if (up,down) in there,
    that (down,up) is the same.
    """
    #create dictionary/list for sequence and structure
    seq_dict = {}
    pairs = Pairs()

    for line in residue_lines:
        try:
            pos, res, partner = line.strip().split()
            if partner == unpaired_symbol:
                # adjust pos, not partner
                pos = int(pos) - num_base
                partner = None
            else:
                # adjust pos and partner
                pos = int(pos) - num_base
                partner = int(partner) - num_base
            pairs.append((pos, partner))

            #fill seq_dict
            if pos in seq_dict:
                raise BpseqParseError(\
                    "Double entry for residue %s (%s in bpseq file)"\
                    %(str(pos), str(pos+1)))
            else:
                seq_dict[pos] = res

        except ValueError:
            raise BpseqParseError("Failed to parse line: %s" % (line))

    #check for conflicts, remove unpaired bases
    if pairs.hasConflicts():
        raise BpseqParseError("Conflicts in the list of basepairs")
    pairs = pairs.directed()
    pairs.sort()

    # construct sequence from seq_dict
    seq = RnaSequence(construct_sequence(seq_dict))

    return seq, pairs
Exemplo n.º 12
0
 def test_gapped_to_ungapped_out_of_order(self):
     """gapped_to_ungapped: should work when pairs are out of order
     """
     s = RnaSequence(self.gapped)
     p = Pairs(self.out_order_g)
     obs_seq, obs_pairs = gapped_to_ungapped(s, p)
     self.assertEqual(obs_seq, self.ungapped)
     self.assertEqualItems(obs_pairs, self.out_order)
     assert isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_pairs, Pairs)
Exemplo n.º 13
0
 def test_gapped_to_ungapped_duplicates(self):
     """gapped_to_ungapped: should work when pairs contains duplicates
     """
     s = RnaSequence(self.gapped)
     p = Pairs(self.duplicates_g)
     obs_seq, obs_pairs = gapped_to_ungapped(s, p)
     self.assertEqual(obs_seq, self.ungapped)
     self.assertEqualItems(obs_pairs, self.duplicates)
     assert isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_pairs, Pairs)
Exemplo n.º 14
0
 def test_gapped_to_ungapped_pseudo(self):
     """gapped_to_ungapped: shouldn't care about pseudoknots
     """
     s = RnaSequence(self.gapped)
     p = Pairs(self.pseudo_g)
     obs_seq, obs_pairs = gapped_to_ungapped(s, p)
     self.assertEqual(obs_seq, self.ungapped)
     self.assertEqualItems(obs_pairs, self.pseudo)
     assert isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_pairs, Pairs)
Exemplo n.º 15
0
 def test_gapped_to_ungapped_no_gaps(self):
     """gapped_to_ungapped: should return same pairs when no gaps
     """
     s = RnaSequence(self.ungapped)
     p = Pairs(self.simple)
     obs_seq, obs_pairs = gapped_to_ungapped(s, p)
     self.assertEqual(obs_seq, self.ungapped)
     self.assertEqualItems(obs_pairs, self.simple)
     assert isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_pairs, Pairs)
Exemplo n.º 16
0
 def test_ungapped_to_gapped(self):
     """ungapped_to_gapped: should work for basic case
     """
     s = RnaSequence(self.gapped)
     p = self.simple
     obs_seq, obs_pairs = ungapped_to_gapped(s, p)
     assert obs_seq is s
     self.assertEqualItems(obs_pairs, self.simple_g)
     assert isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_pairs, Pairs)
Exemplo n.º 17
0
 def test_ungapped_to_gapped_out_of_order(self):
     """ungapped_to_gapped: should work when pairs out of order
     """
     s = RnaSequence(self.gapped)
     p = self.out_order
     obs_seq, obs_pairs = ungapped_to_gapped(s, p)
     assert obs_seq is s
     self.assertEqualItems(obs_pairs, self.out_order_g)
     assert isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_pairs, Pairs)
Exemplo n.º 18
0
    def test_adjust_pairs_from_mapping(self):
        """adjust_pairs_from_mapping: should work both ways
        """
        #ungapped to gapped
        r = RnaSequence('UC-AG-UC-CG-A-')
        u_to_g = r.gapMaps()[0] 
        #{0: 0, 1: 1, 2: 3, 3: 4, 4: 6, 5: 7, 6: 9, 7: 10, 8: 12}
        ungapped_pairs = Pairs([(0,8),(1,6),(2,5)])
        exp_pairs = Pairs([(0,12),(1,9),(3,7)])
        self.assertEqualItems(adjust_pairs_from_mapping(ungapped_pairs,\
            u_to_g), exp_pairs)

        #gapped to ungapped
        r = RnaSequence('UC-AG-UC-CG-A-')
        g_to_u = r.gapMaps()[1]
        #{0: 0, 1: 1, 3: 2, 4: 3, 6: 4, 7: 5, 9: 6, 10: 7, 12: 8}
        gapped_pairs = Pairs([(0,12),(1,9),(3,7)])
        exp_pairs = Pairs([(0,8),(1,6),(2,5)])
        self.assertEqualItems(adjust_pairs_from_mapping(gapped_pairs,\
            g_to_u), exp_pairs)
Exemplo n.º 19
0
 def test_AlignmentToProfile_basic(self):
     """AlignmentToProfile: should work under basic conditions
     """
     #sequences in the alignment are unweighted
     #Alphabet is the alphabet of the sequences (RNA)
     #CharOrder is set explicitly
     #Degenerate bases are split up
     #Gaps are ignored
     #In all of the columns at least one character is in the CharOrder
     a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAAA')})
     exp =\
     array([[.5,0,.5,0],
      [0,1,0,0],
      [.5,0,.5,0],
      [0,0,0,1],
      [0,0,.75,.25],
      [.25,.25,.5,0],
      [.125,.125,.625,.125],
      [0,0,1,0]])
     self.assertEqual(AlnToProfile(a,alphabet=RNA,\
         split_degenerates=True).Data.tolist(),exp.tolist())
Exemplo n.º 20
0
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
Exemplo n.º 21
0
    def test_full(self):
        """RdbParser: full data, valid and invalid"""
        # when only good record, should work independent of strict
        r1 = RnaSequence("-??GG-UGAA--CGCU---ACGU-N???---",\
            Info=Info({'Species': "unidentified Thermus OPB AF027020",\
            'Refs':{'rRNA':['AF027020']},\
            'OriginalSeq':'-o[oGG-U{G}AA--C^GC]U---ACGU-Nooo---'}))
        r2 = RnaSequence("---CGAUCG--UAUACG-N???-",\
            Info=Info({'Species':'Thermus silvanus X84211',\
            'Refs':{'rRNA':['X84211']},\
            'OriginalSeq':'---CGAU[C(G){--UA}U]ACG-Nooo-'}))
        obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split('\n'), strict=True))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].Info, r1.Info)
        self.assertEqual(obs[1], r2)
        self.assertEqual(str(obs[1]), str(r2))
        self.assertEqual(obs[1].Info, r2.Info)

        obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split('\n'), strict=False))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].Info, r1.Info)

        # when strict, should raise error on invalid record
        f = RdbParser(RDB_LINES_GOOD_BAD.split('\n'), strict=True)
        self.assertRaises(RecordError, list, f)
        # when not strict, malicious record is skipped
        obs = list(RdbParser(RDB_LINES_GOOD_BAD.split('\n'), strict=False))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].Info, r1.Info)
        self.assertEqual(obs[1], r2)
        self.assertEqual(str(obs[1]), str(r2))
        self.assertEqual(obs[1].Info, r2.Info)
Exemplo n.º 22
0
 def test_ungapped_to_gapped_general(self):
     """ungapped_to_gapped: should return object of right type
     """
     s = RnaSequence(self.gapped)
     p = self.simple
     #in case of RnaSequence
     obs_seq, obs_pairs = ungapped_to_gapped(s, p)
     assert obs_seq is s
     self.assertEqualItems(obs_pairs, self.simple_g)
     assert isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_pairs, Pairs)
     #in case of str input
     s = self.gapped
     obs_seq, obs_pairs = ungapped_to_gapped(s, p)
     assert obs_seq is s
     self.assertEqualItems(obs_pairs, self.simple_g)
     assert not isinstance(obs_seq, RnaSequence)
     assert isinstance(obs_seq, str)
     assert isinstance(obs_pairs, Pairs)
Exemplo n.º 23
0
    def test_adjust_pairs_from_mapping(self):
        """adjust_pairs_from_mapping: should work both ways
        """
        #ungapped to gapped
        r = RnaSequence('UC-AG-UC-CG-A-')
        u_to_g = r.gapMaps()[0]
        #{0: 0, 1: 1, 2: 3, 3: 4, 4: 6, 5: 7, 6: 9, 7: 10, 8: 12}
        ungapped_pairs = Pairs([(0, 8), (1, 6), (2, 5)])
        exp_pairs = Pairs([(0, 12), (1, 9), (3, 7)])
        self.assertEqualItems(adjust_pairs_from_mapping(ungapped_pairs,\
            u_to_g), exp_pairs)

        #gapped to ungapped
        r = RnaSequence('UC-AG-UC-CG-A-')
        g_to_u = r.gapMaps()[1]
        #{0: 0, 1: 1, 3: 2, 4: 3, 6: 4, 7: 5, 9: 6, 10: 7, 12: 8}
        gapped_pairs = Pairs([(0, 12), (1, 9), (3, 7)])
        exp_pairs = Pairs([(0, 8), (1, 6), (2, 5)])
        self.assertEqualItems(adjust_pairs_from_mapping(gapped_pairs,\
            g_to_u), exp_pairs)
Exemplo n.º 24
0
def rna_distance(first, second):
    first = RnaSequence(first)
    return first.fracDiff(second)
Exemplo n.º 25
0
def rna_distance(first,second):
    first = RnaSequence(first)
    return first.fracDiff(second)
Exemplo n.º 26
0
def Rna(x, Info=None):
    if isinstance(x, list):
        x = ''.join(x)
    if Info is None:
        Info = {}
    return RnaSequence(x.upper().replace('T','U'), Info=InfoClass(Info))
Exemplo n.º 27
0
    def test_fromPair(self):
        """Counts fromPair should return correct counts."""
        s = Counts.fromPair( RnaSequence('UCCGAUCGAUUAUCGGGUACGUA'), \
                             RnaSequence('GUCGAGUAUAGCGUACGGCUACG'),
                             RnaPairs)

        assert isinstance(s, Counts)

        vals = [
            ('U', 'U', 0),
            ('U', 'C', 2.5),
            ('U', 'A', 1),
            ('U', 'G', 2.5),
            ('C', 'U', 2.5),
            ('C', 'C', 1),
            ('C', 'A', 1),
            ('C', 'G', 0.5),
            ('A', 'U', 1),
            ('A', 'C', 1),
            ('A', 'A', 1),
            ('A', 'G', 2),
            ('G', 'U', 2.5),
            ('G', 'C', 0.5),
            ('G', 'A', 2),
            ('G', 'G', 2),
        ]
        for i, j, val in vals:
            self.assertFloatEqual(s[i, j], val)
        #check that it works for big seqs
        s = Counts.fromPair( RnaSequence('UCAG'*1000), \
                             RnaSequence('UGAG'*1000),
                             RnaPairs)

        assert isinstance(s, Counts)

        vals = [
            ('U', 'U', 1000),
            ('U', 'C', 0),
            ('U', 'A', 0),
            ('U', 'G', 0),
            ('C', 'U', 0),
            ('C', 'C', 0),
            ('C', 'A', 0),
            ('C', 'G', 500),
            ('A', 'U', 0),
            ('A', 'C', 0),
            ('A', 'A', 1000),
            ('A', 'G', 0),
            ('G', 'U', 0),
            ('G', 'C', 500),
            ('G', 'A', 0),
            ('G', 'G', 1000),
        ]
        for i, j, val in vals:
            self.assertFloatEqual(s[i, j], val)

        #check that it works for codon seqs
        s1 = ModelRnaCodonSequence('UUCGCG')
        s2 = ModelRnaCodonSequence('UUUGGG')
        c = Counts.fromPair(s1, s2, RNA.Alphabet.Triples**2)
        self.assertEqual(c._data.sum(), 2)
        self.assertEqual(c._data[0, 1], 0.5)
        self.assertEqual(c._data[1, 0], 0.5)
        self.assertEqual(c._data[55, 63], 0.5)
        self.assertEqual(c._data[63, 55], 0.5)