def predict(self, seq): """Predict the secondary structure of RNA sequence. Args: seq: RNA sequence. Returns: m: Molecule object with predicted bracket notation. """ prob = [[], []] dot = '' if self.library == 'mxnet': example = mx.io.NDArrayIter(np.array([rna.encode_rna(seq), rna.encode_rna(seq[::-1])])) prob = self.model.predict(example) if self.library == 'lasagne': if self.data_model == 'linear': prob = self.model(np.array([rna.encode_rna(seq), rna.encode_rna(seq[::-1])])) elif self.data_model == 'matrix': prob = self.model(np.array([rna.complementarity_matrix(rna.Molecule(seq)), rna.complementarity_matrix(rna.Molecule(seq[::-1]))])) backwards = False if prob[0].max() > prob[1].max(): max = prob[0].argmax() else: max = prob[1].argmax() backwards = True for i, j in self.a.items(): if j == max: dot = i break if backwards: dot = rna.dot_reverse(dot) m = rna.Molecule(seq, dot) return m
def test_substrings_method(self): m = rna.Molecule("GGCCUGAGGAGACUCAGAAGCC", "((((((((....)))))..)))") sub = m.get_substrings(6) self.assertEqual(type(sub), list) self.assertEqual(len(sub), 1) self.assertEqual(sub[0].dot, '(....)') self.assertEqual(len(m.get_substrings(3)), 2)
def predict(self, molecule): """Predict molecule's secondary structure. Args: molecule: Molecule object whose structure is to be predicted. """ if not molecule.dot: molecule.dot = '.' * len(molecule.seq) self.t = self.start_t self.seq = molecule.seq self.n = len(molecule.seq) self.neurons = np.random.uniform(0, 0, self.n * (self.n - 1) // 2) self.w = self.compute_weights() for i in range(self.num_epoch): self.epoch() self.t += self.t / (i + 1) dot = molecule.dot for k in range(len(self.neurons)): x = self.n - 2 - math.floor(math.sqrt(-8 * k + 4 * self.n * (self.n - 1) - 7) / 2.0 - 0.5) y = int(k + x + 1 - self.n * (self.n - 1) / 2 + (self.n - x) * ((self.n - x) - 1) / 2) if dot[x] == '.' and dot[y] == '.': if self.node_weight(x, y) == self.ni: if self.neurons[k] > 0.5: dot = dot[:x] + '(' + dot[x + 1: y] + ')' + dot[y + 1:] if self.node_weight(x, y) == self.ni / 2: if self.neurons[k] > 0.7: dot = dot[:x] + '(' + dot[x + 1: y] + ')' + dot[y + 1:] return rna.Molecule(molecule.seq, dot)
def preprocess(self): """Preprocess loaded data. Returns: X, y: NDArray of sequences and NDArray of labels. """ X = self.X y = [] list = [] for i in X: if self.substrings: m = rna.Molecule(i[0, 0], i[0, 1]) for j in m.get_substrings(self.sequence_length): seq = j.seq dot = j.dot if rna.dot_reverse(dot) in y: seq = seq[::-1] dot = rna.dot_reverse(dot) list.append(rna.encode_rna(seq)) y.append(dot) # list.append(rna.encode_rna(j.seq)) # y.append(j.dot) else: if len(i[0, 0]) == self.sequence_length: seq = i[0, 0] dot = i[0, 1] if rna.dot_reverse(dot) in y: seq = seq[::-1] dot = rna.dot_reverse(dot) if self.data_model == 'linear': list.append(rna.encode_rna(seq)) elif self.data_model == 'matrix': list.append(rna.complementarity_matrix(rna.Molecule(seq))) y.append(dot) X = np.array(list) y = y[:self.max_examples] z = set(y) self.num_labels = len(z) self.a = {} idx = 0 for i in z: self.a[i] = idx idx += 1 for i in range(len(y)): y[i] = self.a[y[i]] y = np.array(y) return X[:self.max_examples, :], y[:self.max_examples]
def test_complementarity_matrix_funtion(self): m = rna.Molecule("GGCCUGAGGAGACUCAGAAGCC", "((((((((....)))))..)))") p = rna.complementarity_matrix(m) self.assertEqual(p[0, len(m.seq) - 1], 2) self.assertEqual(p[0, len(m.seq) - 2], 2) self.assertEqual(p[0, len(m.seq) - 3], 0) self.assertEqual(p[m.seq.find('G'), m.seq.find('U')], 1) self.assertEqual(p.all(), p.T.all())
def mutate(self, molecule): """Mutate molecule by inserting or deleting basepairs. Args: molecule: Molecule that should be mutated. Returns: mutated: Mutated Molecule. """ m = rna.pair_matrix(molecule) seq = molecule.seq dot = molecule.dot length = len(seq) x = random.randrange(length - 5) y = random.randrange(x + 5, length) if m[x, :].sum() == 0 and m[:, y].sum() == 0: dot = dot[:x] + '(' + dot[x + 1: y] + ')' + dot[y + 1:] if m[x, y] == 1: dot = dot[:x] + '.' + dot[x + 1: y] + '.' + dot[y + 1:] dot = self.mutate(rna.Molecule(seq, dot)).dot return rna.Molecule(seq, dot)
def train(self, X=None, eta=0.001, limit=10, num_iter=5, log=False): """Train predictor on example data. Args: X: List of Molecule objects that are known examples. (use loaded data if None) eta: Learning ratio. limit: Maximum number of examples to train. num_iter: Number of training iterations. """ if X is None: if self.X.shape[0] == 0: raise Exception("Too few examples.") X = [] for i in self.X: if len(i[0, 0]) < 50: X.append(rna.Molecule(i[0, 0], i[0, 1])) X = X[:limit] for k in range(num_iter): for s in X: self.predict(s) example = [] pair = rna.pair_matrix(s) for i in range(self.n): for j in range(i + 1, self.n): example.append(pair[i, j]) example = np.array(example) for i in range(len(self.neurons)): r, c = self.get_upper_triangular_coordinates(i) for j in range(i + 1, len(self.neurons)): x, y = self.get_upper_triangular_coordinates(j) dif = eta * (math.tanh(np.dot(example, self.w[i])) - math.tanh(np.dot(self.neurons, self.w[i]))) if r == x: self.alpha -= dif elif c == y: self.beta -= dif elif r < i < c < j or i < r < j < c: self.gamma -= dif else: self.mi -= dif if log: print("Sequence {} trained...".format(s)) print(self.alpha, self.beta, self.gamma, self.mi)
def test_constructor_no_bracket(self): seq = 'AUGC' molecule = rna.Molecule(seq) self.assertEqual(molecule.seq, seq)
def test_constructor_incorrect_seq(self): seq = 'AUGB' with self.assertRaises(Exception): rna.Molecule(seq)
def test_constructor_with_bracket(self): seq = 'AUGC' dot = '(..)' molecule = rna.Molecule(seq, dot) self.assertEqual(molecule.seq, seq) self.assertEqual(len(molecule.seq), len(molecule.dot))
def test_constructor_with_incorrect_bracket2(self): seq = 'AUGC' dot = ')..(' with self.assertRaises(Exception): rna.Molecule(seq, dot)
def test_pair_matrix_funtion(self): m = rna.Molecule("GGCCUGAGGAGACUCAGAAGCC", "((((((((....)))))..)))") p = rna.pair_matrix(m) self.assertEqual(p.sum(), 16) self.assertEqual(p.all(), p.T.all())
def test_dot_reverse_function(self): m = rna.Molecule("GGCCUGAGGAGACUCAGAAGCC", "((((((((....)))))..)))") rev = rna.dot_reverse(m.dot) self.assertEqual(rev[:4], '(((.') self.assertEqual(rev.count('('), rev.count(')'))
def test_match_parentheses_function(self): m = rna.Molecule("GGCCUGAGGAGACUCAGAAGCC", "((((((((....)))))..)))") self.assertEqual(rna.match_parentheses(m.dot, 3), 16)
def test_evaluate_method(self): m = rna.Molecule("GGCCUGAGGAGACUCAGAAGCC", "((((((((....)))))..)))") k = rna.Molecule("GGCCUGAGGAGACUCAGAAGCC", ".(((((((....)))))..)).") l = rna.Molecule("GGCCUGAGGAGACUCAGAAGCC", "(((.(..((....))..).)))") self.assertGreater(m.evaluate(), k.evaluate()) self.assertGreater(m.evaluate(), l.evaluate())
def test_repair_method(self): m = rna.Molecule("GGCCUGAGGAGACUCAGAAGCA", "(((((((((..))))))..)))") m.repair() self.assertEqual(m.dot, ".(((((((....)))))..)).")
def test_show_method(self): m = rna.Molecule('AGGCU') with self.assertRaises(Exception): m.show()