def remap(read_ref, ev, min_prob, kmer_len, prior, slip): inMat = sloika.features.from_events(ev, tag='') inMat = np.expand_dims(inMat, axis=1) post = sloika.decode.prepare_post(calc_post(inMat), min_prob=min_prob, drop_bad=False) kmers = np.array(bio.seq_to_kmers(read_ref, kmer_len)) seq = [kmer_to_state[k] + 1 for k in kmers] prior0 = None if prior[0] is None else sloika.util.geometric_prior( len(seq), prior[0]) prior1 = None if prior[1] is None else sloika.util.geometric_prior( len(seq), prior[1], rev=True) score, path = sloika.transducer.map_to_sequence(post, seq, slip=slip, prior_initial=prior0, prior_final=prior1, log=False) ev = nprf.append_fields( ev, ['seq_pos', 'kmer', 'good_emission'], [path, kmers[path], np.repeat(True, len(ev))]) return (score, ev, path, seq)
def test_de_bruijn_allkmers(self): alpha = 4 dblen = 2 debruijn_seq = ''.join( [str(y) for y in bio.de_bruijn(alpha, dblen, pad=True)]) kmers = bio.seq_to_kmers(debruijn_seq, dblen) self.assertEqual(len(kmers), alpha**dblen)
def test_de_bruijn_noduplicates(self): alpha = 4 dblen = 2 debruijn_seq = ''.join( [str(y) for y in bio.de_bruijn(alpha, dblen, pad=True)]) all_kmers = bio.seq_to_kmers(debruijn_seq, dblen) self.assertTrue(len(all_kmers) == len(set(all_kmers)))
def raw_remap(ref, signal, min_prob, kmer_len, prior, slip): """ Map raw signal to reference sequence using transducer model""" from sloika import config # local import to avoid CUDA init in main thread inMat = (signal - np.median(signal)) / mad(signal) inMat = inMat[:, None, None].astype(config.sloika_dtype) post = sloika.decode.prepare_post(batch.calc_post(inMat), min_prob=min_prob, drop_bad=False) kmers = np.array(bio.seq_to_kmers(ref, kmer_len)) seq = [batch.kmer_to_state[k] + 1 for k in kmers] prior0 = None if prior[0] is None else sloika.util.geometric_prior( len(seq), prior[0]) prior1 = None if prior[1] is None else sloika.util.geometric_prior( len(seq), prior[1], rev=True) score, path = sloika.transducer.map_to_sequence(post, seq, slip=slip, prior_initial=prior0, prior_final=prior1, log=False) mapping_dtype = [ ('start', '<i8'), ('length', '<i8'), ('seq_pos', '<i8'), ('move', '<i8'), ('kmer', 'S{}'.format(kmer_len)), ('good_emission', '?'), ] mapping_table = np.zeros(post.shape[0], dtype=mapping_dtype) stride = int(np.ceil(signal.shape[0] / float(post.shape[0]))) mapping_table['start'] = np.arange( 0, signal.shape[0], stride, dtype=np.int) - stride // 2 mapping_table['length'] = stride mapping_table['seq_pos'] = path mapping_table['move'] = np.ediff1d(path, to_begin=1) mapping_table['kmer'] = kmers[path] # We set 'good_emission' for compatability only mapping_table['good_emission'] = True _, mapping_table = trim_signal_and_mapping(signal, mapping_table, 0, len(signal)) return (score, mapping_table, path, seq)
def test_seq_to_kmers_returns_correct(self): self.assertEqual(bio.seq_to_kmers(self.base_seq, 10), self.kmers1)