def test_012_bad_trimming(self): rt = scrappy.RawTable(self.one_signal) rt.trim(start=200, end=len(rt._data) - 5) self.assertEqual(rt.start, 0, 'Empty gives start=0.') self.assertEqual(rt.end, 0, 'Empty gives end=0.') self.assertEqual(len(rt.data(as_numpy=True)), 0, 'Empty give len=0 array')
def read_fast5_signal(fname: str) -> np.ndarray: with h5py.File(fname, 'r') as input_data: raw_attr = input_data['Raw/Reads/'] read_name = list(raw_attr.keys())[0] raw_signal = np.array(raw_attr[read_name + "/Signal"].value) raw_signal = scrappy.RawTable(raw_signal).trim().scale().data( as_numpy=True) logger.debug(f"Read {fname} size: {len(raw_signal)}") return raw_signal
def test_060_matrix_conversions(self): rt = scrappy.RawTable(self.one_signal) rt.trim().scale() original = scrappy.calc_post(rt, self.model, log=True) from_np = scrappy.ScrappyMatrix( original.data(as_numpy=True, sloika=False)) # decoding both as a check on the C structure scores = [scrappy.decode_post(x)[1] for x in (original, from_np)] np.testing.assert_almost_equal(scores[0], scores[1], err_msg='Scores equal after round trip.')
def test_011_raw_table_methods(self): rt = scrappy.RawTable(self.one_signal) # check trim rt.trim(start=200) self.assertEqual(rt.start, 200, 'Trimming applied.') # check scale rt.scale() self.assertEqual(np.median(rt._data[rt.start:rt.end]), 0.0, 'Scaling shifts median to 0.0.') # .data(as_numpy=True) should own its data new_data = rt.data(as_numpy=True) self.assertTrue(new_data.flags.owndata)
def test_065_matrix_view(self): rt = scrappy.RawTable(self.one_signal) rt.trim().scale() original = scrappy.calc_post(rt, self.model, log=True) all_view = original[:] self.assertSequenceEqual(original.shape, all_view.shape, 'All view is same shape.') small_view = original[100:200] self.assertSequenceEqual((100, original.shape[1]), small_view.shape, 'Slice has correct shape.') smaller_view = small_view[10:50] self.assertSequenceEqual((40, original.shape[1]), smaller_view.shape, 'Slice of slice has correct shape.') np_original = original.data(as_numpy=True, sloika=False) np_smaller = smaller_view.data(as_numpy=True, sloika=False) np.testing.assert_allclose(np_original[110:150,], np_smaller, err_msg='Slice contains correct data.')
def get_chunks(fname: str, cfg: EmbeddingCfg) -> List[np.ndarray]: with h5py.File(fname, 'r') as f: raw_dat = list(f['/Raw/Reads/'].values())[0] raw_dat = np.array(raw_dat['Signal'].value) raw_dat_processed = scrappy.RawTable(raw_dat).trim().scale().data( as_numpy=True ) chunks = [] for i in range( 0, raw_dat_processed.shape[0] - cfg.receptive_field, cfg.stride, ): chunks.append(raw_dat_processed[i:i + cfg.receptive_field]) return chunks
def test_020_intermediates(self): rt = scrappy.RawTable(self.one_signal) self.assertIsInstance(rt._rt, scrappy.ffi.CData) rt.trim().scale() post = scrappy.calc_post(rt.data(), self.model, log=True) self.assertIsInstance(post, scrappy.ffi.CData) # Check matrix is formed sanely sloika_post = scrappy.scrappie_to_numpy(post, sloika=True) self.assertIsInstance(sloika_post, np.ndarray) self.assertEqual(sloika_post.shape[1], self.expected_states) # check types, shouldn't leak cffi abstraction seq, score, pos = scrappy.decode_post(post, self.model) self.assertIsInstance(seq, str, 'sequence is str.') self.assertIsInstance(score, float, 'score is float.') self.assertIsInstance(pos, np.ndarray, 'pos is ndarray.') scrappy.free_matrix(post)
def worker(args, noise=None): model = 'rgrgr_r94' seq, ref, start, end, strand = args squiggle = scrappy.sequence_to_squiggle(seq, rescale=True).data(as_numpy=True, sloika=False) n = 1 / np.sqrt(2) raw_data = np.concatenate([ np.random.laplace(mean, n * stdv, int(dwell)) for mean, stdv, dwell in squiggle ]) if noise is not None: raw_data += np.random.normal(scale=noise, size=len(raw_data)) raw = scrappy.RawTable(raw_data) raw.scale() post = scrappy.calc_post(raw, model, log=True) call, score, _ = scrappy.decode_post(post, model) return '>call_{}:{}-{}({}) seq_len={} call_len={} score={}\n{}'.format( ref, start, end, strand, len(seq), len(call), score, call)
def test_046_post_viterbi_mapping(self): rt = scrappy.RawTable(self.one_signal) rt.trim().scale() post = scrappy.calc_post(rt, self.model, log=True) t0 = now() score_band, _ = scrappy.map_post_to_sequence( post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0, viterbi=True, path=False, bands=100) t1 = now() score_no_band, _ = scrappy.map_post_to_sequence( post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0, viterbi=True, path=False, bands=None) t2 = now() self.assertIsInstance(score_no_band, float, 'score is float.') self.assertLess(t1 - t0, t2 - t0, 'banded mapping is faster.') score_band, path = scrappy.map_post_to_sequence( post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0, viterbi=True, path=True, bands=100) self.assertIsInstance(path, np.ndarray, 'path is ndarray.')
def test_045_post_forward_mapping(self): rt = scrappy.RawTable(self.one_signal) rt.trim().scale() post = scrappy.calc_post(rt, self.model, log=True) t0 = now() score_band, _ = scrappy.map_post_to_sequence( post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0, viterbi=False, path=False, bands=100) t1 = now() score_no_band, _ = scrappy.map_post_to_sequence( post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0, viterbi=False, path=False, bands=None) t2 = now() self.assertIsInstance(score_no_band, float, 'score is float.') self.assertLess(t1 - t0, t2 - t0, 'banded mapping is faster.') with self.assertRaises(ValueError): # can't calculate path with Forward score_no_band = scrappy.map_post_to_sequence( post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0, viterbi=False, path=True, bands=None)
def test_010_raw_table_type(self): # type should be correct for t in (np.float32, np.float64): rt = scrappy.RawTable(self.one_signal.astype(t)) self.assertEqual(rt._data.dtype, scrappy.ftype, 'Raw table internal type.')
def produce_datapoints(cfg: InputFeederCfg, fnames: List[str], repeat=True): """ Pushes single instances to the queue of the form: signal[None,], labels [None,] That is 1D numpy array :param cfg: :param fnames: :param q: :return: """ random.seed(os.urandom(20)) for cnt in itertools.count(1): random.shuffle(fnames) for x in fnames: with gzip.open(x, "r") as f: dp = dataset_pb2.DataPoint() dp.ParseFromString(f.read()) signal = np.array(dp.signal, dtype=np.float32) signal = scrappy.RawTable(signal).scale().data(as_numpy=True) assert len(signal) == len(dp.signal), "Trimming occured" if len(signal) < cfg.min_signal_size: yield ValueError( f"Signal too short {len(dp.signal)} < {cfg.min_signal_size}" ) continue label_idx = 0 for start in range(0, len(signal), cfg.seq_length): buff = [] while label_idx < len( dp.labels ) and dp.labels[label_idx].upper < start: label_idx += 1 while label_idx < len( dp.labels ) and dp.labels[label_idx].lower < start + cfg.seq_length: buff.append(dp.labels[label_idx].pair) # Sanity check assert start <= dp.labels[label_idx].lower assert start <= dp.labels[label_idx].upper assert dp.labels[label_idx ].lower <= start + cfg.seq_length label_idx += 1 signal_segment = signal[start:start + cfg.seq_length] if len(buff) == 0: yield ValueError("Empty labels") elif len(signal_segment) / cfg.ratio < len(buff): yield ValueError( f"max possible labels {len(signal_segment)/cfg.ratio}, have {len(buff)} labels.\n" f"Signal len: {len(signal_segment)}, ratio: {cfg.ratio}" ) elif len(buff) > cfg.max_label_size: yield ValueError( f"More labels {len(buff)} then allowed limit {cfg.max_label_size}" ) else: logging.debug(f"produce_datapoints: yielding datapoint") yield [ signal_segment, np.array(buff, dtype=np.int32), ] if not repeat: logging.info("Repeat is false, quiting") break