예제 #1
0
 def test_012_bad_trimming(self):
     rt = scrappy.RawTable(self.one_signal)
     rt.trim(start=200, end=len(rt._data) - 5)
     self.assertEqual(rt.start, 0, 'Empty gives start=0.')
     self.assertEqual(rt.end, 0, 'Empty gives end=0.')
     self.assertEqual(len(rt.data(as_numpy=True)), 0,
                      'Empty give len=0 array')
예제 #2
0
def read_fast5_signal(fname: str) -> np.ndarray:
    with h5py.File(fname, 'r') as input_data:
        raw_attr = input_data['Raw/Reads/']
        read_name = list(raw_attr.keys())[0]
        raw_signal = np.array(raw_attr[read_name + "/Signal"].value)
        raw_signal = scrappy.RawTable(raw_signal).trim().scale().data(
            as_numpy=True)
        logger.debug(f"Read {fname} size: {len(raw_signal)}")
        return raw_signal
예제 #3
0
    def test_060_matrix_conversions(self):
        rt = scrappy.RawTable(self.one_signal)
        rt.trim().scale()
        original = scrappy.calc_post(rt, self.model, log=True)
        from_np = scrappy.ScrappyMatrix(
            original.data(as_numpy=True, sloika=False))

        # decoding both as a check on the C structure
        scores = [scrappy.decode_post(x)[1] for x in (original, from_np)]
        np.testing.assert_almost_equal(scores[0], scores[1], err_msg='Scores equal after round trip.')
예제 #4
0
 def test_011_raw_table_methods(self):
     rt = scrappy.RawTable(self.one_signal)
     # check trim
     rt.trim(start=200)
     self.assertEqual(rt.start, 200, 'Trimming applied.')
     # check scale
     rt.scale()
     self.assertEqual(np.median(rt._data[rt.start:rt.end]), 0.0,
                      'Scaling shifts median to 0.0.')
     # .data(as_numpy=True) should own its data
     new_data = rt.data(as_numpy=True)
     self.assertTrue(new_data.flags.owndata)
예제 #5
0
    def test_065_matrix_view(self):
        rt = scrappy.RawTable(self.one_signal)
        rt.trim().scale()
        original = scrappy.calc_post(rt, self.model, log=True)

        all_view = original[:]
        self.assertSequenceEqual(original.shape, all_view.shape, 'All view is same shape.')
        small_view = original[100:200]
        self.assertSequenceEqual((100, original.shape[1]), small_view.shape, 'Slice has correct shape.')
        smaller_view = small_view[10:50]
        self.assertSequenceEqual((40, original.shape[1]), smaller_view.shape, 'Slice of slice has correct shape.')

        np_original = original.data(as_numpy=True, sloika=False)
        np_smaller = smaller_view.data(as_numpy=True, sloika=False)
        np.testing.assert_allclose(np_original[110:150,], np_smaller, err_msg='Slice contains correct data.')
예제 #6
0
def get_chunks(fname: str, cfg: EmbeddingCfg) -> List[np.ndarray]:
    with h5py.File(fname, 'r') as f:
        raw_dat = list(f['/Raw/Reads/'].values())[0]
        raw_dat = np.array(raw_dat['Signal'].value)
        raw_dat_processed = scrappy.RawTable(raw_dat).trim().scale().data(
            as_numpy=True
        )

        chunks = []
        for i in range(
            0,
            raw_dat_processed.shape[0] - cfg.receptive_field,
            cfg.stride,
        ):
            chunks.append(raw_dat_processed[i:i + cfg.receptive_field])
        return chunks
예제 #7
0
    def test_020_intermediates(self):
        rt = scrappy.RawTable(self.one_signal)
        self.assertIsInstance(rt._rt, scrappy.ffi.CData)
        rt.trim().scale()
        post = scrappy.calc_post(rt.data(), self.model, log=True)
        self.assertIsInstance(post, scrappy.ffi.CData)

        # Check matrix is formed sanely
        sloika_post = scrappy.scrappie_to_numpy(post, sloika=True)
        self.assertIsInstance(sloika_post, np.ndarray)
        self.assertEqual(sloika_post.shape[1], self.expected_states)

        # check types, shouldn't leak cffi abstraction
        seq, score, pos = scrappy.decode_post(post, self.model)
        self.assertIsInstance(seq, str, 'sequence is str.')
        self.assertIsInstance(score, float, 'score is float.')
        self.assertIsInstance(pos, np.ndarray, 'pos is ndarray.')

        scrappy.free_matrix(post)
예제 #8
0
def worker(args, noise=None):
    model = 'rgrgr_r94'
    seq, ref, start, end, strand = args
    squiggle = scrappy.sequence_to_squiggle(seq,
                                            rescale=True).data(as_numpy=True,
                                                               sloika=False)

    n = 1 / np.sqrt(2)
    raw_data = np.concatenate([
        np.random.laplace(mean, n * stdv, int(dwell))
        for mean, stdv, dwell in squiggle
    ])
    if noise is not None:
        raw_data += np.random.normal(scale=noise, size=len(raw_data))

    raw = scrappy.RawTable(raw_data)
    raw.scale()
    post = scrappy.calc_post(raw, model, log=True)
    call, score, _ = scrappy.decode_post(post, model)
    return '>call_{}:{}-{}({}) seq_len={} call_len={} score={}\n{}'.format(
        ref, start, end, strand, len(seq), len(call), score, call)
예제 #9
0
    def test_046_post_viterbi_mapping(self):
        rt = scrappy.RawTable(self.one_signal)
        rt.trim().scale()
        post = scrappy.calc_post(rt, self.model, log=True)

        t0 = now()
        score_band, _ = scrappy.map_post_to_sequence(
            post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0,
            viterbi=True, path=False, bands=100)
        t1 = now()
        score_no_band, _ = scrappy.map_post_to_sequence(
            post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0,
            viterbi=True, path=False, bands=None)
        t2 = now()
        self.assertIsInstance(score_no_band, float, 'score is float.')
        self.assertLess(t1 - t0, t2 - t0, 'banded mapping is faster.')

        score_band, path = scrappy.map_post_to_sequence(
            post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0,
            viterbi=True, path=True, bands=100)
        self.assertIsInstance(path, np.ndarray, 'path is ndarray.')
예제 #10
0
    def test_045_post_forward_mapping(self):
        rt = scrappy.RawTable(self.one_signal)
        rt.trim().scale()
        post = scrappy.calc_post(rt, self.model, log=True)

        t0 = now()
        score_band, _ = scrappy.map_post_to_sequence(
            post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0,
            viterbi=False, path=False, bands=100)
        t1 = now()
        score_no_band, _ = scrappy.map_post_to_sequence(
            post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0,
            viterbi=False, path=False, bands=None)
        t2 = now()
        self.assertIsInstance(score_no_band, float, 'score is float.')
        self.assertLess(t1 - t0, t2 - t0, 'banded mapping is faster.')

        with self.assertRaises(ValueError):
            # can't calculate path with Forward
            score_no_band = scrappy.map_post_to_sequence(
                post, self.one_ref, stay_pen=0, skip_pen=0, local_pen=4.0,
                viterbi=False, path=True, bands=None)
예제 #11
0
 def test_010_raw_table_type(self):
     # type should be correct
     for t in (np.float32, np.float64):
         rt = scrappy.RawTable(self.one_signal.astype(t))
         self.assertEqual(rt._data.dtype, scrappy.ftype,
                          'Raw table internal type.')
예제 #12
0
def produce_datapoints(cfg: InputFeederCfg, fnames: List[str], repeat=True):
    """

    Pushes single instances to the queue of the form:
        signal[None,], labels [None,]
    That is 1D numpy array
    :param cfg:
    :param fnames:
    :param q:
    :return:
    """
    random.seed(os.urandom(20))
    for cnt in itertools.count(1):
        random.shuffle(fnames)
        for x in fnames:
            with gzip.open(x, "r") as f:
                dp = dataset_pb2.DataPoint()
                dp.ParseFromString(f.read())
                signal = np.array(dp.signal, dtype=np.float32)
                signal = scrappy.RawTable(signal).scale().data(as_numpy=True)
                assert len(signal) == len(dp.signal), "Trimming occured"
                if len(signal) < cfg.min_signal_size:
                    yield ValueError(
                        f"Signal too short {len(dp.signal)} < {cfg.min_signal_size}"
                    )
                    continue

                label_idx = 0
                for start in range(0, len(signal), cfg.seq_length):
                    buff = []
                    while label_idx < len(
                        dp.labels
                    ) and dp.labels[label_idx].upper < start:
                        label_idx += 1
                    while label_idx < len(
                        dp.labels
                    ) and dp.labels[label_idx].lower < start + cfg.seq_length:
                        buff.append(dp.labels[label_idx].pair)

                        # Sanity check
                        assert start <= dp.labels[label_idx].lower
                        assert start <= dp.labels[label_idx].upper
                        assert dp.labels[label_idx
                                        ].lower <= start + cfg.seq_length

                        label_idx += 1

                    signal_segment = signal[start:start + cfg.seq_length]
                    if len(buff) == 0:
                        yield ValueError("Empty labels")
                    elif len(signal_segment) / cfg.ratio < len(buff):
                        yield ValueError(
                            f"max possible labels {len(signal_segment)/cfg.ratio}, have {len(buff)} labels.\n"
                            f"Signal len: {len(signal_segment)}, ratio: {cfg.ratio}"
                        )
                    elif len(buff) > cfg.max_label_size:
                        yield ValueError(
                            f"More labels {len(buff)} then allowed limit {cfg.max_label_size}"
                        )
                    else:
                        logging.debug(f"produce_datapoints: yielding datapoint")
                        yield [
                            signal_segment,
                            np.array(buff, dtype=np.int32),
                        ]
        if not repeat:
            logging.info("Repeat is false, quiting")
            break