def test_shuffled_ignore_missing(temp_file_1_name, temp_file_2_name): with io_open('ark:' + temp_file_1_name, 't', mode='w') as token_f: token_f.write('1', 'cool') token_f.write('3', 'bean') token_f.write('4', 'casserole') keys = [str(i) for i in range(6)] data = corpus.ShuffledData(('ark:' + temp_file_1_name, 't'), key_list=keys, ignore_missing=True, rng=NonRandomState()) assert len(data) == 3 act_samples = list(data) assert all(ex == act for ex, act in zip(['casserole', 'bean', 'cool'], act_samples)) with io_open('ark:' + temp_file_2_name, 'B', mode='w') as bool_f: bool_f.write('0', True) bool_f.write('1', False) bool_f.write('2', True) bool_f.write('4', False) data = corpus.ShuffledData(('ark:' + temp_file_1_name, 't'), ('ark:' + temp_file_2_name, 'B'), key_list=keys, ignore_missing=True, rng=NonRandomState()) assert len(data) == 2 act_tok_samples, act_bool_samples = list(zip(*iter(data))) assert all(ex == act for ex, act in zip(['casserole', 'cool'], act_tok_samples)) assert all(not act for act in act_bool_samples)
def test_seeded_shuffled_is_predictable(temp_file_1_name, seed): samples = np.arange(100000).reshape((1000, 100)).astype(np.float32) with io_open('ark:' + temp_file_1_name, 'fv', mode='w') as f: for idx, sample in enumerate(samples): f.write(str(idx), sample) data_1 = corpus.ShuffledData(('ark:' + temp_file_1_name, 'fv'), batch_size=13, rng=seed) data_2 = corpus.ShuffledData(('ark:' + temp_file_1_name, 'fv'), batch_size=13, rng=seed) for _ in range(2): for batch_1, batch_2 in zip(data_1, data_2): assert np.allclose(batch_1, batch_2)
def test_shuffled_data_basic(temp_file_1_name): samples = np.arange(100000).reshape((10, 200, 50)).astype( np.float64 if KaldiDataType.BaseMatrix.is_double else np.float32) keys = tuple(str(i) for i in range(10)) with io_open('ark:' + temp_file_1_name, 'bm', mode='w') as f: for key, sample in zip(keys, samples): f.write(key, sample) data = corpus.ShuffledData('ark:' + temp_file_1_name, batch_size=3, rng=NonRandomState()) assert isinstance(data.rng, NonRandomState) assert len(data) == int(np.ceil(len(keys) / 3)) assert keys == tuple(data.key_list) for _ in range(2): ex_samp_idx = len(samples) for batch in data: for act_sample in batch: ex_samp_idx -= 1 assert np.allclose(samples[ex_samp_idx], act_sample)
def test_shuffled_data_tups(temp_file_1_name, temp_file_2_name): feats = [ [[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10], [11, 12]], [[13, 14, 15], [16, 17, 18]], [[19], [20]], ] labels = [ np.array([[1, 2], [3, 4]], dtype=np.float64), np.array([[5, 6, 7, 8], [9, 10, 11, 12]], dtype=np.float64), np.array([[13], [14]], dtype=np.float64), np.array([[15, 16, 17], [18, 19, 20]], dtype=np.float64) ] keys = tuple(str(i) for i in range(4)) with io_open('ark:' + temp_file_1_name, 'ivv', mode='w') as feat_f, \ io_open('ark:' + temp_file_2_name, 'dm', mode='w') as lab_f: for key, feat, label in zip(keys, feats, labels): feat_f.write(key, feat) lab_f.write(key, label) data = corpus.ShuffledData(('ark:' + temp_file_1_name, 'ivv'), ('ark:' + temp_file_2_name, 'dm'), batch_size=2, batch_pad_mode='constant', key_list=keys, axis_lengths=1, rng=NonRandomState(), batch_cast_to_array=(np.int32, None, None)) for _ in range(2): ex_samp_idx = len(feats) for feat_batch, _, len_batch in data: for act_feat, act_len in zip(feat_batch, len_batch): ex_samp_idx -= 1 ex_feat = np.array(feats[ex_samp_idx], copy=False) ex_len = ex_feat.shape[1] assert ex_len == act_len assert np.allclose(ex_feat, act_feat[:, :ex_len]) assert np.allclose(act_feat[:, ex_len:], 0) data = corpus.ShuffledData( ('ark:' + temp_file_1_name, 'ivv'), ('ark:' + temp_file_2_name, 'dm'), batch_size=3, batch_pad_mode='constant', key_list=keys, axis_lengths=((1, 1), (0, 1)), rng=NonRandomState(), batch_cast_to_array=(np.int32, None, None, None)) for _ in range(2): ex_samp_idx = len(feats) for feat_batch, label_batch, lablen_batch, featlen_batch in data: for act_feat, act_label, act_lablen, act_featlen in zip( feat_batch, label_batch, lablen_batch, featlen_batch): ex_samp_idx -= 1 ex_feat = np.array(feats[ex_samp_idx], copy=False) ex_label = labels[ex_samp_idx] ex_featlen = ex_feat.shape[1] ex_lablen = ex_label.shape[1] assert ex_featlen == act_featlen assert ex_lablen == act_lablen assert np.allclose(ex_feat, act_feat[:, :ex_featlen]) assert np.allclose(act_feat[:, ex_featlen:], 0) assert np.allclose(ex_label, act_label[:, :ex_lablen]) assert np.allclose(act_label[:, ex_lablen:], 0)