def test_multiple_mlf_files(): os.chdir(data_path) feature_dim = 33 num_classes = 132 context = 2 test_mlf_path = "../../../../Tests/EndToEndTests/Speech/Data/glob_00001.mlf" features_file = "glob_0000.scp" label_files = ["glob_0000.mlf", test_mlf_path] label_mapping_file = "state.list" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef( shape=feature_dim, context=(context, context), scp=features_file))) ld = HTKMLFDeserializer( label_mapping_file, StreamDefs( awesome_labels=StreamDef(shape=num_classes, mlf=label_files))) # Make sure we can read at least one minibatch. mbsource = MinibatchSource([fd, ld]) mbsource.next_minibatch(1) os.chdir(abs_path)
def test_text_format(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_SPARSE) input_dim = 1000 num_output_classes = 5 mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False))), randomize=False) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.end_of_sweep assert features.num_sequences == 2 assert features.num_samples == 7 assert features.is_sparse labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert labels.end_of_sweep assert labels.num_sequences == 2 assert labels.num_samples == 2 assert not labels.is_sparse label_data = labels.asarray() assert np.allclose( label_data, np.asarray([[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]]])) mb = mb_source.next_minibatch(1) features = mb[features_si] labels = mb[labels_si] assert not features.end_of_sweep assert not labels.end_of_sweep assert features.num_samples < 7 assert labels.num_samples == 1
def test_max_samples(tmpdir): mb_source = MinibatchSource(create_ctf_deserializer(tmpdir), max_samples=1) input_map = {'features': mb_source['features']} mb = mb_source.next_minibatch(10, input_map) assert 'features' in mb assert mb['features'].num_samples == 1 assert not mb['features'].end_of_sweep mb = mb_source.next_minibatch(10, input_map) assert not mb
def test_max_samples(tmpdir): mb_source = MinibatchSource( create_ctf_deserializer(tmpdir), max_samples=1) input_map = {'features': mb_source['features']} mb = mb_source.next_minibatch(10, input_map) assert 'features' in mb assert mb['features'].num_samples == 1 assert not mb['features'].end_of_sweep mb = mb_source.next_minibatch(10, input_map) assert not mb
def test_text_format(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_SPARSE) input_dim = 1000 num_output_classes = 5 mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False) )), randomize=False) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.end_of_sweep assert features.num_sequences == 2 assert features.num_samples == 7 assert features.is_sparse labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert labels.end_of_sweep assert labels.num_sequences == 2 assert labels.num_samples == 2 assert not labels.is_sparse label_data = labels.asarray() assert np.allclose(label_data, np.asarray([ [[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]] ])) mb = mb_source.next_minibatch(1) features = mb[features_si] labels = mb[labels_si] assert not features.end_of_sweep assert not labels.end_of_sweep assert features.num_samples < 7 assert labels.num_samples == 1
def test_large_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_2) mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='S0', shape=1), labels=StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) features = mb[features_si] labels = mb[labels_si] # Actually, the minibatch spans over multiple sweeps, # not sure if this is an artificial situation, but # maybe instead of a boolean flag we should indicate # the largest sweep index the data was taken from. assert features.end_of_sweep assert labels.end_of_sweep assert features.num_samples == 1000 - 1000 % 7 assert labels.num_samples == 5 * (1000 // 7) assert mb[features_si].num_sequences == (1000 // 7) assert mb[labels_si].num_sequences == (1000 // 7)
def test_MinibatchData_and_Value_as_input(tmpdir): mbdata = r'''0 |S0 100''' tmpfile = str(tmpdir / 'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) defs = StreamDefs(f1=StreamDef(field='S0', shape=1)) mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs), randomize=False) f1_si = mb_source.stream_info('f1') mb = mb_source.next_minibatch(1) f1 = input(shape=(1, ), needs_gradient=True, name='f') res = f1 * 2 assert res.eval({f1: mb[f1_si]}) == [[200]] # Test MinibatchData assert res.eval(mb[f1_si]) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]] # Test NumPy (converted back from MinibatchData) assert res.eval(mb[f1_si].asarray()) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]]
def test_large_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_2) mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) features = mb[features_si] labels = mb[labels_si] # Actually, the minibatch spans over multiple sweeps, # not sure if this is an artificial situation, but # maybe instead of a boolean flag we should indicate # the largest sweep index the data was taken from. assert features.end_of_sweep assert labels.end_of_sweep assert features.num_samples == 1000 - 1000 % 7 assert labels.num_samples == 5 * (1000 // 7) assert mb[features_si].num_sequences == (1000 // 7) assert mb[labels_si].num_sequences == (1000 // 7)
def test_base64_image_deserializer(tmpdir): import io, base64, uuid from PIL import Image images, b64_images = [], [] np.random.seed(1) for i in range(10): data = np.random.randint(0, 2**8, (5, 7, 3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 7 and image.height == 5 b64_images.append(base64.b64encode(buf.getvalue())) images.append(np.array(image)) image_data = str(tmpdir / 'mbdata1.txt') seq_ids = [] uid = uuid.uuid1().int >> 64 with open(image_data, 'wb') as f: for i, data in enumerate(b64_images): seq_id = uid ^ i seq_id = str(seq_id).encode('ascii') seq_ids.append(seq_id) line = seq_id + b'\t' label = str(i).encode('ascii') line += label + b'\t' + data + b'\n' f.write(line) ctf_data = str(tmpdir / 'mbdata2.txt') with open(ctf_data, 'wb') as f: for i, sid in enumerate(seq_ids): line = sid + b'\t' + b'|index ' + str(i).encode('ascii') + b'\n' f.write(line) transforms = [xforms.scale(width=7, height=5, channels=3)] b64_deserializer = Base64ImageDeserializer( image_data, StreamDefs(images=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=10))) ctf_deserializer = CTFDeserializer( ctf_data, StreamDefs(index=StreamDef(field='index', shape=1))) mb_source = MinibatchSource([ctf_deserializer, b64_deserializer]) assert isinstance(mb_source, MinibatchSource) for j in range(100): mb = mb_source.next_minibatch(10) index_stream = mb_source.streams['index'] index = mb[index_stream].asarray().flatten() image_stream = mb_source.streams['images'] results = mb[image_stream].asarray() for i in range(10): # original images are RBG, openCV produces BGR images, # reverse the last dimension of the original images bgrImage = images[int(index[i])][:, :, ::-1] assert (bgrImage == results[i][0]).all()
def test_MinibatchData_and_Value_as_input(tmpdir): mbdata = r'''0 |S0 100''' tmpfile = str(tmpdir/'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) defs = StreamDefs(f1 = StreamDef(field='S0', shape=1)) mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs), randomize=False) f1_si = mb_source.stream_info('f1') mb = mb_source.next_minibatch(1) f1 = input_variable(shape=(1,), needs_gradient=True, name='f') res = f1 * 2 assert res.eval({f1: mb[f1_si]}) == [[200]] # Test MinibatchData assert res.eval(mb[f1_si]) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]] # Test NumPy (converted back from MinibatchData) assert res.eval(mb[f1_si].value) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]]
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.ops import input, times input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir / '2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels=StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True))), randomize=False, epoch_size=2) raw_input = sequence.input(shape=input_vocab_dim, sequence_axis=Axis('inputAxis'), name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input: mbs.streams.features}, device=cntk_device(device_id)) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid, device=cntk_device(device_id)) # CSR with the raw_input encoding in ctf_data one_hot_data = [[3, 4, 5, 4, 7, 12, 1], [60, 61]] data = [ csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data ] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = Value.one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id)) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_hot)])
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.device import cpu, gpu, set_default_device from cntk.ops import input_variable, times from scipy.sparse import csr_matrix input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True) )), randomize=False, epoch_size = 2) batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable( shape=input_vocab_dim, dynamic_axes=input_dynamic_axes, name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input : mbs.streams.features}) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid) # CSR with the raw_input encoding in ctf_data one_hot_data = [ [3, 4, 5, 4, 7, 12, 1], [60, 61] ] data = [csr_matrix(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = one_hot(one_hot_data, num_classes=input_vocab_dim) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef( shape=feature_dim, context=(context, context), scp=features_file))) ld = HTKMLFDeserializer( label_mapping_file, StreamDefs( awesome_labels=StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd, ld]) features = C.input_variable(((2 * context + 1) * feature_dim)) labels = C.input_variable((num_classes)) model = Sequential( [For(range(3), lambda: Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error(z, labels) learner = C.adam_sgd(z.parameters, lr=C.learning_rate_schedule(lr, C.UnitType.sample, epoch_size), momentum=C.momentum_as_time_constant_schedule(1000), low_memory=True, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) trainer = C.Trainer(z, (ce, errs), learner) input_map = { features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } pp = C.ProgressPrinter(freq=0) # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) pp.update_with_trainer(trainer, with_metric=True) assert True os.chdir(abs_path)
def compare_cbf_and_ctf(num_mbs, mb_size, randomize): ctf = MinibatchSource(CTFDeserializer(tmpfile, streams), randomize=randomize) cbf = MinibatchSource(CBFDeserializer(tmpfile + '.bin', streams), randomize=randomize) ctf_stream_names = sorted([x.m_name for x in ctf.stream_infos()]) cbf_stream_names = sorted([x.m_name for x in cbf.stream_infos()]) assert (ctf_stream_names == cbf_stream_names) for _ in range(num_mbs): ctf_mb = ctf.next_minibatch(mb_size, device=device) cbf_mb = cbf.next_minibatch(mb_size, device=device) for name in cbf_stream_names: ctf_data = ctf_mb[ctf[name]] cbf_data = cbf_mb[cbf[name]] assert ctf_data.num_samples == cbf_data.num_samples assert ctf_data.num_sequences == cbf_data.num_sequences assert ctf_data.shape == cbf_data.shape assert ctf_data.end_of_sweep == cbf_data.end_of_sweep assert ctf_data.is_sparse == cbf_data.is_sparse assert ctf_data.data.masked_count( ) == cbf_data.data.masked_count() # XXX: # assert(ctf_data.asarray() == cbf_data.asarray()).all() # not using asarray because for sparse values it fails with # some strange exception "sum of the rank of the mask and Variable #rank does not equal the Value's rank". assert C.cntk_py.are_equal(ctf_data.data.data, cbf_data.data.data) if (ctf_data.data.masked_count() > 0): assert (ctf_data.data.mask == cbf_data.data.mask).all() # XXX: if mask_count is zero, mb_data.data.mask fails with # "AttributeError: 'Value' object has no attribute 'mask'"! # XXX: without invoking erase, next_minibatch will fail with: # "Resize: Cannot resize the matrix because it is a view." ctf_data.data.erase() cbf_data.data.erase()
def test_mlf_binary_files(): os.chdir(data_path) feature_dim = 33 num_classes = 132 context = 2 features_file = "glob_0000.scp" fd = HTKFeatureDeserializer(StreamDefs( amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file))) ld = HTKMLFBinaryDeserializer(StreamDefs(awesome_labels = StreamDef(shape=num_classes, mlf=e2e_data_path + "mlf2.bin"))) # Make sure we can read at least one minibatch. mbsource = MinibatchSource([fd,ld]) mbsource.next_minibatch(1) os.chdir(abs_path)
def test_minibatch(tmpdir): mbdata = r'''0 |S0 0 |S1 0 0 |S0 1 |S1 1 0 |S0 2 0 |S0 3 |S1 3 1 |S0 4 1 |S0 5 |S1 1 1 |S0 6 |S1 2 ''' tmpfile = str(tmpdir/'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1)))) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert len(features.value) == 2 expected_features = \ [ [[0],[1],[2],[3]], [[4],[5],[6]] ] for res, exp in zip (features.value, expected_features): assert np.allclose(res, exp) assert np.allclose(features.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert len(labels.value) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip (labels.value, expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.mask, [[2, 1, 1], [2, 1, 0]])
def compare_cbf_and_ctf(num_mbs, mb_size, randomize): ctf = MinibatchSource(CTFDeserializer(tmpfile, streams), randomize=randomize) cbf = MinibatchSource(CBFDeserializer(tmpfile+'.bin', streams), randomize=randomize) ctf_stream_names = sorted([x.m_name for x in ctf.stream_infos()]) cbf_stream_names = sorted([x.m_name for x in cbf.stream_infos()]) assert(ctf_stream_names == cbf_stream_names) for _ in range(num_mbs): ctf_mb = ctf.next_minibatch(mb_size, device=device) cbf_mb = cbf.next_minibatch(mb_size, device=device) for name in cbf_stream_names: ctf_data = ctf_mb[ctf[name]] cbf_data = cbf_mb[cbf[name]] assert ctf_data.num_samples == cbf_data.num_samples assert ctf_data.num_sequences == cbf_data.num_sequences assert ctf_data.shape == cbf_data.shape assert ctf_data.end_of_sweep == cbf_data.end_of_sweep assert ctf_data.is_sparse == cbf_data.is_sparse assert ctf_data.data.masked_count() == cbf_data.data.masked_count() # XXX: # assert(ctf_data.asarray() == cbf_data.asarray()).all() # not using asarray because for sparse values it fails with # some strange exception "sum of the rank of the mask and Variable #rank does not equal the Value's rank". assert C.cntk_py.are_equal(ctf_data.data.data, cbf_data.data.data) if (ctf_data.data.masked_count() > 0): assert (ctf_data.data.mask == cbf_data.data.mask).all() # XXX: if mask_count is zero, mb_data.data.mask fails with # "AttributeError: 'Value' object has no attribute 'mask'"! # XXX: without invoking erase, next_minibatch will fail with: # "Resize: Cannot resize the matrix because it is a view." ctf_data.data.erase() cbf_data.data.erase()
def train(): global sentences, vocabulary, reverse_vocabulary # function will create the trainer and train it for specified number of epochs # Print loss 50 times while training print_freqency = 50 pp = ProgressPrinter(print_freqency) # get the trainer word_one_hot, context_one_hots, negative_one_hots, targets, trainer, word_negative_context_product, embedding_layer = create_trainer() # Create a CTF reader which reads the sparse inputs print("reader started") reader = CTFDeserializer(G.CTF_input_file) reader.map_input(G.word_input_field, dim=G.embedding_vocab_size, format="sparse") # context inputs for i in range(context_size): reader.map_input(G.context_input_field.format(i), dim=G.embedding_vocab_size, format="sparse") # negative inputs for i in range(G.negative): reader.map_input(G.negative_input_field.format(i), dim=G.embedding_vocab_size, format="sparse") # targets reader.map_input(G.target_input_field, dim=(G.negative + 1), format="dense") print("reader done") # Get minibatch source from reader is_training = True minibatch_source = MinibatchSource(reader, randomize=is_training, epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP) minibatch_source.streams[targets] = minibatch_source.streams[G.target_input_field] del minibatch_source.streams[G.target_input_field] print("minibatch source done") total_minibatches = total_training_instances // G.minibatch_size print("traning started") print("Total minibatches to train =", total_minibatches) for i in range(total_minibatches): # Collect minibatch # start_batch_collection = time.time() mb = minibatch_source.next_minibatch(G.minibatch_size, input_map=minibatch_source.streams) # end_batch_collection = time.time() # print("Batch collection time = %.6fsecs" % (end_batch_collection - start_batch_collection)) # print("Time taken to collect one training_instance = %.6fsecs" % ((end_batch_collection - start_batch_collection)/G.minibatch_size)) # Train minibatch # start_train = time.time() trainer.train_minibatch(mb) # end_train = time.time() # print("minibatch train time = %.6fsecs" % (end_train - start_train)) # print("Time per training instance = %.6fsecs" % ((end_train - start_train)/G.minibatch_size)) # Update progress printer pp.update_with_trainer(trainer) # start_batch_collection = time.time() print("Total training instances =", total_training_instances) return word_negative_context_product
def test_mlf_binary_files(): os.chdir(data_path) feature_dim = 33 num_classes = 132 context = 2 features_file = "glob_0000.scp" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef( shape=feature_dim, context=(context, context), scp=features_file))) ld = HTKMLFBinaryDeserializer( StreamDefs(awesome_labels=StreamDef(shape=num_classes, mlf=e2e_data_path + "mlf2.bin"))) # Make sure we can read at least one minibatch. mbsource = MinibatchSource([fd, ld]) mbsource.next_minibatch(1) os.chdir(abs_path)
def test_max_samples_over_several_sweeps(tmpdir): mb_source = MinibatchSource( create_ctf_deserializer(tmpdir), max_samples=11) input_map = {'features': mb_source['features']} for i in range(2): mb = mb_source.next_minibatch(5, input_map) assert 'features' in mb assert mb['features'].num_samples == 5 assert mb['features'].end_of_sweep mb = mb_source.next_minibatch(5, input_map) assert 'features' in mb assert mb['features'].num_samples == 1 assert not mb['features'].end_of_sweep mb = mb_source.next_minibatch(1, input_map) assert not mb
def test_max_sweeps(tmpdir): # set max sweeps to 3 (12 samples altogether). mb_source = MinibatchSource(create_ctf_deserializer(tmpdir), max_sweeps=3) input_map = {'features': mb_source['features']} for i in range(2): mb = mb_source.next_minibatch(5, input_map) assert 'features' in mb assert mb['features'].num_samples == 5 assert mb['features'].end_of_sweep mb = mb_source.next_minibatch(5, input_map) assert 'features' in mb assert mb['features'].num_samples == 2 assert mb['features'].end_of_sweep mb = mb_source.next_minibatch(1, input_map) assert not mb
def test_max_samples_over_several_sweeps(tmpdir): mb_source = MinibatchSource(create_ctf_deserializer(tmpdir), max_samples=11) input_map = {'features': mb_source['features']} for i in range(2): mb = mb_source.next_minibatch(5, input_map) assert 'features' in mb assert mb['features'].num_samples == 5 assert mb['features'].end_of_sweep mb = mb_source.next_minibatch(5, input_map) assert 'features' in mb assert mb['features'].num_samples == 1 assert not mb['features'].end_of_sweep mb = mb_source.next_minibatch(1, input_map) assert not mb
def test_text_format(tmpdir): from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs mbdata = r'''0 |x 560:1 |y 1 0 0 0 0 0 |x 0:1 0 |x 0:1 1 |x 560:1 |y 0 1 0 0 0 1 |x 0:1 1 |x 0:1 1 |x 424:1 ''' tmpfile = str(tmpdir/'mbdata.txt') with open(tmpfile, 'w') as f: f.write(mbdata) input_dim = 1000 num_output_classes = 5 mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='x', shape=input_dim, is_sparse=True), labels = StreamDef(field='y', shape=num_output_classes, is_sparse=False) ))) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.is_sparse # TODO features is sparse and cannot be accessed right now: # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format # 2 samples, max seq len 4, 1000 dim #assert features.data().shape().dimensions() == (2, 4, input_dim) #assert features.data().is_sparse() labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert not labels.is_sparse label_data = np.asarray(labels) assert np.allclose(label_data, np.asarray([ [[ 1., 0., 0., 0., 0.]], [[ 0., 1., 0., 0., 0.]] ]))
def test_max_sweeps(tmpdir): # set max sweeps to 3 (12 samples altogether). mb_source = MinibatchSource( create_ctf_deserializer(tmpdir), max_sweeps=3) input_map = {'features': mb_source['features']} for i in range(2): mb = mb_source.next_minibatch(5, input_map) assert 'features' in mb assert mb['features'].num_samples == 5 assert mb['features'].end_of_sweep mb = mb_source.next_minibatch(5, input_map) assert 'features' in mb assert mb['features'].num_samples == 2 assert mb['features'].end_of_sweep mb = mb_source.next_minibatch(1, input_map) assert not mb
def test_prefetch_with_unpacking(tmpdir): data = r'''0 |S0 1 1 1 1 |S1 1000 1 |S0 2 2 2 2 |S1 100 2 |S0 3 3 3 3 |S1 100 3 |S0 1 1 1 1 |S1 10 4 |S0 2 2 2 2 |S1 1 5 |S0 3 3 3 3 |S1 2000 6 |S0 1 1 1 1 |S1 200 7 |S0 2 2 2 2 |S1 200 8 |S0 3 3 3 3 |S1 20 9 |S0 1 1 1 1 |S1 2 ''' import time tmpfile = _write_data(tmpdir, data) input_dim = 4 num_output_classes = 1 mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=False), labels=StreamDef(field='S1', shape=num_output_classes, is_sparse=False))), randomize=False, max_samples=FULL_DATA_SWEEP) input_map = { 'S0': mb_source.streams.features, 'S1': mb_source.streams.labels } empty = False mb_size = 3 # On the last minibatch there will be resize called, # due to 10%3 = 1 sample in the minibatch while not empty: mb = mb_source.next_minibatch(mb_size, input_map=input_map) time.sleep(1) # make sure the prefetch kicks in if mb: # Force unpacking to check that we do # not break prefetch actual_size = mb['S0'].shape[0] assert (mb['S0'].asarray() == np.array( [[[1, 1, 1, 1]], [[2, 2, 2, 2]], [[3, 3, 3, 3]]], dtype=np.float32)[0:actual_size]).all() else: empty = True
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer(StreamDefs( amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file))) ld = HTKMLFDeserializer(label_mapping_file, StreamDefs( awesome_labels = StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd,ld]) features = C.input_variable(((2*context+1)*feature_dim)) labels = C.input_variable((num_classes)) model = Sequential([For(range(3), lambda : Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error (z, labels) learner = C.adam_sgd(z.parameters, lr=C.learning_rate_schedule(lr, C.UnitType.sample, epoch_size), momentum=C.momentum_as_time_constant_schedule(1000), low_memory=True, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) trainer = C.Trainer(z, (ce, errs), learner) input_map={ features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } pp = C.ProgressPrinter(freq=0) # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) pp.update_with_trainer(trainer, with_metric=True) assert True os.chdir(abs_path)
def test_multiple_mlf_files(): os.chdir(data_path) feature_dim = 33 num_classes = 132 context = 2 test_mlf_path = e2e_data_path+"glob_00001.mlf" features_file = "glob_0000.scp" label_files = [ "glob_0000.mlf", test_mlf_path] label_mapping_file = "state.list" fd = HTKFeatureDeserializer(StreamDefs( amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file))) ld = HTKMLFDeserializer(label_mapping_file, StreamDefs( awesome_labels = StreamDef(shape=num_classes, mlf=label_files))) # Make sure we can read at least one minibatch. mbsource = MinibatchSource([fd,ld]) mbsource.next_minibatch(1) os.chdir(abs_path)
def test_crop_dimensionality(tmpdir): import io; from PIL import Image np.random.seed(1) file_mapping_path = str(tmpdir / 'file_mapping.txt') with open(file_mapping_path, 'w') as file_mapping: for i in range(5): data = np.random.randint(0, 2**8, (20, 40, 3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 40 and image.height == 20 label = str(i) # save to mapping + png file file_name = label + '.png' with open(str(tmpdir/file_name), 'wb') as f: f.write(buf.getvalue()) file_mapping.write('.../%s\t%s\n' % (file_name, label)) transforms1 = [ xforms.scale(width=40, height=20, channels=3), xforms.crop(crop_type='randomside', crop_size=(20, 10), side_ratio=(0.2, 0.5), jitter_type='uniratio')] transforms2 = [ xforms.crop(crop_type='randomside', crop_size=(20, 10), side_ratio=(0.2, 0.5), jitter_type='uniratio')] d1 = ImageDeserializer(file_mapping_path, StreamDefs( images1=StreamDef(field='image', transforms=transforms1), labels1=StreamDef(field='label', shape=10))) d2 = ImageDeserializer(file_mapping_path, StreamDefs( images2=StreamDef(field='image', transforms=transforms2), labels2=StreamDef(field='label', shape=10))) mbs = MinibatchSource([d1, d2]) for j in range(5): mb = mbs.next_minibatch(1) images1 = mb[mbs.streams.images1].asarray() images2 = mb[mbs.streams.images2].asarray() assert images1.shape == (1, 1, 3, 10, 20) assert (images1 == images2).all()
def test_base64_is_equal_image(tmpdir): import io, base64 from PIL import Image np.random.seed(1) file_mapping_path = str(tmpdir / 'file_mapping.txt') base64_mapping_path = str(tmpdir / 'base64_mapping.txt') with open(file_mapping_path, 'w') as file_mapping: with open(base64_mapping_path, 'w') as base64_mapping: for i in range(10): data = np.random.randint(0, 2**8, (5, 7, 3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 7 and image.height == 5 label = str(i) # save to base 64 mapping file encoded = base64.b64encode(buf.getvalue()).decode('ascii') base64_mapping.write('%s\t%s\n' % (label, encoded)) # save to mapping + png file file_name = label + '.png' with open(str(tmpdir / file_name), 'wb') as f: f.write(buf.getvalue()) file_mapping.write('.../%s\t%s\n' % (file_name, label)) transforms = [xforms.scale(width=7, height=5, channels=3)] b64_deserializer = Base64ImageDeserializer( base64_mapping_path, StreamDefs(images1=StreamDef(field='image', transforms=transforms), labels1=StreamDef(field='label', shape=10))) file_image_deserializer = ImageDeserializer( file_mapping_path, StreamDefs(images2=StreamDef(field='image', transforms=transforms), labels2=StreamDef(field='label', shape=10))) mb_source = MinibatchSource([b64_deserializer, file_image_deserializer]) for j in range(20): mb = mb_source.next_minibatch(1) images1_stream = mb_source.streams['images1'] images1 = mb[images1_stream].asarray() images2_stream = mb_source.streams['images2'] images2 = mb[images2_stream].asarray() assert (images1 == images2).all()
def test_crop_dimensionality(tmpdir): import io; from PIL import Image np.random.seed(1) file_mapping_path = str(tmpdir / 'file_mapping.txt') with open(file_mapping_path, 'w') as file_mapping: for i in range(5): data = np.random.randint(0, 2**8, (20, 40, 3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 40 and image.height == 20 label = str(i) # save to mapping + png file file_name = label + '.png' with open(str(tmpdir/file_name), 'wb') as f: f.write(buf.getvalue()) file_mapping.write('.../%s\t%s\n' % (file_name, label)) transforms1 = [ xforms.scale(width=40, height=20, channels=3), xforms.crop(crop_type='randomside', crop_size=(20, 10), side_ratio=(0.2, 0.5), jitter_type='uniratio')] transforms2 = [ xforms.crop(crop_type='randomside', crop_size=(20, 10), side_ratio=(0.2, 0.5), jitter_type='uniratio')] d1 = ImageDeserializer(file_mapping_path, StreamDefs( images1=StreamDef(field='image', transforms=transforms1), labels1=StreamDef(field='label', shape=10))) d2 = ImageDeserializer(file_mapping_path, StreamDefs( images2=StreamDef(field='image', transforms=transforms2), labels2=StreamDef(field='label', shape=10))) mbs = MinibatchSource([d1, d2]) for j in range(5): mb = mbs.next_minibatch(1) images1 = mb[mbs.streams.images1].asarray() images2 = mb[mbs.streams.images2].asarray() assert images1.shape == (1, 1, 3, 10, 20) assert (images1 == images2).all()
def test_multiple_streams_in_htk(): feature_dim = 33 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" fd = HTKFeatureDeserializer(StreamDefs( amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file), amazing_features2 = StreamDef(shape=feature_dim, context=(context,context), scp=features_file))) mbs = MinibatchSource([fd]) mb = mbs.next_minibatch(1) assert (mb[mbs.streams.amazing_features].asarray() == mb[mbs.streams.amazing_features2].asarray()).all() os.chdir(abs_path)
def test_base64_is_equal_image(tmpdir): import io, base64; from PIL import Image np.random.seed(1) file_mapping_path = str(tmpdir / 'file_mapping.txt') base64_mapping_path = str(tmpdir / 'base64_mapping.txt') with open(file_mapping_path, 'w') as file_mapping: with open(base64_mapping_path, 'w') as base64_mapping: for i in range(10): data = np.random.randint(0, 2**8, (5,7,3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 7 and image.height == 5 label = str(i) # save to base 64 mapping file encoded = base64.b64encode(buf.getvalue()).decode('ascii') base64_mapping.write('%s\t%s\n' % (label, encoded)) # save to mapping + png file file_name = label + '.png' with open(str(tmpdir/file_name), 'wb') as f: f.write(buf.getvalue()) file_mapping.write('.../%s\t%s\n' % (file_name, label)) transforms = [xforms.scale(width=7, height=5, channels=3)] b64_deserializer = Base64ImageDeserializer(base64_mapping_path, StreamDefs( images1=StreamDef(field='image', transforms=transforms), labels1=StreamDef(field='label', shape=10))) file_image_deserializer = ImageDeserializer(file_mapping_path, StreamDefs( images2=StreamDef(field='image', transforms=transforms), labels2=StreamDef(field='label', shape=10))) mb_source = MinibatchSource([b64_deserializer, file_image_deserializer]) for j in range(20): mb = mb_source.next_minibatch(1) images1_stream = mb_source.streams['images1'] images1 = mb[images1_stream].asarray() images2_stream = mb_source.streams['images2'] images2 = mb[images2_stream].asarray() assert(images1 == images2).all()
def test_full_sweep_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_1) mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0, max_sweeps=1) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert features.end_of_sweep assert len(features.as_sequences()) == 2 expected_features = \ [ [[0], [1], [2], [3]], [[4], [5], [6]] ] for res, exp in zip(features.as_sequences(), expected_features): assert np.allclose(res, exp) assert np.allclose(features.data.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert labels.end_of_sweep assert len(labels.as_sequences()) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip(labels.as_sequences(), expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.data.mask, [[2, 1, 1], [2, 1, 0]])
def test_full_sweep_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_1) mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0, max_sweeps=1) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert features.end_of_sweep assert len(features.as_sequences()) == 2 expected_features = \ [ [[0], [1], [2], [3]], [[4], [5], [6]] ] for res, exp in zip(features.as_sequences(), expected_features): assert np.allclose(res, exp) assert np.allclose(features.data.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert labels.end_of_sweep assert len(labels.as_sequences()) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip(labels.as_sequences(), expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.data.mask, [[2, 1, 1], [2, 1, 0]])
def decode_model(use_gpu=True, gpu_id=0): # use GPU or CPU according to parameters try_set_default_device(gpu(gpu_id) if use_gpu else cpu()) model_dnn = load_model("./model/speech_enhancement.model") features_file = "./test_normed.scp" feature_dim = 257 test_reader = MinibatchSource(HTKFeatureDeserializer(StreamDefs( amazing_features=StreamDef( shape=feature_dim, context=(3, 3), scp=features_file))), randomize=False, frame_mode=False) eval_input_map = {input: test_reader.streams.amazing_features} f = open(features_file) line = f.readline() while line: temp_input_path = line.split(']')[0] mb_size = temp_input_path.split(',')[-1] mb_size = int(mb_size) + 1 noisy_fea = test_reader.next_minibatch( mb_size, input_map=eval_input_map) real_noisy_fea = noisy_fea[input].data node_in_graph = model_dnn.find_by_name('irm') output_nodes = combine([node_in_graph.owner]) out_noisy_fea = output_nodes.eval(real_noisy_fea) # out_noisy_fea = as_composite(model_dnn.output1[0].owner).eval( # real_noisy_fea) out_SE_noisy_fea = np.concatenate((out_noisy_fea), axis=0) out_file_path = line.split('=')[0] out_file_name = os.path.join('./enhanced_norm_fea_mat', out_file_path) out_file_fullpath = os.path.split(out_file_name)[0] # print (out_file_fullpath) if not os.path.exists(out_file_fullpath): os.makedirs(out_file_fullpath) sio.savemat(out_file_name, {'SE': out_SE_noisy_fea}) line = f.readline() f.close()
def test_multiple_streams_in_htk(): feature_dim = 33 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef(shape=feature_dim, context=(context, context), scp=features_file), amazing_features2=StreamDef(shape=feature_dim, context=(context, context), scp=features_file))) mbs = MinibatchSource([fd]) mb = mbs.next_minibatch(1) assert (mb[mbs.streams.amazing_features].asarray() == mb[ mbs.streams.amazing_features2].asarray()).all() os.chdir(abs_path)
def test_prefetch_with_unpacking(tmpdir): data = r'''0 |S0 1 1 1 1 |S1 1000 1 |S0 2 2 2 2 |S1 100 2 |S0 3 3 3 3 |S1 100 3 |S0 1 1 1 1 |S1 10 4 |S0 2 2 2 2 |S1 1 5 |S0 3 3 3 3 |S1 2000 6 |S0 1 1 1 1 |S1 200 7 |S0 2 2 2 2 |S1 200 8 |S0 3 3 3 3 |S1 20 9 |S0 1 1 1 1 |S1 2 ''' import time tmpfile = _write_data(tmpdir, data) input_dim = 4 num_output_classes = 1 mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features=StreamDef(field='S0', shape=input_dim, is_sparse=False), labels=StreamDef(field='S1', shape=num_output_classes, is_sparse=False) )), randomize=False, max_samples=FULL_DATA_SWEEP) input_map = { 'S0' : mb_source.streams.features, 'S1' : mb_source.streams.labels } empty = False mb_size = 3 # On the last minibatch there will be resize called, # due to 10%3 = 1 sample in the minibatch while not empty: mb = mb_source.next_minibatch(mb_size, input_map=input_map) time.sleep(1) # make sure the prefetch kicks in if mb: # Force unpacking to check that we do # not break prefetch actual_size = mb['S0'].shape[0] assert (mb['S0'].asarray() == np.array([[[1, 1, 1, 1]], [[2, 2, 2, 2]], [[3, 3, 3, 3]]], dtype=np.float32)[0:actual_size]).all() else: empty = True
def test_distributed_mb_source(tmpdir): input_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH 2 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 60:1 |# <s> |S1 3:1 |# <s> 3 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 61:1 |# A |S1 32:1 |# ~AH 4 |S0 60:1 |# <s> |S1 3:1 |# <s> 5 |S0 60:1 |# <s> |S1 3:1 |# <s> 5 |S0 61:1 |# A |S1 32:1 |# ~AH 6 |S0 60:1 |# <s> |S1 3:1 |# <s> 6 |S0 61:1 |# A |S1 32:1 |# ~AH 7 |S0 60:1 |# <s> |S1 3:1 |# <s> 8 |S0 60:1 |# <s> |S1 3:1 |# <s> 8 |S0 61:1 |# A |S1 32:1 |# ~AH 9 |S0 60:1 |# <s> |S1 3:1 |# <s> 9 |S0 61:1 |# A |S1 32:1 |# ~AH 10 |S0 61:1 |# A |S1 32:1 |# ~AH ''' from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, FULL_DATA_SWEEP ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) # No randomization mb0 = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_dim, is_sparse=True), labels = StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=False, epoch_size=36) # A bit more than a sweep mb1 = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_dim, is_sparse=True), labels = StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=False, epoch_size=36) # A bit more than a sweep input = input_variable(shape=(input_dim,)) label = input_variable(shape=(input_dim,)) input_map = { input : mb0.streams.features, label : mb0.streams.labels } # Because we emulating two workers here, the minibatch_size_in_samples will be splitted in 2, # so below we expect 5 samples per worker. data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 7) # Sequence 0 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 4) # Sequence 3 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 5) # Sequences 5, 7, 9 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 7) # Sequence 0 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 4) # Sequence 3 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(len(data) == 0) # No data data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert(data[input].num_samples == 4) # Sequences 2, 4 data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert(data[input].num_samples == 5) # Sequences 6, 8, 10 data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert(data[input].num_samples == 3) # Sequences 2 data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert(len(data) == 0) # No data # Radomization mb3 = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_dim, is_sparse=True), labels = StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=True, epoch_size=FULL_DATA_SWEEP) mb4 = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_dim, is_sparse=True), labels = StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=True, epoch_size=FULL_DATA_SWEEP) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 5) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 4) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 4) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 5) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert(data[input].num_samples == 7) data = mb4.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert(len(data) == 0) # Due to chunking we do not expect any data for rank 1
def test_base64_image_deserializer(tmpdir): import io, base64, uuid; from PIL import Image images, b64_images = [], [] np.random.seed(1) for i in range(10): data = np.random.randint(0, 2**8, (5,7,3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 7 and image.height == 5 b64_images.append(base64.b64encode(buf.getvalue())) images.append(np.array(image)) image_data = str(tmpdir / 'mbdata1.txt') seq_ids = [] uid = uuid.uuid1().int >> 64 with open(image_data, 'wb') as f: for i,data in enumerate(b64_images): seq_id = uid ^ i seq_id = str(seq_id).encode('ascii') seq_ids.append(seq_id) line = seq_id + b'\t' label = str(i).encode('ascii') line += label + b'\t' + data + b'\n' f.write(line) ctf_data = str(tmpdir / 'mbdata2.txt') with open(ctf_data, 'wb') as f: for i, sid in enumerate(seq_ids): line = sid + b'\t' + b'|index '+str(i).encode('ascii') + b'\n' f.write(line) transforms = [xforms.scale(width=7, height=5, channels=3)] b64_deserializer = Base64ImageDeserializer(image_data, StreamDefs( images=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=10))) ctf_deserializer = CTFDeserializer(ctf_data, StreamDefs(index=StreamDef(field='index', shape=1))) mb_source = MinibatchSource([ctf_deserializer, b64_deserializer]) assert isinstance(mb_source, MinibatchSource) for j in range(100): mb = mb_source.next_minibatch(10) index_stream = mb_source.streams['index'] index = mb[index_stream].asarray().flatten() image_stream = mb_source.streams['images'] results = mb[image_stream].asarray() for i in range(10): # original images are RBG, openCV produces BGR images, # reverse the last dimension of the original images bgrImage = images[int(index[i])][:,:,::-1] # transposing to get CHW representation bgrImage = np.transpose(bgrImage, (2, 0, 1)) assert (bgrImage == results[i][0]).all()
def test_distributed_mb_source(tmpdir): input_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH 2 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 60:1 |# <s> |S1 3:1 |# <s> 3 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 61:1 |# A |S1 32:1 |# ~AH 4 |S0 60:1 |# <s> |S1 3:1 |# <s> 5 |S0 60:1 |# <s> |S1 3:1 |# <s> 5 |S0 61:1 |# A |S1 32:1 |# ~AH 6 |S0 60:1 |# <s> |S1 3:1 |# <s> 6 |S0 61:1 |# A |S1 32:1 |# ~AH 7 |S0 60:1 |# <s> |S1 3:1 |# <s> 8 |S0 60:1 |# <s> |S1 3:1 |# <s> 8 |S0 61:1 |# A |S1 32:1 |# ~AH 9 |S0 60:1 |# <s> |S1 3:1 |# <s> 9 |S0 61:1 |# A |S1 32:1 |# ~AH 10 |S0 61:1 |# A |S1 32:1 |# ~AH ''' from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, FULL_DATA_SWEEP ctf_file = str(tmpdir / '2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) # No randomization mb0 = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True))), randomize=False, epoch_size=FULL_DATA_SWEEP) mb1 = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True))), randomize=False, epoch_size=FULL_DATA_SWEEP) input = input_variable(shape=(input_dim, )) label = input_variable(shape=(input_dim, )) input_map = {input: mb0.streams.features, label: mb0.streams.labels} data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 7) data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 4) data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 5) data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert (data[input].num_samples == 3) data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert (data[input].num_samples == 5) # Radomization mb3 = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True))), randomize=True, epoch_size=FULL_DATA_SWEEP) mb4 = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True))), randomize=True, epoch_size=FULL_DATA_SWEEP) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 5) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 4) data = mb4.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert (len(data) == 0)
def simple_mnist(): input_dim = 784 num_output_classes = 10 num_hidden_layers = 2 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = C.input_variable(input_dim) label = C.input_variable(num_output_classes) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) # z = Sequential([ # Dense(hidden_layers_dim, activation=relu), # Dense(hidden_layers_dim, activation=relu), # Dense(num_output_classes)])(scaled_input) with default_options(activation=relu, init=C.glorot_uniform()): z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim)), Dense(num_output_classes, activation=None)])(scaled_input) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) # setup the data path = abs_path + "\Train-28x28_cntk_text.txt" reader_train = MinibatchSource(CTFDeserializer(path, StreamDefs( features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=num_output_classes)))) input_map = { feature: reader_train.streams.features, label: reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. progress_writers = [ProgressPrinter( tag='Training', num_epochs=num_sweeps_to_train_with)] # Instantiate the trainer object to drive the model training lr = learning_rate_schedule(1, UnitType.sample) trainer = Trainer(z, (ce, pe), [adadelta(z.parameters, lr)], progress_writers) training_session( trainer=trainer, mb_source=reader_train, mb_size=minibatch_size, model_inputs_to_streams=input_map, max_samples=num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep ).train() # Load test data path = abs_path + "\Test-28x28_cntk_text.txt" reader_test = MinibatchSource(CTFDeserializer(path, StreamDefs( features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=num_output_classes)))) input_map = { feature: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test
def generate_visualization(use_brain_script_model, testing=False): num_objects_to_eval = 5 if (use_brain_script_model): model_file_name = "07_Deconvolution_BS.model" encoder_output_file_name = "encoder_output_BS.txt" decoder_output_file_name = "decoder_output_BS.txt" enc_node_name = "z.pool1" input_node_name = "f2" output_node_name = "z" else: model_file_name = "07_Deconvolution_PY.model" encoder_output_file_name = "encoder_output_PY.txt" decoder_output_file_name = "decoder_output_PY.txt" enc_node_name = "pooling_node" input_node_name = "input_node" output_node_name = "output_node" # define location of output, model and data and check existence output_path = os.path.join(abs_path, "Output") model_file = os.path.join(model_path, model_file_name) data_file = os.path.join(data_path, "Test-28x28_cntk_text.txt") if not (os.path.exists(model_file) and os.path.exists(data_file)): print( "Cannot find required data or model. " "Please get the MNIST data set and run 'cntk configFile=07_Deconvolution_BS.cntk' or 'python 07_Deconvolution_PY.py' to create the model." ) exit(0) # create minibatch source minibatch_source = MinibatchSource(CTFDeserializer( data_file, StreamDefs(features=StreamDef(field='features', shape=(28 * 28)), labels=StreamDef(field='labels', shape=10))), randomize=False, max_sweeps=1) # use this to print all node names in the model # print_all_node_names(model_file, use_brain_script_model) # load model and pick desired nodes as output loaded_model = load_model(model_file) output_nodes = combine([ loaded_model.find_by_name(input_node_name).owner, loaded_model.find_by_name(enc_node_name).owner, loaded_model.find_by_name(output_node_name).owner ]) # evaluate model save output features_si = minibatch_source['features'] with open(os.path.join(output_path, decoder_output_file_name), 'wb') as decoder_text_file: with open(os.path.join(output_path, encoder_output_file_name), 'wb') as encoder_text_file: for i in range(0, num_objects_to_eval): mb = minibatch_source.next_minibatch(1) raw_dict = output_nodes.eval(mb[features_si]) output_dict = {} for key in raw_dict.keys(): output_dict[key.name] = raw_dict[key] encoder_input = output_dict[input_node_name] encoder_output = output_dict[enc_node_name] decoder_output = output_dict[output_node_name] in_values = (encoder_input[0, 0].flatten())[np.newaxis] enc_values = (encoder_output[0, 0].flatten())[np.newaxis] out_values = (decoder_output[0, 0].flatten())[np.newaxis] if not testing: # write results as text and png np.savetxt(decoder_text_file, out_values, fmt="%.6f") np.savetxt(encoder_text_file, enc_values, fmt="%.6f") save_as_png( in_values, os.path.join(output_path, "imageAutoEncoder_%s__input.png" % i)) save_as_png( out_values, os.path.join(output_path, "imageAutoEncoder_%s_output.png" % i)) # visualizing the encoding is only possible and meaningful with a single conv filter enc_dim = 7 if (enc_values.size == enc_dim * enc_dim): save_as_png( enc_values, os.path.join( output_path, "imageAutoEncoder_%s_encoding.png" % i), dim=enc_dim) print("Done. Wrote output to %s" % output_path)
def test_distributed_mb_source(tmpdir): input_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH 2 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 60:1 |# <s> |S1 3:1 |# <s> 3 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 61:1 |# A |S1 32:1 |# ~AH 3 |S0 61:1 |# A |S1 32:1 |# ~AH 4 |S0 60:1 |# <s> |S1 3:1 |# <s> 5 |S0 60:1 |# <s> |S1 3:1 |# <s> 5 |S0 61:1 |# A |S1 32:1 |# ~AH 6 |S0 60:1 |# <s> |S1 3:1 |# <s> 6 |S0 61:1 |# A |S1 32:1 |# ~AH 7 |S0 60:1 |# <s> |S1 3:1 |# <s> 8 |S0 60:1 |# <s> |S1 3:1 |# <s> 8 |S0 61:1 |# A |S1 32:1 |# ~AH 9 |S0 60:1 |# <s> |S1 3:1 |# <s> 9 |S0 61:1 |# A |S1 32:1 |# ~AH 10 |S0 61:1 |# A |S1 32:1 |# ~AH ''' from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs ctf_file = str(tmpdir / '2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) # No randomization mb0 = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True))), randomize=False, max_samples=36) # A bit more than a sweep mb1 = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True))), randomize=False, max_samples=36) # A bit more than a sweep input = sequence.input_variable(shape=(input_dim, )) label = sequence.input_variable(shape=(input_dim, )) input_map = {input: mb0.streams.features, label: mb0.streams.labels} # Because we emulating two workers here, the minibatch_size_in_samples will be splitted in 2, # so below we expect 5 samples per worker. data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 7) # Sequence 0 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 4) # Sequence 3 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 5) # Sequences 5, 7, 9 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 7) # Sequence 0 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 4) # Sequence 3 data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (len(data) == 0) # No data data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert (data[input].num_samples == 4) # Sequences 2, 4 data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert (data[input].num_samples == 5) # Sequences 6, 8, 10 data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert (data[input].num_samples == 3) # Sequences 2 data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert (len(data) == 0) # No data # Radomization mb3 = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True))), max_sweeps=1) mb4 = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True))), max_sweeps=1) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 5) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 4) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 4) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 5) data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0) assert (data[input].num_samples == 7) data = mb4.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1) assert (len(data) == 0 ) # Due to chunking we do not expect any data for rank 1
def test_sweep_based_schedule(tmpdir, device_id): from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk import cross_entropy_with_softmax, classification_error, plus, reduce_sum, sequence from cntk import Trainer input_dim = 69 ctf_data = '''\ 0 |S0 3:1 |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_dim, is_sparse=True), labels = StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=False) in1 = sequence.input_variable(shape=(input_dim,)) labels = sequence.input_variable(shape=(input_dim,)) p = parameter(shape=(input_dim,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { in1 : mbs.streams.features, labels : mbs.streams.labels } # fetch minibatch (first sequence) data = mbs.next_minibatch(1, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.3 # fetch minibatch (second sequence, sweep ends at this point) data = mbs.next_minibatch(1, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.2 # fetch minibatch (both sequences -- entire sweep in one go) data = mbs.next_minibatch(9, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.1 # fetch minibatch (multiple sweeps) data = mbs.next_minibatch(30, input_map=input_map) trainer.train_minibatch(data, outputs=[z.output]) assert learner.learning_rate() == 0.0
# load model and pick desired nodes as output loaded_model = load_model(model_file) output_nodes = combine([ loaded_model.find_by_name('f1').owner, loaded_model.find_by_name('z.p1').owner, loaded_model.find_by_name('z').owner ]) # evaluate model save output features_si = minibatch_source['features'] with open(os.path.join(output_path, "decoder_output_py.txt"), 'wb') as decoder_text_file: with open(os.path.join(output_path, "encoder_output_py.txt"), 'wb') as encoder_text_file: for i in range(0, num_objects_to_eval): mb = minibatch_source.next_minibatch(1) raw_dict = output_nodes.eval(mb[features_si]) output_dict = {} for key in raw_dict.keys(): output_dict[key.name] = raw_dict[key] encoder_input = output_dict['f1'] encoder_output = output_dict['z.p1'] decoder_output = output_dict['z'] in_values = (encoder_input[0, 0].flatten())[np.newaxis] enc_values = (encoder_output[0, 0].flatten())[np.newaxis] out_values = (decoder_output[0, 0].flatten())[np.newaxis] # write results as text and png np.savetxt(decoder_text_file, out_values, fmt="%.6f") np.savetxt(encoder_text_file, enc_values, fmt="%.6f")
max_samples=numberOfSamples * numberOfSweepsForTraining, progress_frequency=numberOfSamples ) trainingSession.train() # Testing time # testPath = "test.txt" ctfdResultTest = CTFDeserializer(testPath, StreamDefs( features=StreamDef(field='features', shape=featuresShapeValue), labels=StreamDef(field='labels', shape=labelsShapeValue))) readerTest = MinibatchSource(ctfdResultTest) inputMapTest = { featuresShape: readerTest.streams.features, labelsShape: readerTest.streams.labels } minibatchSizeTest = 25 numberOfSamplesTest = 312 minibatchesToTest = numberOfSamplesTest / minibatchSizeTest testResult = 0.0 for i in range(0, int(minibatchesToTest)): mb = readerTest.next_minibatch(minibatch_size_in_samples = minibatchSizeTest, input_map=inputMapTest) evalError = trainer.test_minibatch(mb) testResult = testResult + evalError averageClassificationError = testResult / minibatchesToTest print(averageClassificationError)
def __train_cntk(self, path_to_folder: str, model_definition, epochs: int, output_model_path: str, classes, minibatch_size: int): import cntk from cntk.learners import learning_parameter_schedule from cntk.ops import input_variable from cntk.io import MinibatchSource, ImageDeserializer, StreamDefs, StreamDef, MinibatchData, UserDeserializer import cntk.io.transforms as xforms from cntk.layers import default_options, Dense, Sequential, Activation, Embedding, Convolution2D, MaxPooling, Stabilizer, Convolution, Dropout, BatchNormalization from cntk.ops.functions import CloneMethod from cntk.logging import ProgressPrinter from cntk.losses import cross_entropy_with_softmax from cntk import classification_error, softmax, relu, ModelFormat, element_times, momentum_schedule, momentum_sgd import pandas as pd path_to_folder = path_to_folder.rstrip('/') map_file_train = path_to_folder + "/train_map.txt" map_file_test = path_to_folder + "/test_map.txt" classes_set = set() num_train = 0 num_test = 0 num_channels = 3 class TrackDataset(UserDeserializer): def __init__(self, map_file, streams, chunksize=100): super(TrackDataset, self).__init__() self._batch_size = chunksize self.dataframes = pd.read_csv(map_file, sep='\t', dtype=str, header=None, names=["features", "labels"]) self._streams = [ cntk.io.StreamInformation(s['name'], i, 'dense', np.float32, s['shape']) for i, s in enumerate(streams) ] self._num_chunks = int( math.ceil(len(self.dataframes) / chunksize)) def _scale_image(self, image, width=224, height=168): try: return image.resize((width, height), Image.LINEAR) except: raise Exception('scale_image error') def stream_infos(self): return self._streams def num_chunks(self): return self._num_chunks def get_chunk(self, chunk_id): images = [] labels = [] maximum = (chunk_id + 1) * self._batch_size if (maximum > len(self.dataframes)): maximum = len(self.dataframes) for i in range(chunk_id * self._batch_size, maximum): img_name = self.dataframes.iloc[i, 0] image = Image.open(img_name) cl = self.dataframes.iloc[i, 1:].values[0] image = self._scale_image(image) image = np.moveaxis((np.array(image).astype('float32')), -1, 0) image -= np.mean(image, keepdims=True) image /= (np.std(image, keepdims=True) + 1e-6) images.append(image) yv = np.zeros(num_classes) yv[classes.index(cl)] = 1 labels.append(yv) result = {} features = np.array(images) lab = np.array(labels).astype('float32') result[self._streams[0].m_name] = features result[self._streams[1].m_name] = lab return result try: with open(map_file_train) as f: csv_reader = csv.reader(f, delimiter='\t') for row in csv_reader: cmd = row[1] classes_set.add(cmd) num_train = num_train + 1 except Exception as e: raise Exception( "No train_map.txt file found in path " + path_to_folder + ". Did you create a dataset using create_balanced_dataset()?") num_classes = len(classes) with open(map_file_test) as f: for num_test, l in enumerate(f): pass # transforms = [ # xforms.scale(width=self.__image_width, height=self.__image_height, channels=num_channels, interpolations='linear'), # xforms.mean(mean_file) # ] dataset_train = TrackDataset(map_file=map_file_train, streams=[ dict(name='features', shape=(num_channels, self.__image_height, self.__image_width)), dict(name='labels', shape=(num_classes, )) ]) reader_train = MinibatchSource([dataset_train], randomize=True) # a = dataset_train.num_chunks() dataset_test = TrackDataset(map_file=map_file_test, streams=[ dict(name='features', shape=(num_channels, self.__image_height, self.__image_width)), dict(name='labels', shape=(num_classes, )) ]) reader_test = MinibatchSource([dataset_test], randomize=True) # ImageDeserializer loads images in the BGR format, not RGB # reader_train = MinibatchSource(ImageDeserializer(map_file_train, StreamDefs( # features = StreamDef(field='image', transforms=transforms), # labels = StreamDef(field='label', shape=num_classes) # ))) # reader_test = MinibatchSource(ImageDeserializer(map_file_test, StreamDefs( # features = StreamDef(field='image', transforms=transforms), # labels = StreamDef(field='label', shape=num_classes) # ))) # mb = reader_train.next_minibatch(10) input_var = input_variable( (num_channels, self.__image_height, self.__image_width)) label_var = input_variable((num_classes)) model = model_definition(input_var) ce = cross_entropy_with_softmax(model, label_var) pe = classification_error(model, label_var) epoch_size = num_train lr_per_minibatch = learning_parameter_schedule([0.01] * 10 + [0.003] * 10 + [0.001], epoch_size=epoch_size) momentums = momentum_schedule(0.9, minibatch_size=minibatch_size) l2_reg_weight = 0.001 learner = momentum_sgd(model.parameters, lr=lr_per_minibatch, momentum=momentums, l2_regularization_weight=l2_reg_weight) progress_printer = ProgressPrinter(tag='Training', num_epochs=epochs) trainer = cntk.train.Trainer(model, (ce, pe), [learner], [progress_printer]) input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } print("Training started") batch_index = 0 plot_data = {'batchindex': [], 'loss': [], 'error': []} for epoch in range(epochs): sample_count = 0 while sample_count < epoch_size: data: MinibatchSource = reader_train.next_minibatch( min(minibatch_size, epoch_size - sample_count), input_map=input_map) trainer.train_minibatch(data) sample_count += data[label_var].num_samples batch_index += 1 plot_data['batchindex'].append(batch_index) plot_data['loss'].append( trainer.previous_minibatch_loss_average) plot_data['error'].append( trainer.previous_minibatch_evaluation_average) trainer.summarize_training_progress() metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 epoch_size = num_test while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) data = reader_test.next_minibatch(current_minibatch, input_map=input_map) metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.1f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") model.save(output_model_path, format=ModelFormat.ONNX)
def get_minibatch(bmuf, working_dir, mb_source): from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs if mb_source == "numpy": for i in range(num_batches): features = [] labels = [] for j in range(batch_size): seq_len_j = [seq_len, seq_len + 5, seq_len - 5][j % 3] x = np.random.rand( seq_len_j, feat_dim).astype(np.float32) y = np.random.rand( seq_len_j, label_dim).astype(np.float32) features.append(x) labels.append(y) yield {bmuf.feat: features, bmuf.label: labels} if mb_source in ("ctf_utterance", "ctf_frame", "ctf_bptt"): if mb_source == "ctf_frame": #frame mode data without sequence ids. ctf_data = ctf_data = '''\ |S0 0.49 0.18 0.84 0.7 0.59 |S1 0.12 0.24 0.14 |S0 0.69 0.63 0.47 0.93 0.69 |S1 0.34 0.85 0.17 |S0 0.04 0.5 0.39 0.86 0.28 |S1 0.62 0.36 0.53 |S0 0.71 0.9 0.15 0.83 0.18 |S1 0.2 0.74 0.04 |S0 0.38 0.67 0.46 0.53 0.75 |S1 0.6 0.14 0.35 |S0 0.94 0.54 0.09 0.55 0.08 |S1 0.07 0.53 0.47 |S0 0.11 0.24 0.17 0.72 0.72 |S1 0.9 0.98 0.18 |S0 0.3 1. 0.34 0.06 0.78 |S1 0.15 0.69 0.63 |S0 0.69 0.86 0.59 0.49 0.99 |S1 0.13 0.6 0.21 ''' #sequence mode data with sequence id else: ctf_data = ctf_data = '''\ 0 |S0 0.49 0.18 0.84 0.7 0.59 |S1 0.12 0.24 0.14 0 |S0 0.69 0.63 0.47 0.93 0.69 |S1 0.34 0.85 0.17 0 |S0 0.04 0.5 0.39 0.86 0.28 |S1 0.62 0.36 0.53 0 |S0 0.71 0.9 0.15 0.83 0.18 |S1 0.2 0.74 0.04 0 |S0 0.38 0.67 0.46 0.53 0.75 |S1 0.6 0.14 0.35 0 |S0 0.94 0.54 0.09 0.55 0.08 |S1 0.07 0.53 0.47 0 |S0 0.11 0.24 0.17 0.72 0.72 |S1 0.9 0.98 0.18 2 |S0 0.3 1. 0.34 0.06 0.78 |S1 0.15 0.69 0.63 2 |S0 0.69 0.86 0.59 0.49 0.99 |S1 0.13 0.6 0.21 ''' ctf_file = os.path.join(working_dir, '2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) # ctf_utterance model frame_mode = False truncation_length = 0 if mb_source == "ctf_frame": frame_mode = True elif mb_source == "ctf_bptt": truncation_length = 2 mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=feat_dim, is_sparse=False), labels = StreamDef(field='S1', shape=label_dim, is_sparse=False) )), randomize=False, max_samples = batch_size*num_batches, frame_mode=frame_mode, truncation_length=truncation_length) for i in range(num_batches): minibatch = mbs.next_minibatch(batch_size, {bmuf.feat: mbs.streams.features, bmuf.label: mbs.streams.labels}) if not minibatch: break yield minibatch
def driver(self): np.random.seed(0) # Define the data dimensions image_shape = (1, 28, 28) input_dim = int(np.prod(image_shape, dtype=int)) output_dim = 10 num_train_samples = 60000 num_test_samples = 10000 # The local path where the training and test data might be found or will be downloaded to. training_data_path = os.path.join(os.getcwd(), "MNIST_data", "Train-28x28_cntk_text.txt") testing_data_path = os.path.join(os.getcwd(), "MNIST_data", "Test-28x28_cntk_text.txt") # Download the data if they don't already exist url_train_image = "train-images-idx3-ubyte.gz" url_train_labels = "train-labels-idx1-ubyte.gz" if not os.path.exists(training_data_path): url_train_image = "train-images-idx3-ubyte.gz" url_train_labels = "train-labels-idx1-ubyte.gz" print("Loading training data") saved_data_dir = os.path.join(os.getcwd(), "MNIST_data") train = self.load_mnist_data(url_train_image, url_train_labels, num_train_samples, local_data_dir=saved_data_dir) print("Writing training data text file...") self.save_as_txt(training_data_path, train) print("[Done]") url_test_image = "t10k-images-idx3-ubyte.gz" url_test_labels = "t10k-labels-idx1-ubyte.gz" if not os.path.exists(testing_data_path): url_test_image = "t10k-images-idx3-ubyte.gz" url_test_labels = "t10k-labels-idx1-ubyte.gz" print("Loading testing data") saved_data_dir = os.path.join(os.getcwd(), "MNIST_data2") test = self.load_mnist_data(url_test_image, url_test_labels, num_test_samples, saved_data_dir) print("Writing testing data text file...") self.save_as_txt(testing_data_path, test) print("[Done]") feature_stream_name = 'features' labels_stream_name = 'labels' # Convert to CNTK MinibatchSource # original as below deprecated------------ #train_minibatch_source = cntk.text_format_minibatch_source(training_data_path, [ #cntk.StreamConfiguration(feature_stream_name, input_dim), #cntk.StreamConfiguration(labels_stream_name, output_dim)]) #------------------------------------------------------------------ train_minibatch_source = MinibatchSource( CTFDeserializer( training_data_path, StreamDefs(features=StreamDef(field='features', shape=input_dim, is_sparse=False), labels=StreamDef(field='labels', shape=output_dim, is_sparse=False)))) training_features = train_minibatch_source[feature_stream_name] training_labels = train_minibatch_source[labels_stream_name] print("Training data from file %s successfully read." % training_data_path) #test_minibatch_source = cntk.text_format_minibatch_source(testing_data_path, [ #cntk.StreamConfiguration(feature_stream_name, input_dim), #cntk.StreamConfiguration(labels_stream_name, output_dim)]) test_minibatch_source = MinibatchSource( CTFDeserializer( testing_data_path, StreamDefs(features=StreamDef(field='features', shape=input_dim, is_sparse=False), labels=StreamDef(field='labels', shape=output_dim, is_sparse=False)))) test_features = test_minibatch_source[feature_stream_name] test_labels = test_minibatch_source[labels_stream_name] print("Test data from file %s successfully read." % testing_data_path) # Define the input to the neural network input_vars = cntk.ops.input_variable(image_shape, np.float32) # Create the convolutional neural network output = self.create_convolutional_neural_network(input_vars, output_dim, dropout_prob=0.5) #''' #---------------------- #Setting up the trainer #---------------------- #''' # Define the label as the other input parameter of the trainer labels = cntk.ops.input_variable(output_dim, np.float32) # Initialize the parameters for the trainer train_minibatch_size = 50 learning_rate = 1e-4 momentum = 0.9 # Define the loss function #loss = cntk.ops.cross_entropy_with_softmax(output, labels) loss = cntk.cross_entropy_with_softmax(output, labels) # Define the function that calculates classification error #label_error = cntk.ops.classification_error(output, labels) label_error = cntk.classification_error(output, labels) # Instantiate the trainer object to drive the model training #learner = cntk.adam_sgd(output.parameters, learning_rate, momentum) learner = cntk.adam( output.parameters, learning_rate_schedule(learning_rate, UnitType.sample), momentum_schedule(momentum)) trainer = cntk.Trainer(output, (loss, label_error), [learner]) #''' #----------------------------------------- #Training the Convolutional Neural Network #----------------------------------------- #''' num_training_epoch = 1 training_progress_output_freq = 100 for epoch in range(num_training_epoch): sample_count = 0 num_minibatch = 0 # loop over minibatches in the epoch while sample_count < num_train_samples: minibatch = train_minibatch_source.next_minibatch( min(train_minibatch_size, num_train_samples - sample_count)) # Specify the mapping of input variables in the model to actual minibatch data to be trained with data = { input_vars: minibatch[training_features], labels: minibatch[training_labels] } trainer.train_minibatch(data) sample_count += data[labels].num_samples num_minibatch += 1 #Print the training progress data if num_minibatch % training_progress_output_freq == 0: #training_loss = cntk.get_train_loss(trainer) training_loss = trainer.previous_minibatch_loss_average #eval_error = cntk.get_train_eval_criterion(trainer) eval_error = trainer.previous_minibatch_evaluation_average print( "Epoch %d | # of Samples: %6d | Loss: %.6f | Error: %.6f" % (epoch, sample_count, training_loss, eval_error)) print("Training Completed.", end="\n\n") #''' #------------------- #Classification Test #-------------------- #''' test_minibatch_size = 1000 sample_count = 0 test_results = [] while sample_count < num_test_samples: minibatch = test_minibatch_source.next_minibatch( min(test_minibatch_size, num_test_samples - sample_count)) # Specify the mapping of input variables in the model to actual minibatch data to be tested with data = { input_vars: minibatch[test_features], labels: minibatch[test_labels] } eval_error = trainer.test_minibatch(data) test_results.append(eval_error) sample_count += data[labels].num_samples # Printing the average of evaluation errors of all test minibatches print("Average errors of all test minibatches: %.3f%%" % (float(np.mean(test_results, dtype=float)) * 100)) a = 5
def generate_visualization(use_brain_script_model, testing=False): num_objects_to_eval = 5 if (use_brain_script_model): model_file_name = "07_Deconvolution_BS.model" encoder_output_file_name = "encoder_output_BS.txt" decoder_output_file_name = "decoder_output_BS.txt" enc_node_name = "z.pool1" input_node_name = "f2" output_node_name = "z" else: model_file_name = "07_Deconvolution_PY.model" encoder_output_file_name = "encoder_output_PY.txt" decoder_output_file_name = "decoder_output_PY.txt" enc_node_name = "pooling_node" input_node_name = "input_node" output_node_name = "output_node" # define location of output, model and data and check existence output_path = os.path.join(abs_path, "Output") model_file = os.path.join(model_path, model_file_name) data_file = os.path.join(data_path, "Test-28x28_cntk_text.txt") if not (os.path.exists(model_file) and os.path.exists(data_file)): print("Cannot find required data or model. " "Please get the MNIST data set and run 'cntk configFile=07_Deconvolution_BS.cntk' or 'python 07_Deconvolution_PY.py' to create the model.") exit(0) # create minibatch source minibatch_source = MinibatchSource(CTFDeserializer(data_file, StreamDefs( features = StreamDef(field='features', shape=(28*28)), labels = StreamDef(field='labels', shape=10) )), randomize=False, max_sweeps = 1) # use this to print all node names in the model # print_all_node_names(model_file, use_brain_script_model) # load model and pick desired nodes as output loaded_model = load_model(model_file) output_nodes = combine( [loaded_model.find_by_name(input_node_name).owner, loaded_model.find_by_name(enc_node_name).owner, loaded_model.find_by_name(output_node_name).owner]) # evaluate model save output features_si = minibatch_source['features'] with open(os.path.join(output_path, decoder_output_file_name), 'wb') as decoder_text_file: with open(os.path.join(output_path, encoder_output_file_name), 'wb') as encoder_text_file: for i in range(0, num_objects_to_eval): mb = minibatch_source.next_minibatch(1) raw_dict = output_nodes.eval(mb[features_si]) output_dict = {} for key in raw_dict.keys(): output_dict[key.name] = raw_dict[key] encoder_input = output_dict[input_node_name] encoder_output = output_dict[enc_node_name] decoder_output = output_dict[output_node_name] in_values = (encoder_input[0,0].flatten())[np.newaxis] enc_values = (encoder_output[0,0].flatten())[np.newaxis] out_values = (decoder_output[0,0].flatten())[np.newaxis] if not testing: # write results as text and png np.savetxt(decoder_text_file, out_values, fmt="%.6f") np.savetxt(encoder_text_file, enc_values, fmt="%.6f") save_as_png(in_values, os.path.join(output_path, "imageAutoEncoder_%s__input.png" % i)) save_as_png(out_values, os.path.join(output_path, "imageAutoEncoder_%s_output.png" % i)) # visualizing the encoding is only possible and meaningful with a single conv filter enc_dim = 7 if(enc_values.size == enc_dim*enc_dim): save_as_png(enc_values, os.path.join(output_path, "imageAutoEncoder_%s_encoding.png" % i), dim=enc_dim) print("Done. Wrote output to %s" % output_path)
def test_sweep_based_schedule(tmpdir, device_id): from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk import cross_entropy_with_softmax, classification_error, plus, reduce_sum, sequence from cntk import Trainer input_dim = 69 ctf_data = '''\ 0 |S0 3:1 |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_dim, is_sparse=True), labels = StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=False) in1 = sequence.input_variable(shape=(input_dim,)) labels = sequence.input_variable(shape=(input_dim,)) p = parameter(shape=(input_dim,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { in1 : mbs.streams.features, labels : mbs.streams.labels } # fetch minibatch (first sequence) data = mbs.next_minibatch(1, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.3 # fetch minibatch (second sequence, sweep ends at this point) data = mbs.next_minibatch(1, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.2 # fetch minibatch (both sequences -- entire sweep in one go) data = mbs.next_minibatch(9, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.1 # fetch minibatch (multiple sweeps) data = mbs.next_minibatch(30, input_map=input_map) trainer.train_minibatch(data, outputs=[z.output]) assert learner.learning_rate() == 0.0
############################# # Prediction # ############################# sample_count = 0 output = np.zeros((number_images, numLabels, ImageH, ImageW), dtype = np.float32) print("##################################################") print("############## Start Prediction ##############") print("##################################################\n") print("Using model of epoch %d\n" % best_model) print("Prediction: 0 %% (% 5.1f samples/s)"% 0, end = '', flush = True) while sample_count < number_images: t_start_mb = time.time() currentMBsize = min(minibatchSize, number_images - sample_count) data = reader.next_minibatch(currentMBsize, input_map = input_map) output_mb = model.eval(data) output[sample_count:sample_count+currentMBsize,] = np.squeeze(output_mb) sample_count += currentMBsize sys.stdout.write('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b') print("% 3d %% (% 5.1f samples/s)" % (math.floor(100*sample_count/number_images), currentMBsize/(time.time()-t_start_mb)), end = '', flush = True) sys.stdout.write('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b') print(" ") print("\nSaving file...", end = '', flush = True) sio.savemat(OutputFile_PathAbs, {'pred':np.transpose(output)}) print("Finished!\n")