def test_session_cv_callback_with_cross_validation_3_times(tmpdir, device_id): device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) cv_mbs = mb_source(tmpdir, "cv") input_map = { feature: mbs.streams.features, label: mbs.streams.labels } def cv_callback(index, average_error, num_samples, num_mb): initial_position = cv_mbs.current_position total_error = 0 while True: mb = cv_mbs.next_minibatch(2, input_map=input_map) if not mb: break mb_error = t.test_minibatch(mb, device=device) total_error += mb_error * mb[label].num_samples total_samples = 25 # Please see input data assert((total_error * 100) / total_samples == 92) cv_mbs.current_position = initial_position return True C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, cv_config = C.CrossValidationConfig(frequency=20, callback=cv_callback) ).train(device) assert(t.total_number_of_samples_seen == 61)
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling=False): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } # Train all minibatches if profiling: cntk.start_profiler(sync_gpu=True) training_session( trainer=trainer, mb_source = train_source, var_to_stream = input_map, mb_size = minibatch_size, progress_frequency=epoch_size, checkpoint_config = CheckpointConfig(frequency = epoch_size, filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"), restore = restore), cv_config = CrossValidationConfig(source = test_source, mb_size=minibatch_size) ).train() if profiling: cntk.stop_profiler()
def test_session_cv_callback_early_exit(tmpdir, device_id): device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } counter = [0] def cv_callback(index, average_error, num_samples, num_mb): assert(counter[0] == index) assert average_error == 0 assert num_samples == 0 assert num_mb == 0 counter[0] += 1 return counter[0] < 1 C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, cv_config = C.CrossValidationConfig(frequency=20, callback=cv_callback) ).train(device) assert counter == [1]
def test_session_progress_print_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() #set to a higher learning rate as we don't need to have converge but just to go through all the samples t, feature, label = create_sample_model( device, writer, lr_per_sample=C.learning_rate_schedule(0.3, C.UnitType.sample)) mbs = mb_source( tmpdir, "training", #max_samples=INFINITELY_REPEAT, max_sweeps=4) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(5), model_inputs_to_streams=input_map, max_samples=FULL_DATA_SWEEP, progress_frequency=(2, C.train.DataUnit.sweep)).train(device) #4 sweeps of 25 samples = 100 samples assert (t.total_number_of_samples_seen == 100) #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written: assert (writer.training_summary_counter == 2)
def run_simple_training(tmpdir, device_id, test_config_factory): device = cntk_device(device_id) writer = MockProgressWriter(expected_test_summary=[[92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "test", ctf=ctf_data2, streams=['S4', 'S5']) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } input_map1 = { feature: mbs1.streams.features, label: mbs1.streams.labels } C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, test_config=test_config_factory(mbs1, input_map) ).train(device) assert(t.total_number_of_samples_seen == 61) assert(writer.test_summary_counter == 1)
def test_session_cv_callback_with_cross_validation_3_times(tmpdir, device_id): device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) cv_mbs = mb_source(tmpdir, "cv") input_map = {feature: mbs.streams.features, label: mbs.streams.labels} def cv_callback(index, average_error, num_samples, num_mb): initial_position = cv_mbs.current_position total_error = 0 while True: mb = cv_mbs.next_minibatch(2, input_map=input_map) if not mb: break mb_error = t.test_minibatch(mb, device=device) total_error += mb_error * mb[label].num_samples total_samples = 25 # Please see input data assert ((total_error * 100) / total_samples == 92) cv_mbs.current_position = initial_position return True C.training_session(trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, cv_config=C.CrossValidationConfig( frequency=20, callback=cv_callback)).train(device) assert (t.total_number_of_samples_seen == 61)
def test_session_progress_print_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() #set to a higher learning rate as we don't need to have converge but just to go through all the samples t, feature, label = create_sample_model(device, writer, lr_per_sample=C.learning_parameter_schedule_per_sample(0.3)) mbs = mb_source(tmpdir, "training", #max_samples=INFINITELY_REPEAT, max_sweeps = 4) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(5), model_inputs_to_streams=input_map, max_samples=FULL_DATA_SWEEP, progress_frequency=(2, C.train.DataUnit.sweep) ).train(device) #4 sweeps of 25 samples = 100 samples assert(t.total_number_of_samples_seen == 100) #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written: assert(writer.training_summary_counter == 2)
def test_session_cv_callback_early_exit(tmpdir, device_id): device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} counter = [0] def cv_callback(index, average_error, num_samples, num_mb): assert (counter[0] == index) assert average_error == 0 assert num_samples == 0 assert num_mb == 0 counter[0] += 1 return counter[0] < 1 C.training_session(trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, cv_config=C.CrossValidationConfig( frequency=20, callback=cv_callback)).train(device) assert counter == [1]
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, model_path=_MODEL_PATH, cv_config=None): """ Train and test """ # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } cntk.training_session(trainer=trainer, mb_source=train_source, mb_size=minibatch_size, model_inputs_to_streams=input_map, checkpoint_config=cntk.CheckpointConfig( filename=os.path.join(model_path, _MODEL_NAME), restore=restore), progress_frequency=epoch_size, cv_config=cv_config).train()
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling=False): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } # Train all minibatches if profiling: cntk.start_profiler(sync_gpu=True) training_session( trainer=trainer, mb_source=train_source, var_to_stream=input_map, mb_size=minibatch_size, progress_frequency=epoch_size, checkpoint_config=CheckpointConfig(frequency=epoch_size, filename=os.path.join( model_path, "ConvNet_CIFAR10_DataAug"), restore=restore), cv_config=CrossValidationConfig(source=test_source, mb_size=minibatch_size)).train() if profiling: cntk.stop_profiler()
def finalize_network(reader, model_details, max_amount_of_epochs, samples_per_epoch, samples_per_minibatch, pixel_dimensions, classes, learning_rate): features = input_variable(shape=(pixel_dimensions['depth'], pixel_dimensions['height'], pixel_dimensions['width'])) label = input_variable(shape=len(classes)) # speeds up training normalized_features = element_times(1.0 / 256.0, features) model = create_tf_model(model_details, num_classes=len(classes), input_features=normalized_features, freeze=True) loss = cross_entropy_with_softmax(model, label) metric = classification_error(model, label) learner = momentum_sgd(parameters=model.parameters, lr=learning_rate_schedule(learning_rate, UnitType.minibatch), momentum=0.9, l2_regularization_weight=0.0005) reporter = ProgressPrinter(tag='training', num_epochs=max_amount_of_epochs) trainer = Trainer(model=model, criterion=(loss, metric), parameter_learners=[learner], progress_writers=[reporter]) log_number_of_parameters(model) map_input_to_streams_train = { features: reader.streams.features, label: reader.streams.labels } training_session(trainer=trainer, mb_source=reader, model_inputs_to_streams=map_input_to_streams_train, mb_size=samples_per_minibatch, progress_frequency=samples_per_epoch, checkpoint_config=CheckpointConfig( frequency=samples_per_epoch, filename=os.path.join("./checkpoints", "ConvNet_Lego_VisiOn"), restore=True)).train() network = {'features': features, 'label': label, 'model': softmax(model)} model_name = f"CNN-3200-224-resnet-18.model" export_path = os.path.abspath( os.path.join("..", "..", "Final models", "CNN", model_name)) model.save(export_path) return network
def test_session_sanity_check(tmpdir, device_id): device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training") input_map = {feature: mbs.streams.features, label: mbs.streams.labels} C.training_session(trainer=t, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4).train(device)
def test_session_restart_from_end_checkpoint(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config=C.CheckpointConfig( frequency=20, filename=str(tmpdir / "restart_from_checkpoint"))).train(device) candidates = [ f for f in listdir(test_dir) if isfile(join(test_dir, f)) and f.startswith("restart_from_checkpoint") ] assert (len(candidates) == 2) assert ("restart_from_checkpoint" in candidates) assert ("restart_from_checkpoint" in candidates) # remove information from the mock printer writer.minibatch_info = [] writer.training_summary_counter = 0 writer.testing_summary_counter = 0 # restoring from a particular checkpoint should not cause any training mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config=C.CheckpointConfig( frequency=35, restore=True, filename=str(tmpdir / "restart_from_checkpoint"))).train(device) assert (len(writer.minibatch_info) == 0) assert (writer.training_summary_counter == 0) assert (writer.testing_summary_counter == 0)
def test_session_max_samples(tmpdir, device_id): device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} C.training_session(trainer=t, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20).train(device) assert (t.total_number_of_samples_seen == 21)
def test_session_sanity_check(tmpdir, device_id): device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training") input_map = { feature: mbs.streams.features, label: mbs.streams.labels } C.training_session( trainer=t, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4 ).train(device)
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling=False): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source = train_source, trainer = trainer, mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), progress_printer = progress_printer, model_inputs_to_mb_source_mapping = input_map, checkpoint_frequency = epoch_size, checkpoint_filename="ResNet_CIFAR10_DataAug", progress_frequency=epoch_size, cv_source=test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(16), restore=False) if profiling: start_profiler(sync_gpu=True) training_session.train() if profiling: stop_profiler()
def train_and_test(network, trainer, train_source, test_source, progress_writers, minibatch_size, epoch_size, restore, profiling=False): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source = train_source, trainer = trainer, model_inputs_to_mb_source_mapping = input_map, mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), progress_printer = progress_writers, checkpoint_frequency = epoch_size, checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"), # save_all_checkpoints = False, progress_frequency=epoch_size, cv_source = test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size), # cv_frequency = epoch_size, restore=restore) # Train all minibatches if profiling: cntk.start_profiler(sync_gpu=True) training_session.train() if profiling: cntk.stop_profiler()
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source=train_source, trainer=trainer, model_inputs_to_mb_source_mapping=input_map, mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size), progress_printer=progress_printer, # checkpoint_frequency = epoch_size, checkpoint_filename=os.path.join(model_path, model_name), # save_all_checkpoints = True, progress_frequency=epoch_size, cv_source=test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size), # cv_frequency = epoch_size, restore=restore) # Train all minibatches training_session.train()
def test_usermbsource_training(tmpdir): input_dim = 1000 num_output_classes = 5 mbs = MyDataSource(input_dim, num_output_classes) from cntk import sequence, parameter, plus, cross_entropy_with_softmax, \ classification_error, learning_rate_schedule, sgd, Trainer, \ training_session, times, UnitType feature = sequence.input_variable(shape=(input_dim, )) label = C.input_variable(shape=(num_output_classes, )) p = parameter(shape=(input_dim, num_output_classes), init=10) z = times(sequence.reduce_sum(feature), p, name='z') ce = cross_entropy_with_softmax(z, label) errs = classification_error(z, label) lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = {feature: mbs.fsi, label: mbs.lsi} session = training_session(trainer=trainer, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20) session.train() assert trainer.total_number_of_samples_seen == 20
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling=False): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source=train_source, trainer=trainer, mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size), progress_printer=progress_printer, model_inputs_to_mb_source_mapping=input_map, checkpoint_frequency=epoch_size, checkpoint_filename="ResNet_CIFAR10_DataAug", progress_frequency=epoch_size, cv_source=test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(16), restore=False) if profiling: start_profiler(sync_gpu=True) training_session.train() if profiling: stop_profiler()
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source = train_source, trainer = trainer, model_inputs_to_mb_source_mapping = input_map, mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), progress_printer = progress_printer, # checkpoint_frequency = epoch_size, checkpoint_filename = os.path.join(model_path, model_name), # save_all_checkpoints = True, progress_frequency = epoch_size, cv_source = test_source, cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), # cv_frequency = epoch_size, restore = restore) # Train all minibatches training_session.train()
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } # Train all minibatches training_session( trainer=trainer, mb_source = train_source, var_to_stream = input_map, mb_size = minibatch_size, progress_frequency=epoch_size, checkpoint_config = CheckpointConfig(filename=os.path.join(model_path, model_name), restore=restore), cv_config= CrossValidationConfig(source=test_source, mb_size=minibatch_size) ).train()
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } # Train all minibatches training_session( trainer=trainer, mb_source = train_source, var_to_stream = input_map, mb_size = minibatch_size, progress_frequency=epoch_size, checkpoint_config = CheckpointConfig(filename = os.path.join(model_path, model_name), restore=restore), cv_config = CrossValidationConfig(source=test_source, mb_size=minibatch_size) ).train()
def test_session_max_samples(tmpdir, device_id): device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } C.training_session( trainer=t, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20 ).train(device) assert(t.total_number_of_samples_seen == 21)
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size): input_map = { network['feature']: train_source.streams.amazing_features, network['label']: train_source.streams.awesome_labels } training_session( trainer=trainer, mb_source = train_source, var_to_stream = input_map, mb_size = minibatch_size, progress_frequency=epoch_size, checkpoint_config = CheckpointConfig(frequency = epoch_size, filename = os.path.join(model_path, "HKT_LSTM_Truncated"), restore = False), cv_config = CrossValidationConfig(source=test_source, mb_size=minibatch_size) ).train()
def test_session_cross_validation_at_end(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter(expected_test_summary=[[92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "cv") input_map = {feature: mbs.streams.features, label: mbs.streams.labels} C.training_session(trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=20, cv_config=C.CrossValidationConfig(mbs1)).train(device) assert (t.total_number_of_samples_seen == 21) assert (writer.test_summary_counter == 1)
def test_session_progress_print(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session(trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(4), model_inputs_to_streams=input_map, max_samples=60, progress_frequency=10).train(device) assert (writer.training_summary_counter == 6)
def train_and_test(network, trainer, train_reader, test_reader, progress_printer, epoch_size, minibatch_size): train_bind = { network['raw_input'] : train_reader.streams.features, network['raw_labels'] : train_reader.streams.labels } training_session( mb_source = train_reader, trainer=trainer, var_to_stream=train_bind, mb_size=minibatch_size, progress_printer=progress_printer, progress_frequency=epoch_size, checkpoint_config=CheckpointConfig(frequency = epoch_size, filename = os.path.join(model_path, "SequenceToSequence"), restore = False), cv_config=CrossValidationConfig(source=test_reader, mb_size=minibatch_size) ).train()
def test_session_restart_from_end_checkpoint(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session(trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config = C.CheckpointConfig(frequency=20, filename=str(tmpdir / "restart_from_checkpoint")) ).train(device) candidates = [f for f in listdir(test_dir) if isfile( join(test_dir, f)) and f.startswith("restart_from_checkpoint")] assert(len(candidates) == 2) assert("restart_from_checkpoint" in candidates) assert("restart_from_checkpoint" in candidates) # remove information from the mock printer writer.minibatch_info = [] writer.training_summary_counter = 0 writer.testing_summary_counter = 0 # restoring from a particular checkpoint should not cause any training mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) C.training_session(trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config = C.CheckpointConfig(frequency=35, restore=True, filename=str(tmpdir / "restart_from_checkpoint")) ).train(device) assert(len(writer.minibatch_info) == 0) assert(writer.training_summary_counter == 0) assert(writer.testing_summary_counter == 0)
def test_session_progress_print_on_minibatch_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(4), model_inputs_to_streams=input_map, max_samples=60, progress_frequency=(5, C.train.DataUnit.minibatch)).train(device) #mb size = 4; num_of_mb = 60/4 = 15; output every 5 mb; at the end, 3 outputs are written: assert (writer.training_summary_counter == 3)
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size): input_map = { network['feature']: train_source.streams.amazing_features, network['label']: train_source.streams.awesome_labels } training_session( trainer=trainer, mb_source=train_source, var_to_stream=input_map, mb_size=minibatch_size, progress_frequency=epoch_size, checkpoint_config=CheckpointConfig(frequency=epoch_size, filename=os.path.join( model_path, "HKT_LSTM_Truncated"), restore=False), cv_config=CrossValidationConfig(source=test_source, mb_size=minibatch_size)).train()
def test_session_with_test(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter(expected_test_summary=[[92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "test") input_map = {feature: mbs.streams.features, label: mbs.streams.labels} C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, test_config=C.TestConfig(mbs1, minibatch_size=2), ).train(device) assert (t.total_number_of_samples_seen == 61) assert (writer.test_summary_counter == 1)
def test_session_progress_print(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(4), model_inputs_to_streams=input_map, max_samples=60, progress_frequency=10 #by default, fequence is on samples ).train(device) assert(writer.training_summary_counter == 6)
def test_session_progress_print_on_minibatch_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(4), model_inputs_to_streams=input_map, max_samples=60, progress_frequency=(5, C.train.DataUnit.minibatch) ).train(device) #mb size = 4; num_of_mb = 60/4 = 15; output every 5 mb; at the end, 3 outputs are written: assert(writer.training_summary_counter == 3)
def test_session_with_test(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter(expected_test_summary=[[92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "test") input_map = { feature: mbs.streams.features, label: mbs.streams.labels } C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, test_config = C.TestConfig(mbs1, minibatch_size=2), ).train(device) assert(t.total_number_of_samples_seen == 61) assert(writer.test_summary_counter == 1)
def test_session_cross_validation_at_end(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter(expected_test_summary=[[92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "cv") input_map = { feature: mbs.streams.features, label: mbs.streams.labels } C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=20, cv_config = C.CrossValidationConfig(mbs1) ).train(device) assert(t.total_number_of_samples_seen == 21) assert(writer.test_summary_counter == 1)
def test_session_cross_validation_3_times_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter(expected_test_summary=[[92, 25], [92, 25], [92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "cv") input_map = { feature: mbs.streams.features, label: mbs.streams.labels } C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=75, cv_config = C.CrossValidationConfig(mbs1, frequency=(1, C.train.DataUnit.sweep), minibatch_size=2), ).train(device) assert(t.total_number_of_samples_seen == 75) assert(writer.test_summary_counter == 3)
def test_session_cross_validation_3_times_checkpoints_2_save_all( tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter( expected_test_summary=[[92, 25], [92, 25], [92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "cv") input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session(trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, checkpoint_config=C.CheckpointConfig( frequency=35, preserve_all=True, filename=str(tmpdir / "checkpoint_save_all")), cv_config=C.CrossValidationConfig( mbs1, frequency=20)).train(device) candidates = [ f for f in listdir(test_dir) if isfile(join(test_dir, f)) and f.startswith("checkpoint_save_all") ] assert ("checkpoint_save_all0" in candidates) assert ("checkpoint_save_all0.ckp" in candidates) assert ("checkpoint_save_all1" in candidates) assert ("checkpoint_save_all1.ckp" in candidates) assert ("checkpoint_save_all" in candidates) assert ("checkpoint_save_all.ckp" in candidates) assert (writer.test_summary_counter == 3)
def test_session_cross_validation_3_times_on_minibatch_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter( expected_test_summary=[[92, 25], [92, 25], [92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "cv") input_map = {feature: mbs.streams.features, label: mbs.streams.labels} C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, cv_config=C.CrossValidationConfig( mbs1, frequency=(5, C.train.DataUnit.minibatch), minibatch_size=2), ).train(device) assert (t.total_number_of_samples_seen == 61) assert (writer.test_summary_counter == 3)
def test_session_cross_validation_3_times_checkpoints_2_save_all_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter(expected_test_summary=[[92, 25], [92, 25], [92, 25]]) t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) mbs1 = mb_source(tmpdir, "cv") input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=75, checkpoint_config = C.CheckpointConfig(frequency=(1, C.train.DataUnit.sweep), preserve_all=True, filename=str(tmpdir / "checkpoint_save_all")), cv_config = C.CrossValidationConfig(mbs1, frequency=25) ).train(device) candidates = [f for f in listdir(test_dir) if isfile( join(test_dir, f)) and f.startswith("checkpoint_save_all")] assert("checkpoint_save_all0" in candidates) assert("checkpoint_save_all0.ckp" in candidates) assert("checkpoint_save_all1" in candidates) assert("checkpoint_save_all1.ckp" in candidates) assert("checkpoint_save_all" in candidates) assert("checkpoint_save_all.ckp" in candidates) assert(writer.test_summary_counter == 3)
def test_usermbsource_training(tmpdir, with_checkpoint_impl): input_dim = 1000 num_output_classes = 5 mbs = MyDataSource(input_dim, num_output_classes) # Using this for testing the UserMinibatchSource checkpointing if with_checkpoint_impl: MBS_CV_CLASS = MyDataSourceWithCheckpoint else: MBS_CV_CLASS = MyDataSource mbs_cv = MBS_CV_CLASS(input_dim, num_output_classes) from cntk import sequence, parameter, plus, cross_entropy_with_softmax, \ classification_error, learning_rate_schedule, sgd, Trainer, \ training_session, times, UnitType feature = sequence.input_variable(shape=(input_dim,)) label = C.input_variable(shape=(num_output_classes,)) p = parameter(shape=(input_dim, num_output_classes), init=10) z = times(sequence.reduce_sum(feature), p, name='z') ce = cross_entropy_with_softmax(z, label) errs = classification_error(z, label) #having a large learning rate to prevent the model from converging earlier where not all the intended samples are fed #note that training session can end earlier if there is no updates lr_per_sample = learning_rate_schedule(0.3, UnitType.sample) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { feature: mbs.fsi, label: mbs.lsi } session = training_session( trainer=trainer, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20, cv_config = C.CrossValidationConfig(minibatch_source=mbs_cv, max_samples=10, minibatch_size=2) ) session.train() assert trainer.total_number_of_samples_seen == 20 if with_checkpoint_impl: assert mbs_cv._restore_from_checkpoint_calls == 1
def test_usermbsource_training(tmpdir, with_checkpoint_impl): input_dim = 1000 num_output_classes = 5 mbs = MyDataSource(input_dim, num_output_classes) # Using this for testing the UserMinibatchSource checkpointing if with_checkpoint_impl: MBS_CV_CLASS = MyDataSourceWithCheckpoint else: MBS_CV_CLASS = MyDataSource mbs_cv = MBS_CV_CLASS(input_dim, num_output_classes) from cntk import sequence, parameter, plus, cross_entropy_with_softmax, \ classification_error, learning_parameter_schedule_per_sample, sgd, Trainer, \ training_session, times feature = sequence.input_variable(shape=(input_dim,)) label = C.input_variable(shape=(num_output_classes,)) p = parameter(shape=(input_dim, num_output_classes), init=10) z = times(sequence.reduce_sum(feature), p, name='z') ce = cross_entropy_with_softmax(z, label) errs = classification_error(z, label) #having a large learning rate to prevent the model from converging earlier where not all the intended samples are fed #note that training session can end earlier if there is no updates lr_per_sample = learning_parameter_schedule_per_sample(0.3) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { feature: mbs.fsi, label: mbs.lsi } session = training_session( trainer=trainer, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20, cv_config = C.CrossValidationConfig(minibatch_source=mbs_cv, max_samples=10, minibatch_size=2) ) session.train() assert trainer.total_number_of_samples_seen == 20 if with_checkpoint_impl: assert mbs_cv._restore_from_checkpoint_calls == 1
def train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session(train_source, trainer, cntk.minibatch_size_schedule(64), progress_printer, input_map, "ConvNet_CIFAR10_DataAug_", epoch_size) training_session.train() ### TODO: Stay tuned for an upcoming simpler EvalSession API for test/validation. ### Evaluation action minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 while True: data = test_source.next_minibatch(minibatch_size, input_map=input_map) if not data: break local_mb_samples = data[network['label']].num_samples metric_numer += trainer.test_minibatch(data) * local_mb_samples metric_denom += local_mb_samples minibatch_index += 1 fin_msg = "Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom) progress_printer.end_progress_print(fin_msg) print("") print(fin_msg) print("") return metric_numer / metric_denom
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( train_source, trainer, cntk.minibatch_size_schedule(minibatch_size), progress_printer, input_map, os.path.join(model_path, "AlexNet_"), epoch_size) training_session.train() # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 while True: data = test_source.next_minibatch(minibatch_size, input_map=input_map) if not data: break local_mb_samples = data[network['label']].num_samples metric_numer += trainer.test_minibatch(data) * local_mb_samples metric_denom += local_mb_samples minibatch_index += 1 fin_msg = "Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom) progress_printer.end_progress_print(fin_msg) print("") print(fin_msg) print("") return metric_numer / metric_denom
def train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source=train_source, trainer=trainer, model_inputs_to_mb_source_mapping=input_map, mb_size_schedule=cntk.minibatch_size_schedule(64), progress_printer=progress_printer, checkpoint_filename=os.path.join(model_path, "ConvNet_CIFAR10_DataAug"), progress_frequency=epoch_size, cv_source=test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(16), restore=False) # Train all minibatches training_session.train()
def test_training_session_with_infinite_samples(tmpdir, device_id): import pytest device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} with pytest.raises(ValueError) as info1: C.training_session(trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map).train(device) assert 'Train minibatch source must have a limited number of samples or sweeps' in str( info1.value) with pytest.raises(ValueError) as info2: mbs1 = mb_source(tmpdir, "test", max_samples=INFINITELY_REPEAT) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=10, test_config=C.TestConfig(mbs1, minibatch_size=2), ).train(device) assert 'Test minibatch source must have a limited number of samples or sweeps' in str( info2.value) with pytest.raises(ValueError) as info3: mbs2 = mb_source(tmpdir, "cv", max_samples=INFINITELY_REPEAT) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=20, cv_config=C.CrossValidationConfig(mbs2)).train(device) assert 'Cross validation minibatch source must have a limited number of samples or sweeps' in str( info3.value)
def test_training_session_with_infinite_samples(tmpdir, device_id): import pytest device = cntk_device(device_id) t, feature, label = create_sample_model(device) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } with pytest.raises(ValueError) as info1: C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map ).train(device) assert 'Train minibatch source must have a limited number of samples or sweeps' in str(info1.value) with pytest.raises(ValueError) as info2: mbs1 = mb_source(tmpdir, "test", max_samples=INFINITELY_REPEAT) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples = 10, test_config = C.TestConfig(mbs1, minibatch_size=2), ).train(device) assert 'Test minibatch source must have a limited number of samples or sweeps' in str(info2.value) with pytest.raises(ValueError) as info3: mbs2 = mb_source(tmpdir, "cv", max_samples=INFINITELY_REPEAT) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=20, cv_config = C.CrossValidationConfig(mbs2) ).train(device) assert 'Cross validation minibatch source must have a limited number of samples or sweeps' in str(info3.value)
def train(reader_train, reader_test, samples_per_epoch, max_amount_of_epochs, samples_per_minibatch, dimensions, classes, learning_rate, output_directory, with_tf): features = input_variable(shape=(dimensions['depth'], dimensions['height'], dimensions['width'])) label = input_variable(shape=len(classes)) # speeds up training normalized_features = element_times(1.0 / 256.0, features) if with_tf: base_model = { 'model_file': os.path.join("..", "..", "Pretrained Models/ResNet_18.model"), 'feature_node_name': 'features', 'last_hidden_node_name': 'z.x', 'image_dims': (3, 224, 224) } model = create_tf_model(base_model, num_classes=len(classes), input_features=normalized_features, freeze=True) else: model = create_model(feature_dimensions=normalized_features, classes=classes) loss = cross_entropy_with_softmax(model, label) metric = classification_error(model, label) learner = momentum_sgd(parameters=model.parameters, lr=learning_rate_schedule(learning_rate, UnitType.minibatch), momentum=0.9, l2_regularization_weight=0.0005) reporter = ProgressPrinter(tag='training', num_epochs=max_amount_of_epochs) trainer = Trainer(model=model, criterion=(loss, metric), parameter_learners=[learner], progress_writers=[reporter]) log_number_of_parameters(model) map_input_to_streams_train = { features: reader_train.streams.features, label: reader_train.streams.labels } map_input_to_streams_test = { features: reader_test.streams.features, label: reader_test.streams.labels } training_session( trainer=trainer, mb_source=reader_train, model_inputs_to_streams=map_input_to_streams_train, mb_size=samples_per_minibatch, progress_frequency=samples_per_epoch, checkpoint_config=CheckpointConfig(frequency=samples_per_epoch, filename=os.path.join( output_directory, "ConvNet_Lego_VisiOn"), restore=False), test_config=TestConfig( reader_test, minibatch_size=samples_per_minibatch, model_inputs_to_streams=map_input_to_streams_test)).train() network = {'features': features, 'label': label, 'model': softmax(model)} return network
def test_session_restart_from_checkpoint_preserve_all(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config=C.CheckpointConfig( frequency=20, preserve_all=True, filename=str(tmpdir / "restart_from_checkpoint"))).train(device) candidates = [ f for f in listdir(test_dir) if isfile(join(test_dir, f)) and f.startswith("restart_from_checkpoint") ] assert ("restart_from_checkpoint0" in candidates) assert ("restart_from_checkpoint0.ckp" in candidates) assert ("restart_from_checkpoint1" in candidates) assert ("restart_from_checkpoint1.ckp" in candidates) assert ("restart_from_checkpoint2" in candidates) assert ("restart_from_checkpoint2.ckp" in candidates) assert ("restart_from_checkpoint" in candidates) assert ("restart_from_checkpoint" in candidates) # remove everything except for 1 for f in candidates: if f != "restart_from_checkpoint1" and f != "restart_from_checkpoint1.ckp": os.remove(str(tmpdir / f)) # remove information about 1 and 2 epoch from the mock printer first_run_minibatch_info = [ i for i in writer.minibatch_info if i[0] != 0 and i[0] != 1 ] writer.minibatch_info = [] writer.training_summary_counter = 2 # restoring from a particular checkpoint and again save everything from the 3 epoch mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config=C.CheckpointConfig( frequency=20, restore=True, preserve_all=True, filename=str(tmpdir / "restart_from_checkpoint"))).train(device) candidates = [ f for f in listdir(test_dir) if isfile(join(test_dir, f)) and f.startswith("restart_from_checkpoint") ] assert ("restart_from_checkpoint1" in candidates) assert ("restart_from_checkpoint1.ckp" in candidates) assert ("restart_from_checkpoint2" in candidates) assert ("restart_from_checkpoint2.ckp" in candidates) assert ("restart_from_checkpoint" in candidates) assert ("restart_from_checkpoint.ckp" in candidates) assert (len(candidates) == 6) assert (first_run_minibatch_info == writer.minibatch_info) # remove everything except for 1 for f in candidates: if f != "restart_from_checkpoint1" and f != "restart_from_checkpoint1.ckp": os.remove(str(tmpdir / f)) # remove information about 1 and 2 epoch from the mock printer writer.minibatch_info = [] writer.training_summary_counter = 2 # renaming checkpoint 1 to generic one os.rename(str(tmpdir / "restart_from_checkpoint1"), str(tmpdir / "restart_from_checkpoint")) os.rename(str(tmpdir / "restart_from_checkpoint1.ckp"), str(tmpdir / "restart_from_checkpoint.ckp")) # restoring from a particular checkpoint and again save everything from the 3 epoch mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config=C.CheckpointConfig( frequency=20, restore=True, preserve_all=True, filename=str(tmpdir / "restart_from_checkpoint"))).train(device) candidates = [ f for f in listdir(test_dir) if isfile(join(test_dir, f)) and f.startswith("restart_from_checkpoint") ] assert ("restart_from_checkpoint2" in candidates) assert ("restart_from_checkpoint2.ckp" in candidates) assert ("restart_from_checkpoint" in candidates) assert ("restart_from_checkpoint.ckp" in candidates) assert (len(candidates) == 4) assert (first_run_minibatch_info == writer.minibatch_info)
def test_session_restart_from_checkpoint_preserve_all(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session(trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency = 20, checkpoint_config = C.CheckpointConfig(frequency=20, preserve_all=True, filename=str(tmpdir / "restart_from_checkpoint")) ).train(device) candidates = [f for f in listdir(test_dir) if isfile( join(test_dir, f)) and f.startswith("restart_from_checkpoint")] assert("restart_from_checkpoint0" in candidates) assert("restart_from_checkpoint0.ckp" in candidates) assert("restart_from_checkpoint1" in candidates) assert("restart_from_checkpoint1.ckp" in candidates) assert("restart_from_checkpoint2" in candidates) assert("restart_from_checkpoint2.ckp" in candidates) assert("restart_from_checkpoint" in candidates) assert("restart_from_checkpoint" in candidates) # remove everything except for 1 for f in candidates: if f != "restart_from_checkpoint1" and f != "restart_from_checkpoint1.ckp": os.remove(str(tmpdir / f)) # remove information about 1 and 2 epoch from the mock printer first_run_minibatch_info = [i for i in writer.minibatch_info if i[0] != 0 and i[0] != 1] writer.minibatch_info = [] writer.training_summary_counter = 2 # restoring from a particular checkpoint and again save everything from the 3 epoch mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config = C.CheckpointConfig(frequency=20, restore=True, preserve_all= True, filename=str(tmpdir / "restart_from_checkpoint")) ).train(device) candidates = [f for f in listdir(test_dir) if isfile( join(test_dir, f)) and f.startswith("restart_from_checkpoint")] assert("restart_from_checkpoint1" in candidates) assert("restart_from_checkpoint1.ckp" in candidates) assert("restart_from_checkpoint2" in candidates) assert("restart_from_checkpoint2.ckp" in candidates) assert("restart_from_checkpoint" in candidates) assert("restart_from_checkpoint.ckp" in candidates) assert(len(candidates) == 6) assert(first_run_minibatch_info == writer.minibatch_info) # remove everything except for 1 for f in candidates: if f != "restart_from_checkpoint1" and f != "restart_from_checkpoint1.ckp": os.remove(str(tmpdir / f)) # remove information about 1 and 2 epoch from the mock printer writer.minibatch_info = [] writer.training_summary_counter = 2 # renaming checkpoint 1 to generic one os.rename(str(tmpdir / "restart_from_checkpoint1"), str(tmpdir / "restart_from_checkpoint")) os.rename(str(tmpdir / "restart_from_checkpoint1.ckp"), str(tmpdir / "restart_from_checkpoint.ckp")) # restoring from a particular checkpoint and again save everything from the 3 epoch mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) training_session = C.training_session( trainer=t, mb_source=mbs, mb_size=4, model_inputs_to_streams=input_map, max_samples=60, progress_frequency=20, checkpoint_config = C.CheckpointConfig(frequency=20, restore=True, preserve_all= True, filename=str(tmpdir / "restart_from_checkpoint")) ).train(device) candidates = [f for f in listdir(test_dir) if isfile( join(test_dir, f)) and f.startswith("restart_from_checkpoint")] assert("restart_from_checkpoint2" in candidates) assert("restart_from_checkpoint2.ckp" in candidates) assert("restart_from_checkpoint" in candidates) assert("restart_from_checkpoint.ckp" in candidates) assert(len(candidates) == 4) assert(first_run_minibatch_info == writer.minibatch_info)
def simple_mnist(tensorboard_logdir=None): input_dim = 784 num_output_classes = 10 num_hidden_layers = 1 hidden_layers_dim = 200 # Input variables denoting the features and label data input = input_variable(input_dim, np.float32) label = input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), input) z = fully_connected_classifier_net( scaled_input, num_output_classes, hidden_layers_dim, num_hidden_layers, relu) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST") path = os.path.normpath(os.path.join(data_dir, "Train-28x28_cntk_text.txt")) check_path(path) reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { input : reader_train.streams.features, label : reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. #training_progress_output_freq = 100 progress_writers = [ProgressPrinter( #freq=training_progress_output_freq, tag='Training', num_epochs=num_sweeps_to_train_with)] if tensorboard_logdir is not None: progress_writers.append(TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)) # Instantiate the trainer object to drive the model training lr_per_minibatch = learning_rate_schedule(0.2, UnitType.minibatch) trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch), progress_writers) training_session( trainer=trainer, mb_source = reader_train, mb_size = minibatch_size, var_to_stream = input_map, max_samples = num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep ).train() # Load test data path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt")) check_path(path) reader_test = create_reader(path, False, input_dim, num_output_classes) input_map = { input : reader_test.streams.features, label : reader_test.streams.labels } # Test data for trained model test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test