def train_and_test(network,
                   trainer,
                   train_source,
                   test_source,
                   progress_printer,
                   minibatch_size,
                   epoch_size,
                   profiling=False):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    training_session = cntk.training_session(
        training_minibatch_source=train_source,
        trainer=trainer,
        mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
        progress_printer=progress_printer,
        model_inputs_to_mb_source_mapping=input_map,
        checkpoint_frequency=epoch_size,
        checkpoint_filename="ResNet_CIFAR10_DataAug",
        progress_frequency=epoch_size,
        cv_source=test_source,
        cv_mb_size_schedule=cntk.minibatch_size_schedule(16),
        restore=False)

    if profiling:
        start_profiler(sync_gpu=True)

    training_session.train()

    if profiling:
        stop_profiler()
Пример #2
0
def train_and_test(network, trainer, train_source, test_source,
                   progress_printer, minibatch_size, epoch_size, restore):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    training_session = cntk.training_session(
        training_minibatch_source=train_source,
        trainer=trainer,
        model_inputs_to_mb_source_mapping=input_map,
        mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
        progress_printer=progress_printer,
        #        checkpoint_frequency = epoch_size,
        checkpoint_filename=os.path.join(model_path, model_name),
        #        save_all_checkpoints = True,
        progress_frequency=epoch_size,
        cv_source=test_source,
        cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
        #        cv_frequency = epoch_size,
        restore=restore)

    # Train all minibatches
    training_session.train()
Пример #3
0
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    training_session = cntk.training_session(
        training_minibatch_source = train_source, 
        trainer = trainer,
        model_inputs_to_mb_source_mapping = input_map, 
        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), 
        progress_printer = progress_printer, 
#        checkpoint_frequency = epoch_size,
        checkpoint_filename = os.path.join(model_path, model_name), 
#        save_all_checkpoints = True,
        progress_frequency = epoch_size, 
        cv_source = test_source, 
        cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
#        cv_frequency = epoch_size,
        restore = restore)

    # Train all minibatches 
    training_session.train()
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling=False):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    training_session = cntk.training_session(
        training_minibatch_source = train_source, 
        trainer = trainer,
        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
        progress_printer = progress_printer,
        model_inputs_to_mb_source_mapping = input_map, 
        checkpoint_frequency = epoch_size,
        checkpoint_filename="ResNet_CIFAR10_DataAug", 
        progress_frequency=epoch_size,
        cv_source=test_source,
        cv_mb_size_schedule=cntk.minibatch_size_schedule(16),
        restore=False)
	
    if profiling:
        start_profiler(sync_gpu=True)
        
    training_session.train()
    
    if profiling:
        stop_profiler()
def train_and_test(network, trainer, train_source, test_source, progress_writers, minibatch_size, epoch_size, restore, profiling=False):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    training_session = cntk.training_session(
        training_minibatch_source = train_source,
        trainer = trainer,
        model_inputs_to_mb_source_mapping = input_map,
        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
        progress_printer = progress_writers,
        checkpoint_frequency = epoch_size,
        checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
#        save_all_checkpoints = False,
        progress_frequency=epoch_size,
        cv_source = test_source,
        cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
#        cv_frequency = epoch_size,
        restore=restore)

    # Train all minibatches
    if profiling:
        cntk.start_profiler(sync_gpu=True)

    training_session.train()

    if profiling:
        cntk.stop_profiler()
Пример #6
0
def test_session_progress_print_on_sweep_unit(tmpdir, device_id):
    device = cntk_device(device_id)
    writer = MockProgressWriter()
    #set to a higher learning rate as we don't need to have converge but just to go through all the samples
    t, feature, label = create_sample_model(device, writer, lr_per_sample=C.learning_parameter_schedule_per_sample(0.3))
    mbs = mb_source(tmpdir, "training",
                    #max_samples=INFINITELY_REPEAT,
                    max_sweeps = 4)

    input_map = {
        feature: mbs.streams.features,
        label: mbs.streams.labels
    }

    test_dir = str(tmpdir)

    C.training_session(
        trainer=t, mb_source=mbs,
        mb_size=C.minibatch_size_schedule(5),
        model_inputs_to_streams=input_map, max_samples=FULL_DATA_SWEEP,
        progress_frequency=(2, C.train.DataUnit.sweep)
    ).train(device)
    #4 sweeps of 25 samples = 100 samples
    assert(t.total_number_of_samples_seen == 100)
    #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written:
    assert(writer.training_summary_counter == 2)
Пример #7
0
def test_session_progress_print_on_sweep_unit(tmpdir, device_id):
    device = cntk_device(device_id)
    writer = MockProgressWriter()
    #set to a higher learning rate as we don't need to have converge but just to go through all the samples
    t, feature, label = create_sample_model(
        device,
        writer,
        lr_per_sample=C.learning_rate_schedule(0.3, C.UnitType.sample))
    mbs = mb_source(
        tmpdir,
        "training",
        #max_samples=INFINITELY_REPEAT,
        max_sweeps=4)

    input_map = {feature: mbs.streams.features, label: mbs.streams.labels}

    test_dir = str(tmpdir)

    C.training_session(
        trainer=t,
        mb_source=mbs,
        mb_size=C.minibatch_size_schedule(5),
        model_inputs_to_streams=input_map,
        max_samples=FULL_DATA_SWEEP,
        progress_frequency=(2, C.train.DataUnit.sweep)).train(device)
    #4 sweeps of 25 samples = 100 samples
    assert (t.total_number_of_samples_seen == 100)
    #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written:
    assert (writer.training_summary_counter == 2)
Пример #8
0
def train_and_test(network, trainer, train_source, test_source,
                   progress_printer, epoch_size):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    training_session = cntk.training_session(
        training_minibatch_source=train_source,
        trainer=trainer,
        model_inputs_to_mb_source_mapping=input_map,
        mb_size_schedule=cntk.minibatch_size_schedule(64),
        progress_printer=progress_printer,
        checkpoint_filename=os.path.join(model_path,
                                         "ConvNet_CIFAR10_DataAug"),
        progress_frequency=epoch_size,
        cv_source=test_source,
        cv_mb_size_schedule=cntk.minibatch_size_schedule(16),
        restore=False)

    # Train all minibatches
    training_session.train()
Пример #9
0
def test_session_progress_print(tmpdir, device_id):
    device = cntk_device(device_id)
    writer = MockProgressWriter()
    t, feature, label = create_sample_model(device, writer)
    mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT)

    input_map = {feature: mbs.streams.features, label: mbs.streams.labels}

    test_dir = str(tmpdir)

    C.training_session(trainer=t,
                       mb_source=mbs,
                       mb_size=C.minibatch_size_schedule(4),
                       model_inputs_to_streams=input_map,
                       max_samples=60,
                       progress_frequency=10).train(device)

    assert (writer.training_summary_counter == 6)
def train_and_test(network, trainer, train_source, test_source,
                   progress_printer, epoch_size):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    training_session = cntk.training_session(train_source, trainer,
                                             cntk.minibatch_size_schedule(64),
                                             progress_printer, input_map,
                                             "ConvNet_CIFAR10_DataAug_",
                                             epoch_size)
    training_session.train()

    ### TODO: Stay tuned for an upcoming simpler EvalSession API for test/validation.

    ### Evaluation action
    minibatch_size = 16

    # process minibatches and evaluate the model
    metric_numer = 0
    metric_denom = 0
    minibatch_index = 0

    while True:
        data = test_source.next_minibatch(minibatch_size, input_map=input_map)
        if not data: break
        local_mb_samples = data[network['label']].num_samples
        metric_numer += trainer.test_minibatch(data) * local_mb_samples
        metric_denom += local_mb_samples
        minibatch_index += 1

    fin_msg = "Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(
        minibatch_index + 1, (metric_numer * 100.0) / metric_denom,
        metric_denom)
    progress_printer.end_progress_print(fin_msg)

    print("")
    print(fin_msg)
    print("")

    return metric_numer / metric_denom
Пример #11
0
def test_session_progress_print_on_minibatch_unit(tmpdir, device_id):
    device = cntk_device(device_id)
    writer = MockProgressWriter()
    t, feature, label = create_sample_model(device, writer)
    mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT)

    input_map = {feature: mbs.streams.features, label: mbs.streams.labels}

    test_dir = str(tmpdir)

    C.training_session(
        trainer=t,
        mb_source=mbs,
        mb_size=C.minibatch_size_schedule(4),
        model_inputs_to_streams=input_map,
        max_samples=60,
        progress_frequency=(5, C.train.DataUnit.minibatch)).train(device)
    #mb size = 4; num_of_mb = 60/4 = 15; output every 5 mb; at the end, 3 outputs are written:
    assert (writer.training_summary_counter == 3)
Пример #12
0
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size)

    # Train all minibatches 
    training_session(
        trainer=trainer, mb_source = train_source,
        model_inputs_to_streams = input_map,
        mb_size_schedule = mb_size_schedule,
        progress_frequency=epoch_size,
        checkpoint_config = CheckpointConfig(filename = os.path.join(model_path, model_name), restore=restore),
        test_config = TestConfig(source=test_source, mb_size=mb_size_schedule)
    ).train()
Пример #13
0
def test_session_progress_print(tmpdir, device_id):
    device = cntk_device(device_id)
    writer = MockProgressWriter()
    t, feature, label = create_sample_model(device, writer)
    mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT)

    input_map = {
        feature: mbs.streams.features,
        label: mbs.streams.labels
    }

    test_dir = str(tmpdir)

    C.training_session(
        trainer=t, mb_source=mbs, 
        mb_size=C.minibatch_size_schedule(4),
        model_inputs_to_streams=input_map, max_samples=60,
        progress_frequency=10 #by default, fequence is on samples
    ).train(device)

    assert(writer.training_summary_counter == 6)
Пример #14
0
def test_session_progress_print_on_minibatch_unit(tmpdir, device_id):
    device = cntk_device(device_id)
    writer = MockProgressWriter()
    t, feature, label = create_sample_model(device, writer)
    mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT)

    input_map = {
        feature: mbs.streams.features,
        label: mbs.streams.labels
    }

    test_dir = str(tmpdir)

    C.training_session(
        trainer=t, mb_source=mbs,
        mb_size=C.minibatch_size_schedule(4),
        model_inputs_to_streams=input_map, max_samples=60,
        progress_frequency=(5, C.train.DataUnit.minibatch)
    ).train(device)
    #mb size = 4; num_of_mb = 60/4 = 15; output every 5 mb; at the end, 3 outputs are written:
    assert(writer.training_summary_counter == 3)
Пример #15
0
def train_and_test(network, trainer, train_source, test_source,
                   progress_printer, minibatch_size, epoch_size):

    # define mapping from intput streams to network inputs
    input_map = {
        network['feature']: train_source.streams.features,
        network['label']: train_source.streams.labels
    }

    training_session = cntk.training_session(
        train_source, trainer, cntk.minibatch_size_schedule(minibatch_size),
        progress_printer, input_map, os.path.join(model_path,
                                                  "AlexNet_"), epoch_size)
    training_session.train()

    # process minibatches and evaluate the model
    metric_numer = 0
    metric_denom = 0
    minibatch_index = 0

    while True:
        data = test_source.next_minibatch(minibatch_size, input_map=input_map)
        if not data: break
        local_mb_samples = data[network['label']].num_samples
        metric_numer += trainer.test_minibatch(data) * local_mb_samples
        metric_denom += local_mb_samples
        minibatch_index += 1

    fin_msg = "Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(
        minibatch_index + 1, (metric_numer * 100.0) / metric_denom,
        metric_denom)
    progress_printer.end_progress_print(fin_msg)

    print("")
    print(fin_msg)
    print("")

    return metric_numer / metric_denom
Пример #16
0
def simple_mnist():
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 1
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    input = input_variable(input_dim, np.float32)
    label = input_variable(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), input)
    z = fully_connected_classifier_net(scaled_input, num_output_classes,
                                       hidden_layers_dim, num_hidden_layers,
                                       relu)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST")

    path = os.path.normpath(os.path.join(data_dir,
                                         "Train-28x28_cntk_text.txt"))
    check_path(path)

    reader_train = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        input: reader_train.streams.features,
        label: reader_train.streams.labels
    }

    lr_per_minibatch = learning_rate_schedule(0.2, UnitType.minibatch)
    # Instantiate the trainer object to drive the model training
    trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch))

    # Get minibatches of images to train with and perform model training
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10
    #training_progress_output_freq = 100

    progress_printer = ProgressPrinter(
        #freq=training_progress_output_freq,
        tag='Training',
        num_epochs=num_sweeps_to_train_with)

    session = training_session(
        training_minibatch_source=reader_train,
        trainer=trainer,
        mb_size_schedule=minibatch_size_schedule(minibatch_size),
        progress_printer=progress_printer,
        model_inputs_to_mb_source_mapping=input_map,
        progress_frequency=num_samples_per_sweep,
        max_training_samples=num_samples_per_sweep * num_sweeps_to_train_with)

    session.train()

    # Load test data
    path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt"))
    check_path(path)

    reader_test = create_reader(path, False, input_dim, num_output_classes)

    input_map = {
        input: reader_test.streams.features,
        label: reader_test.streams.labels
    }

    # Test data for trained model
    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size,
                                        input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    # Average of evaluation errors of all test minibatches
    return test_result / num_minibatches_to_test
Пример #17
0
# Configure distributed training.
# For this, we wrap the learner in a distributed_learner object.
# This specific example implements the BlockMomentum method. The Python script must be run
# using mpiexec in order to have effect. For example, under Windows, the command is:
#   mpiexec -n 4 -lines python -u MNIST_Complex_Training.py
learner = C.train.distributed.data_parallel_distributed_learner(learner)

# For distributed training, we must maximize the minibatch size, as to minimize
# communication cost and GPU underutilization. Hence, we use a "schedule"
# that increases the minibatch size after a few epochs. By specifying the learning rate
# as UnitType.sample, the contribution per sample maintains the same scale without
# having to fix up the learning rate.
# For this MNIST model, larger minibatch sizes make it faster, because the
# model is too small to utilize a full GPU. Hence data-parallel training cannot
# be expected to lead to speed-ups.
minibatch_size_schedule = C.minibatch_size_schedule([256]*6 + [512]*9 + [1024]*7 + [2048]*8 + [4096], epoch_size=epoch_size)

# Train and test, with checkpointing and learning-rate adjustment.
progress = criterion.train((X_train, Y_train), minibatch_size=minibatch_size_schedule,
                           max_epochs=50, parameter_learners=[learner],
                           callbacks=[progress_writer, checkpoint_callback_config, cv_callback_config, test_callback_config])

# Get progress statistics.
final_loss    = progress.epoch_summaries[-1].loss
final_metric  = progress.epoch_summaries[-1].metric
final_samples = progress.epoch_summaries[-1].samples
test_metric   = progress.test_summary.metric

# Inspect predictions on one minibatch, for illustration.
# For evaluation, we map the output of the network between 0-1 and convert them into probabilities
# for the two classes. We use a softmax function to get the probabilities of each of the class.
# For this, we wrap the learner in a distributed_learner object.
# This specific example implements the BlockMomentum method. The Python script must be run
# using mpiexec in order to have effect. For example, under Windows, the command is:
#   mpiexec -n 4 -lines python -u MNIST_Complex_Training.py
learner = C.train.distributed.data_parallel_distributed_learner(learner)

# For distributed training, we must maximize the minibatch size, as to minimize
# communication cost and GPU underutilization. Hence, we use a "schedule"
# that increases the minibatch size after a few epochs. By specifying the learning rate
# as per sample, the contribution per sample maintains the same scale without
# having to fix up the learning rate.
# For this MNIST model, larger minibatch sizes make it faster, because the
# model is too small to utilize a full GPU. Hence data-parallel training cannot
# be expected to lead to speed-ups.
minibatch_size_schedule = C.minibatch_size_schedule(
    [256] * 6 + [512] * 9 + [1024] * 7 + [2048] * 8 + [4096],
    epoch_size=epoch_size)

# Train and test, with checkpointing and learning-rate adjustment.
progress = criterion.train((X_train, Y_train),
                           minibatch_size=minibatch_size_schedule,
                           max_epochs=50,
                           parameter_learners=[learner],
                           callbacks=[
                               progress_writer, checkpoint_callback_config,
                               cv_callback_config, test_callback_config
                           ])

# Get progress statistics.
final_loss = progress.epoch_summaries[-1].loss
final_metric = progress.epoch_summaries[-1].metric