def test_cifar_resnet_error(device_id): if cntk_device(device_id).type() != DeviceKind_GPU: pytest.skip('test only runs on GPU') set_default_device(cntk_device(device_id)) try: base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'], *"Image/CIFAR/v0/cifar-10-batches-py".split("/")) # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt # and CIFAR-10_mean.xml in the base_path. except KeyError: base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), *"../../../../Examples/Image/DataSets/CIFAR-10".split("/")) base_path = os.path.normpath(base_path) os.chdir(os.path.join(base_path, '..')) from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms set_computation_network_trace_level(1) set_fixed_random_seed(1) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works #force_deterministic_algorithms() # TODO: do the above; they lead to slightly different results, so not doing it for now reader_train = create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True) reader_test = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False) test_error = train_and_evaluate(reader_train, reader_test, max_epochs=5) expected_test_error = 0.282 assert np.allclose(test_error, expected_test_error, atol=TOLERANCE_ABSOLUTE)
def test_cifar_resnet_error(device_id): if cntk_device(device_id).type() != DeviceKind_GPU: pytest.skip('test only runs on GPU') set_default_device(cntk_device(device_id)) try: base_path = os.path.join( os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'], *"Image/CIFAR/v0/cifar-10-batches-py".split("/")) # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt # and CIFAR-10_mean.xml in the base_path. except KeyError: base_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), *"../../../../Examples/Image/DataSets/CIFAR-10".split("/")) base_path = os.path.normpath(base_path) os.chdir(os.path.join(base_path, '..')) from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms set_computation_network_trace_level(1) set_fixed_random_seed( 1 ) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works #force_deterministic_algorithms() # TODO: do the above; they lead to slightly different results, so not doing it for now reader_train = create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True) reader_test = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False) test_error = train_and_evaluate(reader_train, reader_test, max_epochs=5) expected_test_error = 0.384 assert np.allclose(test_error, expected_test_error, atol=TOLERANCE_ABSOLUTE)
def cifar_resnet_distributed(data_path, run_test, num_epochs, communicator=None, save_model_filename=None, load_model_filename=None, debug_output=False): image_height = 32 image_width = 32 num_channels = 3 num_classes = 10 feats_stream_name = 'features' labels_stream_name = 'labels' minibatch_source = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True, distributed_communicator = communicator) features_si = minibatch_source[feats_stream_name] labels_si = minibatch_source[labels_stream_name] # Instantiate the resnet classification model, or load from file if load_model_filename: print("Loading model:", load_model_filename) classifier_output = persist.load_model(load_model_filename) image_input = classifier_output.arguments[0] else: image_input = input_variable( (num_channels, image_height, image_width), features_si.m_element_type) classifier_output = create_resnet_model(image_input, num_classes) # Input variables denoting the features and label data label_var = input_variable((num_classes), features_si.m_element_type) ce = cross_entropy_with_softmax(classifier_output, label_var) pe = classification_error(classifier_output, label_var) # Instantiate the trainer object to drive the model training mb_size = 128 num_mb_per_epoch = 100 num_mbs = num_mb_per_epoch * num_epochs lr_per_sample = [1/mb_size]*80+[0.1/mb_size]*40+[0.01/mb_size] lr_schedule = learning_rate_schedule(lr_per_sample, units = mb_size * num_mb_per_epoch) momentum_time_constant = -mb_size/np.log(0.9) # create data parallel distributed trainer if needed dist_trainer = distributed.data_parallel_distributed_trainer(communicator, False) if communicator else None # Instantiate the trainer object to drive the model training trainer = Trainer(classifier_output, ce, pe, [momentum_sgd(classifier_output.parameters, lr_schedule, momentum_time_constant, l2_regularization_weight=0.0001)], distributed_trainer = dist_trainer) # Get minibatches of images to train with and perform model training training_progress_output_freq = 100 if communicator else 20 if debug_output: training_progress_output_freq = training_progress_output_freq/4 for i in range(0, num_mbs): # NOTE: depends on network, the mb_size can be changed dynamically here mb = minibatch_source.next_minibatch(mb_size) # Specify the mapping of input variables in the model to actual # minibatch data to be trained with arguments = { image_input: mb[features_si], label_var: mb[labels_si] } trainer.train_minibatch(arguments) print_training_progress(trainer, i, training_progress_output_freq) if save_model_filename: print("Saving model:", save_model_filename) persist.save_model(classifier_output, save_model_filename) if run_test: test_minibatch_source = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False) features_si = test_minibatch_source[feats_stream_name] labels_si = test_minibatch_source[labels_stream_name] mb_size = 128 num_mbs = 100 total_error = 0.0 for i in range(0, num_mbs): mb = test_minibatch_source.next_minibatch(mb_size) # Specify the mapping of input variables in the model to actual # minibatch data to be trained with arguments = { image_input: mb[features_si], label_var: mb[labels_si] } error = trainer.test_minibatch(arguments) total_error += error return total_error / num_mbs else: return 0
def cifar_resnet_distributed(data_path, run_test, num_epochs, communicator=None, save_model_filename=None, load_model_filename=None, debug_output=False): image_height = 32 image_width = 32 num_channels = 3 num_classes = 10 feats_stream_name = 'features' labels_stream_name = 'labels' minibatch_source = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True, distributed_communicator = communicator) features_si = minibatch_source[feats_stream_name] labels_si = minibatch_source[labels_stream_name] # Instantiate the resnet classification model, or load from file if load_model_filename: print("Loading model:", load_model_filename) classifier_output = persist.load_model(load_model_filename) image_input = classifier_output.arguments[0] else: image_input = input_variable( (num_channels, image_height, image_width), features_si.m_element_type) classifier_output = create_resnet_model(image_input, num_classes) # Input variables denoting the features and label data label_var = input_variable((num_classes), features_si.m_element_type) ce = cross_entropy_with_softmax(classifier_output, label_var) pe = classification_error(classifier_output, label_var) # Instantiate the trainer object to drive the model training mb_size = 128 num_mb_per_epoch = 100 num_mbs = num_mb_per_epoch * num_epochs lr_per_minibatch = learning_rate_schedule([1]*80 + [0.1]*40 + [0.01], mb_size * num_mb_per_epoch, UnitType.minibatch) momentum_time_constant = momentum_as_time_constant_schedule(-mb_size/np.log(0.9)) # create data parallel distributed trainer if needed dist_trainer = distributed.data_parallel_distributed_trainer(communicator, False) if communicator else None # Instantiate the trainer object to drive the model training trainer = Trainer(classifier_output, ce, pe, [momentum_sgd(classifier_output.parameters, lr=lr_per_minibatch, momentum=momentum_time_constant, l2_regularization_weight=0.0001)], distributed_trainer = dist_trainer) # Get minibatches of images to train with and perform model training training_progress_output_freq = 100 if communicator else 20 if debug_output: training_progress_output_freq = training_progress_output_freq/4 for i in range(0, num_mbs): # NOTE: depends on network, the mb_size can be changed dynamically here mb = minibatch_source.next_minibatch(mb_size) # Specify the mapping of input variables in the model to actual # minibatch data to be trained with arguments = { image_input: mb[features_si], label_var: mb[labels_si] } trainer.train_minibatch(arguments) print_training_progress(trainer, i, training_progress_output_freq) if save_model_filename: print("Saving model:", save_model_filename) persist.save_model(classifier_output, save_model_filename) if run_test: test_minibatch_source = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False) features_si = test_minibatch_source[feats_stream_name] labels_si = test_minibatch_source[labels_stream_name] mb_size = 128 num_mbs = 100 total_error = 0.0 for i in range(0, num_mbs): mb = test_minibatch_source.next_minibatch(mb_size) # Specify the mapping of input variables in the model to actual # minibatch data to be trained with arguments = { image_input: mb[features_si], label_var: mb[labels_si] } error = trainer.test_minibatch(arguments) total_error += error return total_error / num_mbs else: return 0