def create_resnet_network(network_name, fp16): # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) dtype = np.float16 if fp16 else np.float32 if fp16: graph_input = C.cast(input_var, dtype=np.float16) graph_label = C.cast(label_var, dtype=np.float16) else: graph_input = input_var graph_label = label_var with C.default_options(dtype=dtype): stride1x1 = (1, 1) stride3x3 = (2, 2) # create model, and configure learning parameters if network_name == 'resnet18': z = create_imagenet_model_basic(graph_input, [2, 1, 1, 2], num_classes) elif network_name == 'resnet34': z = create_imagenet_model_basic(graph_input, [3, 3, 5, 2], num_classes) elif network_name == 'resnet50': z = create_imagenet_model_bottleneck(graph_input, [2, 3, 5, 2], num_classes, stride1x1, stride3x3) elif network_name == 'resnet101': z = create_imagenet_model_bottleneck(graph_input, [2, 3, 22, 2], num_classes, stride1x1, stride3x3) elif network_name == 'resnet152': z = create_imagenet_model_bottleneck(graph_input, [2, 7, 35, 2], num_classes, stride1x1, stride3x3) else: return RuntimeError("Unknown model name!") # loss and metric ce = cross_entropy_with_softmax(z, graph_label) errs = classification_error(z, graph_label, topN=1) top5Errs = classification_error(z, graph_label, topN=1) if fp16: ce = C.cast(ce, dtype=np.float32) errs = C.cast(errs, dtype=np.float32) top5Errs = C.cast(top5Errs, dtype=np.float32) return { 'name': network_name, 'feature': input_var, 'label': label_var, 'ce': ce, 'errs': errs, 'top5Errs': top5Errs, 'output': z }
def create_resnet_network(network_name, fp16): # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) dtype = np.float16 if fp16 else np.float32 if fp16: graph_input = C.cast(input_var, dtype=np.float16) graph_label = C.cast(label_var, dtype=np.float16) else: graph_input = input_var graph_label = label_var with C.default_options(dtype=dtype): stride1x1 = (1, 1) stride3x3 = (2, 2) # create model, and configure learning parameters if network_name == 'resnet18': z = create_imagenet_model_basic(graph_input, [2, 1, 1, 2], num_classes) elif network_name == 'resnet34': z = create_imagenet_model_basic(graph_input, [3, 3, 5, 2], num_classes) elif network_name == 'resnet50': z = create_imagenet_model_bottleneck(graph_input, [2, 3, 5, 2], num_classes, stride1x1, stride3x3) elif network_name == 'resnet101': z = create_imagenet_model_bottleneck(graph_input, [2, 3, 22, 2], num_classes, stride1x1, stride3x3) elif network_name == 'resnet152': z = create_imagenet_model_bottleneck(graph_input, [2, 7, 35, 2], num_classes, stride1x1, stride3x3) else: return RuntimeError("Unknown model name!") # loss and metric ce = cross_entropy_with_softmax(z, graph_label) errs = classification_error(z, graph_label, topN=1) top5Errs = classification_error(z, graph_label, topN=5) if fp16: ce = C.cast(ce, dtype=np.float32) errs = C.cast(errs, dtype=np.float32) top5Errs = C.cast(top5Errs, dtype=np.float32) return { 'name' : network_name, 'feature': input_var, 'label': label_var, 'ce' : ce, 'errs' : errs, 'top5Errs' : top5Errs, 'output': z }
def test_cast(): i = C.input_variable((3)) i2 = C.input_variable((1), needs_gradient=True) i_data = [[1, 20, 300], [2000, 3000, 5000], [3, 4, 5]] i2_data = [[7], [8], [9]] f = C.combine(C.cast(i, dtype=np.float16), C.cast(i2, dtype=np.float16)) feed_dict = { i: AA(i_data).astype(np.float32), i2: AA(i2_data).astype(np.float32) } data = f.eval(feed_dict) assert np.array_equal(data[f[0]], i_data) assert np.array_equal(data[f[1]], i2_data) s = f[0] * f[1] data = s.grad(feed_dict) assert np.array_equal(data, [[321], [10000], [12]])
def test_Cast(tmpdir, from_type, to_type): test_name = "cast_" + from_type.__name__ + "_to_" + to_type.__name__ shape = (3, 10, 15) input_var = C.input_variable(shape, dtype = from_type, name='features') model = C.cast(input_var, dtype=to_type) data = np.random.rand(*shape).astype(from_type) verify_one_input(model, data, tmpdir, test_name)
def test_rnn(device_id): if device_id == -1: pytest.skip('Test only runs on GPU') batch_size = 8 sequence_len = 100 vocab_dim = 20 embed_dim = 10 hidden_dim = 7 input = C.cast(C.sequence.input_variable(()), np.float16) with C.default_options(dtype=np.float16): embed = C.layers.Embedding(embed_dim)(C.one_hot(input, num_classes=vocab_dim, sparse_output=False)) z = C.layers.Recurrence(C.layers.LSTM(hidden_dim))(embed) feed = np.floor( np.random.rand(batch_size, sequence_len).astype(np.float32) * (vocab_dim - 1)) z.grad(feed, wrt=z.parameters) num_layers = 2 W = C.parameter((C.InferredDimension, embed_dim), init=C.glorot_uniform(), dtype=np.float16) with C.default_options(dtype=np.float16): z = C.optimized_rnnstack(embed, W, hidden_dim, num_layers) feed = np.floor( np.random.rand(batch_size, sequence_len).astype(np.float32) * (vocab_dim - 1)) z.grad(feed, wrt=z.parameters)
def create_resnet_network(network_name, fp16): # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) dtype = np.float16 if fp16 else np.float32 if fp16: graph_input = C.cast(input_var, dtype=np.float16) graph_label = C.cast(label_var, dtype=np.float16) else: graph_input = input_var graph_label = label_var with C.default_options(dtype=dtype): # create model, and configure learning parameters if network_name == 'resnet20': z = create_cifar10_model(graph_input, 3, num_classes) elif network_name == 'resnet110': z = create_cifar10_model(graph_input, 18, num_classes) else: return RuntimeError("Unknown model name!") # loss and metric ce = cross_entropy_with_softmax(z, graph_label) pe = classification_error(z, graph_label) if fp16: ce = C.cast(ce, dtype=np.float32) pe = C.cast(pe, dtype=np.float32) return { 'name': network_name, 'feature': input_var, 'label': label_var, 'ce': ce, 'pe': pe, 'output': z }
def create_resnet_network(network_name, fp16): # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) dtype = np.float16 if fp16 else np.float32 if fp16: graph_input = C.cast(input_var, dtype=np.float16) graph_label = C.cast(label_var, dtype=np.float16) else: graph_input = input_var graph_label = label_var with C.default_options(dtype=dtype): # create model, and configure learning parameters if network_name == 'resnet20': z = create_cifar10_model(graph_input, 3, num_classes) elif network_name == 'resnet110': z = create_cifar10_model(graph_input, 18, num_classes) else: return RuntimeError("Unknown model name!") # loss and metric ce = cross_entropy_with_softmax(z, graph_label) pe = classification_error(z, graph_label) if fp16: ce = C.cast(ce, dtype=np.float32) pe = C.cast(pe, dtype=np.float32) return { 'name' : network_name, 'feature': input_var, 'label': label_var, 'ce' : ce, 'pe' : pe, 'output': z }
def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_epochs, profiler_dir=None, model_dir=None, log_dir=None, tensorboard_logdir=None, gen_heartbeat=False, fp16=False): set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width), name='features') label_var = C.input_variable((num_classes)) dtype = np.float16 if fp16 else np.float32 if fp16: graph_input = C.cast(input_var, dtype=np.float16) graph_label = C.cast(label_var, dtype=np.float16) else: graph_input = input_var graph_label = label_var with C.default_options(dtype=dtype): # create model, and configure learning parameters if network_name == 'resnet20': z = create_cifar10_model(graph_input, 3, num_classes) lr_per_mb = [1.0]*80 + [0.1]*40 + [0.01] elif network_name == 'resnet110': z = create_cifar10_model(graph_input, 18, num_classes) lr_per_mb = [0.1]*1 + [1.0]*80 + [0.1]*40 + [0.01] else: raise RuntimeError("Unknown model name!") # loss and metric ce = cross_entropy_with_softmax(z, graph_label) pe = classification_error(z, graph_label) if fp16: ce = C.cast(ce, dtype=np.float32) pe = C.cast(pe, dtype=np.float32) # shared training parameters minibatch_size = 128 l2_reg_weight = 0.0001 # Set learning parameters lr_per_sample = [lr/minibatch_size for lr in lr_per_mb] lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) mm_schedule = momentum_schedule(0.9, minibatch_size) # progress writers progress_writers = [ProgressPrinter(tag='Training', log_to_file=log_dir, num_epochs=max_epochs, gen_heartbeat=gen_heartbeat)] tensorboard_writer = None if tensorboard_logdir is not None: tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z) progress_writers.append(tensorboard_writer) # trainer object learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) trainer = Trainer(z, (ce, pe), learner, progress_writers) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } log_number_of_parameters(z) ; print() # perform model training if profiler_dir: start_profiler(profiler_dir, True) for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() # Log mean of each parameter tensor, so that we can confirm that the parameters change indeed. if tensorboard_writer: for parameter in z.parameters: tensorboard_writer.write_value(parameter.uid + "/mean", reduce_mean(parameter).eval(), epoch) if model_dir: z.save(os.path.join(model_dir, network_name + "_{}.dnn".format(epoch))) enable_profiler() # begin to collect profiler data after first epoch if profiler_dir: stop_profiler() # Evaluation parameters test_epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 while sample_count < test_epoch_size: current_minibatch = min(minibatch_size, test_epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples print("") trainer.summarize_test_progress() print("") return metric_numer/metric_denom
def trainNet(args): # Crash doesn't seem to occur with this flag, # unfortunatly, it reduces training speed by about 35% #os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # Instantiate generators for both training and # validation datasets. Grab their generator functions # TODO: Command line args # TODO: Better system for using files testing/validation than ranges? tFileShp = (1, 598) vFileShp = (0, 1) gen = Generator(featurePath, labelPath, tFileShp, batchSize, loadSize=3) valGen = Generator(featurePath, labelPath, vFileShp, batchSize, loadSize=1) g = gen.generator() vg = valGen.generator() inputVar = cntk.ops.input_variable((BoardDepth, BoardLength, BoardLength), name='features') policyVar = cntk.ops.input_variable((BoardSize)) valueVar = cntk.ops.input_variable((2)) if args.fp16: cntk.cast(inputVar, dtype=np.float16) cntk.cast(policyVar, dtype=np.float16) cntk.cast(valueVar, dtype=np.float16) net, epochOffset = loadModel(args, inputVar, netFilters, resBlockCount) # Show a heatmap of network outputs # over an input board state if args.heatMap: hmap = NetHeatMap(net, g) hmap.genHeatmap(args.heatMap) # Loss and accuracy policyLoss = cntk.cross_entropy_with_softmax(net.outputs[0], policyVar) valueLoss = cntk.cross_entropy_with_softmax(net.outputs[1], valueVar) loss = policyLoss + valueLoss # TODO: Figure out how to display/report both errors policyError = cntk.element_not( cntk.classification_error(net.outputs[0], policyVar)) valueError = cntk.element_not( cntk.classification_error(net.outputs[1], valueVar)) #error = (valueError + policyError) / 2 #error = valueError error = policyError if args.fp16: loss = cntk.cast(loss, dtype=np.float32) error = cntk.cast(error, dtype=np.float32) lrc = args.lr if args.cycleLr[0]: lrc = learningRateCycles(*args.cycleLr, gen.stepsPerEpoch, args.cycleMax) lrc = lrc * maxEpochs elif args.optLr: lrc = findOptLr(maxEpochs, *args.optLr, gen.stepsPerEpoch) lrc = cntk.learners.learning_parameter_schedule(lrc, batchSize, batchSize) learner = cntk.adam(net.parameters, lrc, momentum=0.9, minibatch_size=batchSize, l2_regularization_weight=0.0001) #learner = cntk.adadelta(net.parameters, lrc, l2_regularization_weight=0.0001) # Test adelta out! # TODO: Figure out how to write multiple 'metrics' tbWriter = cntk.logging.TensorBoardProgressWriter(freq=1, log_dir='./TensorBoard/', model=net) progressPrinter = cntk.logging.ProgressPrinter(tag='Training', num_epochs=maxEpochs) trainer = cntk.Trainer(net, (loss, error), learner, [progressPrinter, tbWriter]) # TODO: Replace model load with loading/saving checkpoints! # So we can store learners state et al #trainer.restore_from_checkpoint(findLatestModel('latest')) #checkpointFreq = gen.stepsPerEpoch // checkpointFreq ls = [] losses = [] #valueAccs = [] #policyAccs = [] for epoch in range(maxEpochs): miniBatches = 0 while miniBatches < gen.stepsPerEpoch: X, Y, W = next(g) miniBatches += 1 trainer.train_minibatch({ net.arguments[0]: X, policyVar: Y, valueVar: W }) ls.append(trainer.previous_minibatch_loss_average) trainer.summarize_training_progress() policyAcc, valueAcc = printAccuracy(net, 'Validation Acc %', vg, valGen.stepsPerEpoch) losses.append([epoch, sum(ls) / gen.stepsPerEpoch]) ls.clear() #policyAccs.append([epoch, policyAcc]) #valueAccs.append([epoch, valueAcc]) net.save(saveDir + netName + '_{}_{}_{}_{:.3f}.dnn'.format( epoch + 1 + epochOffset, policyAcc, valueAcc, losses[epoch][1]))