def test_multi_optimizer(backend_default): opt_gdm = GradientDescentMomentum(learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) opt = MultiOptimizer({ 'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1 }) map_list = opt._map_optimizers(layer_list) assert map_list[opt_adam][0].__class__.__name__ == 'Convolution' assert map_list[opt_ada][0].__class__.__name__ == 'Bias' assert map_list[opt_rms][0].__class__.__name__ == 'Linear' assert map_list[opt_gdm][0].__class__.__name__ == 'Activation' assert map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def test_multi_optimizer(backend): opt_gdm = GradientDescentMomentum(learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(clip_gradients=True) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, pad=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) opt = MultiOptimizer( { "default": opt_gdm, "Bias": opt_ada, "Convolution": opt_adam, "Linear": opt_rms, "LstmLayer": opt_rms_1, "GruLayer": opt_rms_1, } ) map_list = opt.map_optimizers(layer_list) assert map_list[opt_adam][0].__class__.__name__ == "Convolution" assert map_list[opt_ada][0].__class__.__name__ == "Bias" assert map_list[opt_rms][0].__class__.__name__ == "Linear" assert map_list[opt_gdm][0].__class__.__name__ == "Activation" assert map_list[opt_rms_1][0].__class__.__name__ == "LSTM" assert map_list[opt_rms_1][1].__class__.__name__ == "GRU"
def test_multi_optimizer(backend_default): opt_gdm = GradientDescentMomentum( learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1}) map_list = opt.map_optimizers(layer_list) assert map_list[opt_adam][0].__class__.__name__ == 'Convolution' assert map_list[opt_ada][0].__class__.__name__ == 'Bias' assert map_list[opt_rms][0].__class__.__name__ == 'Linear' assert map_list[opt_gdm][0].__class__.__name__ == 'Activation' assert map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def test_multi_optimizer(backend_default_mkl): """ A test for MultiOptimizer. """ opt_gdm = GradientDescentMomentum( learning_rate=0.001, momentum_coef=0.9, wdecay=0.005) opt_ada = Adadelta() opt_adam = Adam() opt_rms = RMSProp() opt_rms_1 = RMSProp(gradient_clip_value=5) init_one = Gaussian(scale=0.01) l1 = Conv((11, 11, 64), strides=4, padding=3, init=init_one, bias=Constant(0), activation=Rectlin()) l2 = Affine(nout=4096, init=init_one, bias=Constant(1), activation=Rectlin()) l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh()) l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh()) layers = [l1, l2, l3, l4] layer_list = [] for layer in layers: if isinstance(layer, list): layer_list.extend(layer) else: layer_list.append(layer) for l in layer_list: l.configure(in_obj=(16, 28, 28)) l.allocate() # separate layer_list into two, the last two recurrent layers and the rest layer_list1, layer_list2 = layer_list[:-2], layer_list[-2:] opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_ada, 'Convolution': opt_adam, 'Convolution_bias': opt_adam, 'Linear': opt_rms, 'LSTM': opt_rms_1, 'GRU': opt_rms_1}) layers_to_optimize1 = [l for l in layer_list1 if isinstance(l, ParameterLayer)] layers_to_optimize2 = [l for l in layer_list2 if isinstance(l, ParameterLayer)] opt.optimize(layers_to_optimize1, 0) assert opt.map_list[opt_adam][0].__class__.__name__ is 'Convolution_bias' assert opt.map_list[opt_rms][0].__class__.__name__ == 'Linear' opt.optimize(layers_to_optimize2, 0) assert opt.map_list[opt_rms_1][0].__class__.__name__ == 'LSTM' assert opt.map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
# weight initialization init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model layers = [] layers.append(Affine(nout=100, init=init_norm, bias=Constant(0), activation=Rectlin())) layers.append(Affine(nout=10, init=init_norm, bias=Constant(0), activation=Logistic(shortcut=True), name='special_linear')) cost = GeneralizedCost(costfunc=CrossEntropyBinary()) mlp = Model(layers=layers) # fit and validate optimizer_one = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) optimizer_two = RMSProp() # all bias layers and the last linear layer will use # optimizer_two. all other layers will use optimizer_one. opt = MultiOptimizer({'default': optimizer_one, 'Bias': optimizer_two, 'special_linear': optimizer_two}) # configure callbacks callbacks = Callbacks(mlp, eval_set=valid_set, **args.callback_args) mlp.fit(train_set, optimizer=opt, num_epochs=args.epochs, cost=cost, callbacks=callbacks)
parser = NeonArgparser(__doc__) args = parser.parse_args() NervanaObject.be.enable_winograd = 4 # setup data provider X_train = np.random.uniform(-1, 1, (128, 3*224*224)) y_train = np.random.uniform(-1, 1, (128, 1000)) train = ArrayIterator(X_train, y_train, nclass=1000, lshape=(3, 224, 224)) layers = [Conv((11, 11, 64), init=Gaussian(scale=0.01), activation=Rectlin(), padding=3, strides=4), Pooling(3, strides=2), Conv((5, 5, 192), init=Gaussian(scale=0.01), activation=Rectlin(), padding=2), Pooling(3, strides=2), Conv((3, 3, 384), init=Gaussian(scale=0.03), activation=Rectlin(), padding=1), Conv((3, 3, 256), init=Gaussian(scale=0.03), activation=Rectlin(), padding=1), Conv((3, 3, 256), init=Gaussian(scale=0.03), activation=Rectlin(), padding=1), Pooling(3, strides=2), Affine(nout=4096, init=Gaussian(scale=0.01), activation=Rectlin()), Affine(nout=4096, init=Gaussian(scale=0.01), activation=Rectlin()), Affine(nout=1000, init=Gaussian(scale=0.01), activation=Softmax())] model = Model(layers=layers) weight_sched = Schedule([22, 44, 65], (1/250.)**(1/3.)) opt_gdm = GradientDescentMomentum(0.01, 0.0, wdecay=0.0005, schedule=weight_sched) opt = MultiOptimizer({'default': opt_gdm}) cost = GeneralizedCost(costfunc=CrossEntropyMulti()) model.benchmark(train, cost=cost, optimizer=opt, niterations=10, nskip=5)
Affine(nout=1000, init=init_g1, bias=Constant(0.0), activation=Softmax(), name='fc8')) model = Model(layers=layers) # scale LR by 0.1 every 20 epochs (this assumes batch_size = 256) weight_sched = Schedule(20, 0.1) opt_gdm = GradientDescentMomentum(0.01, 0.9, wdecay=0.0005, schedule=weight_sched) opt_biases = GradientDescentMomentum(0.02, 0.9, schedule=weight_sched) opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_biases}) # configure callbacks valmetric = TopKMisclassification(k=5) callbacks = Callbacks(model, eval_set=test, metric=valmetric, **args.callback_args) if args.model_file is not None: model.load_params(args.model_file) if not args.test_only: cost = GeneralizedCost(costfunc=CrossEntropyMulti()) model.fit(train, optimizer=opt, num_epochs=args.epochs,
rois_random_sample=True, add_flipped=False, subset_pct=args.subset_pct) test_set = PASCALVOCTrain('test', '2007', path=args.data_dir, n_mb=n_mb, img_per_batch=img_per_batch, rois_per_img=rois_per_img, rois_random_sample=True, add_flipped=False) # setup model model = create_frcn_model(frcn_fine_tune) # setup optimizer opt_w = GradientDescentMomentum( 0.001 * learning_rate_scale, 0.9, wdecay=0.0005) opt_b = GradientDescentMomentum(0.002 * learning_rate_scale, 0.9) optimizer = MultiOptimizer({'default': opt_w, 'Bias': opt_b}) # if training a new model, seed the image model conv layers with pre-trained weights # otherwise, just load the model file if args.model_file is None: load_vgg_weights(model, args.data_dir) cost = Multicost(costs=[GeneralizedCostMask(costfunc=CrossEntropyMulti()), GeneralizedCostMask(costfunc=SmoothL1Loss())], weights=[1, 1]) callbacks = Callbacks(model, eval_set=test_set, **args.callback_args) model.fit(train_set, optimizer=optimizer, num_epochs=num_epochs, cost=cost, callbacks=callbacks)
# setup cost function as Square Hinge Loss cost = GeneralizedCost(costfunc=SquareHingeLoss()) # setup optimizer LR_start = 1.65e-2 def ShiftAdaMax_with_Scale(LR=1): return ShiftAdaMax(learning_rate=LR_start * LR, schedule=ShiftSchedule(2, shift_size=1)) optimizer = MultiOptimizer({ 'default': ShiftAdaMax_with_Scale(), 'BinaryLinear_0': ShiftAdaMax_with_Scale(57.038), 'BinaryLinear_1': ShiftAdaMax_with_Scale(73.9008), 'BinaryLinear_2': ShiftAdaMax_with_Scale(73.9008), 'BinaryLinear_3': ShiftAdaMax_with_Scale(52.3195) }) # initialize model object bnn = Model(layers=layers) # configure callbacks callbacks = Callbacks(bnn, eval_set=valid_set, **args.callback_args) # run fit bnn.fit(train_set, optimizer=optimizer, num_epochs=args.epochs, cost=cost,
def main(): # parse the command line arguments parser = NeonArgparser(__doc__) args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_thresh) #Set up batch iterator for training images print "Setting up data batch loaders..." train = ImgMaster(repo_dir='dataTmp', set_name='train', inner_size=120, subset_pct=100) val = ImgMaster(repo_dir='dataTmp', set_name='train', inner_size=120, subset_pct=100, do_transforms=False) test = ImgMaster(repo_dir='dataTestTmp', set_name='train', inner_size=120, subset_pct=100, do_transforms=False) train.init_batch_provider() val.init_batch_provider() test.init_batch_provider() print "Constructing network..." #Create AlexNet architecture model = constuct_network() #model.load_weights(args.model_file) # drop weights LR by 1/250**(1/3) at epochs (23, 45, 66), drop bias LR by 1/10 at epoch 45 weight_sched = Schedule([22, 44, 65, 90, 97], (1 / 250.)**(1 / 3.)) opt_gdm = GradientDescentMomentum(0.01, 0.9, wdecay=0.005, schedule=weight_sched) opt_biases = GradientDescentMomentum(0.04, 1.0, schedule=Schedule([130], .1)) opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_biases}) # configure callbacks valmetric = TopKMisclassification(k=5) callbacks = Callbacks(model, train, eval_set=val, metric=valmetric, **args.callback_args) cost = GeneralizedCost(costfunc=CrossEntropyMulti()) #flag = input("Press Enter if you want to begin training process.") print "Training network..." model.fit(train, optimizer=opt, num_epochs=args.epochs, cost=cost, callbacks=callbacks) mets = model.eval(test, metric=valmetric) print 'Validation set metrics:' print 'LogLoss: %.2f, Accuracy: %.1f %%0 (Top-1), %.1f %% (Top-5)' % ( mets[0], (1.0 - mets[1]) * 100, (1.0 - mets[2]) * 100) test.exit_batch_provider() val.exit_batch_provider() train.exit_batch_provider()
GeneralizedCostMask(costfunc=SmoothL1Loss(sigma=3.0), weights=weights), frcn_tree_cost, ], weights=[1, 1, 1]) # setup optimizer schedule_w = StepSchedule(step_config=[10], change=[0.001 / 10]) schedule_b = StepSchedule(step_config=[10], change=[0.002 / 10]) opt_w = GradientDescentMomentum(0.001, 0.9, wdecay=0.0005, schedule=schedule_w) opt_b = GradientDescentMomentum(0.002, 0.9, wdecay=0.0005, schedule=schedule_b) opt_skip = GradientDescentMomentum(0.0, 0.0) optimizer = MultiOptimizer({ 'default': opt_w, 'Bias': opt_b, 'skip': opt_skip, 'skip_bias': opt_skip }) # if training a new model, seed the image model conv layers with pre-trained weights # otherwise, just load the model file if args.model_file is None: util.load_vgg_all_weights(model, cache_dir) callbacks = Callbacks(model, eval_set=train_set, **args.callback_args) model.fit(train_set, optimizer=optimizer, cost=cost, num_epochs=args.epochs, callbacks=callbacks)
optHead = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999) if PRETRAINED: optPretrained = Adam(learning_rate=0.0003, beta_1=0.9, beta_2=0.999) # Set a slow learning rate for ResNet layers else: optPretrained = optHead mapping = {'default': optPretrained, # default optimizer applied to the pretrained sections 'Input Layer' : optHead, # The layer named 'Input Layer' 'Custom Head 1' : optHead, 'Custom Head 2' : optHead, 'Affine': optHead} # all layers from the Affine class # use multiple optimizers opt = MultiOptimizer(mapping) # configure callbacks if args.callback_args['eval_freq'] is None: args.callback_args['eval_freq'] = 1 # configure callbacks callbacks = Callbacks(lunaModel, eval_set=valid_set, **args.callback_args) # add a callback that saves the best model state callbacks.add_save_best_state_callback(modelFileName) lunaModel.fit(train_set, optimizer=opt, num_epochs=num_epochs, cost=cost, callbacks=callbacks)
vgg_layers.append(Affine(nout=1, init=GlorotUniform(), bias=Constant(0), activation=Logistic(), name="class_layer")) # define different optimizers for the class_layer and the rest of the network # we use a momentum coefficient of 0.9 and weight decay of 0.0005. opt_vgg = GradientDescentMomentum(0.001, 0.9, wdecay=0.0005) opt_class_layer = GradientDescentMomentum(0.01, 0.9, wdecay=0.0005) # also define optimizers for the bias layers, which have a different learning rate # and not weight decay. opt_bias = GradientDescentMomentum(0.002, 0.9) opt_bias_class = GradientDescentMomentum(0.02, 0.9) # set up the mapping of layers to optimizers opt = MultiOptimizer({'default': opt_vgg, 'Bias': opt_bias, 'class_layer': opt_class_layer, 'class_layer_bias': opt_bias_class}) # use cross-entropy cost to train the network cost = GeneralizedCost(costfunc=CrossEntropyMulti()) lunaModel = Model(layers=vgg_layers) if args.model_file: import os assert os.path.exists(args.model_file), '%s not found' % args.model_file lunaModel.load_params(args.model_file) # configure callbacks #callbacks = Callbacks(lunaModel, eval_set=valid_set, **args.callback_args) callbacks = Callbacks(lunaModel, eval_set=valid_set, metric=Misclassification(), **args.callback_args)
def main(): # larger batch sizes may not fit on GPU parser = NeonArgparser(__doc__, default_overrides={'batch_size': 4}) parser.add_argument("--bench", action="store_true", help="run benchmark instead of training") parser.add_argument("--num_classes", type=int, default=12, help="number of classes in the annotation") parser.add_argument("--height", type=int, default=256, help="image height") parser.add_argument("--width", type=int, default=512, help="image width") args = parser.parse_args(gen_be=False) # check that image dimensions are powers of 2 if((args.height & (args.height - 1)) != 0): raise TypeError("Height must be a power of 2.") if((args.width & (args.width - 1)) != 0): raise TypeError("Width must be a power of 2.") (c, h, w) = (args.num_classes, args.height, args.width) # need to use the backend with the new upsampling layer implementation be = NervanaGPU_Upsample(rng_seed=args.rng_seed, device_id=args.device_id) # set batch size be.bsz = args.batch_size # couple backend to global neon object NervanaObject.be = be shape = dict(channel_count=3, height=h, width=w, subtract_mean=False) train_params = ImageParams(center=True, flip=False, scale_min=min(h, w), scale_max=min(h, w), aspect_ratio=0, **shape) test_params = ImageParams(center=True, flip=False, scale_min=min(h, w), scale_max=min(h, w), aspect_ratio=0, **shape) common = dict(target_size=h*w, target_conversion='read_contents', onehot=False, target_dtype=np.uint8, nclasses=args.num_classes) train_set = PixelWiseImageLoader(set_name='train', repo_dir=args.data_dir, media_params=train_params, shuffle=False, subset_percent=100, index_file=os.path.join(args.data_dir, 'train_images.csv'), **common) val_set = PixelWiseImageLoader(set_name='val', repo_dir=args.data_dir,media_params=test_params, index_file=os.path.join(args.data_dir, 'val_images.csv'), **common) # initialize model object layers = gen_model(c, h, w) segnet_model = Model(layers=layers) # configure callbacks callbacks = Callbacks(segnet_model, eval_set=val_set, **args.callback_args) opt_gdm = GradientDescentMomentum(1.0e-6, 0.9, wdecay=0.0005, schedule=Schedule()) opt_biases = GradientDescentMomentum(2.0e-6, 0.9, schedule=Schedule()) opt_bn = GradientDescentMomentum(1.0e-6, 0.9, schedule=Schedule()) opt = MultiOptimizer({'default': opt_gdm, 'Bias': opt_biases, 'BatchNorm': opt_bn}) cost = GeneralizedCost(costfunc=CrossEntropyMulti()) if args.bench: segnet_model.initialize(train_set, cost=cost) segnet_model.benchmark(train_set, cost=cost, optimizer=opt) sys.exit(0) else: segnet_model.fit(train_set, optimizer=opt, num_epochs=args.epochs, cost=cost, callbacks=callbacks) # get the trained segnet model outputs for valisation set outs_val = segnet_model.get_outputs(val_set) with open('outputs.pkl', 'w') as fid: pickle.dump(outs_val, fid, -1)
train.nmacrobatches), change=args.rate_change) opt_gdm = GradientDescentMomentum(args.rate_init[0], args.momentum[0], wdecay=args.weight_decay, schedule=weight_sched, stochastic_round=args.rounding) opt_biases = GradientDescentMomentum(args.rate_init[1], args.momentum[1], schedule=weight_sched, stochastic_round=args.rounding) opt_fixed = GradientDescentMomentum(0.0, 1.0, wdecay=0.0) opt = MultiOptimizer({ 'default': opt_gdm, 'Bias': opt_biases, 'DOG': opt_fixed }) # configure cost and test metrics cost = GeneralizedCost(costfunc=(CrossEntropyBinary() \ if train.parser.independent_labels else CrossEntropyMulti())) metric = EMMetric( oshape=test.parser.oshape, use_softmax=not train.parser.independent_labels) if test else None # configure callbacks if not args.neon_progress: args.callback_args['progress_bar'] = False callbacks = Callbacks(model, eval_set=test,