num_channels=args.C, wd=args.wd, wd1=args.wd1, pixel_k=args.pixel_k, lp_k=args.lp_k, bp_ks=args.bp_ks) # ###################################################################### # Build the optimizer - use separate parameter groups for the gain # and convolutional layers default_params = model.parameters() wave_params = model.wave_parameters() optim, sched = get_optim('sgd', default_params, init_lr=args.lr, steps=args.steps, wd=0, gamma=args.gamma, momentum=args.mom, max_epochs=args.epochs) if len(wave_params) > 0: if args.lr1 is None: args.lr1 = args.lr if args.mom1 is None: args.mom1 = args.mom optim2, sched2 = get_optim('sgd', wave_params, init_lr=args.lr1, steps=args.steps, wd=0, gamma=args.gamma,
train_loader, eval_loader = get_data(args) data_id = get_data_id(args) ################### ## Specify model ## ################### model = get_model(args) model_id = get_model_id(args) ####################### ## Specify optimizer ## ####################### optimizer, scheduler_iter, scheduler_epoch = get_optim(args, model.parameters()) optim_id = get_optim_id(args) ############## ## Training ## ############## exp = TeacherExperiment(args=args, data_id=data_id, model_id=model_id, optim_id=optim_id, train_loader=train_loader, eval_loader=eval_loader, model=model, optimizer=optimizer, scheduler_iter=scheduler_iter,
def main(cf): print( f"\nStarting divisize normalization experiment {cf.logdir}: --seed {cf.seed} --device {utils.DEVICE}" ) pprint.pprint(cf) os.makedirs(cf.logdir, exist_ok=True) utils.seed(cf.seed) utils.save_json({k: str(v) for (k, v) in cf.items()}, cf.logdir + "config.json") train_dataset = datasets.MNIST(train=True, scale=cf.label_scale, size=cf.train_size, normalize=cf.normalize) test_dataset = datasets.MNIST(train=False, scale=cf.label_scale, size=cf.test_size, normalize=cf.normalize) train_loader = datasets.get_dataloader(train_dataset, cf.batch_size) test_loader = datasets.get_dataloader(test_dataset, cf.batch_size) print( f"Loaded data [train batches: {len(train_loader)} test batches: {len(test_loader)}]" ) model = PCModel(nodes=cf.nodes, mu_dt=cf.mu_dt, act_fn=cf.act_fn, use_bias=cf.use_bias, kaiming_init=cf.kaiming_init, pe_fn=cf.pe_fn, pe_fn_inverse=cf.pe_fn_inverse) optimizer = optim.get_optim( model.params, cf.optim, cf.lr, batch_scale=cf.batch_scale, grad_clip=cf.grad_clip, weight_decay=cf.weight_decay, ) with torch.no_grad(): metrics = {"acc": []} for epoch in range(1, cf.n_epochs + 1): print(f"\nTrain @ epoch {epoch} ({len(train_loader)} batches)") for batch_id, (img_batch, label_batch) in enumerate(train_loader): model.train_batch_supervised(img_batch, label_batch, cf.n_train_iters, fixed_preds=cf.fixed_preds_train) optimizer.step( curr_epoch=epoch, curr_batch=batch_id, n_batches=len(train_loader), batch_size=img_batch.size(0), ) if epoch % cf.test_every == 0: acc = 0 for _, (img_batch, label_batch) in enumerate(test_loader): label_preds = model.test_batch_supervised(img_batch) acc += datasets.accuracy(label_preds, label_batch) metrics["acc"].append(acc / len(test_loader)) print("\nTest @ epoch {} / Accuracy: {:.4f}".format( epoch, acc / len(test_loader))) utils.save_json(metrics, cf.logdir + "metrics.json")
def train(options, data, load_params=False, start_epoc=0): print "OPTIONS: ", options print 'Setting up model with options:' options = set_defaults(options) for kk, vv in options.iteritems(): print kk, vv print "model seed: ", options['model_seed'] print "fold: ", options['fold'] print 'seed: ', options['seed'] rng = numpy.random.RandomState(options['model_seed'] + 100 * options.get('fold', 99) + options.get('seed', 99)) params, operators = init_params(options, rng) print 'done...' if load_params: loaded = load_par(options) start_epoc = resume_epoc(options) # Check that we've loaded the correct parameters... for kk, vv in loaded.iteritems(): assert params[kk].shape == vv.shape assert type(params[kk]) == type(vv) params = loaded tparams = init_tparams(params) trng, use_noise, inps, out = build_model(tparams, options, rng) y = tensor.imatrix('y') cost = nll(out, y) f_eval = theano.function([inps, y], cost, givens={use_noise: numpy.float32(0.)}, on_unused_input='ignore') reg = 0. for k, v in tparams.iteritems(): if k[:6] == 'hidden' or k[-3:] == 'W_h': reg += options['l1'] * tensor.sum(abs(v)) reg += options['l2'] * tensor.sum((v)**2) cost += reg grads = tensor.grad(cost, wrt=itemlist(tparams)) lr = tensor.scalar(name='lr', dtype=theano.config.floatX) opt = get_optim(options['opt']) print 'Compiling functions' f_grad_shared, f_update, gshared = opt(lr, tparams, grads, [inps, y], cost, use_noise) f_out = theano.function([inps], out, givens={use_noise: numpy.float32(0.)}, on_unused_input='ignore', allow_input_downcast=True) best = numpy.inf print 'Starting training' train = list_update(data[0], f_eval, options['batch_size'], rng=rng) test = list_update(data[-1], f_eval, options['batch_size'], rng=rng) starting = (train, test) print 'Pre-training. test: %f, train: %f' % (test, train) print 'Training' lr = options['lr'] max_itr = options['max_itr'] grad_norm = 0. train_scores = 50 * [0.] try: for epoch in xrange(max_itr): start_time = time.time() for g in gshared: # manually set gradients to 0 because we accumulate in list update g.set_value(0.0 * g.get_value()) use_noise.set_value(1.) train_cost, n_obs = list_update(data[0], f_grad_shared, batchsize=options['batch_size'], rng=rng, return_n_obs=True) use_noise.set_value(0.) for g in gshared: g.set_value(floatx(g.get_value() / float(n_obs))) f_update(lr) apply_proximity(tparams, operators) train = list_update(data[0], f_eval, options['batch_size'], rng=rng) elapsed_time = time.time() - start_time if train < best: # early stopping on training set test = list_update(data[-1], f_eval) best_par = unzip(tparams) best_perf = (train, test) best = train test = list_update(data[-1], f_eval) if (epoch % 50) == 0: # Save progress.... save_progress(options, tparams, epoch, best_perf) print 'Epoch: %d, cost: %f, train: %f, test: %f, lr:%f, time: %f' % ( epoch, train_cost, train, test, lr, elapsed_time) # Check if we're diverging... train_ave = running_ave(train_scores, train, epoch) if epoch > 1000: # Only exit if we're diverging after 1000 iterations if train_ave > 1.03 * best_perf[0]: print "Diverged..." break except KeyboardInterrupt: print "Interrupted" # check that we're outputing prob distributions X = data[0][(3, 3)][0] assert abs( f_out(X.reshape(X.shape[0], 2, 3, 3)).sum() - float(X.shape[0])) < 1e-4 print "Best performance:" print "train, test" print "%f,%f" % best_perf return best_perf, best_par
train_loader, eval_loader, data_shape = get_data(args) data_id = get_data_id(args) ################### ## Specify model ## ################### model = get_model(args, data_shape=data_shape) model_id = get_model_id(args) ####################### ## Specify optimizer ## ####################### optimizer, _, _ = get_optim(args, model) optim_id = f"more_{get_optim_id(args)}" ############## ## Training ## ############## exp = FlowExperiment(args=args, data_id=data_id, model_id=model_id, optim_id=optim_id, train_loader=train_loader, eval_loader=eval_loader, model=model, optimizer=optimizer, scheduler_iter=None,
train_loader, eval_loader, data_shape = get_data(args) data_id = get_data_id(args) ################### ## Specify model ## ################### model = get_model(args, data_shape=data_shape) model_id = get_model_id(args) ####################### ## Specify optimizer ## ####################### optimizer, scheduler_iter, scheduler_epoch = get_optim(args, model) optim_id = get_optim_id(args) ############## ## Training ## ############## exp = FlowExperiment(args=args, data_id=data_id, model_id=model_id, optim_id=optim_id, train_loader=train_loader, eval_loader=eval_loader, model=model, optimizer=optimizer, scheduler_iter=scheduler_iter,
## Load args ## ############### with open(path_args, 'rb') as f: args = pickle.load(f) ################ ## Experiment ## ################ if eval_args.model_type == "flow": student, teacher, data_id = get_model(args) model_id = get_model_id(args) args.dataset = data_id optimizer, scheduler_iter, scheduler_epoch = get_optim( args, student.parameters()) optim_id = get_optim_id(args) exp = StudentExperiment(args=args, data_id=data_id, model_id=model_id, optim_id=optim_id, model=student, teacher=teacher, optimizer=optimizer, scheduler_iter=scheduler_iter, scheduler_epoch=scheduler_epoch) else: student, teacher, data_id = get_baseline(args) model_id = get_model_id(args) args.dataset = data_id