def train(batch_size, lr, epochs, period): assert period >= batch_size and period % batch_size == 0 params, sqrs, vs = adam_init_params() w = params[0] b = params[1] total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())] t = 0 for epoch in range(1, epochs + 1): if epoch > 2: lr *= 0.1 for batch_i, data, label in data_iter(batch_size): with autograd.record(): output = net(data, w, b) loss = square_loss(output, label) loss.backward() t += 1 optimizer.adam(params, sqrs, vs, batch_size, lr, t) if batch_i * batch_size % period == 0: total_loss.append(np.mean(square_loss(net(X,w ,b), y).asnumpy())) print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" % ( batch_size, lr, epoch, total_loss[-1] )) print("w:", np.reshape(w.asnumpy(), (1, -1)), "b:", b.asnumpy()[0]) print("Total loss length:", len(total_loss)) x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True) plt.semilogy(x_axis, total_loss) plt.xlabel("Epochs") plt.ylabel("Loss") plt.show()
def __init__(self, x, y, l, window, opt, lr, init_emb, dim_emb, dim_hidden, n_vocab, L2_reg, unit, sim='cos', n_layers=1, activation=tanh): self.tr_inputs = [x, y, l] self.pr_inputs = [x, y, l] self.x = x # 1D: batch_size * l * 2, 2D: window; elem=word_id self.y = y # 1D: batch_size; elem=label self.l = l # scalar: elem=sentence length batch_size = y.shape[0] n_cands = x.shape[0] / batch_size / l self.pad = build_shared_zeros((1, dim_emb)) if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb)) else: self.emb = theano.shared(init_emb) self.E = T.concatenate([self.pad, self.emb], 0) self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden)) self.params = [self.emb, self.W_out] """ Input Layer """ e = self.E[x] # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb x_in = e.reshape((batch_size * n_cands, l, -1)) """ Intermediate Layer """ # h: 1D: n_batch * n_cands, 2D: dim_emb h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers, activation) self.params.extend(params) """ Output Layer """ h = h.reshape((batch_size, n_cands, -1)) h_1 = h[T.arange(batch_size), 0] h_2 = h[T.arange(batch_size), 1:] if sim == 'cos': y_score = cosign_similarity(h_1, h_2) else: y_score = T.batched_dot(T.dot(h_1, self.W_out), h_2.dimshuffle(0, 2, 1)) y_score_hat = T.max(y_score, 1) """ Objective Function """ self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size), y]) self.L2_sqr = regularization(self.params) self.cost = self.nll + L2_reg * self.L2_sqr / 2. """ Optimization """ if opt == 'adagrad': self.update = ada_grad(cost=self.cost, params=self.params, lr=lr) elif opt == 'ada_delta': self.update = ada_delta(cost=self.cost, params=self.params) elif opt == 'adam': self.update = adam(cost=self.cost, params=self.params, lr=lr) else: self.update = sgd(cost=self.cost, params=self.params, lr=lr) """ Predicts """ y_hat = T.argmax(y_score, 1) """ Check Accuracies """ self.correct = T.eq(y_hat, y)
def run_bm(bm_fname, optz_cfg=None, verbose=True, misc=None): # load bm_fname bm = importlib.import_module(bm_fname.rsplit('.', 1)[0]) e = bm.e; decorate_stind(e) thts_init = bm.thts_init compare = bm.compare if optz_cfg is None: optz_cfg = bm.optz_cfg # optz_detail optz_detail = get_optz_detail(bm_fname, optz_cfg) print('\n===== OPTZ: %s =====' % optz_detail) # run experiments for alg_str in compare: print('[%s] ' % alg_str, end='') alg = importlib.import_module(compare[alg_str].rsplit('.',1)[0]) alg.init(e) misc_arg = {'misc':misc} if alg_str == 'ours2' else {} # run adam grad_func = lambda thts, e=e: alg.elbo_grad(e, thts, **misc_arg) thts_res = optimizer.adam(grad_func, thts_init, iter_n = optz_cfg['iter_n'], lr = optz_cfg['lr'], sample_n_grad = optz_cfg['sample_n_grad'], sample_n_var = optz_cfg['sample_n_var'], verbose = verbose, **misc_arg) # save res to file save_res(optz_detail, alg_str, thts_res)
def ScbowTrain(file): corpus , word_to_id , id_to_word = preprocess(file) contexts, target = create_context_target(corpus, window_size = 1) vocab_size = len(word_to_id) target = one_hot_v(target,vocab_size) contexts = one_hot_v(contexts,vocab_size) model = scbow(vocab_size , hidden_size) optimizer = adam() train = Trainer(model, optimizer) train.fit(contexts, target, max_epoch, batch_size) train.plot() word_vecs = model.word_vecs for word_id, word in id_to_word.items(): print(word,word_vecs[word_id]) C = co_mat(corpus,vocab_size,window_size = 1) ms('彼女',word_to_id,id_to_word,C,top = 10)
def build_model(self, lr=0.001, dropout=None): trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.matrix('x', dtype='int32') # step * samples x_mask = T.matrix('x_mask', dtype='float32') # step * samples y = T.matrix('y', dtype='int32') # sample * emb ctx = T.tensor3('ctx', dtype='float32') # sample * annotation * dim n_timesteps = x.shape[0] n_samples = x.shape[1] emb = self.W_emb[x.flatten()] emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) ctx0 = ctx ctx_mean = ctx0.mean(1) init_state = T.dot(ctx_mean, self.W_hidden_init) + self.b_hidden_init init_memory = T.dot(ctx_mean, self.W_memory_init) + self.b_memory_init # proj : lstm hidden 들의 리스트 proj = self.lstm_layer(emb, mask=x_mask, context=ctx, init_state=init_state, init_memory=init_memory) proj_h = proj[0] # hidden 들의 평균 proj_h = (proj_h * x_mask[:, :, None]).sum(axis=0) proj_h = proj_h / x_mask.sum(axis=0)[:, None] # sample * dim # 마지막 hidden #proj_h = proj_h[-1] # sample * dim if dropout is not None: proj_h = dropout_layer(proj_h, use_noise, trng, dropout) output = T.dot(proj_h, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) ## avoid NaN epsilon = 1.0e-9 probs = T.clip(probs, epsilon, 1.0 - epsilon) probs /= probs.sum(axis=-1, keepdims=True) ## avoid NaN cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction
def update(self, lr=0.00001, weight_decay=0.0004): ''' # mini-batch SGD self.weights *= (1 - weight_decay) self.bias *= (1 - weight_decay) self.weights -= lr * self.d_weights self.bias -= lr * self.d_bias ''' # adam optimizer self.weights, self.config_w = adam(self.weights * (1 - weight_decay), self.d_weights, config=self.config_w) self.bias, self.config_b = adam(self.bias * (1 - weight_decay), self.d_bias, config=self.config_b) # clear gradients self.d_weights = np.zeros(self.weights.shape) self.d_bias = np.zeros(self.bias.shape)
def build_model(self, lr=0.001): trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.matrix('x', dtype = 'int32') x_mask = T.matrix('x_mask', dtype='float32') y = T.matrix('y', dtype = 'int32') img = T.matrix('img', dtype = 'float32') n_timesteps = x.shape[0] n_samples = x.shape[1] init_state = T.dot(img, self.W_img_emb) + self.b_img_emb emb = self.W_emb[x.flatten()] emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) # proj : gru hidden 들의 리스트 proj = self.gru_layer(emb, init_state, mask=x_mask) # hidden 들의 평균 proj = (proj * x_mask[:, :, None]).sum(axis=0) proj = proj / x_mask.sum(axis=0)[:, None] # sample * dim # 마지막 hidden #proj = proj[-1] # sample * dim output = T.dot(proj, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) ## avoid NaN epsilon = 1.0e-9 probs = T.clip(probs, epsilon, 1.0 - epsilon) probs /= probs.sum(axis=-1, keepdims=True) ## avoid NaN cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return trng, use_noise, x, x_mask, img, y, cost, updates, prediction
def build_model(self, lr=0.001): trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.matrix('x', dtype='int32') x_mask = T.matrix('x_mask', dtype='float32') y = T.matrix('y', dtype='int32') img = T.matrix('img', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] init_state = T.dot(img, self.W_img_emb) + self.b_img_emb emb = self.W_emb[x.flatten()] emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) # proj : gru hidden 들의 리스트 proj = self.gru_layer(emb, init_state, mask=x_mask) # hidden 들의 평균 #proj = (proj * x_mask[:, :, None]).sum(axis=0) #proj = proj / x_mask.sum(axis=0)[:, None] # sample * dim # 마지막 hidden proj = proj[-1] # sample * dim output = T.dot(proj, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return x, x_mask, img, y, cost, updates, prediction
def build_model(self, lr=0.001, dropout=None): def concatenate(tensor_list, axis=0): concat_size = sum(tt.shape[axis] for tt in tensor_list) output_shape = () for k in range(axis): output_shape += (tensor_list[0].shape[k],) output_shape += (concat_size,) for k in range(axis + 1, tensor_list[0].ndim): output_shape += (tensor_list[0].shape[k],) out = tensor.zeros(output_shape) offset = 0 for tt in tensor_list: indices = () for k in range(axis): indices += (slice(None),) indices += (slice(offset, offset + tt.shape[axis]),) for k in range(axis + 1, tensor_list[0].ndim): indices += (slice(None),) out = tensor.set_subtensor(out[indices], tt) offset += tt.shape[axis] return out trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.matrix('x', dtype = 'int32') # step * samples x_mask = T.matrix('x_mask', dtype='float32') # step * samples y = T.matrix('y', dtype = 'int32') # sample * emb ctx = T.tensor3('ctx', dtype = 'float32') # sample * annotation * dim n_timesteps = x.shape[0] n_samples = x.shape[1] xr = x[::-1] xr_mask = x_mask[::-1] emb = self.W_emb[x.flatten()] emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) embr = self.W_emb[xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, self.dim_word]) ctx0 = ctx ctx_mean = ctx0.mean(1) init_state = T.dot(ctx_mean, self.W_ctx_init) + self.b_ctx_init # proj : gru hidden 들의 리스트 proj = self.gru_layer(emb, mask=x_mask, context=ctx, init_state=init_state) proj_h = proj[0] projr = self.gru_layer(embr, mask=xr_mask, context=ctx, init_state=init_state) projr_h = projr[0] concat_proj_h = concatenate([proj_h, projr_h[::-1]], axis=proj_h.ndim-1) # step_ctx : step * samples * (dim*2) concat_proj_h = (concat_proj_h * x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None] # step_ctx_mean : samples * (dim*2) if dropout is not None : concat_proj_h = dropout_layer(concat_proj_h, use_noise, trng, dropout) output = T.dot(concat_proj_h, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) ## avoid NaN epsilon = 1.0e-9 probs = T.clip(probs, epsilon, 1.0 - epsilon) probs /= probs.sum(axis=-1, keepdims=True) ## avoid NaN cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction
def build_model(self, lr=0.001, dropout=None): trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.matrix('x', dtype = 'int32') # step * samples x_mask = T.matrix('x_mask', dtype='float32') # step * samples y = T.matrix('y', dtype = 'int32') # sample * emb ctx = T.tensor3('ctx', dtype = 'float32') # sample * annotation * dim n_timesteps = x.shape[0] n_samples = x.shape[1] emb = self.W_emb[x.flatten()] emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) ctx0 = ctx ctx_mean = ctx0.mean(1) init_state = T.dot(ctx_mean, self.W_hidden_init) + self.b_hidden_init init_memory = T.dot(ctx_mean, self.W_memory_init) + self.b_memory_init # proj : lstm hidden 들의 리스트 proj = self.lstm_layer(emb, mask=x_mask, context=ctx, init_state=init_state, init_memory=init_memory) proj_h = proj[0] # hidden 들의 평균 proj_h = (proj_h * x_mask[:, :, None]).sum(axis=0) proj_h = proj_h / x_mask.sum(axis=0)[:, None] # sample * dim # 마지막 hidden #proj_h = proj_h[-1] # sample * dim if dropout is not None : proj_h = dropout_layer(proj_h, use_noise, trng, dropout) output = T.dot(proj_h, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) ## avoid NaN epsilon = 1.0e-9 probs = T.clip(probs, epsilon, 1.0 - epsilon) probs /= probs.sum(axis=-1, keepdims=True) ## avoid NaN cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction
def build_model(self, lr=0.001, dropout=None): trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.tensor3('x', dtype = 'float32') # step * sample * 5555 y = T.matrix('y', dtype = 'int32') img = T.tensor3('img', dtype = 'float32') # 1*sample * 4096 # T.set_subtensor(img_t3[0], img) # emb=theano.tensor.concatenate([img_t3,x]) emb = x embr = x[::-1] # proj : gru hidden 들의 리스트 proj = self.gru_layer(emb, img) projr = self.gru_cond_layer(embr, img) proj = concatenate([proj, projr[::-1]], axis=proj.ndim-1) # hidden 들의 평균 proj = proj.mean(axis=0) # 마지막 hidden #proj = proj[-1] # sample * dim if dropout is not None : proj = dropout_layer(proj, use_noise, trng, dropout) output = T.dot(proj, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) ## avoid NaN epsilon = 1.0e-9 probs = T.clip(probs, epsilon, 1.0 - epsilon) probs /= probs.sum(axis=-1, keepdims=True) ## avoid NaN cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) ''' decay_c = 0.000001 # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(np.float32(decay_c), name='decay_c') weight_decay = 0. for vv in self.params: weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay ''' updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return trng, use_noise, x, img, y, cost, updates, prediction, probs
def build_model(self, lr=0.001, dropout=None): trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.tensor3('x', dtype = 'float32') y = T.matrix('y', dtype = 'int32') img = T.matrix('img', dtype = 'float32') n_timesteps = x.shape[0] n_samples = x.shape[1] init_state = T.dot(img, self.W_hidden_init) + self.b_hidden_init init_memory = T.dot(img, self.W_memory_init) + self.b_memory_init emb = x embr = x.swapaxes(0,1)[::-1].swapaxes(0,1) # proj : gru hidden 들의 리스트 proj = self.lstm_layer(emb, init_state=init_state, init_memory=init_memory)[0] projr = self.lstm_layer(embr, init_state=init_state, init_memory=init_memory)[0] proj = concatenate([proj, projr[::-1]], axis=proj.ndim-1) # hidden 들의 평균 proj = proj.mean(axis=0) # 마지막 hidden #proj = proj[-1] # sample * dim if dropout is not None : proj = dropout_layer(proj, use_noise, trng, dropout) output = T.dot(proj, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) ## avoid NaN epsilon = 1.0e-8 probs = T.clip(probs, epsilon, 1.0 - epsilon) probs /= probs.sum(axis=-1, keepdims=True) ## avoid NaN cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) ''' decay_c = 0.000001 # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(np.float32(decay_c), name='decay_c') weight_decay = 0. for vv in self.params: weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay ''' updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return trng, use_noise, x, img, y, cost, updates, prediction
b_h3 = shared_normal((2, n_h3), sigma = 0) b_h4 = shared_normal((2, n_h4), sigma = 0) X = binomial(X) #internal binarization #model calls [dout_prob_y, dout_dual_recon_err] = model_NG_ACE(X, batch_size, gaussian_err, 0.2, 0.5) #with dropout [prob_y, dual_recon_err] = model_NG_ACE(X, batch_size, gaussian_err, 0., 0.) #without dropout y_model = T.argmax(prob_y, axis=1) #model labels #dropout classification err dout_class_err = T.nnet.categorical_crossentropy(dout_prob_y, Y).sum() #optimizer call cost = dout_class_err + dout_dual_recon_err params = [W_h, W_h2, W_h3, W_h4, W_o, b_h, b_h2, b_h3, b_h4] updates, norm_grad = adam(cost, params, lr = learning_rate, data_part = float(batch_size) / P) #givens s_trX, s_teX, s_trY, s_teY = shared(trX), shared(teX), shared(trY), shared(teY) tr_batch_X = s_trX[start : end] tr_batch_Y = s_trY[start : end] te_batch_X = s_teX[start : end] te_batch_Y = s_teY[start : end] #train & test functions mode = theano.compile.get_default_mode() train = theano.function(inputs=[start, end, learning_rate], outputs= [dout_class_err, dual_recon_err, y_model, norm_grad], updates=updates, givens = {X : tr_batch_X, Y : tr_batch_Y}, allow_input_downcast=True, mode = mode) test = theano.function(inputs=[start, end], outputs= [dual_recon_err, y_model], givens = {X : te_batch_X}, allow_input_downcast=True, mode = mode) #main loop over epochs tr_len = len(trY)
def main(): logging.info("️start loading setting data.") settings = LearningDataSettings(args.train_setting_file) logging.info("☑ loading setting data complete.") vector_size = settings.input_unit hidden_unit_num = settings.hidden_unit class_num = settings.class_unit hidden_layer_value_num = args.division_num logging.info( "input_vector(n):%d, hidden_unit(m):%d, class_num(K):%d, div_num:%d" % (vector_size, hidden_unit_num, class_num, hidden_layer_value_num)) drbm = DRBM.load_from_json(settings.initial_model, args.division_num, args.sparse, sparse_learning_rate=args.sparse_learning_rate, sparse_adamax=args.sparse_adamax) logging.info("initial model: {}".format(str(drbm))) if args.datasize_limit != 0 and not args.generative_model: settings.training_data = settings.training_data.restore_minibatch( args.datasize_limit, random=False) gen_drbm = None if args.kl_divergence: gen_drbm = DRBM.load_from_json(args.kl_divergence) logging.info("generative model: {}".format(str(gen_drbm))) elif args.generative_model: gen_drbm = DRBM(settings.gen_input, settings.gen_hidden, settings.gen_class, 0, random_bias=True) logging.info("generated generative model: {}".format(str(gen_drbm))) value, target = gen_drbm.stick_break(args.datasize_limit) settings.training_data = Categorical(value, target, class_num) settings.test_data = Categorical(np.array([]), np.array([]), class_num) opt = None if args.optimizer == "momentum": logging.info("optimize method: momentum") opt = optimizer.momentum(vector_size, hidden_unit_num, class_num) elif args.optimizer == "adam": logging.info("optimize method: adam") opt = optimizer.adam(vector_size, hidden_unit_num, class_num) else: logging.info("optimize method: adamax") opt = optimizer.adamax(vector_size, hidden_unit_num, class_num) logging.info("train started.") start_time = time.time() learning_result = drbm.train( settings.training_data, settings.test_data, args.learning_num, args.minibatch_size, opt, test_interval=args.test_interval, correct_rate=args.correct_rate, gen_drbm=gen_drbm, ) end_time = time.time() logging.info("☑ train complete. time: {} sec".format(end_time - start_time)) now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") hidden_layer = "s" if args.sparse else "d" filename_template = "{}_{}_{}{}_v{}h{}c{}_%s.json".format( now, args.filename_prefix, hidden_layer, drbm.div_num, drbm.num_visible, drbm.num_hidden, drbm.num_class) learning_result.save( os.path.join(args.result_directory, filename_template % "log")) drbm.save(os.path.join(args.result_directory, filename_template % "params")) logging.info("☑ parameters dumped.")
def main(method, LR_start, Binarize_weight_only): # BN parameters name = "mnist" print("dataset = " + str(name)) print("Binarize_weight_only=" + str(Binarize_weight_only)) print("Method = " + str(method)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) batch_size = 100 print("batch_size = " + str(batch_size)) num_epochs = 50 print("num_epochs = " + str(num_epochs)) # network structure num_units = 2048 print("num_units = " + str(num_units)) n_hidden_layers = 3 print("n_hidden_layers = " + str(n_hidden_layers)) print("LR_start = " + str(LR_start)) LR_decay = 0.1 print("LR_decay=" + str(LR_decay)) if Binarize_weight_only == "w": activation = lasagne.nonlinearities.rectify else: activation = lab.binary_tanh_unit print("activation = " + str(activation)) print('Loading MNIST dataset...') train_set = MNIST(which_set='train', start=0, stop=50000, center=True) valid_set = MNIST(which_set='train', start=50000, stop=60000, center=True) test_set = MNIST(which_set='test', center=True) # bc01 format train_set.X = train_set.X.reshape(-1, 1, 28, 28) valid_set.X = valid_set.X.reshape(-1, 1, 28, 28) test_set.X = test_set.X.reshape(-1, 1, 28, 28) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. print('Building the MLP...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) mlp = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input) for k in range(n_hidden_layers): mlp = lab.DenseLayer(mlp, nonlinearity=lasagne.nonlinearities.identity, num_units=num_units, method=method) mlp = batch_norm.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) mlp = lab.DenseLayer(mlp, nonlinearity=lasagne.nonlinearities.identity, num_units=10, method=method) mlp = batch_norm.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(mlp, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if method != "FPN": # W updates W = lasagne.layers.get_all_params(mlp, binary=True) W_grads = lab.compute_grads(loss, mlp) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = lab.clipping_scaling(updates, mlp) # other parameters updates params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) updates = OrderedDict(updates.items() + optimizer.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) ## update 2nd moment, can get from the adam optimizer also updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(mlp, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[ idx] * W_grads[idx] * (1 - beta2) idx = idx + 1 updates = OrderedDict(updates.items() + updates3.items()) else: params = lasagne.layers.get_all_params(mlp, trainable=True) updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(mlp, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') lab.train(name, method, train_fn, val_fn, batch_size, LR_start, LR_decay, num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y, test_set.X, test_set.y)
def main(method,LR_start, SEQ_LENGTH): lasagne.random.set_rng(np.random.RandomState(1)) name = "linux" print("dataset = "+str(name)) print("Method = "+str(method)) # Sequence Length SEQ_LENGTH = SEQ_LENGTH # SEQ_LENGTH = 100 #can have diffvalues 50, 100, 200 print("SEQ_LENGTH = "+str(SEQ_LENGTH)) # Number of units in the two hidden (LSTM) layers N_HIDDEN = 512 print("N_HIDDEN = "+str(N_HIDDEN)) # All gradients above this will be clipped GRAD_CLIP=5. #### this clip the gradients at every time step, while T.clip clips the sum of gradients as a whole print("GRAD_CLIP ="+str(GRAD_CLIP)) # Number of epochs to train the net num_epochs = 200 print("num_epochs = "+str(num_epochs)) # Batch Size batch_size = 100 print("batch_size = "+str(batch_size)) print("LR_start = "+str(LR_start)) LR_decay = 0.98 print("LR_decay="+str(LR_decay)) activation = lasagne.nonlinearities.tanh ## load data, change data file dir with open('data/linux_input.txt', 'r') as f: in_text = f.read() generation_phrase = "Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar\n *\n * This file contains the interrupt probing code and driver APIs.\n */\n\n#include" #This snippet loads the text file and creates dictionaries to #encode characters into a vector-space representation and vice-versa. chars = list(set(in_text)) data_size, vocab_size = len(in_text), len(chars) char_to_ix = { ch:i for i,ch in enumerate(chars) } ix_to_char = { i:ch for i,ch in enumerate(chars) } num_splits = [0.9, 0.05, 0.05] num_splits_all = np.floor(data_size/batch_size/SEQ_LENGTH) num_train = np.floor(num_splits_all*num_splits[0]) num_val = np.floor(num_splits_all*num_splits[1]) num_test = num_splits_all - num_train - num_val train_X = in_text[0:(num_train*batch_size*SEQ_LENGTH+1).astype('int32')] val_X = in_text[(num_train*batch_size*SEQ_LENGTH).astype('int32'):((num_train+num_val)*batch_size*SEQ_LENGTH+1).astype('int32')] test_X = in_text[((num_train+num_val)*batch_size*SEQ_LENGTH).astype('int32'):(num_splits_all*batch_size*SEQ_LENGTH+1).astype('int32')] ## build model print('Building the model...') # input = T.tensor3('inputs') target = T.imatrix('target') LR = T.scalar('LR', dtype=theano.config.floatX) # (batch size, SEQ_LENGTH, num_features) l_in = lasagne.layers.InputLayer(shape=(None, None, vocab_size)) l_forward_2 = laq.LSTMLayer( l_in, num_units=N_HIDDEN, grad_clipping=GRAD_CLIP, peepholes=False, nonlinearity=activation, ### change this activation can change the hidden layer to binary method=method) ### batch_size*SEQ_LENGTH*N_HIDDEN l_shp = lasagne.layers.ReshapeLayer(l_forward_2, (-1, N_HIDDEN)) ## (batch_size*SEQ_LENGTH, N_HIDDEN) l_out = lasagne.layers.DenseLayer(l_shp, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax) batchsize, seqlen, _ = l_in.input_var.shape train_output = lasagne.layers.get_output(l_out, deterministic=False) loss = T.nnet.categorical_crossentropy(train_output,target.flatten()).mean() if method!= "FPN": # W updates W = lasagne.layers.get_all_params(l_out, quantized=True) W_grads = laq.compute_grads(loss,l_out) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR, epsilon=1e-8) updates = laq.clipping_scaling(updates,l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, quantized=False) updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR, epsilon = 1e-8).items()) ## update the ternary matrix ternary_weights = laq.get_quantized_weights(loss, l_out) updates2 = OrderedDict() idx = 0 tt_tag = lasagne.layers.get_all_params(l_out, tt=True) for tt_tag_temp in tt_tag: updates2[tt_tag_temp]= ternary_weights[idx] idx = idx+1 updates = OrderedDict(updates.items() + updates2.items()) ## update 2nd momentum updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(l_out, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2) idx = idx+1 updates = OrderedDict(updates.items() + updates3.items()) else: params_other = lasagne.layers.get_all_params(l_out, trainable=True) W_grads = [theano.grad(loss, wrt=l_forward_2.W_in_to_ingate), theano.grad(loss, wrt=l_forward_2.W_hid_to_ingate), theano.grad(loss, wrt=l_forward_2.W_in_to_forgetgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_forgetgate), theano.grad(loss, wrt=l_forward_2.W_in_to_cell),theano.grad(loss, wrt=l_forward_2.W_hid_to_cell), theano.grad(loss, wrt=l_forward_2.W_in_to_outgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_outgate)] updates = optimizer.adam(loss_or_grads=loss, params=params_other, learning_rate=LR) test_output = lasagne.layers.get_output(l_out, deterministic=True) test_loss = T.nnet.categorical_crossentropy(test_output,target.flatten()).mean() train_fn = theano.function([l_in.input_var, target, LR], loss, updates=updates, allow_input_downcast=True) val_fn = theano.function([l_in.input_var, target], test_loss, allow_input_downcast=True) print('Training...') X_train = train_X X_val = val_X X_test = test_X def gen_data(pp, batch_size,SEQ_LENGTH, data, return_target=True): x = np.zeros((batch_size,SEQ_LENGTH,vocab_size)) ###### 128*100*85 y = np.zeros((batch_size, SEQ_LENGTH)) for n in range(batch_size): # ptr = n for i in range(SEQ_LENGTH): x[n,i,char_to_ix[data[pp[n]*SEQ_LENGTH+i]]] = 1. y[n,i] = char_to_ix[data[pp[n]*SEQ_LENGTH+i+1]] return x, np.array(y,dtype='int32') in_text = X_train+X_val+X_test chars = list(set(in_text)) data_size, vocab_size = len(in_text), len(chars) char_to_ix = { ch:i for i,ch in enumerate(chars) } ix_to_char = { i:ch for i,ch in enumerate(chars) } def train_epoch(X,LR): loss = 0 batches = len(X)/batch_size/SEQ_LENGTH num_seq = len(X)/SEQ_LENGTH shuffled_ind = range(num_seq) np.random.shuffle(shuffled_ind) for i in range(batches): tmp_ind = shuffled_ind[i*batch_size:(i+1)*batch_size] xx,yy = gen_data(tmp_ind,batch_size,SEQ_LENGTH, X) new_loss = train_fn(xx,yy,LR) loss+=new_loss loss=loss/batches return loss # This function tests the model a full epoch (on the whole dataset) def val_epoch(X): # err = 0 loss = 0 batches = len(X)/batch_size/SEQ_LENGTH num_seq = len(X)/SEQ_LENGTH ind = range(num_seq) for i in range(batches): tmp_ind = ind[i*batch_size:(i+1)*batch_size] xx, yy = gen_data(tmp_ind, batch_size, SEQ_LENGTH, X) new_loss = val_fn(xx,yy) loss += new_loss loss = loss/batches return loss best_val_loss=100 best_epoch = 1 LR = LR_start # iterate over epochs: for epoch in range(1,num_epochs+1): start_time = time.time() train_loss = train_epoch(X_train, LR) val_loss = val_epoch(X_val) # test if validation error went down if val_loss <= best_val_loss: best_val_loss = val_loss best_epoch = epoch test_loss = val_epoch(X_test) # all_params = lasagne.layers.get_all_params(l_out) # np.savez("{0}/{1}_seq{2}_lr{3}_hid{4}_{5}.npz".format(method, name, SEQ_LENGTH, LR_start, N_HIDDEN, method), *all_params) epoch_duration = time.time() - start_time # Then we print the results for this epoch: print(" Epoch "+str(epoch)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s") print(" LR: "+str(LR)) print(" training loss: "+str(train_loss)) print(" validation loss: "+str(val_loss)) print(" best epoch: "+str(best_epoch)) print(" test loss: "+str(test_loss)) with open("{0}/{1}_seq{2}_lr{3}_hid{4}_{5}.txt".format(method, name, SEQ_LENGTH, LR_start, N_HIDDEN, method), "a") as myfile: myfile.write("{0} {1:.3f} {2:.3f} {3:.3f} {4:.3f}\n".format(epoch, train_loss, val_loss, test_loss, epoch_duration)) # learning rate update scheme if epoch>10: LR *= LR_decay
def main(method,LR_start,Binarize_weight_only, SEQ_LENGTH): lasagne.random.set_rng(np.random.RandomState(1)) name = "linux" print("dataset = "+str(name)) print("Binarize_weight_only="+str(Binarize_weight_only)) print("Method = "+str(method)) # Sequence Length SEQ_LENGTH = SEQ_LENGTH # SEQ_LENGTH = 100 #can have diffvalues 50, 100, 200 print("SEQ_LENGTH = "+str(SEQ_LENGTH)) # Number of units in the two hidden (LSTM) layers N_HIDDEN = 512 print("N_HIDDEN = "+str(N_HIDDEN)) # All gradients above this will be clipped GRAD_CLIP=5. #### this clip the gradients at every time step, while T.clip clips the sum of gradients as a whole print("GRAD_CLIP ="+str(GRAD_CLIP)) # Number of epochs to train the net num_epochs = 200 print("num_epochs = "+str(num_epochs)) # Batch Size batch_size = 100 print("batch_size = "+str(batch_size)) print("LR_start = "+str(LR_start)) LR_decay = 0.98 print("LR_decay="+str(LR_decay)) if Binarize_weight_only =="w": activation = lasagne.nonlinearities.tanh else: activation = lab.binary_tanh_unit print("activation = "+ str(activation)) name = name+"_"+Binarize_weight_only ## load data, change data file dir with open('data/linux_input.txt', 'r') as f: in_text = f.read() generation_phrase = "Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar\n *\n * This file contains the interrupt probing code and driver APIs.\n */\n\n#include" #This snippet loads the text file and creates dictionaries to #encode characters into a vector-space representation and vice-versa. chars = list(set(in_text)) data_size, vocab_size = len(in_text), len(chars) char_to_ix = { ch:i for i,ch in enumerate(chars) } ix_to_char = { i:ch for i,ch in enumerate(chars) } num_splits = [0.9, 0.05, 0.05] num_splits_all = np.floor(data_size/batch_size/SEQ_LENGTH) num_train = np.floor(num_splits_all*num_splits[0]) num_val = np.floor(num_splits_all*num_splits[1]) num_test = num_splits_all - num_train - num_val train_X = in_text[0:(num_train*batch_size*SEQ_LENGTH+1).astype('int32')] val_X = in_text[(num_train*batch_size*SEQ_LENGTH).astype('int32'):((num_train+num_val)*batch_size*SEQ_LENGTH+1).astype('int32')] test_X = in_text[((num_train+num_val)*batch_size*SEQ_LENGTH).astype('int32'):(num_splits_all*batch_size*SEQ_LENGTH+1).astype('int32')] ## build model print('Building the model...') # input = T.tensor3('inputs') target = T.imatrix('target') LR = T.scalar('LR', dtype=theano.config.floatX) # (batch size, SEQ_LENGTH, num_features) l_in = lasagne.layers.InputLayer(shape=(None, None, vocab_size)) l_forward_2 = lab.LSTMLayer( l_in, num_units=N_HIDDEN, grad_clipping=GRAD_CLIP, peepholes=False, nonlinearity=activation, ### change this activation can change the hidden layer to binary method=method) ### batch_size*SEQ_LENGTH*N_HIDDEN l_shp = lasagne.layers.ReshapeLayer(l_forward_2, (-1, N_HIDDEN)) ## (batch_size*SEQ_LENGTH, N_HIDDEN) l_out = lasagne.layers.DenseLayer(l_shp, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax) batchsize, seqlen, _ = l_in.input_var.shape l_shp1 = lasagne.layers.ReshapeLayer(l_out, (batchsize, seqlen, vocab_size)) l_out1 = lasagne.layers.SliceLayer(l_shp1, -1, 1) train_output = lasagne.layers.get_output(l_out, deterministic=False) loss = T.nnet.categorical_crossentropy(train_output,target.flatten()).mean() if method!= "FPN": # W updates W = lasagne.layers.get_all_params(l_out, binary=True) W_grads = lab.compute_grads(loss,l_out) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR, epsilon = 1e-8) ### can choose different methods to update updates = lab.clipping_scaling(updates,l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False) updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR, epsilon = 1e-8).items()) ## update 2 momentum updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(l_out, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: # updates3[acc_tag_temp]=updates.keys()[idx] updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2) idx = idx+1 updates = OrderedDict(updates.items() + updates3.items()) else: params_other = lasagne.layers.get_all_params(l_out, trainable=True) W_grads = [theano.grad(loss, wrt=l_forward_2.W_in_to_ingate), theano.grad(loss, wrt=l_forward_2.W_hid_to_ingate), theano.grad(loss, wrt=l_forward_2.W_in_to_fotgetgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_forgetgate), theano.grad(loss, wrt=l_forward_2.W_in_to_cell),theano.grad(loss, wrt=l_forward_2.W_hid_to_cell), theano.grad(loss, wrt=l_forward_2.W_in_to_outgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_outgate)] updates = optimizer.adam(loss_or_grads=loss, params=params_other, learning_rate=LR) test_output = lasagne.layers.get_output(l_out, deterministic=True) test_loss = T.nnet.categorical_crossentropy(test_output,target.flatten()).mean() train_fn = theano.function([l_in.input_var, target, LR], [loss, W_grads[5]], updates=updates, allow_input_downcast=True) val_fn = theano.function([l_in.input_var, target], test_loss, allow_input_downcast=True) probs = theano.function([l_in.input_var],lasagne.layers.get_output(l_out1), allow_input_downcast=True) print('Training...') lab.train( name, method, train_fn,val_fn, batch_size, SEQ_LENGTH, N_HIDDEN, LR_start,LR_decay, num_epochs, train_X, val_X, test_X)
def build_model(self, lr=0.001, dropout=None): def concatenate(tensor_list, axis=0): concat_size = sum(tt.shape[axis] for tt in tensor_list) output_shape = () for k in range(axis): output_shape += (tensor_list[0].shape[k],) output_shape += (concat_size,) for k in range(axis + 1, tensor_list[0].ndim): output_shape += (tensor_list[0].shape[k],) out = T.zeros(output_shape) offset = 0 for tt in tensor_list: indices = () for k in range(axis): indices += (slice(None),) indices += (slice(offset, offset + tt.shape[axis]),) for k in range(axis + 1, tensor_list[0].ndim): indices += (slice(None),) out = T.set_subtensor(out[indices], tt) offset += tt.shape[axis] return out trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.matrix('x', dtype = 'int32') x_mask = T.matrix('x_mask', dtype='float32') y = T.matrix('y', dtype = 'int32') img = T.matrix('img', dtype = 'float32') n_timesteps = x.shape[0] n_samples = x.shape[1] init_state = T.dot(img, self.W_img_emb) + self.b_img_emb emb = self.W_emb[x.flatten()] emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) xr = x[::-1] xr_mask = x_mask[::-1] embr = self.W_emb[xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, self.dim_word]) proj = self.gru_layer(emb, init_state, mask=x_mask) projr = self.gru_layer(embr, init_state, mask=xr_mask) proj = concatenate([proj, projr[::-1]], axis=proj.ndim-1) # hidden 들의 평균 proj = (proj * x_mask[:, :, None]).sum(axis=0) proj = proj / x_mask.sum(axis=0)[:, None] # sample * dim # 마지막 hidden #proj = proj[-1] # sample * dim if dropout is not None : proj = dropout_layer(proj, use_noise, trng, dropout) output = T.dot(proj, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) ## avoid NaN epsilon = 1.0e-9 probs = T.clip(probs, epsilon, 1.0 - epsilon) probs /= probs.sum(axis=-1, keepdims=True) ## avoid NaN cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return trng, use_noise, x, x_mask, img, y, cost, updates, prediction
def main(method, LR_start, Binarize_weight_only): name = "svhn" print("dataset = " + str(name)) print("Binarize_weight_only=" + str(Binarize_weight_only)) print("Method = " + str(method)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # Training parameters batch_size = 50 print("batch_size = " + str(batch_size)) num_epochs = 50 print("num_epochs = " + str(num_epochs)) print("LR_start = " + str(LR_start)) LR_decay = 0.1 print("LR_decay=" + str(LR_decay)) # BTW, LR decay might good for the BN moving average... if Binarize_weight_only == "w": activation = lasagne.nonlinearities.rectify else: activation = lab.binary_tanh_unit print("activation = " + str(activation)) ## number of filters in the first convolutional layer K = 64 print("K=" + str(K)) print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input) # 128C3-128C3-P2 l_cnn1 = lab.Conv2DLayer(l_in, num_filters=K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha) l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation) l_cnn2 = lab.Conv2DLayer(l_nl1, num_filters=K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2)) l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha) l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation) # 256C3-256C3-P2 l_cnn3 = lab.Conv2DLayer(l_nl2, num_filters=2 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha) l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation) l_cnn4 = lab.Conv2DLayer(l_nl3, num_filters=2 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2)) l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha) l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation) # 512C3-512C3-P2 l_cnn5 = lab.Conv2DLayer(l_nl4, num_filters=4 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha) l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation) l_cnn6 = lab.Conv2DLayer(l_nl5, num_filters=4 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2)) l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha) l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation) # print(cnn.output_shape) # 1024FP-1024FP-10FP l_dn1 = lab.DenseLayer(l_nl6, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method=method) l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha) l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation) l_dn2 = lab.DenseLayer(l_nl7, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method=method) l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha) l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation) l_dn3 = lab.DenseLayer(l_nl8, nonlinearity=lasagne.nonlinearities.identity, num_units=10, method=method) l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(l_out, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if method != "FPN": # W updates W = lasagne.layers.get_all_params(l_out, binary=True) W_grads = lab.compute_grads(loss, l_out) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = lab.clipping_scaling(updates, l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False) updates = OrderedDict(updates.items() + optimizer.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) ## update 2nd moment, can get from the adam optimizer also updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(l_out, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[ idx] * W_grads[idx] * (1 - beta2) idx = idx + 1 updates = OrderedDict(updates.items() + updates3.items()) else: params = lasagne.layers.get_all_params(l_out, trainable=True) updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(l_out, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) val_fn = theano.function([input, target], [test_loss, test_err]) ## load data print('Loading SVHN dataset') train_set = SVHN( which_set='splitted_train', # which_set= 'valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) valid_set = SVHN(which_set='valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) test_set = SVHN(which_set='test', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) # bc01 format # print train_set.X.shape train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32)) valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32)) test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32)) train_set.y = np.array(train_set.y).flatten() valid_set.y = np.array(valid_set.y).flatten() test_set.y = np.array(test_set.y).flatten() # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. print('Training...') # ipdb.set_trace() lab.train(name, method, train_fn, val_fn, batch_size, LR_start, LR_decay, num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y, test_set.X, test_set.y)
def main(method, LR_start): name = "svhn" print("dataset = " + str(name)) print("Method = " + str(method)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # Training parameters batch_size = 50 print("batch_size = " + str(batch_size)) num_epochs = 50 print("num_epochs = " + str(num_epochs)) print("LR_start = " + str(LR_start)) LR_decay = 0.1 print("LR_decay=" + str(LR_decay)) # BTW, LR decay might good for the BN moving average... activation = lasagne.nonlinearities.rectify # number of filters in the first convolutional layer K = 64 print("K=" + str(K)) print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input) # 128C3-128C3-P2 l_cnn1 = laq.Conv2DLayer(l_in, num_filters=K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha) l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation) l_cnn2 = laq.Conv2DLayer(l_nl1, num_filters=K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2)) l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha) l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation) # 256C3-256C3-P2 l_cnn3 = laq.Conv2DLayer(l_nl2, num_filters=2 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha) l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation) l_cnn4 = laq.Conv2DLayer(l_nl3, num_filters=2 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2)) l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha) l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation) # 512C3-512C3-P2 l_cnn5 = laq.Conv2DLayer(l_nl4, num_filters=4 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha) l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation) l_cnn6 = laq.Conv2DLayer(l_nl5, num_filters=4 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2)) l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha) l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation) # print(cnn.output_shape) # 1024FP-1024FP-10FP l_dn1 = laq.DenseLayer(l_nl6, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method=method) l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha) l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation) l_dn2 = laq.DenseLayer(l_nl7, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method=method) l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha) l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation) l_dn3 = laq.DenseLayer(l_nl8, nonlinearity=lasagne.nonlinearities.identity, num_units=10, method=method) l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(l_out, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if method != "FPN": # W updates W = lasagne.layers.get_all_params(l_out, quantized=True) W_grads = laq.compute_grads(loss, l_out) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = laq.clipping_scaling(updates, l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, quantized=False) updates = OrderedDict(updates.items() + optimizer.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) ## update 2nd moment, can get from the adam optimizer also ternary_weights = laq.get_quantized_weights(loss, l_out) updates2 = OrderedDict() idx = 0 tt_tag = lasagne.layers.get_all_params(l_out, tt=True) for tt_tag_temp in tt_tag: updates2[tt_tag_temp] = ternary_weights[idx] idx = idx + 1 updates = OrderedDict(updates.items() + updates2.items()) ## update 2nd momentum updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(l_out, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[ idx] * W_grads[idx] * (1 - beta2) idx = idx + 1 updates = OrderedDict(updates.items() + updates3.items()) else: params = lasagne.layers.get_all_params(l_out, trainable=True) updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(l_out, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) train_fn = theano.function([input, target, LR], loss, updates=updates) val_fn = theano.function([input, target], [test_loss, test_err]) ## load data print('Loading SVHN dataset') train_set = SVHN( which_set='splitted_train', # which_set= 'valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) valid_set = SVHN(which_set='valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) test_set = SVHN(which_set='test', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) # bc01 format # print train_set.X.shape train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32)) valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32)) test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32)) train_set.y = np.array(train_set.y).flatten() valid_set.y = np.array(valid_set.y).flatten() test_set.y = np.array(test_set.y).flatten() # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. print('Training...') X_train = train_set.X y_train = train_set.y X_val = valid_set.X y_val = valid_set.y X_test = test_set.X y_test = test_set.y # This function trains the model a full epoch (on the whole dataset) def train_epoch(X, y, LR): loss = 0 batches = len(X) / batch_size # move shuffle here to save memory # k = 5 # batches = int(batches/k)*k shuffled_range = range(len(X)) np.random.shuffle(shuffled_range) for i in range(batches): tmp_ind = shuffled_range[i * batch_size:(i + 1) * batch_size] newloss = train_fn(X[tmp_ind], y[tmp_ind], LR) loss += newloss loss /= batches return loss # This function tests the model a full epoch (on the whole dataset) def val_epoch(X, y): err = 0 loss = 0 batches = len(X) / batch_size for i in range(batches): new_loss, new_err = val_fn(X[i * batch_size:(i + 1) * batch_size], y[i * batch_size:(i + 1) * batch_size]) err += new_err loss += new_loss err = err / batches * 100 loss /= batches return err, loss best_val_err = 100 best_epoch = 1 LR = LR_start # We iterate over epochs: for epoch in range(1, num_epochs + 1): start_time = time.time() train_loss = train_epoch(X_train, y_train, LR) val_err, val_loss = val_epoch(X_val, y_val) # test if validation error went down if val_err <= best_val_err: best_val_err = val_err best_epoch = epoch test_err, test_loss = val_epoch(X_test, y_test) epoch_duration = time.time() - start_time # Then we print the results for this epoch: print("Epoch " + str(epoch) + " of " + str(num_epochs) + " took " + str(epoch_duration) + "s") print(" LR: " + str(LR)) print(" training loss: " + str(train_loss)) print(" validation loss: " + str(val_loss)) print(" validation error rate: " + str(val_err) + "%") print(" best epoch: " + str(best_epoch)) print(" best validation error rate: " + str(best_val_err) + "%") print(" test loss: " + str(test_loss)) print(" test error rate: " + str(test_err) + "%") with open( "{0}/{1}_lr{2}_{3}.txt".format(method, name, LR_start, method), "a") as myfile: myfile.write( "{0} {1:.5f} {2:.5f} {3:.5f} {4:.5f} {5:.5f} {6:.5f} {7:.5f}\n" .format(epoch, train_loss, val_loss, test_loss, val_err, test_err, epoch_duration, LR)) ## Learning rate update scheme if epoch == 15 or epoch == 25: LR *= LR_decay
def main(method,LR_start): name = "cifar100" print("dataset = "+str(name)) print("Method = "+str(method)) # alpha is the exponential moving average factor alpha = .1 print("alpha = "+str(alpha)) epsilon = 1e-4 print("epsilon = "+str(epsilon)) # Training parameters batch_size = 100 print("batch_size = "+str(batch_size)) num_epochs = 200 print("num_epochs = "+str(num_epochs)) print("LR_start = "+str(LR_start)) LR_decay = 0.5 print("LR_decay="+str(LR_decay)) activation = lasagne.nonlinearities.rectify train_set_size = 45000 print("train_set_size = "+str(train_set_size)) print('Loading CIFAR-100 dataset...') preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar100/pylearn2_gcn_whitened/preprocessor.pkl") train_set = ZCA_Dataset( preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar100/pylearn2_gcn_whitened/train.pkl"), preprocessor = preprocessor, start=0, stop = train_set_size) valid_set = ZCA_Dataset( preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar100/pylearn2_gcn_whitened/train.pkl"), preprocessor = preprocessor, start=45000, stop = 50000) test_set = ZCA_Dataset( preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar100/pylearn2_gcn_whitened/test.pkl"), preprocessor = preprocessor) # bc01 format train_set.X = train_set.X.reshape(-1,3,32,32) valid_set.X = valid_set.X.reshape(-1,3,32,32) test_set.X = test_set.X.reshape(-1,3,32,32) # flatten targets train_set.y = np.int32(np.hstack(train_set.y)) valid_set.y = np.int32(np.hstack(valid_set.y)) test_set.y = np.int32(np.hstack(test_set.y)) print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.ivector('targets') LR = T.scalar('LR', dtype=theano.config.floatX) l_in = lasagne.layers.InputLayer( shape=(None, 3, 32, 32), input_var=input) # 128C3-128C3-P2 l_cnn1 = laq.Conv2DLayer( l_in, num_filters=128, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_bn1 = batch_norm.BatchNormLayer( l_cnn1, epsilon=epsilon, alpha=alpha) l_nl1 = lasagne.layers.NonlinearityLayer( l_bn1, nonlinearity = activation) l_cnn2 = laq.Conv2DLayer( l_nl1, num_filters=128, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2)) l_bn2 = batch_norm.BatchNormLayer( l_mp1, epsilon=epsilon, alpha=alpha) l_nl2 = lasagne.layers.NonlinearityLayer( l_bn2, nonlinearity = activation) # 256C3-256C3-P2 l_cnn3 = laq.Conv2DLayer( l_nl2, num_filters=256, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_bn3 = batch_norm.BatchNormLayer( l_cnn3, epsilon=epsilon, alpha=alpha) l_nl3 = lasagne.layers.NonlinearityLayer( l_bn3, nonlinearity = activation) l_cnn4 = laq.Conv2DLayer( l_nl3, num_filters=256, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2)) l_bn4 = batch_norm.BatchNormLayer( l_mp2, epsilon=epsilon, alpha=alpha) l_nl4 = lasagne.layers.NonlinearityLayer( l_bn4, nonlinearity = activation) # 512C3-512C3-P2 l_cnn5 = laq.Conv2DLayer( l_nl4, num_filters=512, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_bn5 = batch_norm.BatchNormLayer( l_cnn5, epsilon=epsilon, alpha=alpha) l_nl5 = lasagne.layers.NonlinearityLayer( l_bn5, nonlinearity = activation) l_cnn6 = laq.Conv2DLayer( l_nl5, num_filters=512, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2)) l_bn6 = batch_norm.BatchNormLayer( l_mp3, epsilon=epsilon, alpha=alpha) l_nl6 = lasagne.layers.NonlinearityLayer( l_bn6, nonlinearity = activation) # print(cnn.output_shape) # 1024FP-1024FP-10FP l_dn1 = laq.DenseLayer( l_nl6, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method = method) l_bn7 = batch_norm.BatchNormLayer( l_dn1, epsilon=epsilon, alpha=alpha) l_nl7 = lasagne.layers.NonlinearityLayer( l_bn7, nonlinearity = activation) l_dn2 = laq.DenseLayer( l_nl7, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method = method) l_bn8 = batch_norm.BatchNormLayer( l_dn2, epsilon=epsilon, alpha=alpha) l_nl8 = lasagne.layers.NonlinearityLayer( l_bn8, nonlinearity = activation) l_dn3 = laq.DenseLayer( l_nl8, nonlinearity=lasagne.nonlinearities.identity, num_units=100, method = method) l_out = lasagne.layers.NonlinearityLayer(l_dn3, nonlinearity=lasagne.nonlinearities.softmax) train_output = lasagne.layers.get_output(l_out, deterministic=False) loss = categorical_crossentropy(train_output, target).mean() if method!="FPN": # W updates W = lasagne.layers.get_all_params(l_out, quantized=True) W_grads = laq.compute_grads(loss,l_out) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = laq.clipping_scaling(updates,l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, quantized=False) updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR).items()) ## update 2nd moment, can get from the adam optimizer also ternary_weights = laq.get_quantized_weights(loss, l_out) updates2 = OrderedDict() idx = 0 tt_tag = lasagne.layers.get_all_params(l_out, tt=True) for tt_tag_temp in tt_tag: updates2[tt_tag_temp]= ternary_weights[idx] idx = idx+1 updates = OrderedDict(updates.items() + updates2.items()) ## update 2nd momentum updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(l_out, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2) idx = idx+1 updates = OrderedDict(updates.items() + updates3.items()) else: params = lasagne.layers.get_all_params(l_out, trainable=True) updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(l_out, deterministic=True) test_loss = categorical_crossentropy(test_output, target).mean() test_err = T.mean(T.neq(T.argmax(test_output, axis=1), target),dtype=theano.config.floatX) train_fn = theano.function([input, target, LR], loss, updates=updates) val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') X_train = train_set.X y_train = train_set.y X_val = valid_set.X y_val = valid_set.y X_test = test_set.X y_test = test_set.y # This function trains the model a full epoch (on the whole dataset) def train_epoch(X,y,LR): loss = 0 batches = len(X)/batch_size shuffled_range = range(len(X)) np.random.shuffle(shuffled_range) for i in range(batches): tmp_ind = shuffled_range[i*batch_size:(i+1)*batch_size] newloss = train_fn(X[tmp_ind],y[tmp_ind],LR) loss +=newloss loss/=batches return loss # This function tests the model a full epoch (on the whole dataset) def val_epoch(X,y): err = 0 loss = 0 batches = len(X)/batch_size for i in range(batches): new_loss, new_err = val_fn(X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size]) err += new_err loss += new_loss err = err / batches * 100 loss /= batches return err, loss best_val_err = 100 best_epoch = 1 LR = LR_start # We iterate over epochs: for epoch in range(1, num_epochs+1): start_time = time.time() train_loss = train_epoch(X_train,y_train,LR) val_err, val_loss = val_epoch(X_val,y_val) # test if validation error went down if val_err <= best_val_err: best_val_err = val_err best_epoch = epoch test_err, test_loss = val_epoch(X_test,y_test) epoch_duration = time.time() - start_time # Then we print the results for this epoch: print("Epoch "+str(epoch)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s") print(" LR: "+str(LR)) print(" training loss: "+str(train_loss)) print(" validation loss: "+str(val_loss)) print(" validation error rate: "+str(val_err)+"%") print(" best epoch: "+str(best_epoch)) print(" best validation error rate: "+str(best_val_err)+"%") print(" test loss: "+str(test_loss)) print(" test error rate: "+str(test_err)+"%") with open("{0}/{1}_lr{2}_{3}.txt".format(method, name, LR_start, method), "a") as myfile: myfile.write("{0} {1:.5f} {2:.5f} {3:.5f} {4:.5f} {5:.5f} {6:.5f} {7:.5f}\n".format(epoch, train_loss, val_loss, test_loss, val_err, test_err, epoch_duration, LR)) if epoch % 15 ==0: LR*=LR_decay
def main(method,LR_start): # BN parameters name = "mnist" print("dataset = "+str(name)) print("Method = "+str(method)) # alpha is the exponential moving average factor alpha = .1 print("alpha = "+str(alpha)) epsilon = 1e-4 print("epsilon = "+str(epsilon)) batch_size = 100 print("batch_size = "+str(batch_size)) num_epochs = 50 print("num_epochs = "+str(num_epochs)) # network structure num_units = 2048 print("num_units = "+str(num_units)) n_hidden_layers = 3 print("n_hidden_layers = "+str(n_hidden_layers)) print("LR_start = "+str(LR_start)) LR_decay = 0.1 print("LR_decay="+str(LR_decay)) activation = lasagne.nonlinearities.rectify print('Loading MNIST dataset...') train_set = MNIST(which_set= 'train', start=0, stop = 50000, center = True) valid_set = MNIST(which_set= 'train', start=50000, stop = 60000, center = True) test_set = MNIST(which_set= 'test', center = True) # bc01 format train_set.X = train_set.X.reshape(-1, 1, 28, 28) valid_set.X = valid_set.X.reshape(-1, 1, 28, 28) test_set.X = test_set.X.reshape(-1, 1, 28, 28) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2* train_set.y - 1. valid_set.y = 2* valid_set.y - 1. test_set.y = 2* test_set.y - 1. print('Building the MLP...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) mlp = lasagne.layers.InputLayer( shape=(None, 1, 28, 28), input_var=input) for k in range(n_hidden_layers): mlp = laq.DenseLayer( mlp, nonlinearity=lasagne.nonlinearities.identity, num_units=num_units, method = method) mlp = batch_norm.BatchNormLayer( mlp, epsilon=epsilon, alpha=alpha) mlp = lasagne.layers.NonlinearityLayer( mlp, nonlinearity = activation) mlp = laq.DenseLayer( mlp, nonlinearity=lasagne.nonlinearities.identity, num_units=10, method = method) mlp = batch_norm.BatchNormLayer( mlp, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(mlp, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output))) if method!="FPN": # W updates W = lasagne.layers.get_all_params(mlp, quantized=True) W_grads = laq.compute_grads(loss,mlp) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = laq.clipping_scaling(updates,mlp) # other parameters updates params = lasagne.layers.get_all_params(mlp, trainable=True, quantized=False) updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR, epsilon = 1e-8).items()) ## update the ternary matrix ternary_weights = laq.get_quantized_weights(loss, mlp) updates2 = OrderedDict() idx = 0 tt_tag = lasagne.layers.get_all_params(mlp, tt=True) for tt_tag_temp in tt_tag: updates2[tt_tag_temp]= ternary_weights[idx] idx = idx+1 updates = OrderedDict(updates.items() + updates2.items()) ## update 2nd momentum updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(mlp, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2) idx = idx+1 updates = OrderedDict(updates.items() + updates3.items()) else: params = lasagne.layers.get_all_params(mlp, trainable=True) updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(mlp, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX) train_fn = theano.function([input, target, LR], loss, updates=updates) val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') X_train = train_set.X y_train = train_set.y X_val = valid_set.X y_val = valid_set.y X_test = test_set.X y_test = test_set.y # This function trains the model a full epoch (on the whole dataset) def train_epoch(X,y,LR): loss = 0 batches = len(X)/batch_size shuffled_range = range(len(X)) np.random.shuffle(shuffled_range) for i in range(batches): tmp_ind = shuffled_range[i*batch_size:(i+1)*batch_size] newloss = train_fn(X[tmp_ind],y[tmp_ind],LR) loss +=newloss loss/=batches return loss # This function tests the model a full epoch (on the whole dataset) def val_epoch(X,y): err = 0 loss = 0 batches = len(X)/batch_size for i in range(batches): new_loss, new_err = val_fn(X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size]) err += new_err loss += new_loss err = err / batches * 100 loss /= batches return err, loss best_val_err = 100 best_epoch = 1 LR = LR_start # We iterate over epochs: for epoch in range(1, num_epochs+1): start_time = time.time() train_loss = train_epoch(X_train,y_train,LR) val_err, val_loss = val_epoch(X_val,y_val) # test if validation error went down if val_err <= best_val_err: best_val_err = val_err best_epoch = epoch test_err, test_loss = val_epoch(X_test,y_test) all_params = lasagne.layers.get_all_params(mlp) np.savez('{0}/{1}_lr{2}_{3}.npz'.format(method, name, LR_start, method), *all_params) epoch_duration = time.time() - start_time # Then we print the results for this epoch: print("Epoch "+str(epoch)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s") print(" LR: "+str(LR)) print(" training loss: "+str(train_loss)) print(" validation loss: "+str(val_loss)) print(" validation error rate: "+str(val_err)+"%") print(" best epoch: "+str(best_epoch)) print(" best validation error rate: "+str(best_val_err)+"%") print(" test loss: "+str(test_loss)) print(" test error rate: "+str(test_err)+"%") with open("{0}/{1}_lr{2}_{3}.txt".format(method,name, LR_start, method), "a") as myfile: myfile.write("{0} {1:.5f} {2:.5f} {3:.5f} {4:.5f} {5:.5f} {6:.5f} {7:.5f}\n".format(epoch, train_loss, val_loss, test_loss, val_err, test_err, epoch_duration, LR)) # Learning rate update scheme if epoch == 15 or epoch==25: LR*=LR_decay
def main(method,LR_start,Binarize_weight_only): name = "cifar" print("dataset = "+str(name)) print("Binarize_weight_only="+str(Binarize_weight_only)) print("Method = "+str(method)) # alpha is the exponential moving average factor alpha = .1 print("alpha = "+str(alpha)) epsilon = 1e-4 print("epsilon = "+str(epsilon)) # Training parameters batch_size = 50 print("batch_size = "+str(batch_size)) num_epochs = 200 print("num_epochs = "+str(num_epochs)) print("LR_start = "+str(LR_start)) LR_decay = 0.5 print("LR_decay="+str(LR_decay)) if Binarize_weight_only =="w": activation = lasagne.nonlinearities.rectify else: activation = lab.binary_tanh_unit print("activation = "+ str(activation)) train_set_size = 45000 print("train_set_size = "+str(train_set_size)) print('Loading CIFAR-10 dataset...') preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl") train_set = ZCA_Dataset( preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), preprocessor = preprocessor, start=0, stop = train_set_size) valid_set = ZCA_Dataset( preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), preprocessor = preprocessor, start=45000, stop = 50000) test_set = ZCA_Dataset( preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/test.pkl"), preprocessor = preprocessor) # bc01 format train_set.X = train_set.X.reshape(-1,3,32,32) valid_set.X = valid_set.X.reshape(-1,3,32,32) test_set.X = test_set.X.reshape(-1,3,32,32) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2* train_set.y - 1. valid_set.y = 2* valid_set.y - 1. test_set.y = 2* test_set.y - 1. print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) l_in = lasagne.layers.InputLayer( shape=(None, 3, 32, 32), input_var=input) # 128C3-128C3-P2 l_cnn1 = lab.Conv2DLayer( l_in, num_filters=128, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_bn1 = batch_norm.BatchNormLayer( l_cnn1, epsilon=epsilon, alpha=alpha) l_nl1 = lasagne.layers.NonlinearityLayer( l_bn1, nonlinearity = activation) l_cnn2 = lab.Conv2DLayer( l_nl1, num_filters=128, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2)) l_bn2 = batch_norm.BatchNormLayer( l_mp1, epsilon=epsilon, alpha=alpha) l_nl2 = lasagne.layers.NonlinearityLayer( l_bn2, nonlinearity = activation) # 256C3-256C3-P2 l_cnn3 = lab.Conv2DLayer( l_nl2, num_filters=256, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_bn3 = batch_norm.BatchNormLayer( l_cnn3, epsilon=epsilon, alpha=alpha) l_nl3 = lasagne.layers.NonlinearityLayer( l_bn3, nonlinearity = activation) l_cnn4 = lab.Conv2DLayer( l_nl3, num_filters=256, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2)) l_bn4 = batch_norm.BatchNormLayer( l_mp2, epsilon=epsilon, alpha=alpha) l_nl4 = lasagne.layers.NonlinearityLayer( l_bn4, nonlinearity = activation) # 512C3-512C3-P2 l_cnn5 = lab.Conv2DLayer( l_nl4, num_filters=512, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_bn5 = batch_norm.BatchNormLayer( l_cnn5, epsilon=epsilon, alpha=alpha) l_nl5 = lasagne.layers.NonlinearityLayer( l_bn5, nonlinearity = activation) l_cnn6 = lab.Conv2DLayer( l_nl5, num_filters=512, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method = method) l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2)) l_bn6 = batch_norm.BatchNormLayer( l_mp3, epsilon=epsilon, alpha=alpha) l_nl6 = lasagne.layers.NonlinearityLayer( l_bn6, nonlinearity = activation) # print(cnn.output_shape) # 1024FP-1024FP-10FP l_dn1 = lab.DenseLayer( l_nl6, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method = method) l_bn7 = batch_norm.BatchNormLayer( l_dn1, epsilon=epsilon, alpha=alpha) l_nl7 = lasagne.layers.NonlinearityLayer( l_bn7, nonlinearity = activation) l_dn2 = lab.DenseLayer( l_nl7, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method = method) l_bn8 = batch_norm.BatchNormLayer( l_dn2, epsilon=epsilon, alpha=alpha) l_nl8 = lasagne.layers.NonlinearityLayer( l_bn8, nonlinearity = activation) l_dn3 = lab.DenseLayer( l_nl8, nonlinearity=lasagne.nonlinearities.identity, num_units=10, method = method) l_out = batch_norm.BatchNormLayer( l_dn3, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(l_out, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output))) if method!="FPN": # W updates W = lasagne.layers.get_all_params(l_out, binary=True) W_grads = lab.compute_grads(loss,l_out) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = lab.clipping_scaling(updates,l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False) updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR).items()) ## update 2nd moment, can get from the adam optimizer also updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(l_out, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2) idx = idx+1 updates = OrderedDict(updates.items() + updates3.items()) else: params = lasagne.layers.get_all_params(l_out, trainable=True) updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(l_out, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') lab.train( name, method, train_fn,val_fn, batch_size, LR_start,LR_decay, num_epochs, train_set.X,train_set.y, valid_set.X,valid_set.y, test_set.X,test_set.y)
def build_model(self, lr=0.001, dropout=None): def concatenate(tensor_list, axis=0): concat_size = sum(tt.shape[axis] for tt in tensor_list) output_shape = () for k in range(axis): output_shape += (tensor_list[0].shape[k], ) output_shape += (concat_size, ) for k in range(axis + 1, tensor_list[0].ndim): output_shape += (tensor_list[0].shape[k], ) out = tensor.zeros(output_shape) offset = 0 for tt in tensor_list: indices = () for k in range(axis): indices += (slice(None), ) indices += (slice(offset, offset + tt.shape[axis]), ) for k in range(axis + 1, tensor_list[0].ndim): indices += (slice(None), ) out = tensor.set_subtensor(out[indices], tt) offset += tt.shape[axis] return out trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = T.matrix('x', dtype='int32') # step * samples x_mask = T.matrix('x_mask', dtype='float32') # step * samples y = T.matrix('y', dtype='int32') # sample * emb ctx = T.tensor3('ctx', dtype='float32') # sample * annotation * dim n_timesteps = x.shape[0] n_samples = x.shape[1] xr = x[::-1] xr_mask = x_mask[::-1] emb = self.W_emb[x.flatten()] emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) embr = self.W_emb[xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, self.dim_word]) ctx0 = ctx ctx_mean = ctx0.mean(1) init_state = T.dot(ctx_mean, self.W_ctx_init) + self.b_ctx_init # proj : gru hidden 들의 리스트 proj = self.gru_layer(emb, mask=x_mask, context=ctx, init_state=init_state) proj_h = proj[0] projr = self.gru_layer(embr, mask=xr_mask, context=ctx, init_state=init_state) projr_h = projr[0] concat_proj_h = concatenate([proj_h, projr_h[::-1]], axis=proj_h.ndim - 1) # step_ctx : step * samples * (dim*2) concat_proj_h = (concat_proj_h * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] # step_ctx_mean : samples * (dim*2) if dropout is not None: concat_proj_h = dropout_layer(concat_proj_h, use_noise, trng, dropout) output = T.dot(concat_proj_h, self.W_pred) + self.b_pred probs = T.nnet.softmax(output) prediction = probs.argmax(axis=1) ## avoid NaN epsilon = 1.0e-9 probs = T.clip(probs, epsilon, 1.0 - epsilon) probs /= probs.sum(axis=-1, keepdims=True) ## avoid NaN cost = T.nnet.categorical_crossentropy(probs, y) cost = T.mean(cost) updates = optimizer.adam(cost=cost, params=self.params, lr=lr) return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction