def train(model, dataset, cfg): train_loss_batch = [] # record_loss train_acc_batch = [] # record_accuracy train_loss = [] val_loss = [] val_acc = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) time_since_improvement = 0 for epoch in range(cfg.max_epochs): inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0 * dataset.n_train / cfg.batch_size)) for batch_num in range(tot_batches): batch_inds = inds[batch_num * cfg.batch_size:min( (batch_num + 1) * cfg.batch_size, dataset.n_train)] feed_dict = { model.input_images: dataset.data['train_images'][batch_inds, :], model.labels: dataset.data['train_labels'][batch_inds], model.lr: cfg.learning_rate, model.use_past_bt: False, model.input_past_bt: np.zeros((len(batch_inds), model.cfg.input_height, model.cfg.input_width, model.cfg.input_nchannels)), model.fc4_past_bt: np.zeros((len(batch_inds), 1000)) } loss, acc, _ = model.sess.run( [model.loss, model.accuracy, model.train_op], feed_dict=feed_dict) train_loss_batch.append(loss) train_acc_batch.append(acc) print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f} train_acc:{:.3f}'.format( epoch + 1, batch_num + 1, train_loss_batch[-1], train_acc_batch[-1]) train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format( epoch + 1, train_loss[-1]) if train_loss[-1] == min(train_loss): time_since_improvement = 0 else: time_since_improvement += 1 if time_since_improvement >= cfg.early_stopping: print 'early stopping. no improvement since ', str( cfg.early_stopping), ' epochs.' break vl, va = validate(model, dataset) val_loss.append(vl) val_acc.append(va) save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') return train_loss_batch, train_acc_batch, train_loss, val_loss, val_acc
def train_autoencoder_vanilla(model, dataset, cfg, save_dir): train_loss_batch = [] # record_loss train_loss = [] val_loss = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) time_since_improvement = 0 print np.min(dataset.data['train_images']) print np.max(dataset.data['train_images']) print '=' * 100 time_since_improvement = 0 # early stopping train_time = 0.0 for epoch in range(cfg.max_epochs): st = time.time() inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0 * dataset.n_train / cfg.batch_size)) for batch_num in range(tot_batches): batch_inds = inds[batch_num * cfg.batch_size:min( (batch_num + 1) * cfg.batch_size, dataset.n_train)] feed_dict = { model.input_images: dataset.data['train_images'][batch_inds, :], model.lr: cfg.learning_rate, } loss, _ = model.sess.run([model.loss, model.train_op], feed_dict=feed_dict) train_loss_batch.append(loss) print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f}'.format( epoch + 1, batch_num + 1, train_loss_batch[-1]) train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format( epoch + 1, train_loss[-1]) ## train time train_time += time.time() - st print 'Total Train Time:', train_time ## validation vl, va = validate_autoencoder(model, dataset) val_loss.append(vl) save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') ## early stopping time_since_improvement, early_stop = early_stopping( val_loss, time_since_improvement, cfg.early_stopping) if early_stop: break return train_loss, val_loss
def train(model, dataset, cfg): train_loss_batch = [] # record_loss train_acc_batch = [] # record_accuracy train_loss = [] val_loss = [] val_acc = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) save_loss([], save_dir, 'max_learning_rates.txt', first_use=True) save_loss([], save_dir, 'learning_rates.txt', first_use=True) alpha = 1e-2 for epoch in range(cfg.max_epochs): inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0*dataset.n_train/cfg.batch_size)) max_lr_epoch = [] lr_epoch = [] est = time.time() times = [[] for i in range(10)] for batch_num in range(tot_batches): bst = time.time() st = time.time() batch_inds = inds[batch_num*cfg.batch_size:min((batch_num+1)*cfg.batch_size,dataset.n_train)] ## get f(x) and gradients fd = {model.input_images: dataset.data['train_images'][batch_inds,:], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: False, model.input_past_bt: np.zeros((len(batch_inds),model.cfg.input_height,model.cfg.input_width,model.cfg.input_nchannels)), model.fc4_past_bt: np.zeros((len(batch_inds),1000)) } loss, acc, grads, input_bt, fc4_bt = model.sess.run([model.loss, model.accuracy, model.grads, model.input_binary_tensor, model.fc4_binary_tensor], feed_dict=fd) fx = loss train_loss_batch.append(loss) train_acc_batch.append(acc) research_fd = {model.conv1_W_grad: grads[0], model.conv1_b_grad: grads[1], model.conv2_W_grad: grads[2], model.conv2_b_grad: grads[3], model.conv3_W_grad: grads[4], model.conv3_b_grad: grads[5], model.fc4_W_grad: grads[6], model.fc4_b_grad: grads[7], model.fc5_W_grad: grads[8], model.fc5_b_grad: grads[9], } times[0].append(time.time()-st) print 'fx and grads: ', time.time()-st st = time.time() gT_g = np.sum([np.sum(np.square(g)) for g in grads]) times[1].append(time.time()-st) print 'gT_g: ', time.time()-st ## set fd to use old binary tensors st = time.time() fd = {model.input_images: dataset.data['train_images'][batch_inds,:], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: True, model.input_past_bt: input_bt, model.fc4_past_bt: fc4_bt } times[2].append(time.time()-st) print 'change fd: ', time.time()-st ## get f(x+alpha*g) st = time.time() research_fd[model.lr] = -alpha model.sess.run(model.change_weights_op, feed_dict=research_fd) times[3].append(time.time()-st) print 'change_weights_op: ', time.time()-st st = time.time() fx_plus_ag = model.sess.run(model.loss, feed_dict=fd) times[4].append(time.time()-st) print 'fx+: ', time.time()-st ## get f(x-alpha*g) st = time.time() research_fd[model.lr] = 2*alpha model.sess.run(model.change_weights_op, feed_dict=research_fd) times[5].append(time.time()-st) print 'change_weights_op: ', time.time()-st st = time.time() fx_minus_ag = model.sess.run(model.loss, feed_dict=fd) times[6].append(time.time()-st) print 'fx-: ', time.time()-st ## choose learning rate st = time.time() gT_H_g = (fx_plus_ag + fx_minus_ag - 2*fx)/(alpha**2) if not cfg.magic_2nd_order: max_lr = 2*gT_g/np.abs(gT_H_g) lr = min(fx/gT_g, max_lr) else: ## 2nd order magic if gT_g**2-2*gT_H_g*fx > 0: max_lr = lr = - (-gT_g + np.sqrt(gT_g**2-2*gT_H_g*fx)) / gT_H_g else: max_lr = lr = - (-gT_g/gT_H_g) max_lr_epoch.append(max_lr) lr_epoch.append(lr) times[7].append(time.time()-st) print 'choose lr: ', time.time()-st ## print st = time.time() if True: print '' print 'alpha : ', alpha print 'f(x) : ', fx print 'f(x+alpha*g) : ', fx_plus_ag print 'f(x-alpha*g) : ', fx_minus_ag print 'f(x+)+f(x-)-2f(x) : ', fx_plus_ag + fx_minus_ag - 2*fx print 'estimated (g.T)Hg : ', gT_H_g print '(g.T)g : ', gT_g print 'max lr : ', max_lr print 'lr : ', lr print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f} train_acc:{:.3f}'.format(epoch+1,batch_num+1, train_loss_batch[-1],train_acc_batch[-1]) times[8].append(time.time()-st) print 'printing: ', time.time()-st ## quit? st = time.time() if gT_H_g==0.0: print 'gT_H_g==0.0, exiting' exit() ## update step research_fd[model.lr] = -alpha+lr model.sess.run(model.change_weights_op, feed_dict=research_fd) ## update alpha alpha = min(lr/2, 1e-1) times[9].append(time.time()-st) print 'quit? final update, alpha: ', time.time()-st print 'batch_time: ', time.time()-bst print '_'*100 print 'avg_batch_time: ', (time.time()-est)/tot_batches for i in range(len(times)): print 'i: ', i, ' ', np.mean(times[i]) train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(max_lr_epoch, save_dir, 'max_learning_rates.txt') save_loss(lr_epoch, save_dir, 'learning_rates.txt') save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format(epoch+1, train_loss[-1]) #vl, va = validate(model, dataset) #val_loss.append(vl) #val_acc.append(va) #save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') return train_loss_batch, train_acc_batch, train_loss, val_loss, val_acc
def train(model, dataset, cfg): train_loss_batch = [] # record_loss train_acc_batch = [] # record_accuracy train_loss = [] val_loss = [] val_acc = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) save_loss([], save_dir, 'max_learning_rates.txt', first_use=True) save_loss([], save_dir, 'learning_rates.txt', first_use=True) alpha = 1e-2 db = [] # d_biased gg2 = [] timestep = 0 count = 0 for epoch in range(cfg.max_epochs): inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0 * dataset.n_train / cfg.batch_size)) max_lr_epoch = [] lr_epoch = [] est = time.time() for batch_num in range(tot_batches): timestep += 1 bst = time.time() st = time.time() batch_inds = inds[batch_num * cfg.batch_size:min( (batch_num + 1) * cfg.batch_size, dataset.n_train)] ## get f(x) and gradients fd = { model.input_images: dataset.data['train_images'][batch_inds, :], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: False, model.h1_past_bt: np.zeros( (len(batch_inds), model.cfg.h1_dim)), model.h2_past_bt: np.zeros((len(batch_inds), model.cfg.h2_dim)) } loss, acc, grads, h1_bt, h2_bt = model.sess.run([ model.loss, model.accuracy, model.grads, model.h1_binary_tensor, model.h2_binary_tensor ], feed_dict=fd) fx = loss train_loss_batch.append(loss) train_acc_batch.append(acc) if db == []: #db = [(1-cfg.eps)*grads[i] for i in range(len(grads))] d = [grads[i] for i in range(len(grads))] #gg2 = [(1-cfg.eps2)*grads[i]*grads[i] for i in range(len(grads))] else: #db = [cfg.eps*db[i] + (1-cfg.eps)*grads[i] for i in range(len(grads))] mult1 = cfg.eps * (1 - cfg.eps** (timestep - 1)) / (1 - cfg.eps**timestep) mult2 = (1 - cfg.eps) / (1 - cfg.eps**timestep) d = [ mult1 * d[i] + mult2 * grads[i] for i in range(len(grads)) ] #d = [db[i]/(1-cfg.eps**timestep) for i in range(len(grads))] #gg2 = [cfg.eps*gg2[i] + (1-cfg.eps2)*grads[i]*grads[i] for i in range(len(grads))] #d = [db[i]/(1-cfg.eps**timestep) for i in range(len(grads))] #gg = [gg2[i]/(1-cfg.eps2**timestep) for i in range(len(grads))] #d = [d[i]/(np.sqrt(gg[i])+cfg.epsilon) for i in range(len(grads))] research_fd = { model.h1_W_grad: d[0], model.h1_b_grad: d[1], model.h2_W_grad: d[2], model.h2_b_grad: d[3], model.preds_W_grad: d[4], model.preds_b_grad: d[5], } print 'fx and grads: ', time.time() - st st = time.time() gT_d = np.sum([np.sum(grads[i] * d[i]) for i in range(len(grads))]) print 'gT_d: ', time.time() - st ## set fd to use old binary tensors st = time.time() fd = { model.input_images: dataset.data['train_images'][batch_inds, :], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: True, model.h1_past_bt: h1_bt, model.h2_past_bt: h2_bt } print 'change fd: ', time.time() - st ## get f(x+alpha*d) st = time.time() research_fd[model.lr] = -alpha model.sess.run(model.change_weights_op, feed_dict=research_fd) fx_plus_ad, grads2 = model.sess.run([model.loss, model.grads], feed_dict=fd) print 'fx+: ', time.time() - st ## get f(x-alpha*d) st = time.time() research_fd[model.lr] = 2 * alpha model.sess.run(model.change_weights_op, feed_dict=research_fd) #fx_minus_ad = model.sess.run(model.loss, feed_dict=fd) fx_minus_ad, grads3 = model.sess.run([model.loss, model.grads], feed_dict=fd) print 'fx-: ', time.time() - st ## estimate Hd and dT_H_d Hd = [ grads2[i] / alpha - grads[i] / alpha for i in range(len(grads)) ] dT_H_d = np.sum([np.sum(d[i] * Hd[i]) for i in range(len(grads))]) ## choose learning rate st = time.time() dT_H_d_2 = (fx_plus_ad + fx_minus_ad - 2 * fx) / (alpha**2) Hd_23 = [ grads2[i] / (2.0 * alpha) - grads3[i] / (2.0 * alpha) for i in range(len(grads)) ] dT_H_d_23 = np.sum( [np.sum(d[i] * Hd_23[i]) for i in range(len(grads))]) print 'dT_H_d_2', dT_H_d, dT_H_d_2, dT_H_d_23 dT_H_d = dT_H_d if not cfg.magic_2nd_order: if dT_H_d == 0.0: max_lr = lr = 0.0 else: max_lr = 2 * gT_d / np.abs( dT_H_d) # this is a magnitude comment lr = max(min(fx / gT_d, np.abs(max_lr)), -np.abs(max_lr)) max_lr_epoch.append(max_lr) lr_epoch.append(lr) else: ## 2nd order magic if dT_H_d == 0.0: max_lr = lr = 0.0 else: delta_f = fx if gT_d**2 - 2 * dT_H_d * delta_f >= 0: if gT_d > 0: # choose the smaller of the two max_lr = lr = -(-gT_d + np.sqrt( gT_d**2 - 2 * dT_H_d * delta_f)) / dT_H_d else: max_lr = lr = -(-gT_d - np.sqrt( gT_d**2 - 2 * dT_H_d * delta_f)) / dT_H_d else: max_lr = lr = -(-gT_d / dT_H_d) print 'choose lr: ', time.time() - st if max_lr == lr: count += 1 ## print st = time.time() if True: print '' print 'alpha : ', alpha print 'f(x) : ', fx print 'f(x+alpha*d) : ', fx_plus_ad #print 'f(x-alpha*d) : ', fx_minus_ad #print 'f(x+)+f(x-)-2f(x) : ', fx_plus_ad + fx_minus_ad - 2*fx print 'estimated (d.T)Hd : ', dT_H_d print '(g.T)d : ', gT_d print 'max lr : ', max_lr print 'lr : ', lr print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f} train_acc:{:.3f}'.format( epoch + 1, batch_num + 1, train_loss_batch[-1], train_acc_batch[-1]) print 'printing: ', time.time() - st ## quit? st = time.time() if dT_H_d == 0.0: print 'dT_H_d==0.0, exiting' exit() ## update step # reset to x research_fd[model.lr] = -alpha + lr model.sess.run(model.change_weights_op, feed_dict=research_fd) ## update alpha alpha = min(lr / 2, 1e-1) alpha = 0.1 print 'quit? final update, alpha: ', time.time() - st print 'batch_time: ', time.time() - bst print '_' * 100 print 'avg_batch_time: ', (time.time() - est) / tot_batches train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(max_lr_epoch, save_dir, 'max_learning_rates.txt') save_loss(lr_epoch, save_dir, 'learning_rates.txt') save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format( epoch + 1, train_loss[-1]) #vl, va = validate(model, dataset) #val_loss.append(vl) #val_acc.append(va) #save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') print '#max_lr==lr: ', count, '/', timestep exit() return train_loss_batch, train_acc_batch, train_loss, val_loss, val_acc
def train_autoencoder_kalpit(model, dataset, cfg, save_dir): train_loss_batch = [] # record_loss train_loss = [] val_loss = [] save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) save_loss([], save_dir, 'max_learning_rates.txt', first_use=True) save_loss([], save_dir, 'learning_rates.txt', first_use=True) alpha = 1e-1 moms = [] converged = False ### accumulators for epoch in range(cfg.max_epochs): inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0*dataset.n_train/cfg.batch_size)) max_lr_epoch = [] lr_epoch = [] est = time.time() for batch_num in range(tot_batches): alpha = 1e0 bst = time.time() batch_inds = inds[batch_num*cfg.batch_size:min((batch_num+1)*cfg.batch_size,dataset.n_train)] ## get f(x) and gradients fd = {model.input_images: dataset.data['train_images'][batch_inds,:]} loss, grads = model.sess.run([model.loss, model.grads], feed_dict=fd) train_loss_batch.append(loss) print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f}'.format(epoch+1,batch_num+1,train_loss_batch[-1]) ## set research_fd. set fd to use old binary tensors. research_fd = {model.enc1_W_grad: grads[0], model.enc1_b_grad: grads[1], model.enc2_W_grad: grads[2], model.enc2_b_grad: grads[3], model.dec2_W_grad: grads[4], model.dec2_b_grad: grads[5], model.dec1_W_grad: grads[6], model.dec1_b_grad: grads[7] } fd = {model.input_images: dataset.data['train_images'][batch_inds,:]} ## get kalpit learning_rate max_lr, lr = get_kalpit_lr(model, cfg, research_fd, fd, loss, alpha, grads) max_lr_epoch.append(max_lr) lr_epoch.append(lr) ## update step # momentum if moms==[]: moms = grads[:] else: moms = [model.cfg.momentum*moms[i] + grads[i] for i in range(len(moms))] research_fd = {model.enc1_W_grad: moms[0], model.enc1_b_grad: moms[1], model.enc2_W_grad: moms[2], model.enc2_b_grad: moms[3], model.dec2_W_grad: moms[4], model.dec2_b_grad: moms[5], model.dec1_W_grad: moms[6], model.dec1_b_grad: moms[7] } research_fd[model.lr] = lr model.sess.run(model.change_weights_op, feed_dict=research_fd) ## update alpha alpha = min(lr/2, 1e-1) print 'batch_time: ', time.time()-bst print '_'*100 print 'avg_batch_time: ', (time.time()-est)/tot_batches train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(max_lr_epoch, save_dir, 'max_learning_rates.txt') save_loss(lr_epoch, save_dir, 'learning_rates.txt') save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format(epoch+1, train_loss[-1]) if converged: break vl, va = validate_autoencoder(model, dataset) val_loss.append(vl) save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') return train_loss, val_loss
def train_conv_kalpit(model, dataset, cfg, save_dir): train_loss_batch = [] # record_loss train_acc_batch = [] # record_accuracy train_loss = [] val_loss = [] val_acc = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) save_loss([], save_dir, 'max_learning_rates.txt', first_use=True) save_loss([], save_dir, 'learning_rates.txt', first_use=True) alpha = 1e-2 moms = [] for epoch in range(cfg.max_epochs): inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0*dataset.n_train/cfg.batch_size)) max_lr_epoch = [] lr_epoch = [] est = time.time() for batch_num in range(tot_batches): bst = time.time() batch_inds = inds[batch_num*cfg.batch_size:min((batch_num+1)*cfg.batch_size,dataset.n_train)] ## get f(x) and gradients fd = {model.input_images: dataset.data['train_images'][batch_inds,:], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: False, model.input_past_bt: np.zeros((len(batch_inds),cfg.input_height,cfg.input_width,cfg.input_nchannels)), model.fc4_past_bt: np.zeros((len(batch_inds),1000)) } loss, acc, grads, input_bt, fc4_bt = model.sess.run([model.loss, model.accuracy, model.grads, model.input_binary_tensor, model.fc4_binary_tensor], feed_dict=fd) train_loss_batch.append(loss) train_acc_batch.append(acc) print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f} train_acc:{:.3f}'.format(epoch+1,batch_num+1, train_loss_batch[-1],train_acc_batch[-1]) ## set research_fd. set fd to use old binary tensors. research_fd = {model.conv1_W_grad: grads[0], model.conv1_b_grad: grads[1], model.conv2_W_grad: grads[2], model.conv2_b_grad: grads[3], model.conv3_W_grad: grads[4], model.conv3_b_grad: grads[5], model.fc4_W_grad: grads[6], model.fc4_b_grad: grads[7], model.fc5_W_grad: grads[8], model.fc5_b_grad: grads[9], model.input_past_bt: input_bt, model.fc4_past_bt: fc4_bt } fd = {model.input_images: dataset.data['train_images'][batch_inds,:], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: True, model.input_past_bt: input_bt, model.fc4_past_bt: fc4_bt } # momentum if moms==[]: moms = grads[:] else: moms = [model.cfg.momentum*moms[i] + grads[i] for i in range(len(moms))] moms_fd = {model.conv1_W_grad: moms[0], model.conv1_b_grad: moms[1], model.conv2_W_grad: moms[2], model.conv2_b_grad: moms[3], model.conv3_W_grad: moms[4], model.conv3_b_grad: moms[5], model.fc4_W_grad: moms[6], model.fc4_b_grad: moms[7], model.fc5_W_grad: moms[8], model.fc5_b_grad: moms[9] } ## get kalpit learning_rate print 'USING DIXIT LR' max_lr, lr = get_dixit_lr(loss, grads, moms, cfg) #max_lr, lr = get_kalpit_lr(model, cfg, grads_fd, fd, loss, alpha, grads) max_lr_epoch.append(max_lr) lr_epoch.append(lr) ## update step moms_fd[model.lr] = lr model.sess.run(model.change_weights_op, feed_dict=moms_fd) ## update alpha alpha = min(lr/2, 1e-1) print 'batch_time: ', time.time()-bst print '_'*100 print 'avg_batch_time: ', (time.time()-est)/tot_batches train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(max_lr_epoch, save_dir, 'max_learning_rates.txt') save_loss(lr_epoch, save_dir, 'learning_rates.txt') save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format(epoch+1, train_loss[-1]) vl, va = validate_conv(model, dataset) val_loss.append(vl) val_acc.append(va) save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') return train_loss, val_loss, val_acc
def train_ff_kalpit(model, dataset, cfg, save_dir): train_loss_batch = [] # record_loss train_acc_batch = [] # record_accuracy train_loss = [] val_loss = [] val_acc = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) save_loss([], save_dir, 'max_learning_rates.txt', first_use=True) save_loss([], save_dir, 'learning_rates.txt', first_use=True) count1 = 0 count2 = 0 for epoch in range(cfg.max_epochs): inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0*dataset.n_train/cfg.batch_size)) max_lr_epoch = [] lr_epoch = [] est = time.time() for batch_num in range(tot_batches): bst = time.time() batch_inds = inds[batch_num*cfg.batch_size:min((batch_num+1)*cfg.batch_size,dataset.n_train)] ## get f(x) and gradients fd = {model.input_images: dataset.data['train_images'][batch_inds,:], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: False, model.h1_past_bt: np.zeros((len(batch_inds),model.cfg.h1_dim)), model.h2_past_bt: np.zeros((len(batch_inds),model.cfg.h2_dim)), model.max_lr: cfg.max_lr } loss, acc, lr, _ = model.sess.run([model.loss, model.accuracy, model.lr, model.dixit_train_op], feed_dict=fd) train_loss_batch.append(loss) train_acc_batch.append(acc) print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f} train_acc:{:.3f}'.format(epoch+1,batch_num+1, train_loss_batch[-1],train_acc_batch[-1]) ## get kalpit learning_rate print 'USING DIXIT LR' max_lr_epoch.append(cfg.max_lr) lr_epoch.append(lr) if lr > 0.999*cfg.max_lr: count1 += 1.0 count2 += 1.0 print 100.0*count1/count2 print 'batch_time: ', time.time()-bst print '_'*100 print 'avg_batch_time: ', (time.time()-est)/tot_batches train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(max_lr_epoch, save_dir, 'max_learning_rates.txt') save_loss(lr_epoch, save_dir, 'learning_rates.txt') save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format(epoch+1, train_loss[-1]) vl, va = validate_ff(model, dataset) val_loss.append(vl) val_acc.append(va) save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') return train_loss, val_loss, val_acc
def train_ff_vanilla(model, dataset, cfg, save_dir): train_loss_batch = [] # record_loss train_acc_batch = [] # record_accuracy train_loss = [] val_loss = [] val_acc = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) save_loss(val_acc, save_dir, 'validation_accuracy.txt', first_use=True) time_since_improvement = 0 # early stopping train_time = 0.0 for epoch in range(cfg.max_epochs): st = time.time() inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0 * dataset.n_train / cfg.batch_size)) for batch_num in range(tot_batches): batch_inds = inds[batch_num * cfg.batch_size:min( (batch_num + 1) * cfg.batch_size, dataset.n_train)] feed_dict = { model.input_images: dataset.data['train_images'][batch_inds, :], model.labels: dataset.data['train_labels'][batch_inds], model.lr: cfg.learning_rate, model.keep_prob: cfg.keep_prob, model.use_past_bt: False, model.h1_past_bt: np.zeros( (len(batch_inds), model.cfg.h1_dim)), model.h2_past_bt: np.zeros((len(batch_inds), model.cfg.h2_dim)) } loss, acc, _ = model.sess.run( [model.loss, model.accuracy, model.train_op], feed_dict=feed_dict) train_loss_batch.append(loss) train_acc_batch.append(acc) print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f} train_acc:{:.3f}'.format( epoch + 1, batch_num + 1, train_loss_batch[-1], train_acc_batch[-1]) train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format( epoch + 1, train_loss[-1]) ## train time train_time += time.time() - st print 'Total Train Time:', train_time ## validation vl, va = validate_ff(model, dataset) val_loss.append(vl) val_acc.append(va) save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') save_loss(val_acc[-1:], save_dir, 'validation_accuracy.txt') ## early stopping time_since_improvement, early_stop = early_stopping( val_loss, time_since_improvement, cfg.early_stopping) if early_stop: break return train_loss, val_loss, val_acc
def train_conv_kalpit(model, dataset, cfg, save_dir): train_loss_batch = [] # record_loss train_acc_batch = [] # record_accuracy train_loss = [] val_loss = [] val_acc = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) save_loss(val_acc, save_dir, 'validation_accuracy.txt', first_use=True) save_loss([], save_dir, 'max_learning_rates.txt', first_use=True) save_loss([], save_dir, 'learning_rates.txt', first_use=True) count1 = 0 count2 = 0 time_since_improvement = 0 # early stopping train_time = 0.0 for epoch in range(cfg.max_epochs): st = time.time() inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0 * dataset.n_train / cfg.batch_size)) max_lr_epoch = [] lr_epoch = [] count1 = 0.0 count2 = 0.0 for batch_num in range(tot_batches): batch_inds = inds[batch_num * cfg.batch_size:min( (batch_num + 1) * cfg.batch_size, dataset.n_train)] ## get f(x) and gradients fd = { model.input_images: dataset.data['train_images'][batch_inds, :], model.labels: dataset.data['train_labels'][batch_inds], model.keep_prob: cfg.keep_prob, model.use_past_bt: False, model.input_past_bt: np.zeros((len(batch_inds), cfg.input_height, cfg.input_width, cfg.input_nchannels)), model.fc4_past_bt: np.zeros((len(batch_inds), 1000)), model.max_lr: cfg.max_lr } loss, acc, lr, _, gT_d = model.sess.run([ model.loss, model.accuracy, model.lr, model.dixit_train_op, model.gT_d ], feed_dict=fd) train_loss_batch.append(loss) train_acc_batch.append(acc) print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f} train_acc:{:.3f} learning_rate:{:.3f} gT_d:{:.3f}'.format( epoch + 1, batch_num + 1, train_loss_batch[-1], train_acc_batch[-1], lr, gT_d) ## get kalpit learning_rate print 'USING DIXIT LR' max_lr_epoch.append(cfg.max_lr) lr_epoch.append(lr) if lr > 0.999 * cfg.max_lr: count1 += 1.0 count2 += 1.0 print '_' * 100 train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(max_lr_epoch, save_dir, 'max_learning_rates.txt') save_loss(lr_epoch, save_dir, 'learning_rates.txt') save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format( epoch + 1, train_loss[-1]) print 'Percentage of lr==max_lr: {:.3f}'.format(100.0 * count1 / count2) ## train time train_time += time.time() - st print 'Total Train Time:', train_time ## validation vl, va = validate_conv(model, dataset) val_loss.append(vl) val_acc.append(va) save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') save_loss(val_acc[-1:], save_dir, 'validation_accuracy.txt') ## early stopping time_since_improvement, early_stop = early_stopping( val_loss, time_since_improvement, cfg.early_stopping) if early_stop: break return train_loss, val_loss, val_acc
def train(model, dataset, cfg): train_loss_batch = [] # record_loss train_acc_batch = [] # record_accuracy train_loss = [] val_loss = [] val_acc = [] save_loss(train_loss, save_dir, 'training_cost.txt', first_use=True) save_loss(val_loss, save_dir, 'validation_cost.txt', first_use=True) save_loss([], save_dir, 'max_learning_rates.txt', first_use=True) save_loss([], save_dir, 'learning_rates.txt', first_use=True) alpha = 1e-2 moms = [] for epoch in range(cfg.max_epochs): inds = range(dataset.n_train) np.random.shuffle(inds) tot_batches = int(np.ceil(1.0 * dataset.n_train / cfg.batch_size)) max_lr_epoch = [] lr_epoch = [] est = time.time() for batch_num in range(tot_batches): bst = time.time() st = time.time() batch_inds = inds[batch_num * cfg.batch_size:min( (batch_num + 1) * cfg.batch_size, dataset.n_train)] ## get f(x) and gradients fd = { model.input_images: dataset.data['train_images'][batch_inds, :], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: False, model.h1_past_bt: np.zeros( (len(batch_inds), model.cfg.h1_dim)), model.h2_past_bt: np.zeros((len(batch_inds), model.cfg.h2_dim)) } loss, acc, grads, h1_bt, h2_bt = model.sess.run([ model.loss, model.accuracy, model.grads, model.h1_binary_tensor, model.h2_binary_tensor ], feed_dict=fd) fx = loss train_loss_batch.append(loss) train_acc_batch.append(acc) ## momentum if moms == []: moms = grads[:] else: moms = [ model.cfg.momentum * moms[i] + grads[i] for i in range(len(moms)) ] research_fd = { model.h1_W_grad: moms[0], model.h1_b_grad: moms[1], model.h2_W_grad: moms[2], model.h2_b_grad: moms[3], model.preds_W_grad: moms[4], model.preds_b_grad: moms[5], } print 'fx and grads: ', time.time() - st st = time.time() gT_m = np.sum( [np.sum(grads[i] * moms[i]) for i in range(len(moms))]) print 'mT_m: ', time.time() - st ## set fd to use old binary tensors st = time.time() fd = { model.input_images: dataset.data['train_images'][batch_inds, :], model.labels: dataset.data['train_labels'][batch_inds], model.use_past_bt: True, model.h1_past_bt: h1_bt, model.h2_past_bt: h2_bt } print 'change fd: ', time.time() - st ## get f(x+alpha*m) st = time.time() research_fd[model.lr] = -alpha model.sess.run(model.change_weights_op, feed_dict=research_fd) fx_plus_am, grads2 = model.sess.run([model.loss, model.grads], feed_dict=fd) print 'fx+: ', time.time() - st ### get f(x-alpha*g) #st = time.time() #research_fd[model.lr] = 2*alpha #model.sess.run(model.change_weights_op, feed_dict=research_fd) #fx_minus_am = model.sess.run(model.loss, feed_dict=fd) #print 'fx-: ', time.time()-st ## choose learning rate st = time.time() #mT_H_m = (fx_plus_am + fx_minus_am - 2*fx)/(alpha**2) H_m = [ grads2[i] / alpha - grads[i] / alpha for i in range(len(grads)) ] mT_H_m = np.sum( [np.sum(moms[i] * H_m[i]) for i in range(len(grads))]) if not cfg.magic_2nd_order: max_lr = 2 * gT_m / np.abs(mT_H_m) lr = min(fx / gT_m, max_lr) max_lr_epoch.append(max_lr) lr_epoch.append(lr) else: ## 2nd order magic if mT_H_m == 0.0: max_lr = lr = 0.0 else: delta_f = fx if gT_m**2 - 2 * mT_H_m * delta_f >= 0: max_lr = lr = -(-gT_m + np.sqrt(gT_m**2 - 2 * mT_H_m * delta_f)) / mT_H_m else: max_lr = lr = -(-gT_m / mT_H_m) print 'choose lr: ', time.time() - st ## print st = time.time() if True: print '' print 'alpha : ', alpha print 'f(x) : ', fx print 'f(x+alpha*m) : ', fx_plus_am #print 'f(x-alpha*m) : ', fx_minus_am #print 'f(x+)+f(x-)-2f(x) : ', fx_plus_am + fx_minus_am - 2*fx print 'estimated (m.T)Hm : ', mT_H_m print '(g.T)m : ', gT_m print 'max lr : ', max_lr print 'lr : ', lr print 'Epoch-Batch: {:3d}-{:3d} train_loss: {:.3f} train_acc:{:.3f}'.format( epoch + 1, batch_num + 1, train_loss_batch[-1], train_acc_batch[-1]) print 'printing: ', time.time() - st ## quit? st = time.time() if mT_H_m == 0.0: print 'mT_H_m==0.0, exiting' exit() ## update step # reset to x research_fd[model.lr] = -alpha + lr model.sess.run(model.change_weights_op, feed_dict=research_fd) ## update alpha alpha = min(lr / 2, 1e-1) print 'quit? final update, alpha: ', time.time() - st print 'batch_time: ', time.time() - bst print '_' * 100 print 'avg_batch_time: ', (time.time() - est) / tot_batches train_loss.append(np.mean(train_loss_batch[-tot_batches:])) save_loss(max_lr_epoch, save_dir, 'max_learning_rates.txt') save_loss(lr_epoch, save_dir, 'learning_rates.txt') save_loss(train_loss[-1:], save_dir, 'training_cost.txt') print 'Epoch {} - Average Training Cost: {:.3f}'.format( epoch + 1, train_loss[-1]) #vl, va = validate(model, dataset) #val_loss.append(vl) #val_acc.append(va) #save_loss(val_loss[-1:], save_dir, 'validation_cost.txt') return train_loss_batch, train_acc_batch, train_loss, val_loss, val_acc