def train(hparams): # this won't crash ever. If no exp number is there, it'll be None exp_version_from_slurm_script = hparams.hpc_exp_number # init exp and track all the parameters from the HyperOptArgumentParser # the experiment version is optional, but using the one from slurm means the exp will not collide with other # versions if slurm runs multiple at once. exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, version=exp_version_from_slurm_script, autosave=False, ) exp.argparse(hparams) # pretend to train x = hparams.x_val for train_step in range(0, 100): y = hparams.y_val out = x * y exp.log({'fake_err': out.item()}) # save exp when we're done exp.save()
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, # The experiment version is optional, but using the one # from SLURM means the exp will not collide with other # versions if SLURM runs multiple at once. version=hparams.hpc_exp_number, autosave=False, ) exp.argparse(hparams) # Pretend to train. x = hparams.x_val for train_step in range(0, 100): y = hparams.y_val out = x * y exp.log({'fake_err': out.item()}) # Log metrics. # Save exp when done. exp.save()
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters # if hparams.disease_model: # save_model_path = hparams.save_model_dir+'/disease' # else: # save_model_path = hparams.save_model_dir+'/synthetic' # Set seeds SEED = hparams.seed torch.manual_seed(SEED) np.random.seed(SEED) print(hparams) print(args) exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # checkpoint_callback = ModelCheckpoint( # filepath=save_model_path+'/'+hparams.cage_nr + # '/version_'+str(cluster.hpc_exp_number)+'/checkpoints', # verbose=True, # monitor='val_loss', # mode='min', # prefix='' # ) # # Pretend to train. # x = torch.rand((1, hparams.x_val)) # for train_step in range(0, 100): # y = torch.rand((hparams.x_val, 1)) # out = x.mm(y) # exp.log({'fake_err': out.item()}) dsl, \ trainedmodels,\ validatedmodels,\ losses,\ lossdf,\ knnres = runevaler("opsitu", hparams.epochs, [ESNNSystem], [TorchEvaler], [eval_dual_ann], networklayers=[hparams.c_layers, hparams.g_layers], lrs=[hparams.lr], dropoutrates=[hparams.dropout], validate_on_k=10, n=1, filenamepostfixes=["esnn"]) stats = stat(lossdf, hparams.epochs, "esnn") print(f"type : {type(stats)}") print(f"innertype : {type(stats[0])}") print(f"stats : {stats}") print(f"stats0 : {stats[0]}") exp.log({'loss': stats[0]}) #exp.log('tng_err': tng_err) #exp.log({"loss", stats[0]}) # Save exp when . exp.save()
def train(hparams): # init exp and track all the parameters from the HyperOptArgumentParser exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # pretend to train x = torch.rand((1, hparams.x_val)) for train_step in range(0, 100): y = torch.rand((hparams.x_val, 1)) out = x.mm(y) exp.log({'fake_err': out.item()}) # save exp when we're done exp.save()
def train(hparams, *args): """Train your awesome model. :param hparams: The arguments to run the model with. """ # Initialize experiments and track all the hyperparameters exp = Experiment( name=hparams.test_tube_exp_name, # Location to save the metrics. save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # Pretend to train. x = torch.rand((1, hparams.x_val)) for train_step in range(0, 100): y = torch.rand((hparams.x_val, 1)) out = x.mm(y) exp.log({'fake_err': out.item()}) # Save exp when . exp.save()
def train(hparams): # init exp and track all the parameters from the HyperOptArgumentParser exp = Experiment( name=hparams.test_tube_exp_name, save_dir=hparams.log_path, autosave=False, ) exp.argparse(hparams) # define tensorflow graph x = tf.placeholder(dtype=tf.int32, name='x') y = tf.placeholder(dtype=tf.int32, name='y') out = x * y sess = tf.Session() # Run the tf op for train_step in range(0, 100): output = sess.run(out, feed_dict={x: hparams.x_val, y: hparams.y_val}) exp.log({'fake_err': output}) # save exp when we're done exp.save()
def run(args): device = torch.device("cuda" if ( not args.cpu) and torch.cuda.is_available() else "cpu") print("Using device", device) train_data, val_data, test_data, src, trg = loader.load_data(args) src_padding_idx = src.vocab.stoi['<pad>'] trg_padding_idx = trg.vocab.stoi['<pad>'] for i in range(5): print(i, src.vocab.itos[i]) print(i, trg.vocab.itos[i]) assert src_padding_idx == config.PAD_TOKEN assert trg_padding_idx == config.PAD_TOKEN #src_unk_idx = src.vocab.stoi['<unk>'] #trg_unk_idx = trg.vocab.stoi['<unk>'] src_vocab_size = len(src.vocab) trg_vocab_size = len(trg.vocab) encoder = models.CnnEncoder(args, src_padding_idx, src_vocab_size).to(device) if args.attention: assert args.bidirectional, "if using attention model, bidirectional must be true" decoder = models.LuongAttnDecoderRNN(args, trg_padding_idx, trg_vocab_size).to(device) else: assert not args.bidirectional, "if not using attention model, bidirectional must be false" decoder = models.RnnDecoder(args, trg_padding_idx, trg_vocab_size).to(device) # initialize weights using gaussian with 0 mean and 0.01 std, just like the paper said # TODO: Better initialization. Xavier? for net in [encoder, decoder]: for name, param in net.named_parameters(): #print(name, type(param), param) if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_normal_(param) #nn.init.normal_(param, std=0.01) if args.encoder_word_embedding is not None: encoder_embedding_dict = torch.load(args.encoder_word_embedding) encoder.word_embedding.load_state_dict( {'weight': encoder_embedding_dict['weight']}) if args.freeze_all_words: encoder.word_embedding.requires_grad = False else: ##### encoder_embedding_dict = None ##### if args.decoder_word_embedding is not None: decoder_embedding_dict = torch.load(args.decoder_word_embedding) decoder.embedding.load_state_dict( {'weight': decoder_embedding_dict['weight']}) if args.freeze_all_words: decoder.embedding.requires_grad = False else: ##### decoder_embedding_dict = None ##### # TODO: other optimizers encoder_optimizer = optim.Adam(encoder.parameters(), lr=args.lr, weight_decay=args.l2_penalty) decoder_optimizer = optim.Adam(decoder.parameters(), lr=args.lr, weight_decay=args.l2_penalty) # TODO: use different loss? loss_function = nn.NLLLoss() # TODO: save/load weights # TODO: early stopping loss_history = defaultdict(list) bleu_history = defaultdict(list) # Initiate test-tube experiment object if not args.test: exp = Experiment( name=args.name, save_dir=args.logs_path, autosave=True, ) exp.argparse(args) model_path = os.path.join(args.model_weights_path, exp.name) model_path = os.path.join(model_path, str(exp.version)) pathlib.Path(model_path).mkdir(parents=True, exist_ok=True) print(model_path) if args.test: encoder.load_state_dict( torch.load( os.path.join(args.model_weights_path, 'encoder_weights.pt'))) decoder.load_state_dict( torch.load( os.path.join(args.model_weights_path, 'decoder_weights.pt'))) return test(args, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_function, device, i, test_data, trg, encoder_embedding_dict, decoder_embedding_dict) else: for i in range(args.epoch): train_loss, val_loss, val_bleu = train_and_val( args, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_function, device, i, train_data, val_data, trg, encoder_embedding_dict, decoder_embedding_dict) loss_history["train"].append(train_loss) loss_history["val"].append(val_loss) bleu_history["val"].append(val_bleu) # update best models if val_bleu == np.max(bleu_history["val"]): # save model weights of the best models torch.save(encoder.state_dict(), os.path.join(model_path, 'encoder_weights.pt')) torch.save(decoder.state_dict(), os.path.join(model_path, 'decoder_weights.pt')) if args.save_all_epoch: model_path_current_epoch = os.path.join(model_path, str(i)) pathlib.Path(model_path_current_epoch).mkdir(parents=True, exist_ok=True) torch.save( encoder.state_dict(), os.path.join(model_path_current_epoch, 'encoder_weights.pt')) torch.save( decoder.state_dict(), os.path.join(model_path_current_epoch, 'decoder_weights.pt')) # add logs exp.log({ 'train epoch loss': train_loss, 'val epoch loss': val_loss, 'val epoch bleu': val_bleu }) if early_stop(bleu_history["val"], args.early_stopping, max): print("Early stopped.") break
def train_VI_classification(net, name, save_dir, batch_size, nb_epochs, trainset, valset, cuda, flat_ims=False, nb_its_dev=1, early_stop=None, load_path=None, save_freq=20, stop_criteria='test_ELBO', tags=None, show=False): exp = Experiment(name=name, debug=False, save_dir=save_dir, autosave=True) if load_path is not None: net.load(load_path) exp_version = exp.version media_dir = exp.get_media_path(name, exp_version) models_dir = exp.get_data_path(name, exp_version) + '/models' mkdir(models_dir) exp.tag({ 'n_layers': net.model.n_layers, 'batch_size': batch_size, 'init_lr': net.lr, 'lr_schedule': net.schedule, 'nb_epochs': nb_epochs, 'early_stop': early_stop, 'stop_criteria': stop_criteria, 'nb_its_dev': nb_its_dev, 'model_loaded': load_path, 'cuda': cuda, }) if net.model.__class__.__name__ == 'arq_uncert_conv2d_resnet': exp.tag({ 'outer_width': net.model.outer_width, 'inner_width': net.model.inner_width }) else: exp.tag({'width': net.model.width}) exp.tag({ 'prob_model': net.model.prob_model.name, 'prob_model_summary': net.model.prob_model.summary }) if tags is not None: exp.tag(tags) if cuda: trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=3) valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=3) else: trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, pin_memory=False, num_workers=3) valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False, pin_memory=False, num_workers=3) ## --------------------------------------------------------------------------------------------------------------------- # net dims cprint('c', '\nNetwork:') epoch = 0 ## --------------------------------------------------------------------------------------------------------------------- # train cprint('c', '\nTrain:') print(' init cost variables:') mloglike_train = np.zeros(nb_epochs) KL_train = np.zeros(nb_epochs) ELBO_train = np.zeros(nb_epochs) ELBO_test = np.zeros(nb_epochs) err_train = np.zeros(nb_epochs) mloglike_dev = np.zeros(nb_epochs) err_dev = np.zeros(nb_epochs) best_epoch = 0 best_train_ELBO = -np.inf best_test_ELBO = -np.inf best_dev_ll = -np.inf tic0 = time.time() for i in range(epoch, nb_epochs): net.set_mode_train(True) tic = time.time() nb_samples = 0 for x, y in trainloader: if flat_ims: x = x.view(x.shape[0], -1) KL, minus_loglike, err = net.fit(x, y) err_train[i] += err mloglike_train[i] += minus_loglike / len(trainloader) KL_train[i] += KL / len(trainloader) nb_samples += len(x) # mloglike_train[i] *= nb_samples # KL_train[i] *= nb_samples ELBO_train[i] = (-KL_train[i] - mloglike_train[i]) * nb_samples err_train[i] /= nb_samples toc = time.time() # ---- print print("it %d/%d, sample minus loglike = %f, sample KL = %.10f, err = %f, ELBO = %f" % \ (i, nb_epochs, mloglike_train[i], KL_train[i], err_train[i], ELBO_train[i]), end="") exp.log({ 'epoch': i, 'MLL': mloglike_train[i], 'KLD': KL_train[i], 'err': err_train[i], 'ELBO': ELBO_train[i] }) cprint('r', ' time: %f seconds\n' % (toc - tic)) net.update_lr(i, 0.1) # ---- dev if i % nb_its_dev == 0: tic = time.time() nb_samples = 0 for j, (x, y) in enumerate(valloader): if flat_ims: x = x.view(x.shape[0], -1) minus_loglike, err = net.eval(x, y) mloglike_dev[i] += minus_loglike / len(valloader) err_dev[i] += err nb_samples += len(x) ELBO_test[i] = (-KL_train[i] - mloglike_dev[i]) * nb_samples ELBO_test[i] = (-KL_train[i] - mloglike_dev[i]) * nb_samples err_dev[i] /= nb_samples toc = time.time() cprint('g', ' sample minus loglike = %f, err = %f, ELBO = %f\n' % (mloglike_dev[i], err_dev[i], ELBO_test[i]), end="") cprint( 'g', ' (prev best it = %i, sample minus loglike = %f, ELBO = %f)\n' % (best_epoch, best_dev_ll, best_test_ELBO), end="") cprint('g', ' time: %f seconds\n' % (toc - tic)) exp.log({ 'epoch': i, 'MLL_val': mloglike_dev[i], 'err_val': err_dev[i], 'ELBO_val': ELBO_test[i] }) if stop_criteria == 'test_LL' and -mloglike_dev[i] > best_dev_ll: best_dev_ll = -mloglike_dev[i] best_epoch = i cprint('b', 'best test loglike: %d' % best_dev_ll) net.save(models_dir + '/theta_best.dat') probs = net.model.prob_model.get_q_probs().data.cpu().numpy() cuttoff = np.max(probs) * 0.95 exp.tag({ "q_vec": net.model.get_q_vector().cpu().detach().numpy(), "q_probs": net.model.prob_model.get_q_probs().cpu().detach().numpy(), "expected_depth": np.sum(probs * np.arange(net.model.n_layers + 1)), "95th_depth": np.argmax(probs > cuttoff), "best_epoch": best_epoch, "best_dev_ll": best_dev_ll }) if stop_criteria == 'test_ELBO' and ELBO_test[i] > best_test_ELBO: best_test_ELBO = ELBO_test[i] best_epoch = i cprint('b', 'best test ELBO: %d' % best_test_ELBO) net.save(models_dir + '/theta_best.dat') probs = net.model.prob_model.get_q_probs().data.cpu().numpy() cuttoff = np.max(probs) * 0.95 exp.tag({ "q_vec": net.model.get_q_vector().cpu().detach().numpy(), "q_probs": net.model.prob_model.get_q_probs().cpu().detach().numpy(), "expected_depth": np.sum(probs * np.arange(net.model.n_layers + 1)), "95th_depth": np.argmax(probs > cuttoff), "best_epoch": best_epoch, "best_test_ELBO": best_test_ELBO }) if stop_criteria == 'train_ELBO' and ELBO_train[i] > best_train_ELBO: best_train_ELBO = ELBO_train[i] best_epoch = i cprint('b', 'best train ELBO: %d' % best_train_ELBO) net.save(models_dir + '/theta_best.dat') probs = net.model.prob_model.get_q_probs().data.cpu().numpy() cuttoff = np.max(probs) * 0.95 exp.tag({ "q_vec": net.model.get_q_vector().cpu().detach().numpy(), "q_probs": net.model.prob_model.get_q_probs().cpu().detach().numpy(), "expected_depth": np.sum(probs * np.arange(net.model.n_layers + 1)), "95th_depth": np.argmax(probs > cuttoff), "best_epoch": best_epoch, "best_train_ELBO": best_train_ELBO }) if save_freq is not None and i % save_freq == 0: exp.tag({ "final_q_vec": net.model.get_q_vector().cpu().detach().numpy(), "final_q_probs": net.model.prob_model.get_q_probs().cpu().detach().numpy(), "final_expected_depth": np.sum(net.model.prob_model.get_q_probs().data.cpu().numpy() * np.arange(net.model.n_layers + 1)) }) net.save(models_dir + '/theta_last.dat') if early_stop is not None and (i - best_epoch) > early_stop: exp.tag({"early_stop_epoch": i}) cprint('r', ' stopped early!\n') break toc0 = time.time() runtime_per_it = (toc0 - tic0) / float(i + 1) cprint('r', ' average time: %f seconds\n' % runtime_per_it) ## --------------------------------------------------------------------------------------------------------------------- # fig cost vs its textsize = 15 marker = 5 plt.figure(dpi=100) fig, ax1 = plt.subplots() ax1.plot(range(0, i, nb_its_dev), np.clip(mloglike_dev[:i:nb_its_dev], a_min=-5, a_max=5), 'b-') ax1.plot(np.clip(mloglike_train[:i], a_min=-5, a_max=5), 'r--') ax1.set_ylabel('Cross Entropy') plt.xlabel('epoch') plt.grid(b=True, which='major', color='k', linestyle='-') plt.grid(b=True, which='minor', color='k', linestyle='--') lgd = plt.legend(['test', 'train'], markerscale=marker, prop={ 'size': textsize, 'weight': 'normal' }) ax = plt.gca() plt.title('classification costs') for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(textsize) item.set_weight('normal') plt.savefig(media_dir + '/cost.png', bbox_extra_artists=(lgd, ), bbox_inches='tight') if show: plt.show() plt.figure(dpi=100) fig, ax1 = plt.subplots() ax1.plot(range(0, i), KL_train[:i], 'b-') ax1.set_ylabel('KL') plt.xlabel('epoch') plt.grid(b=True, which='major', color='k', linestyle='-') plt.grid(b=True, which='minor', color='k', linestyle='--') lgd = plt.legend(['KL'], markerscale=marker, prop={ 'size': textsize, 'weight': 'normal' }) ax = plt.gca() plt.title('KL divideed by number of samples') for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(textsize) item.set_weight('normal') plt.savefig(media_dir + '/KL.png', bbox_extra_artists=(lgd, ), bbox_inches='tight') if show: plt.show() plt.figure(dpi=100) fig, ax1 = plt.subplots() ax1.plot(range(0, i), ELBO_train[:i], 'b-') ax1.set_ylabel('nats') plt.xlabel('epoch') plt.grid(b=True, which='major', color='k', linestyle='-') plt.grid(b=True, which='minor', color='k', linestyle='--') lgd = plt.legend(['ELBO'], markerscale=marker, prop={ 'size': textsize, 'weight': 'normal' }) ax = plt.gca() plt.title('ELBO') for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(textsize) item.set_weight('normal') plt.savefig(media_dir + '/ELBO.png', bbox_extra_artists=(lgd, ), bbox_inches='tight') if show: plt.show() plt.figure(dpi=100) fig, ax2 = plt.subplots() ax2.set_ylabel('% error') ax2.semilogy(range(0, i, nb_its_dev), err_dev[:i:nb_its_dev], 'b-') ax2.semilogy(err_train[:i], 'r--') ax2.set_ylim(top=1, bottom=1e-3) plt.xlabel('epoch') plt.grid(b=True, which='major', color='k', linestyle='-') plt.grid(b=True, which='minor', color='k', linestyle='--') ax2.get_yaxis().set_minor_formatter(matplotlib.ticker.ScalarFormatter()) ax2.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) lgd = plt.legend(['test error', 'train error'], markerscale=marker, prop={ 'size': textsize, 'weight': 'normal' }) ax = plt.gca() for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(textsize) item.set_weight('normal') plt.savefig(media_dir + '/err.png', bbox_extra_artists=(lgd, ), box_inches='tight') if show: plt.show() return exp, mloglike_train, KL_train, ELBO_train, err_train, mloglike_dev, err_dev