def train_regression_sequences(): data = read_data() cleaned_sequences, y2s = build_sequences(data) seq2vec = False #False for seq2seq - all years are included in y, seq2vec = only the last one # x_train, x_test, y_train, y_test = split_transform(only_index_2018) x_train, x_test, y_train, y_test = split_transform(only_index_all_years, cleaned_sequences, y2s) # x_train, x_test, y_train, y_test = split_transform(only_index_all_years_multiplied_100) EPOCHS = 50 log_dir = "logs/fit_rnn/" + datetime.datetime.now().strftime( "%Y%m%d-%H%M%S") tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) if seq2vec: model = build_simple_rnn_seq2vector() history_rnn = model.fit( x_train, y_train[:, -1], epochs=EPOCHS + 10, validation_data=(x_test, y_test[:, -1]), callbacks=[checkpoint(False, seq=True), tensorboard_callback]) else: model = build_lstm_seq2seq() history_seq2seq_rnn = model.fit( x_train, y_train, epochs=80, validation_data=(x_test, y_test), verbose=1, shuffle=True, callbacks=[checkpoint(False, seq=True), tensorboard_callback])
def train(generator, discriminator, discriminator_loss=None, generator_loss=None, gen_opt=None, dis_opt=None, device='cpu', ini_epoch=0, max_epoch=1000, it_val=100): for epoch in range(ini_epoch, max_epoch): generator.train() discriminator.train() for i, (inp, targ) in enumerate(train_loader): inp, targ = inp.to(device), targ.to(device) output_image = generator(inp) out_gen_discr = discriminator(output_image, inp) out_trg_discr = discriminator(targ, inp) discr_loss = discriminator_loss(out_trg_discr, out_gen_discr) gen_loss = generator_loss(out_gen_discr, output_image, targ) gen_opt.zero_grad() dis_opt.zero_grad() discr_loss.backward(retain_graph=True) gen_loss.backward() gen_opt.step() dis_opt.step() if epoch % it_val == 0: checkpoint('./', generator, discriminator, epoch) generator.eval() # Evaluate some images. fig, ax = plt.subplots(4, 3) inp, targ = next(iter(validation_loader)) inp, targ = inp.to(device), targ.to(device) output_image = generator(inp) for i in range(4): ax[i, 0].imshow(inp[i].data.cpu().numpy().transpose(1, 2, 0)) ax[i, 1].imshow(output_image[i].data.cpu().numpy().transpose( 1, 2, 0)) ax[i, 2].imshow(targ[i].data.cpu().numpy().transpose(1, 2, 0)) fig.savefig('val_{}.png'.format(epoch)) # Guardar ultimo estado de la red. checkpoint('./', generator, discriminator, epoch)
def train_regression(): oneyear = True # two years care considered type = 0 x_train, x_test, y_train, y_test = get_data(type, oneyear) if oneyear: model = build_model(9) else: model = build_model(18) model.summary() log_dir = "logs/fit_regression/" + datetime.datetime.now().strftime( "%Y%m%d-%H%M%S") tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) history = model.fit(x_train[:, :-1], y_train[:, 0], epochs=90, batch_size=20, verbose=2, validation_data=(x_test[:, :-1], y_test[:, 0]), callbacks=[checkpoint(False), tensorboard_callback])
def train_classification(): oneyear = False # two years are considered type = 2 # 1 and 2 - classification with/without threshold for increase/decrease x_train, x_test, y_train, y_test = get_data(type, oneyear) if oneyear: model = build_model_classification(9) else: model = build_model_classification(18) model.summary() log_dir = "logs/fit_classification/" + datetime.datetime.now().strftime( "%Y%m%d-%H%M%S") tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) #the value of type determines which interpretation of the index is considered history_classification = model.fit( x_train[:, :-1], y_train[:, type], epochs=250, verbose=2, validation_data=(x_test[:, :-1], y_test[:, type]), callbacks=[checkpoint(True), tensorboard_callback])
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True) val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) device = torch.device('cuda') model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=LEARNING_RATE) train_losses = [] val_losses = [] for epoch in range(NUM_EPOCHS): train_loss = train_epoch(model, train_loader, criterion, optimizer, device) train_losses.append(train_loss) message = f'Epoch: {epoch}\tTrainLoss: {train_loss}' if len( val_data ) > 0: # only run validation epoch if the validation dataset is not empty val_loss, val_acc = val_epoch(model, val_loader, criterion, device) val_losses.append(val_loss) message += f'\tValLoss: {val_loss}\tValAcc: {val_acc}' print(message) if epoch % CHECKPOINT_RATE == 0: print('Checkpointing model...') checkpoint(model, os.path.join(MODEL_DIR, f'fer_model_{epoch}.pt'))
print('[%s]. WER: %.3f, CER: %.3f, KS: %s, KE: %s' % info) writer = summary_info['writer'] for key, value in summary_info['measures'].iteritems(): if not 'all' in key: s = summary.scalar(key, value) writer.add_summary(s, global_step=global_step) #save best model if val_score > val_score_best: val_score_best = val_score global_step = len(loader_train) * (epoch - 1) + step sdir = "logdir/%s/models/best" % args.model_dir mkpath(sdir) fname_model = 'gazeNET_%04d_%08d_K%.4f.pth.tar' % ( epoch, global_step, val_score) file_path = '%s/%s' % (sdir, fname_model) torch.save(model_func.checkpoint(model, step, epoch), file_path) #switch back to train mode model.train() #%% on epoch done #save model #global_step = len(train_loader)*(epoch-1) + step #config = model_func.save(model, args.model_dir, epoch, global_step, config) #config['learning_rate'] = config['learning_rate']/config['learning_rate_anneal'] #model_func.anneal_learning_rate(optimizer, config['learning_rate']) #configuration.save_params(config)
def train_model( model, criterion, optimizer, LR, scheduler, num_epochs, dataloaders, dataset_sizes, PATH_TO_IMAGES, data_transforms, opt, ): """ Fine tunes torchvision model to NIH CXR data. Args: model: torchvision model to be finetuned (densenet-121 in this case) criterion: loss criterion (binary cross entropy loss, BCELoss) optimizer: optimizer to use in training (SGD) LR: learning rate num_epochs: continue training up to this many epochs dataloaders: pytorch train and val dataloaders dataset_sizes: length of train and val datasets weight_decay: weight decay parameter we use in SGD with momentum Returns: model: trained torchvision model best_epoch: epoch on which best model val loss was obtained """ since = time.time() start_epoch = 1 best_auc = -1 best_epoch = -1 last_train_loss = -1 # iterate over epochs for epoch in range(start_epoch, num_epochs + 1): print('Epoch {}/{}(max)'.format(epoch, num_epochs)) print('-' * 10) # set model to train or eval mode based on whether we are in train or val # necessary to get correct predictions given batchnorm for phase in ['train', 'val']: print('Epoch %03d, ' % epoch, phase) if phase == 'train': model.train(True) else: model.train(False) running_loss = 0.0 i = 0 total_done = 0 # iterate over all data in train/val dataloader: data_length = len(dataloaders[phase]) for data_idx, data in enumerate(dataloaders[phase]): inputs, labels, _ = data batch_size = inputs.shape[0] if phase == 'val': with torch.no_grad(): inputs = inputs.cuda(opt.gpu_ids[0]) labels = labels.cuda(opt.gpu_ids[0]).float() outputs = model(inputs) if isinstance(outputs, tuple): # has dot product outputs, dp = outputs else: dp = None # calculate gradient and update parameters in train phase optimizer.zero_grad() loss = criterion(outputs, labels) else: inputs = inputs.cuda(opt.gpu_ids[0]) labels = labels.cuda(opt.gpu_ids[0]).float() outputs = model(inputs) if isinstance(outputs, tuple): # has dot product outputs, dp = outputs else: dp = None # calculate gradient and update parameters in train phase optimizer.zero_grad() loss = criterion(outputs, labels) if dp is not None: dp_loss = opt.orth_loss_lambda * torch.abs(dp.mean()) loss = loss + dp_loss if phase == 'train': loss.backward() optimizer.step() if data_idx % 20 == 0: wandb.log({ 'epoch': epoch + data_idx / float(len(dataloaders[phase])), 'loss': loss.cpu(), 'lr': list(optimizer.param_groups)[0]['lr'] }) if data_idx == 0: log_images = [] for image in list(inputs[:10].cpu()): log_images.append(wandb.Image( np.transpose(image.numpy(), (1, 2, 0)), caption='{}_image'.format(phase) )) wandb.log({'{}_image'.format(phase): log_images}) running_loss += loss.data.item() * batch_size if data_idx % 100 == 0: print("{} / {} ".format(data_idx, data_length), end="\r", flush=True) epoch_loss = running_loss / dataset_sizes[phase] if phase == 'train': last_train_loss = epoch_loss print(phase + ' epoch {}:loss {:.4f} with data size {}'.format( epoch, epoch_loss, dataset_sizes[phase])) # decay learning rate if no val loss improvement in this epoch if phase == 'val': pred, auc = E.make_pred_multilabel( data_transforms, model, PATH_TO_IMAGES, fold="val", opt=opt, ) wandb.log({ 'epoch': epoch + 1, 'performance': np.average(list(auc.auc)) }) epoch_auc = np.average(list(auc.auc)) scheduler.step(epoch_auc) # checkpoint model if phase == 'val' and epoch_auc > best_auc: # best_loss = epoch_loss best_auc = epoch_auc best_epoch = epoch checkpoint(model, best_auc, epoch, LR, opt) # log training and validation loss over each epoch if phase == 'val': with open(os.path.join(opt.run_path, "log_train"), 'a') as logfile: logwriter = csv.writer(logfile, delimiter=',') if (epoch == 1): logwriter.writerow(["epoch", "train_loss", "val_loss"]) logwriter.writerow([epoch, last_train_loss, epoch_loss]) total_done += batch_size if (total_done % (100 * batch_size) == 0): print("completed " + str(total_done) + " so far in epoch") # break if no val loss improvement in 3 epochs if np.round(list(optimizer.param_groups)[0]['lr'], 5) <= np.round( LR * (opt.lr_decay_ratio ** opt.num_lr_drops), 5): break time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) # load best model weights to return checkpoint_best = torch.load(os.path.join(opt.run_path, 'checkpoint')) model = load_model(N_LABELS=14, opt=opt) model.load_state_dict(checkpoint_best['state_dict']) return model, best_epoch
# only allow the first convolution layer in the gray model to train for param in gray_model.parameters(): param.requires_grad = False gray_model.conv1.weight.requires_grad = True # prepare each model for the fine-tuning torch_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') color_model = color_model.to(torch_device) gray_model = gray_model.to(torch_device) criterion = nn.MSELoss() optimizer = optim.Adam(gray_model.parameters(), lr=learning_rate) train_losses = [] val_losses = [] for epoch in range(NUM_EPOCHS): if epoch % DECAY_RATE == 0: learning_rate *= LR_DECAY optimizer = optim.Adam(gray_model.parameters(), lr=learning_rate) print(f'New learning rate: {learning_rate}') train_loss = train_epoch_gray(gray_model, color_model, train_loader, criterion, optimizer, torch_device) val_loss = val_epoch_gray(gray_model, color_model, val_loader, criterion, torch_device) train_losses.append(train_loss) val_losses.append(val_loss) print(f'Epoch: {epoch}\tTrainLoss: {train_loss}\tValLoss: {val_loss}') if epoch % CHECKPOINT_RATE == 0: print('Checkpointing model...') checkpoint(gray_model, os.path.join(MODEL_DIR, f'gray_{epoch}.pt'))