def train(model, loss_fn, optimizer, sampler, val_sampler=None, last_iter=0, train_writer=None, val_writer=None, monitor=None, **params): """ Generalized training function """ assert params_defined(params), "Params under-specified" if monitor is None: monitor = utils.LearningMonitor() #Determine the names of inputs, labels, masks sample_spec = utils.SampleSpec(sampler().keys()) mask_names = sample_spec.get_masks() print("======= BEGIN TRAINING LOOP ========") for i in range(last_iter, params['max_iter']): start = time.time() # Make sure no mask is empty (data for all tasks) sample = fetch_nonempty_sample(sampler, mask_names, params['batch_size']) inputs, labels, masks = group_sample(sample, sample_spec, "train") #Running forward pass preds = model(*inputs) losses, nmsks = eval_error(preds, labels, masks, loss_fn, sample_spec) update_model(optimizer, losses) log_errors(monitor, losses, nmsks, i) # Elapsed time. elapsed = time.time() - start log_elapsed_time(monitor, elapsed, i, "train") if val_sampler is not None and i % params["test_intv"] == 0: run_validation(model, val_sampler, params["test_iter"], loss_fn, sample_spec, monitor, val_writer, i) if i % params["avgs_intv"] == 0 or i < last_iter + params["warm_up"]-1: monitor.compute_avgs(i, "train") #Displaying stats (both to console and TensorBoard) avg_losses = { k : round(monitor.get_last_value(k, "train"),5) for k in losses.keys() } avg_time = round(monitor.get_last_value("iter_time","train"),5) write_averages(train_writer, avg_losses, avg_time, i) print("iter: {}; avg losses = {} (iter_time = {} s on avg)".format(i,avg_losses, avg_time)) if i % params["chkpt_intv"] == 0 and i != last_iter: print("SAVE CHECKPOINT: {} iters.".format(i)) utils.save_chkpt(model, monitor, i, params["model_dir"], params["log_dir"])
def _train_SimSiam(device,trainloader,bankloader,queryloader, model,optimizer,scheduler,num_epochs,base_dir, saved_epoch=0,loss_hist=[],knn_hist=[],best_acc=80): model.to(device) scaler = torch.cuda.amp.GradScaler() start_time = time.time() for epoch in range(saved_epoch,num_epochs): model.train() epoch_time = time.time() for b,(x1,x2,label,_) in enumerate(trainloader): x1 = x1.to(device) x2 = x2.to(device) optimizer.zero_grad() with torch.cuda.amp.autocast(): z1,z2,p1,p2 = model(x1,x2) loss = D(p1,z2)/2 + D(p2,z1)/2 scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() if b%40 == 0: loss_hist.append(loss.item()) now = datetime.datetime.now() print("epoch:{} batch:{} loss:{:.4f} lr:{:.2e} time:{}".format(epoch,b,loss.item(),get_lr(optimizer), now.strftime('%Y-%m-%d %H:%M:%S'))) # adjust lr by cosine annealing scheduler.step() correct,total = knn_monitor(device,model,bankloader,queryloader) accuracy = correct/total*100 knn_hist.append(accuracy) print("epoch:{} knn_acc:{:.2f}% ({}/{}) epoch_time:{:.2f}\n".format(epoch,accuracy,correct,total, time.time()-epoch_time)) if accuracy > best_acc: save_dir = base_dir+"SimSiam_{}.chkpt".format(int(accuracy*100)) save_chkpt(model,optimizer,scheduler,epoch,loss_hist,knn_hist,base_dir+save_dir) best_acc = accuracy print("---------------- model saved: {:.2f}% ----------------".format(accuracy)) print("training finished!\ntotal time:{:.2f}, best_acc:{:.2f}".format(time.time()-start_time,best_acc))
def train(model, loss_fn, optimizer, sampler, val_sampler=None, last_iter=0, monitor=None, **params): """ Generalized training fn """ assert params_defined(params), "Params under-specified" if monitor is None: monitor = utils.LearningMonitor() #Determine the names of inputs, labels, masks sample_spec = utils.SampleSpec(sampler.get().keys()) mask_names = sample_spec.get_masks() start = time.time() print("======= BEGIN TRAINING LOOP ========") for i in range(last_iter, params['max_iter']): # Make sure no mask is empty (data for all tasks) sample = fetch_nonempty_sample(sampler, mask_names, params['batch_size']) #print("sample type, size in training loop", type(sample), sample.get(sample.keys()[0]).shape) inputs, labels, masks = make_variables(sample, sample_spec, "train") #Running forward pass preds = model(*inputs) if (params["resize"] != 1): print("Type of Preds[0]:", type(preds[0])) #preds = misc.imresize(preds, 1.0*params["resize"], interp="bilinear") #print("Resized!") losses, nmsks = eval_error(preds, labels, masks, loss_fn, sample_spec) update_model(optimizer, losses) log_errors(monitor, losses, nmsks) # Elapsed time. elapsed = time.time() - start log_elapsed_time(monitor, elapsed, "train") start = time.time() if val_sampler is not None and i % params["test_intv"] == 0: run_validation(model, val_sampler, params["test_iter"], loss_fn, sample_spec, monitor, i) start = time.time() #ignore validation time if i % params[ "avgs_intv"] == 0 or i < last_iter + params["warm_up"] - 1: monitor.compute_avgs(i, "train") #Displaying stats avg_losses = { k: round(monitor.get_last_value(k, "train"), 5) for k in losses.keys() } avg_time = round(monitor.get_last_value("iter_time", "train"), 5) print("iter: {}; avg losses = {} (iter_time = {} s on avg)".format( i, avg_losses, avg_time)) if i % params["chkpt_intv"] == 0 and i != last_iter: print("SAVE CHECKPOINT: {} iters.".format(i)) utils.save_chkpt(model, monitor, i, params["model_dir"], params["log_dir"])
weight_decay=args.weightDecay) # Learning for epoch_num in range(args.initEpochNum, args.initEpochNum + args.nEpochs): trn_metrics = runModel(trn_data_gen, model, optimizer, class_wts, 'trn', args.batchSize, trn_num_batches, loss_wts=loss_wts) utils.log_metrics(epoch_num, trn_metrics, 'trn', log_file, args.savename) torch.save(model.state_dict(), args.savename + '.pt') val_metrics = runModel(val_data_gen, model, optimizer, class_wts, 'val', args.batchSize, val_num_batches, None) utils.log_metrics(epoch_num, val_metrics, 'val', log_file, args.savename) if best_val_record and val_metrics.AUROC > best_val: best_val = utils.save_chkpt(best_val_record, best_val, val_metrics, model, args.savename) tst_metrics = runModel(tst_data_gen, model, optimizer, class_wts, 'tst', args.batchSize, tst_num_batches, None) utils.log_metrics(0, tst_metrics, 'tst', log_file, args.savename) # val_aggregator = Aggregator('val', task, val_data_loader) # val_aggregator.aggregate() # tst_aggregator = Aggregator('tst', task, tst_data_loader) # tst_aggregator.aggregate()