class Analyser: def __init__(self, **kwargs): self.options = kwargs LOGS_DIR_TIMESTAMP = datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.options["logdir"] = os.path.join( (self.options["logdir"] if "logdir" in self.options else "results"), LOGS_DIR_TIMESTAMP + (self.options["comment"] if "comment" in self.options else "")) if not os.path.exists(self.options["logdir"]): os.makedirs(self.options["logdir"]) self.writer = SummaryWriter(log_dir=self.options["logdir"], flush_secs=self.options["flush_secs"] if "flush_secs" in self.options else 120) self.scalars = {} def refresh(self): self.scalars = {} def add_to_scalars(self, **kwargs): for k in kwargs.keys(): self.scalars[k] = (self.scalars[k] + kwargs[k]) if k in self.scalars else kwargs[k] def write_scalars(self, epoch): for k in self.scalars.keys(): self.writer.add_scalar(k, self.scalars[k], epoch) def save_weights(self, model, epoch): torch.save( model.state_dict(), self.options["logdir"] + "/model_weights_" + str(epoch) + ".pk") def update_str(self, epoch, epoch_time=None): losses_str = (": {:4.5f}\t\t".join(self.scalars.keys()) + ": {:4.5f}").format(*[x for x in self.scalars.values()]) epoch_str = "Epoch:{:3d}\t[time={:3.2f}s]\t\t".format( epoch, epoch_time ) if epoch_time is not None else "Epoch:{:3d}\t\t".format(epoch) return epoch_str + losses_str def write_scalars_to_file(self, epoch, filename=None): filename = "results" if filename is None else filename with open(self.writer.get_logdir() + "/" + filename + ".txt", 'a') as f: print(self.update_str(epoch), file=f) try: with open(self.writer.get_logdir() + "/" + filename + ".pk", 'rb') as f: current_dict: dict = pickle.load(f) current_dict.update({epoch: self.scalars}) except: current_dict = {epoch: self.scalars} finally: with open(self.writer.get_logdir() + "/" + filename + ".pk", 'wb') as f: pickle.dump(current_dict, f)
def main(opt): writer = SummaryWriter() log_dir = writer.get_logdir() os.makedirs(os.path.join(log_dir, "images"), exist_ok=True) os.makedirs(os.path.join(log_dir, "test"), exist_ok=True) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") # Initialize generator and discriminator generator = UNet(opt.sample_num, opt.channels, opt.batch_size, opt.alpha) discriminator = Discriminator(opt.batch_size, opt.alpha) generator.to(device=device) discriminator.to(device=device) # Optimizers optimizer_G = torch.optim.Adam(generator.parameters(), lr=opt.lr_g, betas=(opt.b1, opt.b2)) optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=opt.lr_d, betas=(opt.b1, opt.b2)) if opt.mode == 'train': generator = train(writer, log_dir, device, generator, discriminator, optimizer_G, optimizer_D, opt) test(opt, log_dir, generator=generator) if opt.mode == 'test': test(opt, log_dir) test_moving(opt, log_dir)
def vedo_data(writer: SummaryWriter, image_densities, image_samples, image_warps, epoch, image_idx, max_number_saved_points=1000): logdir = os.path.join(writer.get_logdir(), "vedo_data") if not os.path.exists(logdir): os.makedirs(logdir) if len(image_densities) < max_number_saved_points: max_number_saved_points = len(image_densities) if image_densities.sum() == 0: indices_densities = np.arange(len(image_densities)) else: densities_distribution = image_densities / image_densities.sum() indices_densities = np.random.choice(np.arange(len(image_densities)), max_number_saved_points, p=densities_distribution) image_densities = image_densities[indices_densities] samples_densities = image_samples[indices_densities] if image_warps is not None: warp_magnitude = np.linalg.norm(image_warps, axis=-1) if warp_magnitude.sum() == 0: indices_warps = np.arange(max_number_saved_points) else: warp_magnitude_exp = np.exp(10 * warp_magnitude) warp_distribution = warp_magnitude_exp / (warp_magnitude_exp.sum( axis=-1)) indices_warps = np.random.choice(np.arange(len(image_warps)), max_number_saved_points, p=warp_distribution) image_warps = image_warps[indices_warps] samples_warps = image_samples[indices_warps] else: image_warps = [] samples_warps = [] np.savez(os.path.join( logdir, "densities_samples_warps_epoch_{}_image_{}".format( epoch, image_idx)) + '.npz', densities=image_densities, samples_density=samples_densities, samples_warp=samples_warps, warps=image_warps)
def main(args): utils.init_distributed_mode(args) print(args) # Opening YAML cfg config file with open(args.cfg_file, 'r') as stream: try: cfg_file = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) # Retrieving cfg train_cfg = cfg_file['training'] model_cfg = cfg_file['model'] data_cfg = cfg_file['dataset'] # Setting device device = torch.device(model_cfg['device']) # No possible to set checkpoint and pre-trained model at the same time if train_cfg['checkpoint'] and train_cfg['pretrained_model']: print("You can't set checkpoint and pretrained-model at the same time") exit(1) # Creating tensorboard writer if train_cfg['checkpoint']: checkpoint = torch.load(train_cfg['checkpoint']) writer = SummaryWriter(log_dir=checkpoint['tensorboard_working_dir']) else: writer = SummaryWriter(comment="_" + train_cfg['tensorboard_filename']) # Saving cfg file in the same folder copyfile( args.cfg_file, os.path.join(writer.get_logdir(), os.path.basename(args.cfg_file))) ####################### # Creating model ####################### print("Creating model") load_custom_model = False if train_cfg['checkpoint'] or train_cfg['pretrained_model']: load_custom_model = True model, backbone = get_model_detection(num_classes=1, cfg=model_cfg, load_custom_model=load_custom_model) # Putting model to device and setting eval mode model.to(device) model.train() # Freeze the backbone parameters, if needed if backbone is not None and model_cfg['freeze_backbone']: for param in backbone.parameters(): param.requires_grad = False print('Backbone is freezed!') ##################################### # Creating datasets and dataloaders ##################################### data_root = data_cfg['root'] ################################ # Creating training datasets and dataloaders print("Loading training data") train_datasets_names = data_cfg['train'] if train_cfg['mixed_batch']: assert train_cfg['tgt_images_in_batch'] > 0, \ "Using mixed training. You need to specify the tgt_images_in_batch parameter!" assert len(train_datasets_names) == 2, "Using mixed training, you need to specify two datasets, " \ "the first one as the source while the second as the target" source_dataset = CustomYoloAnnotatedDataset( data_root, { list(train_datasets_names.keys())[0]: list(train_datasets_names.values())[0] }, transforms=get_transform(train=True), phase='train') target_dataset = CustomYoloAnnotatedDataset( data_root, { list(train_datasets_names.keys())[1]: list(train_datasets_names.values())[1] }, transforms=get_transform(train=True), phase='train') train_dataset = DatasetsEnsemble(source_dataset=source_dataset, target_dataset=target_dataset) train_dataloader = DataLoader( train_dataset, collate_fn=train_dataset.source_dataset.standard_collate_fn, num_workers=train_cfg['num_workers'], batch_sampler=EnsembleBatchSampler( train_dataset, batch_size=train_cfg['batch_size'], shuffle=True, tgt_imgs_in_batch=train_cfg['tgt_images_in_batch'])) print( 'Using mixed training datasets. Source: {}, Target: {}. In every batch, {}/{} are from {}' .format( list(train_datasets_names.keys())[0], list(train_datasets_names.keys())[1], train_cfg['tgt_images_in_batch'], train_cfg['batch_size'], list(train_datasets_names.keys())[1])) else: train_dataset = CustomYoloAnnotatedDataset( data_root, train_datasets_names, transforms=get_transform(train=True), phase='train') train_dataloader = DataLoader( train_dataset, batch_size=train_cfg['batch_size'], shuffle=False, num_workers=train_cfg['num_workers'], collate_fn=train_dataset.standard_collate_fn) ############################### # Creating validation datasets print("Loading validation data") val_datasets_names = data_cfg['val'] # Creating dataset(s) and dataloader(s) val_dataloaders = dict() best_validation_ap = defaultdict(float) for dataset_name, dataset_cfg in val_datasets_names.items(): val_dataset = CustomYoloAnnotatedDataset( data_root, {dataset_name: dataset_cfg}, transforms=get_transform(), phase="val", percentage=train_cfg["percentage_val"]) val_dataloader = DataLoader(val_dataset, batch_size=train_cfg['batch_size'], shuffle=False, num_workers=train_cfg['num_workers'], collate_fn=val_dataset.standard_collate_fn) # Adding created dataloader val_dataloaders[dataset_name] = val_dataloader # Initializing best validation ap value best_validation_ap[dataset_name] = 0.0 ####################################### # Defining optimizer and LR scheduler ####################################### ########################## # Constructing an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=train_cfg['lr'], momentum=train_cfg['momentum'], weight_decay=train_cfg['weight_decay'], ) # and a learning rate scheduler if model_cfg['coco_model_pretrained']: lr_step_size = min(25000, len(train_dataset)) else: lr_step_size = min(40000, 2 * len(train_dataset)) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size, gamma=train_cfg['lr_gamma']) # Defining a warm-up lr scheduler warmup_iters = min(1000, len(train_dataloader) - 1) warmup_factor = 1. / 1000 warmup_lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) ############################# # Resuming a model ############################# start_epoch = 0 train_step = -1 # Eventually resuming a pre-trained model if train_cfg['pretrained_model']: print("Resuming pre-trained model") if train_cfg['pretrained_model'].startswith('http://') or train_cfg[ 'pretrained_model'].startswith('https://'): pre_trained_model = torch.hub.load_state_dict_from_url( train_cfg['pretrained_model'], map_location='cpu', model_dir=model_cfg["cache_folder"]) else: pre_trained_model = torch.load(train_cfg['pretrained_model'], map_location='cpu') model.load_state_dict(pre_trained_model['model']) # Eventually resuming from a saved checkpoint if train_cfg['checkpoint']: print("Resuming from a checkpoint") checkpoint = torch.load(train_cfg['checkpoint']) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) warmup_lr_scheduler.load_state_dict(checkpoint['warmup_lr_scheduler']) start_epoch = checkpoint['epoch'] train_step = checkpoint['iteration'] for elem_name, elem in checkpoint.items(): if elem_name.startswith("best_"): d_name = elem_name.split("_")[1] if d_name in best_validation_ap: best_validation_ap[d_name] = elem else: warnings.warn( "The dataset {} was not used in the previous training". format(d_name)) best_validation_ap[d_name] = 0.0 ################ ################ # Training print("Start training") for epoch in range(start_epoch, train_cfg['epochs']): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) for images, targets in metric_logger.log_every( train_dataloader, print_freq=train_cfg['print_freq'], header=header): train_step += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): for target in targets: image_id = target['image_id'].item() print(train_dataset.images[image_id]) print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() # clip norm torch.nn.utils.clip_grad_norm_(model.parameters(), 50) optimizer.step() if epoch == 0 and train_step < warmup_iters: warmup_lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if train_step % train_cfg['log_loss'] == 0: writer.add_scalar('Training/Learning Rate', optimizer.param_groups[0]["lr"], train_step) writer.add_scalar('Training/Reduced Sum Losses', losses_reduced, train_step) writer.add_scalars('Training/All Losses', loss_dict, train_step) if (train_step % train_cfg['save_freq'] == 0 and train_step != 0) \ or ((train_cfg['pretrained_model'] or model_cfg['coco_model_pretrained']) and train_step < 6 * train_cfg['save_freq'] and train_step % 200 == 0 and train_step != 0): # Validation for val_name, val_dataloader in val_dataloaders.items(): print("Validation on {}".format(val_name)) coco_evaluator = evaluate( model, val_dataloader, device=device, max_dets=model_cfg["max_dets_per_image"]) ap = None for iou_type, coco_eval in coco_evaluator.coco_eval.items( ): ap = coco_eval.stats[1] writer.add_scalar( 'COCO mAP Validation/{}'.format(val_name), ap, train_step) # Eventually saving best model if ap > best_validation_ap[val_name]: best_validation_ap[val_name] = ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_{}_ap'.format(val_name): best_validation_ap[val_name], }, writer.get_logdir(), best_model=val_name) # Saving last model checkpoint_dict = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'tensorboard_working_dir': writer.get_logdir(), } for d_name, _ in val_dataloaders.items(): checkpoint_dict["best_{}_ap".format( d_name)] = best_validation_ap[d_name] save_checkpoint(checkpoint_dict, writer.get_logdir()) # Setting again to train mode model.train() # Updating lr scheduler lr_scheduler.step()
def q1_1_cnn(): # prepare tensorboard writer = SummaryWriter(comment="-q1.1-CNN") output_path = writer.get_logdir() net = CNNModel(3, len(CLASS_LABELS)) # define loss and optim criterion = nn.CrossEntropyLoss() # get one batch of training data train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=64, shuffle= False, # make the first batch of data consistent between executions num_workers=2) train_data, train_label = next(iter(train_loader)) # use gpu if available if torch.cuda.is_available(): print("using gpu") net = net.cuda() criterion = criterion.cuda() train_data = train_data.cuda() train_label = train_label.cuda() optimizer = optim.Adam(net.parameters()) # training code epoch_num = 40 train_loss = [] val_loss = [] for epoch_idx in range(epoch_num): # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(train_data) loss = criterion(outputs, train_label) loss.backward() optimizer.step() train_loss.append(loss.cpu().detach().numpy().item()) writer.add_scalar("Loss/train", loss.cpu().detach(), epoch_idx) # validation with torch.no_grad(): tmp = 0 for i, (val_data, val_label) in enumerate(valid_loader): val_data = val_data.to(train_data.device) val_label = val_label.to(train_label.device) outputs = net(val_data) loss = criterion(outputs, val_label) tmp += loss.cpu().detach() writer.add_scalar("Loss/val", tmp / len(valid_loader), epoch_idx) val_loss.append(tmp.numpy().item() / len(valid_loader)) print( f"Train Epoch({epoch_idx + 1} / {epoch_num}), train_loss: {train_loss[-1]}, val_loss: {val_loss[-1]}" ) print("save loss") torch.save(val_loss, os.path.join(output_path, "cnn_val_loss")) torch.save(train_loss, os.path.join(output_path, "cnn_train_loss")) writer.close()
from torch.utils.tensorboard import SummaryWriter from sgan.data.loader import data_loader from sgan.losses import gan_g_loss, gan_d_loss, l2_loss from sgan.losses import displacement_error, final_displacement_error from sgan.models import TrajectoryGenerator, TrajectoryDiscriminator from sgan.utils import int_tuple, bool_flag, get_total_norm from sgan.utils import relative_to_abs, get_dset_path torch.backends.cudnn.benchmark = True writer = SummaryWriter() time_str="_".join(writer.get_logdir().split("/")[1].split("_")[:2]) # output_dir="/media/felicia/Data/sgan_results/{}".format(time_str) output_dir="/scratch/sz2257/sgan/sgan_results/{}".format(time_str) if not os.path.exists(output_dir): os.mkdir(output_dir) # data_dir='/media/felicia/Data/basketball-partial' data_dir='/scratch/sz2257/sgan/basketball-partial' parser = argparse.ArgumentParser() FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout) logger = logging.getLogger(__name__) # Dataset options
class Logger(object): def __init__(self, log_dir, comment=''): self.writer = SummaryWriter(log_dir=log_dir, comment=comment) self.imgs_dict = {} def scalar_summary(self, tag, value, step): self.writer.add_scalar(tag, value, global_step=step) self.writer.flush() def combined_scalars_summary(self, main_tag, tag_scalar_dict, step): self.writer.add_scalars(main_tag, tag_scalar_dict, step) self.writer.flush() def log(self, tag, text_string, step=0): self.writer.add_text(tag, text_string, step) self.writer.flush() def log_model(self, model, inputs): self.writer.add_graph(model, inputs) self.writer.flush() def get_dir(self): return self.writer.get_logdir() def log_model_state(self, model, name='tmp'): path = os.path.join(self.writer.get_logdir(), type(model).__name__ + '_%s.pt' % name) torch.save(model.state_dict(), path) def log_video(self, tag, global_step=None, img_tns=None, finished_video=False, video_tns=None, debug=False): ''' Logs video to tensorboard. Video_tns will be empty. If given image tensors, then when finished_video = True, the video of the past tensors will be made into one video. If vide_tns is not empty, then that will be marked the video and the other arguments will be ignored. ''' if debug: import pdb pdb.set_trace() if img_tns is None and video_tns is None: if not finished_video or tag not in self.imgs_dict.keys(): return None lst_img_tns = self.imgs_dict[tag] self.writer.add_video(tag, torch.tensor(lst_img_tns), global_step=global_step, fps=4) self.writer.flush() self.imgs_dict[tag] = [] return None elif video_tns is not None: self.writer.add_video(tag, video_tns, global_step=global_step, fps=4) self.writer.flush() return None if tag in self.imgs_dict.keys(): lst_img_tns = self.imgs_dict[tag] else: lst_img_tns = [] self.imgs_dict[tag] = lst_img_tns lst_img_tns.append(img_tns) if finished_video: self.writer.add_video(tag, torch.tensor(lst_img_tns), global_step=global_step, fps=4) self.writer.flush() self.imgs_dict[tag].clear() def close(self): self.writer.close()
# from sgan.models import TrajectoryGenerator as GeneratorBaseline, TrajectoryDiscriminator as DiscriminatorBaseline # from sgan.models_teampos import TrajectoryGenerator as TeamPosGenerator, TrajectoryDiscriminator as TeamPosDiscriminator from sgan.models_old import TrajectoryGenerator, TrajectoryDiscriminator # MODELS = { # "baseline": (GeneratorBaseline, DiscriminatorBaseline), # "team_pos": (TeamPosGenerator, TeamPosDiscriminator) # } from sgan.utils import int_tuple, bool_flag, get_total_norm from sgan.utils import relative_to_abs, get_dset_path torch.backends.cudnn.benchmark = True writer = SummaryWriter() time_str = "_".join(writer.get_logdir().split("/")[1].split("_")[:2]) # output_dir="/media/felicia/Data/sgan_results/{}".format(time_str) output_dir = "/scratch/sz2257/sgan_results/{}".format(time_str) # data_dir='/media/felicia/Data/basketball-partial' data_dir = '/scratch/sz2257/basketball-partial' parser = argparse.ArgumentParser() FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout) logger = logging.getLogger(__name__) # Dataset options parser.add_argument('--dataset_name', default='01.02.2016.PHX.at.SAC.new',
class Logger: def __init__(self, exp_ID, log_dir): """Log the training process of Deepymod. Args: exp_ID (str): name or ID of the this experiment log_dir (str): directory to save the log files to disk. """ self.writer = SummaryWriter(comment=exp_ID, log_dir=log_dir, max_queue=5, flush_secs=10) self.log_dir = self.writer.get_logdir() def __call__( self, iteration, loss, MSE, Reg, constraint_coeffs, unscaled_constraint_coeffs, estimator_coeffs, **kwargs, ): l1_norm = torch.sum(torch.abs(torch.cat(constraint_coeffs, dim=1)), dim=0) self.update_tensorboard( iteration, loss, MSE, Reg, l1_norm, constraint_coeffs, unscaled_constraint_coeffs, estimator_coeffs, **kwargs, ) self.update_terminal(iteration, MSE, Reg, l1_norm) def update_tensorboard( self, iteration, loss, loss_mse, loss_reg, loss_l1, constraint_coeff_vectors, unscaled_constraint_coeff_vectors, estimator_coeff_vectors, **kwargs, ): """Write the current state of training to Tensorboard Args: iteration (int): iteration number loss (float): loss value loss_mse (float): loss of the Mean Squared Error term loss_reg (float): loss of the regularization term loss_l1 (float): loss of the L1 penalty term constraint_coeff_vectors (np.array): vector with constraint coefficients unscaled_constraint_coeff_vectors (np.array): unscaled vector with constraint coefficients estimator_coeff_vectors (np.array): coefficients as computed by the estimator. """ # Costs and coeff vectors self.writer.add_scalar("loss/loss", loss, iteration) self.writer.add_scalars( "loss/mse", {f"output_{idx}": val for idx, val in enumerate(loss_mse)}, iteration, ) self.writer.add_scalars( "loss/reg", {f"output_{idx}": val for idx, val in enumerate(loss_reg)}, iteration, ) self.writer.add_scalars( "loss/l1", {f"output_{idx}": val for idx, val in enumerate(loss_l1)}, iteration, ) for output_idx, (coeffs, unscaled_coeffs, estimator_coeffs) in enumerate( zip( constraint_coeff_vectors, unscaled_constraint_coeff_vectors, estimator_coeff_vectors, )): self.writer.add_scalars( f"coeffs/output_{output_idx}", { f"coeff_{idx}": val for idx, val in enumerate(coeffs.squeeze()) }, iteration, ) self.writer.add_scalars( f"unscaled_coeffs/output_{output_idx}", { f"coeff_{idx}": val for idx, val in enumerate(unscaled_coeffs.squeeze()) }, iteration, ) self.writer.add_scalars( f"estimator_coeffs/output_{output_idx}", { f"coeff_{idx}": val for idx, val in enumerate(estimator_coeffs.squeeze()) }, iteration, ) # Writing remaining kwargs for key, value in kwargs.items(): if value.numel() == 1: self.writer.add_scalar(f"remaining/{key}", value, iteration) else: self.writer.add_scalars( f"remaining/{key}", { f"val_{idx}": val.squeeze() for idx, val in enumerate(value.squeeze()) }, iteration, ) def update_terminal(self, iteration, MSE, Reg, L1): """Prints and updates progress of training cycle in command line.""" sys.stdout.write( f"\r{iteration:>6} MSE: {torch.sum(MSE).item():>8.2e} Reg: {torch.sum(Reg).item():>8.2e} L1: {torch.sum(L1).item():>8.2e} " ) sys.stdout.flush() def close(self, model): """Close the Tensorboard writer""" print("Algorithm converged. Writing model to disk.") self.writer.flush() # flush remaining stuff to disk self.writer.close() # close writer # Save model model_path = self.log_dir + "model.pt" torch.save(model.state_dict(), model_path)
def train(opt, config, val_fold=0): # torch.cuda.set_enabled_lms(True) # if (torch.cuda.get_enabled_lms()): # torch.cuda.set_limit_lms(11000 * 1024 * 1024) # print('[LMS=On limit=' + str(torch.cuda.get_limit_lms()) + ']') if 'task' not in config['dataset']: config['dataset']['task'] = 3 # for back compatibility print('Manually assigning: task 3') logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger = SummaryWriter(log_dir=opt.logger_name, comment='') experiment_path = tb_logger.get_logdir() # Dump configuration to experiment path copyfile(opt.config, os.path.join(experiment_path, 'config.json')) # Load Vocabulary Wrapper # Load data loaders test_transforms = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) train_transforms = T.Compose([T.Resize(256), T.RandomCrop(224), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) train_dataset = SemEvalDataset(config, split='train', transforms=train_transforms, val_fold=val_fold) val_dataset = SemEvalDataset(config, split='val', transforms=test_transforms, val_fold=val_fold) id_intersection = set([x['id'] for x in train_dataset.targets]).intersection([x['id'] for x in val_dataset.targets]) assert len(id_intersection) == 0 if config['dataset']['task'] == 3: classes = read_classes('techniques_list_task3.txt') elif config['dataset']['task'] == 1: classes = read_classes('techniques_list_task1-2.txt') collate_fn = Collate(config, classes) if 'balanced-sampling' in config['training'] and config['training']['balanced-sampling']: classes_ids = [[train_dataset.class_list.index(x) for x in info['labels']] for info in train_dataset.targets] labels = np.zeros((len(classes_ids), len(train_dataset.class_list))) for l, c in zip(labels, classes_ids): l[c] = 1 sampler = MultilabelBalancedRandomSampler(labels) else: sampler = None train_dataloader = DataLoader(train_dataset, batch_size=config['training']['bs'], shuffle=True if sampler is None else False, num_workers=opt.workers, collate_fn=collate_fn, sampler=sampler) val_dataloader = DataLoader(val_dataset, batch_size=config['training']['bs'], shuffle=False, num_workers=opt.workers, collate_fn=collate_fn) # Construct the model model = MemeMultiLabelClassifier(config, labels=classes) if torch.cuda.is_available() and not (opt.resume or opt.load_model): model.cuda() # Construct the optimizer if not config['text-model']['fine-tune'] and not config['image-model']['fine-tune']: optimizer = torch.optim.Adam([p for n, p in model.named_parameters() if 'textual_module' not in n and 'visual_module' not in n], lr=config['training']['lr']) else: if config['dataset']['task'] == 3: optimizer = torch.optim.Adam([ {'params': [p for n, p in model.named_parameters() if 'textual_module' not in n and 'visual_module' not in n]}, {'params': model.textual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']}, {'params': model.visual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']}] , lr=config['training']['lr']) elif config['dataset']['task'] == 1: optimizer = torch.optim.Adam([ {'params': [p for n, p in model.named_parameters() if 'textual_module' not in n and 'visual_module' not in n]}, {'params': model.textual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']}] , lr=config['training']['lr']) # LR scheduler scheduler_name = config['training']['scheduler'] if scheduler_name == 'steplr': scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, gamma=config['training']['gamma'], milestones=config['training']['milestones']) elif scheduler_name is None: scheduler = None else: raise ValueError('{} scheduler is not available'.format(scheduler_name)) # # optionally resume from a checkpoint start_epoch = 0 # if opt.resume or opt.load_model: # filename = opt.resume if opt.resume else opt.load_model # if os.path.isfile(filename): # print("=> loading checkpoint '{}'".format(filename)) # checkpoint = torch.load(filename, map_location='cpu') # model.load_state_dict(checkpoint['model'], strict=False) # if torch.cuda.is_available(): # model.cuda() # if opt.resume: # start_epoch = checkpoint['epoch'] # # best_rsum = checkpoint['best_rsum'] # optimizer.load_state_dict(checkpoint['optimizer']) # if checkpoint['scheduler'] is not None and not opt.reinitialize_scheduler: # scheduler.load_state_dict(checkpoint['scheduler']) # # Eiters is used to show logs as the continuation of another # # training # model.Eiters = checkpoint['Eiters'] # print("=> loaded checkpoint '{}' (epoch {})" # .format(opt.resume, start_epoch)) # else: # print("=> loaded only model from checkpoint '{}'" # .format(opt.load_model)) # else: # print("=> no checkpoint found at '{}'".format(opt.resume)) model.train() # Train loop mean_loss = 0 progress_bar = tqdm.trange(start_epoch, opt.num_epochs) progress_bar.set_description('Train') best_f1 = 0.0 for epoch in progress_bar: for it, (image, text, text_len, labels, ids) in enumerate(train_dataloader): global_iteration = epoch * len(train_dataloader) + it if torch.cuda.is_available(): image = image.cuda() if image is not None else None text = text.cuda() labels = labels.cuda() # forward the model optimizer.zero_grad() loss = model(image, text, text_len, labels) loss.backward() optimizer.step() mean_loss += loss.item() if global_iteration % opt.log_step == 0: mean_loss /= opt.log_step progress_bar.set_postfix(dict(loss='{:.2}'.format(mean_loss))) mean_loss = 0 tb_logger.add_scalar("Training/Epoch", epoch, global_iteration) tb_logger.add_scalar("Training/Loss", loss.item(), global_iteration) tb_logger.add_scalar("Training/Learning_Rate", optimizer.param_groups[0]['lr'], global_iteration) if global_iteration % opt.val_step == 0: # validate (using different thresholds) metrics = validate(val_dataloader, model, classes, thresholds=[0.3, 0.5, 0.8]) tb_logger.add_scalars("Validation/F1", metrics, global_iteration) print(metrics) # progress_bar.set_postfix(dict(macroF1='{:.2}'.format(metrics['macroF1_thr=0.5']), microF1='{:.2}'.format(metrics['microF1_thr=0.5']))) # save best model if metrics['macroF1_thr=0.3'] + metrics['microF1_thr=0.3'] > best_f1: print('Saving best model...') checkpoint = { 'cfg': config, 'epoch': epoch, 'model': model.joint_processing_module.state_dict() if not config['text-model']['fine-tune'] and not config['image-model']['fine-tune'] else model.state_dict()} # 'optimizer': optimizer.state_dict(), # 'scheduler': scheduler.state_dict()} latest = os.path.join(experiment_path, 'model_best_fold{}.pt'.format(val_fold)) torch.save(checkpoint, latest) best_f1 = metrics['macroF1_thr=0.3'] + metrics['microF1_thr=0.3'] scheduler.step()
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Creating tensorboard writer if not args.resume: writer = SummaryWriter(comment=TENSORBOARD_RESULT_FILE_NAME) else: writer = SummaryWriter("") ###################### # Creating test data # ###################### print("Loading test data") viped_dataset_test = get_dataset("viped", get_transform(train=False, aug=args.aug), percentage=5, val=True) mot19_dataset_test = get_dataset("mot19", get_transform(train=False), val=True) mot17_dataset_test = get_dataset("mot17", get_transform(train=False), val=True) crowd_human_dataset_test = get_dataset("crowd_human", get_transform(train=False), val=True) city_persons_dataset_test = get_dataset("city_persons", get_transform(train=False), val=True) coco_persons_dataset_test = get_dataset("COCO_persons", get_transform(train=False), val=True) ########################## # Creating training data # ########################## print("Loading training data") train_datasets_dict = { 'viped': lambda: get_dataset("viped", get_transform(train=True, aug=args.aug)), 'mot19': lambda: get_dataset("mot19", get_transform(train=True)), 'mot17': lambda: get_dataset("mot17", get_transform(train=True)), 'crowd_human': lambda: get_dataset("crowd_human", get_transform(train=True)), 'city_persons': lambda: get_dataset("city_persons", get_transform(train=True)), 'COCO_persons:': lambda: get_dataset("COCO_persons", get_transform(train=True)), } ################################# # Preparing training dataloader # ################################# if args.train_on in train_datasets_dict: # the train dataset is a normal single dataset train_dataset = train_datasets_dict[args.train_on]() train_dataloader = DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, collate_fn=train_dataset.standard_collate_fn) print('Using training dataset: {}'.format(args.train_on)) elif ',' in args.train_on: assert args.tgt_images_in_batch > 0, "Using mixed training. " \ "You need to specify the args.tgt_images_in_batch parameter!" # the train dataset is an ensamble of datasets source_dataset_name, target_dataset_name = args.train_on.split(',') train_dataset = DatasetsEnsemble( train_datasets_dict[source_dataset_name](), train_datasets_dict[target_dataset_name]()) train_dataloader = DataLoader( train_dataset, collate_fn=train_dataset.source_dataset.standard_collate_fn, num_workers=args.workers, batch_sampler=EnsembleBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, tgt_imgs_in_batch=args.tgt_images_in_batch)) print( 'Using mixed training datasets. Source: {}, Target: {}. In every batch, {}/{} are from {}' .format(source_dataset_name, target_dataset_name, args.tgt_images_in_batch, args.batch_size, target_dataset_name)) else: raise ValueError('Dataset not known!') ############################## # Preparing test dataloaders # ############################## data_loader_viped_test = DataLoader( viped_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=viped_dataset_test.standard_collate_fn) data_loader_mot19_test = DataLoader( mot19_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=mot19_dataset_test.standard_collate_fn) data_loader_mot17_test = DataLoader( mot17_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=mot17_dataset_test.standard_collate_fn) data_loader_crowd_human_test = DataLoader( crowd_human_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=crowd_human_dataset_test.standard_collate_fn) data_loader_city_persons_test = DataLoader( city_persons_dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=city_persons_dataset_test.standard_collate_fn) data_loader_coco_persons_test = DataLoader( coco_persons_dataset_test, shuffle=False, num_workers=args.workers, collate_fn=coco_persons_dataset_test.standard_collate_fn) # Creating model print("Creating model") model, backbone = get_model_detection(num_classes=1, model=args.model, pretrained=args.pretrained) # Putting model to device and setting eval mode model.to(device) model.train() # freeze the backbone parameters, if needed if backbone is not None and args.freeze_backbone: for param in backbone.parameters(): param.requires_grad = False print('Backbone is freezed!') # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] if args.optimizer == "sgd": optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == "adam": optimizer = torch.optim.Adam( params=params, lr=args.lr, ) else: print("Optimizer not available") exit(1) # and a learning rate scheduler if args.lr_scheduler == "step_lr": lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) elif args.lr_scheduler == "plateau": lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', patience=args.lr_patience, verbose=True) else: print("L-Scheduler not available") exit(1) # Defining a warm-uo lr scheduler warmup_iters = min(1000, len(train_dataloader) - 1) warmup_factor = 1. / 1000 warmup_lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) # Loading checkpoint start_epoch = 0 train_step = -1 best_viped_ap, best_mot19_ap, best_mot17_ap, best_crowdhuman_ap, best_citypersons_ap, best_cocopersons_ap \ = 0, 0, 0, 0, 0, 0 if args.resume: print("Resuming from checkpoint") checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) warmup_lr_scheduler.load_state_dict(checkpoint['warmup_lr_scheduler']) start_epoch = checkpoint['epoch'] train_step = checkpoint['iteration'] best_viped_ap = checkpoint['best_viped_ap'] best_mot19_ap = checkpoint['best_mot19_ap'] best_mot17_ap = checkpoint['best_mot17_ap'] best_crowdhuman_ap = checkpoint['best_crowdhuman_ap'] best_citypersons_ap = checkpoint['best_citypersons_ap'] best_cocopersons_ap = checkpoint['best_cocopersons_ap'] # Cross-check if the backbone has been really freezed if backbone is not None and args.freeze_backbone: for param in backbone.parameters(): assert not param.requires_grad, "Backbone seems to be not freezed correctly!" # Train print("Start training") for epoch in range(start_epoch, args.epochs): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) for images, targets in metric_logger.log_every( train_dataloader, print_freq=args.print_freq, header=header): train_step += 1 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() # clip norm torch.nn.utils.clip_grad_norm(model.parameters(), 50) optimizer.step() if epoch == 0 and train_step < warmup_iters: warmup_lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if train_step % args.log_loss == 0: writer.add_scalar('Training/Learning Rate', optimizer.param_groups[0]["lr"], train_step) writer.add_scalar('Training/Reduced Sum Losses', losses_reduced, train_step) writer.add_scalars('Training/All Losses', loss_dict, train_step) if (train_step % args.save_freq == 0 and train_step != 0) or \ (args.pretrained and train_step < 5*args.save_freq and train_step % 200 == 0 and train_step != 0) \ or train_step == 100: # evaluate on the test datasets print("Validation viped Dataset") viped_coco_evaluator = evaluate(model, data_loader_viped_test, device=device, max_dets=args.max_dets) print("Validation mot19 Dataset") mot19_coco_evaluator = evaluate(model, data_loader_mot19_test, device=device, max_dets=args.max_dets) print("Validation mot17 Dataset") mot17_coco_evaluator = evaluate(model, data_loader_mot17_test, device=device, max_dets=args.max_dets) print("Validation crowdhuman Dataset") crowdhuman_coco_evaluator = evaluate( model, data_loader_crowd_human_test, device=device, max_dets=args.max_dets) print("Validation citypersons Dataset") citypersons_coco_evaluator = evaluate( model, data_loader_city_persons_test, device=device, max_dets=args.max_dets) print("Validation COCO Persons Dataset") cocopersons_coco_evaluator = evaluate( model, data_loader_coco_persons_test, device=device, max_dets=args.max_dets) # save using tensorboard viped_ap, mot19_ap, mot17_ap, crowdhuman_ap, citypersons_ap, cocopersons_ap = \ None, None, None, None, None, None for iou_type, coco_eval in viped_coco_evaluator.coco_eval.items( ): viped_ap = coco_eval.stats[1] for iou_type, coco_eval in mot19_coco_evaluator.coco_eval.items( ): mot19_ap = coco_eval.stats[1] for iou_type, coco_eval in mot17_coco_evaluator.coco_eval.items( ): mot17_ap = coco_eval.stats[1] for iou_type, coco_eval in crowdhuman_coco_evaluator.coco_eval.items( ): crowdhuman_ap = coco_eval.stats[1] for iou_type, coco_eval in citypersons_coco_evaluator.coco_eval.items( ): citypersons_ap = coco_eval.stats[1] for iou_type, coco_eval in cocopersons_coco_evaluator.coco_eval.items( ): cocopersons_ap = coco_eval.stats[1] writer.add_scalar('COCO mAP Validation/ViPeD', viped_ap, train_step) writer.add_scalar('COCO mAP Validation/MOT19', mot19_ap, train_step) writer.add_scalar('COCO mAP Validation/MOT17', mot17_ap, train_step) writer.add_scalar('COCO mAP Validation/CrowdHuman', crowdhuman_ap, train_step) writer.add_scalar('COCO mAP Validation/CityPersons', citypersons_ap, train_step) writer.add_scalar('COCO mAP Validation/COCOPersons', cocopersons_ap, train_step) # Eventually saving best models if viped_ap > best_viped_ap: best_viped_ap = viped_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="viped") if mot19_ap > best_mot19_ap: best_mot19_ap = mot19_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="mot19") if mot17_ap > best_mot17_ap: best_mot17_ap = mot17_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="mot17") if crowdhuman_ap > best_crowdhuman_ap: best_crowdhuman_ap = crowdhuman_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="crowdhuman") if citypersons_ap > best_citypersons_ap: best_citypersons_ap = citypersons_ap save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir(), best_model="citypersons") # Saving model save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'warmup_lr_scheduler': warmup_lr_scheduler.state_dict() if warmup_lr_scheduler is not None else None, 'epoch': epoch, 'iteration': train_step, 'best_viped_ap': best_viped_ap, 'best_mot19_ap': best_mot19_ap, 'best_mot17_ap': best_mot17_ap, 'best_crowdhuman_ap': best_crowdhuman_ap, 'best_citypersons_ap': best_citypersons_ap, 'best_cocopersons_ap': best_cocopersons_ap, }, writer.get_logdir()) # Setting again to train mode model.train() lr_scheduler.step()
def main(): args = config_parser(base_parser()).parse_args() if args.detect_anomalies: torch.autograd.set_detect_anomaly(True) torch.manual_seed(42) torch.set_default_dtype(torch.float32) # load graph pdists g_pdists, g = load_pdists(args) g_pdists = torch.Tensor(g_pdists) n = g_pdists.shape[0] d = args.manifold_dim # masks used to get the upper diagonal part of (i) pairwise distances # matrices, and (ii) the embedding matrices themselves mask = torch.triu_indices(n, n, 1) e_mask = torch.triu_indices(d, d) # we are only using the upper diagonal part g_pdists = g_pdists[mask[0], mask[1]] # scale if needed if args.min_max_scale: g_pdists = min_max_scale(g_pdists, 1, 10) g_sq_pdists = g_pdists.pow(2) # keep a numpy copy for computing metrics g_pdists_np = g_pdists.cpu().numpy() # embedding initializations X_init = sample_init_points(n, d) # put them on GPU if available if torch.cuda.is_available(): with Timer('copying data to GPU'): X_init = X_init.pin_memory().cuda() g_sq_pdists = g_sq_pdists.pin_memory().cuda() # the embedding paramters we optimize for spd = geoopt.SymmetricPositiveDefinite(wmin=1e-8, wmax=1e8) X = geoopt.ManifoldParameter(X_init, manifold=spd) # the distance function dist_fn = manifold_sq_pdists_stein if args.stein_div else manifold_sq_pdists # setup the optimizer # TODO(ccruceru): Investigate the momentum issues. optim = geoopt.optim.RiemannianSGD([X], lr=0.5) lr_scheduler = ReduceLROnPlateau(optim, patience=20, factor=0.5, min_lr=1e-8, verbose=args.verbose) # training settings writer = SummaryWriter() n_epochs = 1500 save_every_epochs = 10 def criterion(epoch): mdists = dist_fn(X) l1 = (mdists / g_sq_pdists - 1.0).abs().sum() eps = 1.0 / (epoch + 1) l2 = (g_sq_pdists / (mdists + eps) - 1.0).abs().sum() return (l1 + l2) / n def run_epoch(epoch): optim.zero_grad() loss = criterion(epoch) loss.backward() optim.step() lr_scheduler.step(loss) return loss def compute_metrics(): with torch.no_grad(): man_pdists_np = dist_fn(X).sqrt().cpu().numpy() ad = average_distortion(g_pdists_np, man_pdists_np) if g is None: return ad, None # TODO(ccruceru): Make sure this is correct. Try to reproduce the # result from the ref. paper on 10D Euclidean manifold. man_pdists_sym = pdists_vec_to_sym(man_pdists_np, n) mean_ap = mean_average_precision(g, man_pdists_sym) return ad, mean_ap with Timer('training'): for epoch in range(n_epochs): # early break if we reached the minimum learning rate if optim.param_groups[0]['lr'] <= 2 * lr_scheduler.min_lrs[0]: break start = time.time() loss = run_epoch(epoch) stop = time.time() if epoch % save_every_epochs != 0: continue # show it if args.verbose: print('epoch {:5}, loss {:.10f}, time {}'.format( epoch, loss.item(), stop - start)) # monitoring with torch.no_grad(): logw = eig(X).log() ks = logw.max(1).values - logw.min(1).values ad, mean_ap = compute_metrics() writer.add_scalar('loss', loss, epoch) writer.add_histogram('log_lambda', logw.flatten(), epoch) writer.add_histogram('log_k_X', ks, epoch) writer.add_embedding(X[:, e_mask[0], e_mask[1]], global_step=epoch) # metrics writer.add_scalar('avg_distortion', ad, epoch) if mean_ap: writer.add_scalar('mAP', mean_ap, epoch) torch.save(X, os.path.join(writer.get_logdir(), 'x_opt.pt')) # final metrics ad, mean_ap = compute_metrics() print('Average distortion: ', ad) if mean_ap: print('mAP: ', mean_ap)
def main(): from config import config_enhanced writer = SummaryWriter(os.path.join('runs', name_dir(config_enhanced))) torch.multiprocessing.freeze_support() print("Current config_enhanced is:") pprint(config_enhanced) writer.add_text("config", str(config_enhanced)) save_path = str(writer.get_logdir()) try: os.makedirs(save_path) except OSError: pass # with open(os.path.join(save_path, "config.json"), 'w') as outfile: # json.dump(config_enhanced, outfile) torch.manual_seed(config_enhanced['seed']) torch.cuda.manual_seed_all(config_enhanced['seed']) use_cuda = torch.cuda.is_available() if torch.cuda.is_available() and config_enhanced['cuda_deterministic']: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # torch.set_num_threads(1) if use_cuda: device = torch.device('cuda') print("using GPU") else: device = torch.device('cpu') print("using CPU") if config_enhanced['num_processes'] == "num_cpu": num_processes = multiprocessing.cpu_count() - 1 else: num_processes = config_enhanced['num_processes'] # if torch.cuda.device_count() > 1: # print("Let's use", torch.cuda.device_count(), "GPUs!") # # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs # model = torch.nn.DataParallel(model) env = CholeskyTaskGraph(**config_enhanced['env_settings']) envs = VectorEnv(env, num_processes) envs.reset() model = SimpleNet(**config_enhanced["network_parameters"]) if config_enhanced["model_path"]: model.load_state_dict(torch.load(config_enhanced['model_path'])) actor_critic = Policy(model, envs.action_space, config_enhanced) actor_critic = actor_critic.to(device) if config_enhanced['agent'] == 'PPO': print("using PPO") agent_settings = config_enhanced['PPO_settings'] agent = PPO( actor_critic, **agent_settings) elif config_enhanced['agent'] == 'A2C': print("using A2C") agent_settings = config_enhanced['A2C_settings'] agent = A2C_ACKTR( actor_critic, **agent_settings) rollouts = RolloutStorage(config_enhanced['trajectory_length'], num_processes, env_example.observation_space.shape, env_example.action_space) obs = envs.reset() obs = torch.tensor(obs, device=device) rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( config_enhanced['num_env_steps']) // config_enhanced['trajectory_length'] // num_processes for j in range(num_updates): if config_enhanced['use_linear_lr_decay']: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, config_enhanced['network']['lr']) for step in tqdm(range(config_enhanced['trajectory_length'])): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step]) actions = action.squeeze(-1).detach().cpu().numpy() # Observe reward and next obs obs, reward, done, infos = envs.step(actions) obs = torch.tensor(obs, device=device) reward = torch.tensor(reward, device=device).unsqueeze(-1) done = torch.tensor(done, device=device) n_step = (j * config_enhanced['trajectory_length'] + step) * num_processes for info in infos: if 'episode' in info.keys(): reward_episode = info['episode']['r'] episode_rewards.append(reward_episode) writer.add_scalar('reward', reward_episode, n_step) writer.add_scalar('solved', int(info['episode']['length'] == envs.envs[0].max_steps)) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1]).detach() rollouts.compute_returns(next_value, config_enhanced["use_gae"], config_enhanced["gamma"], config_enhanced['gae_lambda'], config_enhanced['use_proper_time_limits']) value_loss, action_loss, dist_entropy = agent.update(rollouts) writer.add_scalar('value loss', value_loss, n_step) writer.add_scalar('action loss', action_loss, n_step) writer.add_scalar('dist_entropy', dist_entropy, n_step) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % config_enhanced['save_interval'] == 0 or j == num_updates - 1): save_path = str(writer.get_logdir()) try: os.makedirs(save_path) except OSError: pass torch.save(actor_critic, os.path.join(save_path, "model.pth")) if j % config_enhanced['log_interval'] == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, n_step, int(n_step / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (config_enhanced['evaluate_every'] is not None and len(episode_rewards) > 1 and j % config_enhanced['evaluate_every'] == 0): eval_reward = evaluate(actor_critic, boxworld, config_enhanced, device) writer.add_scalar("eval reward", eval_reward, n_step)
class MainLogger: def __init__(self, log_dir=None,comment='', name='pkgpl', level=logging.DEBUG): self.start=timer() # pytorch tensorboard writer - master only self.writer = SummaryWriter(log_dir=log_dir,comment=comment) self.get_logdir = self.writer.get_logdir self.add_hparams = self.writer.add_hparams self.add_text = self.writer.add_text self.add_scalar = self.writer.add_scalar self.add_scalars = self.writer.add_scalars self.add_figure = self.writer.add_figure self.flush = self.writer.flush self.loss0 = dict() self.add_text('Name', name) # python logger self.logger = logging.getLogger(name) self.stream_logger = logging.getLogger('tty') # file logger fileHandler = logging.FileHandler("%s/log.txt"%(self.get_logdir())) fileHandler.setFormatter(formatter) self.logger.addHandler(fileHandler) self.logger.setLevel(level) self.write = self.logger.debug self.debug = self.logger.debug self.info = self.logger.info self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical # stream logger streamHandler = logging.StreamHandler() streamHandler.setFormatter(formatter) self.stream_logger.addHandler(streamHandler) self.stream_logger.setLevel(level) def print(self, msg): self.logger.debug(msg) self.stream_logger.debug(msg) def get_logdir(self): return self.writer.get_logdir() def log_hparams(self, name, hparams_dict): self.print("%s:\n%s"%(name,yaml.dump(hparams_dict))) self.add_text(name, str(hparams_dict)) def log_loss(self, loss, epoch, name='loss',filename=None, add_figure=True,log_norm=True): if filename is None: filename = name+'.txt' # file output strftime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") with open("%s/%s"%(self.get_logdir(),filename),'a') as fl: fl.write("%d %9.3e [%s]\n"%(epoch,loss,strftime)) # tensorboard if add_figure: self.add_scalar(name, loss, epoch) if log_norm: if self.loss0.get(name) is None: self.loss0[name] = loss self.add_scalar('normalized_%s'%name, loss/self.loss0[name], epoch) def log_gradient(self, grad, epoch, h, filename='grad.', add_figure=True,figurename='gradient',perc=99.,figsize=[15,4]): # file output g = grad.to('cpu').numpy() g.tofile("%s/%s%04d"%(self.get_logdir(),filename,epoch)) # tensorboard if add_figure: fig=plot_mig(perc_clip(g,perc),h,figsize=figsize) self.add_figure(figurename,fig,epoch) plt.close(fig) def log_velocity(self, vel, epoch, h, filename='vel.', add_figure=True,figurename='velocity',vmin=None,vmax=None,figsize=[15,4]): # file output v = vel.to('cpu').detach().numpy() v.tofile("%s/%s%04d"%(self.get_logdir(),filename,epoch)) # tensorboard if add_figure: fig = plot_vel(v,h,vmin,vmax,figsize=figsize) self.add_figure(figurename,fig,epoch) plt.close(fig) def output(self,model,epoch,loss,log_gradient=True): self.log_loss(loss,epoch) if log_gradient: grad = model.gradient() grad_norm = grad.norm(float('inf')).item() self.write("epoch %d, loss %9.3e, gnorm %9.3e"%(epoch,loss,grad_norm)) else: self.write("epoch %d, loss %9.3e"%(epoch,loss)) if epoch < model.hparams.skip_output or epoch % model.hparams.skip_output ==0: if log_gradient: self.log_gradient(grad,epoch,model.h) self.log_velocity(model.velocity(),epoch,model.h, vmin=model.hparams.vmin, vmax=model.hparams.vmax) self.flush() # def final(self,args,loss): # #hparam_dict={'lr':args.lr,'grad_norm':args.grad_norm, # # 'optimizer':args.optimizer,'momentum':args.momentum, # # 'max_epochs':args.max_epochs} # hparam_dict = vars(args) # metric_dict={'final_loss':loss} # self.add_hparams(hparam_dict,metric_dict) def progress_bar(self,count,total,status=''): # from https://gist.github.com/vladignatyev/06860ec2040cb497f0f3 tavg = (timer()-self.start)/(count+1) bar_len=60 frac = count/(total-1) filled_len = int(round(bar_len*frac)) percents = round(100*frac,1) bar = '='*filled_len + '-'*(bar_len-filled_len) sys.stdout.write('[%s] %s%% (%s/%s,%7.2fs/it) %s\r'%(bar,percents,count,total,tavg,status)) sys.stdout.flush()
def train_net(self, train_loader, n_epoc, checkpoint_factor, arg, if_continue, checkpoint_epoc=None, log_dir=None): # TODO arg is not good to pass here if log_dir is None: if self.if_condition: writer = SummaryWriter(comment=f'_CWGAN_GP_{self.data_name}' ) # TODO to add hyper parmeters else: writer = SummaryWriter(comment=f'_WGAN_GP_{self.data_name}') else: writer = SummaryWriter(log_dir=log_dir) if self.if_condition: test_lbl = torch.arange(self.n_class, device=DEVICE).reshape(-1, 1) test_lbl = test_lbl.repeat(1, 8) test_lbl = test_lbl.reshape(-1) test_noise = self.generate_noise(test_lbl.shape[0]) else: test_noise = self.generate_noise(64) n_sample = len(train_loader.dataset) start_epoch = checkpoint_epoc + 1 if if_continue else 1 for i in range(start_epoch, n_epoc + 1): epoc_l_d, epoc_l_g, epoc_score_p, epoc_score_f1, epoc_score_f2 = 0., 0., 0., 0., 0. self.conv_gen.train(), self.conv_dis.train() with tqdm(total=len(train_loader), desc=f"epoc: {i}") as pbar: for k, (real_img, real_lbl) in enumerate(train_loader): if IF_CUDA: real_img = real_img.cuda() real_lbl = real_lbl.cuda() if self.if_condition: d_loss, p_score, f_score1 = self.train_d_step( real_img, real_lbl) g_loss, f_score2 = self.train_g_step( real_img.shape[0], real_lbl) else: d_loss, p_score, f_score1 = self.train_d_step( real_img, None) g_loss, f_score2 = self.train_g_step( real_img.shape[0], None) batch_size = real_img.shape[0] epoc_l_d += d_loss * batch_size epoc_l_g += g_loss * batch_size epoc_score_p += p_score * batch_size epoc_score_f1 += f_score1 * batch_size epoc_score_f2 += f_score2 * batch_size pbar.set_postfix({ "d_loss": d_loss, "g_loss": g_loss, "p_score": p_score, "f_score D": f_score1, 'G': f_score2 }) pbar.update() epoc_l_d /= n_sample epoc_l_g /= n_sample epoc_score_p /= n_sample epoc_score_f1 /= n_sample epoc_score_f2 /= n_sample pbar.set_postfix({ "epoch: d_loss": epoc_l_d, "g_loss": epoc_l_g, "p_score": epoc_score_p, "f_score D": epoc_score_f1, 'G': epoc_score_f2 }) writer.add_scalar('loss/generator', epoc_l_g, i) writer.add_scalar('loss/discriminator', epoc_l_d, i) writer.add_scalar('score/real', epoc_score_p, i) writer.add_scalar('score/fake_D', epoc_score_f1, i) writer.add_scalar('score/fake_G', epoc_score_f2, i) self.conv_gen.eval(), self.conv_dis.eval() if self.if_condition: test_img = self.conv_gen(test_noise, test_lbl) else: test_img = self.conv_gen(test_noise, None) test_img = ( test_img + 1.0) / 2.0 # Note that this is important to recover the range test_img = test_img.reshape(test_noise.shape[0], *self.img_shape) writer.add_images('img', test_img, i) writer.flush() if i % checkpoint_factor == 0: checkpoint_dict = { 'arg': arg.__dict__, 'G': self.conv_gen.state_dict(), 'D': self.conv_dis.state_dict(), 'epoch': i, 'torch_seed': torch.initial_seed(), 'log_dir': writer.get_logdir(), 'opt_D': self.opt_D.state_dict(), 'opt_G': self.opt_G.state_dict() } save_path = os.path.join(writer.get_logdir(), f'ckpt{i}.pth') torch.save(checkpoint_dict, save_path) writer.close() return
""" # Extract all feature maps # Hint: use conv_layer_indices to access with torch.no_grad(): feature_maps = [] x = input for layer in model.features: x = layer(x) if isinstance(layer, torch.nn.ReLU): feature_maps.append(x) return feature_maps from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() output_path = writer.get_logdir() import os, math, random from torchvision import utils with torch.no_grad(): # visualize weight selected_channel = [-1, -1, -1, -1, -1] for layer_idx, channel in zip(conv_layer_indices, selected_channel): weight = extract_filter(layer_idx, model) n, c, h, w = weight.shape if channel == -1: channel = random.randint(0, c - 1) weight = weight[:, channel, :, :].unsqueeze(1) nrow = int(math.sqrt(n)) print(h, w) grid = utils.make_grid(weight, nrow=nrow, normalize=True, scale_each=True)
import torch from torch.utils.tensorboard import SummaryWriter def setup_logger(level=logging.DEBUG): formatter = logging.Formatter( fmt='%(asctime)s - %(levelname)s - %(module)s - %(message)s') handler = logging.StreamHandler() handler.setFormatter(formatter) _logger = logging.getLogger(__name__) _logger.setLevel(level) _logger.addHandler(handler) return _logger if __name__ == '__main__': writer = SummaryWriter("/tmp/runs/test_logger") print(writer.get_logdir()) video = torch.rand((4, 20, 3, 100, 100), dtype=torch.float) # N,T,C,H,W video = torch.clamp(video, 0., 1.) x = range(100) for i in x: writer.add_scalar('y=2x', i * 2, i) writer.add_video('videos', video, fps=20) writer.close()
class DefaultWriter: """ Default writer to be used by the agents, optionally wraps an instance of tensorboard.SummaryWriter. Can be used in the fit() method of the agents, so that training data can be handled by AgentManager and RemoteAgentManager. Parameters ---------- name : str Name of the writer. log_interval : int Minimum number of seconds between consecutive logs (with logging module). tensorboard_kwargs : Optional[dict] Parameters for tensorboard SummaryWriter. If provided, DefaultWriter will behave as tensorboard.SummaryWriter, and will keep utilities to handle data added with the add_scalar method. execution_metadata : metadata_utils.ExecutionMetadata Execution metadata about the object that is using the writer. maxlen : Optional[int], default: None If given, data stored by self._data (accessed through the property self.data) is limited to `maxlen` entries. """ def __init__( self, name: str, log_interval: int = 3, tensorboard_kwargs: Optional[dict] = None, execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None, maxlen: Optional[int] = None, ): self._name = name self._log_interval = log_interval self._execution_metadata = execution_metadata self._data = None self._time_last_log = None self._log_time = True self._maxlen = maxlen self.reset() # initialize tensorboard if (tensorboard_kwargs is not None) and ( not check_packages.TENSORBOARD_INSTALLED ): logger.warning( "[DefaultWriter]: received tensorboard_kwargs, but tensorboard is not installed." ) self._tensorboard_kwargs = tensorboard_kwargs self._tensorboard_logdir = None self._summary_writer = None if (tensorboard_kwargs is not None) and check_packages.TENSORBOARD_INSTALLED: self._summary_writer = SummaryWriter(**self._tensorboard_kwargs) self._tensorboard_logdir = self._summary_writer.get_logdir() def reset(self): """Clear data.""" self._data = dict() self._initial_time = timer() self._time_last_log = timer() @property def summary_writer(self): return self._summary_writer @property def data(self): df = pd.DataFrame(columns=("name", "tag", "value", "global_step")) for tag in self._data: df = pd.concat([df, pd.DataFrame(self._data[tag])], ignore_index=True) return df def add_scalar( self, tag: str, scalar_value: float, global_step: Optional[int] = None, walltime=None, new_style=False, ): """ Behaves as SummaryWriter.add_scalar(). Note: the tag 'dw_time_elapsed' is reserved and updated internally. It logs automatically the number of seconds elapsed Parameters ---------- tag : str Tag for the scalar. scalar_value : float Value of the scalar. global_step : int Step where scalar was added. If None, global steps will no longer be stored for the current tag. walltime : float Optional override default walltime (time.time()) with seconds after epoch of event new_style : bool Whether to use new style (tensor field) or old style (simple_value field). New style could lead to faster data loading. """ if self._summary_writer: self._summary_writer.add_scalar( tag, scalar_value, global_step, walltime, new_style ) self._add_scalar(tag, scalar_value, global_step) def _add_scalar( self, tag: str, scalar_value: float, global_step: Optional[int] = None ): """ Store scalar value in self._data. """ # Update data structures if tag not in self._data: self._data[tag] = dict() self._data[tag]["name"] = deque(maxlen=self._maxlen) self._data[tag]["tag"] = deque(maxlen=self._maxlen) self._data[tag]["value"] = deque(maxlen=self._maxlen) self._data[tag]["global_step"] = deque(maxlen=self._maxlen) self._data[tag]["name"].append( self._name ) # used in plots, when aggregating several writers self._data[tag]["tag"].append( tag ) # useful to convert all data to a single DataFrame self._data[tag]["value"].append(scalar_value) if global_step is None: self._data[tag]["global_step"].append(np.nan) else: self._data[tag]["global_step"].append(global_step) # Append time interval corresponding to global_step if global_step is not None and self._log_time: assert tag != "dw_time_elapsed", "The tag dw_time_elapsed is reserved." self._log_time = False self._add_scalar( tag="dw_time_elapsed", scalar_value=timer() - self._initial_time, global_step=global_step, ) self._log_time = True # Log if not self._log_time: self._log() def _log(self): # time since last log t_now = timer() time_elapsed = t_now - self._time_last_log # log if enough time has passed since the last log max_global_step = 0 if time_elapsed > self._log_interval: self._time_last_log = t_now message = "" for tag in self._data: val = self._data[tag]["value"][-1] gstep = self._data[tag]["global_step"][-1] message += f"{tag} = {val} | " if not np.isnan(gstep): max_global_step = max(max_global_step, gstep) header = self._name if self._execution_metadata: header += f"[worker: {self._execution_metadata.obj_worker_id}]" message = f"[{header}] | max_global_step = {max_global_step} | " + message logger.info(message) def __getattr__(self, attr): """ Calls SummaryWriter methods, if self._summary_writer is not None. Otherwise, does nothing. """ if attr[:2] == "__": raise AttributeError(attr) if attr in self.__dict__: return getattr(self, attr) if self._summary_writer: return getattr(self._summary_writer, attr) def method(*args, **kwargs): pass return method # # For pickle # def __getstate__(self): if self._summary_writer: self._summary_writer.close() state = self.__dict__.copy() return state def __setstate__(self, newstate): # Re-create summary writer with the same logdir if newstate["_summary_writer"]: newstate["_tensorboard_kwargs"].update( dict(log_dir=newstate["_tensorboard_logdir"]) ) newstate["_summary_writer"] = SummaryWriter( **newstate["_tensorboard_kwargs"] ) self.__dict__.update(newstate)
def run_train(args): # show tensorboard graphs with following command: tensorboard --logdir=src/runs writer = SummaryWriter(comment=args.comment) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize((160, 160)), np.float32, transforms.ToTensor(), fixed_image_standardization ]) # transform = transforms.Compose([ # transforms.Resize((224, 224)), # transforms.ToTensor(), # transforms.Normalize((0.485, 0.456, 0.406), # (0.229, 0.224, 0.225)) # ]) train_dataset, val_dataset, test_dataset = read_training_dataset( args, transform) tqdm.write('train data size: {}, validation data size: {}'.format( len(train_dataset), len(val_dataset))) # Build data loader train_loader = get_loader(train_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(val_dataset, args.batch_size, shuffle=False, num_workers=args.num_workers) # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('training on', device) # Build the models model = FaceRecognitionCNN().to(device) if args.freeze_first_epoch: for m in model.resnet.parameters(): m.requires_grad_(False) input_shape = next(iter(train_loader))[2].shape print('input shape', input_shape) # need to call this before summary!!! model.eval() # summary(model, input_shape[1:], batch_size=input_shape[0], device=device) print('model params (trainable, total):', count_parameters(model)) # Loss and optimizer criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.regularization) # decrease learning rate if validation accuracy has not increased lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', factor=1 / 4, patience=args.patience, verbose=True, ) writer.add_hparams(args.__dict__, {}) writer.add_text('model', str(model)) # Train the models total_step = len(train_loader) step = 1 best_val_acc = 0.5 for epoch in range(args.num_epochs): for i, (video_ids, frame_ids, images, targets) in \ tqdm(enumerate(train_loader), desc=f'training epoch {epoch}', total=len(train_loader)): model.train() # Set mini-batch dataset images = images.to(device) targets = targets.to(device) # Forward, backward and optimize outputs = model(images) loss = criterion(outputs, targets) model.zero_grad() loss.backward() optimizer.step() batch_accuracy = float( (outputs > 0.0).eq(targets).sum()) / len(targets) # Print log info step += 1 if (i + 1) % args.log_step == 0: print_training_info(batch_accuracy, loss, step, writer) if (i + 1) % args.val_step == 0: val_acc, pr_acc, tmp_acc = print_validation_info( args, criterion, device, model, val_loader, writer, epoch, step) if val_acc > best_val_acc: save_model_checkpoint(args, epoch, model, (val_acc, pr_acc, tmp_acc), writer.get_logdir()) best_val_acc = val_acc # validation step after full epoch val_acc, pr_acc, tmp_acc = print_validation_info( args, criterion, device, model, val_loader, writer, epoch, step) lr_scheduler.step(val_acc) if val_acc > best_val_acc: save_model_checkpoint(args, epoch, model, (val_acc, pr_acc, tmp_acc), writer.get_logdir()) best_val_acc = val_acc if args.freeze_first_epoch and epoch == 0: for m in model.resnet.parameters(): m.requires_grad_(True) tqdm.write('Fine tuning on') writer.close()