d_loss.backward() d_opt.step() # 生成器训练 z = torch.randn(imgData.size(0), DDN_SIZE, 1, 1).to(device) fake_img_1 = g_net(z) output = d_net(fake_img_1) g_loss = loss_fn(output, real_label) g_opt.zero_grad() g_loss.backward() g_opt.step() for name, value in d_net.named_parameters(): # print('name: {0},\t grad: {1}'.format(name, value.grad)) writer.add_histogram(name, value, epoch) if i % 30 == 0: real_score = real_score.cpu().data.mean() fake_score = fake_score.cpu().data.mean() print( "Epoch:[{}/{}],d_loss:{:.3f}," "g_loss:{:.3f},real_score:{:.3f},fake_score:{:.3f}".format( i, epoch, d_loss, g_loss, real_score, fake_score)) fake_img = fake_img.cpu().data save_image(fake_img, "./IMG/{}-fake.png".format(epoch), nrow=10, normalize=True,
def deepinversion_improved(self, use_generator = False, \ discrete_label = True, \ noisify_network = 0.0, \ knowledge_distill = 0.0, \ mutual_info = 0.0, \ batchnorm_transfer = 0.0, \ use_discriminator = 0.0, \ n_iters = 100): tb = SummaryWriter() if use_generator == True: z = torch.randn((self.n_samples, self.latent_dim), requires_grad=False, device=self.device, dtype=torch.float) if discrete_label == True: y_gt = torch.randint(0, 2, (self.n_samples, self.label_dim), dtype=torch.float, device=self.device) else: y_gt = torch.cuda.FloatTensor(self.n_samples, self.label_dim).uniform_(0, 1) x = self.net_gen(z, y_gt) if mutual_info > 0.0: ''' declare the optimizer for the encoder network ''' optimizer = torch.optim.Adam(list(self.net_gen.parameters()) + list(self.net_enc.parameters()), lr=self.lr) else: optimizer = torch.optim.Adam(self.net_gen.parameters(), lr=self.lr) else: x = torch.randn((self.n_samples, 2), requires_grad=True, device=self.device, dtype=torch.float) if discrete_label == True: y_gt = torch.randint(0, 2, (self.n_samples, self.label_dim), dtype=torch.float, device=self.device) else: y_gt = torch.cuda.FloatTensor(self.n_samples, self.label_dim).uniform_(0, 1) optimizer = torch.optim.Adam([x], lr=self.lr) #update name of output self.imgname = self.imgname + "_gen%d" % (use_generator) ''' declare the optimizer for the student network ''' optimizer_std = torch.optim.Adam(self.net_std.parameters(), lr=self.classifier_lr) if self.device == 'cuda': x_np = x.cpu().detach().clone().numpy() else: x_np = x.detach().clone().numpy() fig, ax = self.setup_plot_progress(x_np) total_loss = [] # set for testing with batchnorm self.net.eval() ## Create hooks for feature statistics loss_bn_feature_layers = [] if use_generator == True and use_discriminator > 0.0: nets_dis = [] nets_dis_params = [] for module in self.net.modules(): if isinstance(module, nn.BatchNorm1d): loss_bn_feature_layers.append(bn1dfeathook(module)) if use_generator == True and use_discriminator > 0.0: net_dis = netdis(module.running_mean.shape[0], self.n_hidden, 1).cuda() net_dis.apply(weights_init) nets_dis.append(net_dis) nets_dis_params += list(net_dis.parameters()) if use_generator == True and use_discriminator > 0.0: self.optimizer_dis = torch.optim.Adam(nets_dis_params, lr=self.lr, betas=(0.5, 0.9)) ## Create hooks for feature statistics for generator if use_generator == True and batchnorm_transfer > 0.0: loss_bn_feature_layers_gen = [] self.compute_loss_bn_gen(loss_bn_feature_layers_gen) for it in range(n_iters): self.net.zero_grad() self.net_gen.zero_grad() self.net_std.zero_grad() self.net_enc.zero_grad() optimizer.zero_grad() optimizer_std.zero_grad() if use_generator == True: ''' randomly sampling latent and labels ''' z = torch.randn((self.n_samples, self.latent_dim), requires_grad=False, device=self.device, dtype=torch.float) y_gt = torch.randint(0, 2, (self.n_samples, self.label_dim), dtype=torch.float, device=self.device) if use_generator == True: ''' generating samples with generator ''' x = self.net_gen(z, y_gt) ''' ********************************************************************** To optimize the generated samples or training the generator ********************************************************************** ''' if noisify_network > 0.0: ''' adding noise into the pre-trained classifier ''' weight = noisify_network * (n_iters - it) / n_iters self.net, orig_params = add_noise_to_net(self.net, weight=weight, noise_type='uniform') if it == 0: self.imgname = self.imgname + "_nosify%0.3f" % ( noisify_network) y_pd = self.net(x) ''' main loss (cross-entropy loss) ''' loss_main = self.loss_func(y_pd, y_gt) ''' l2 regularization ''' loss_l2 = torch.norm(x.view(-1, self.n_input_dim), dim=1).mean() ''' batch-norm regularization ''' rescale = [1. for _ in range(len(loss_bn_feature_layers))] loss_bn = sum([ mod.r_feature * rescale[idx] for (idx, mod) in enumerate(loss_bn_feature_layers) ]) ''' total loss ''' if use_generator == True and use_discriminator > 0.0: bn_w = 0.05 else: bn_w = 1.0 loss = loss_main + 0.005 * loss_l2 + bn_w * loss_bn if knowledge_distill > 0.0: ''' knowledge distillation (teacher-student) based regularization ''' y_st = self.net_std(x) #loss_kd = 1 - self.loss_func(y_st, y_pd.detach()) loss_kd = knowledge_distill_loss(y_pd.detach(), y_st) loss = loss + knowledge_distill * loss_kd if it == 0: self.imgname = self.imgname + "_kdistill%0.3f" % ( knowledge_distill) if mutual_info > 0.0: ''' mutual information constraint ''' ze = self.net_enc(x) loss_mi = ((z - ze)**2).mean() zdiv = torch.randn((self.n_samples, self.latent_dim), requires_grad=False, device=self.device, dtype=torch.float) xdiv = self.net_gen(zdiv, y_gt) loss_div = diveristy_loss(z, x, zdiv, xdiv) loss = loss + mutual_info * loss_mi + 0.1 * mutual_info * loss_div if it == 0: self.imgname = self.imgname + "_minfo%0.3f" % (mutual_info) if use_generator == True and batchnorm_transfer > 0.0: ''' batch-norm transfer loss ''' rescale_gen = [ 1. for _ in range(len(loss_bn_feature_layers_gen)) ] loss_bn_gen = sum([ mod.r_feature * rescale_gen[idx] for (idx, mod) in enumerate(loss_bn_feature_layers_gen) ]) loss = loss + batchnorm_transfer * loss_bn_gen if it == 0: self.imgname = self.imgname + "_btransfer%0.3f" % ( batchnorm_transfer) if use_generator == True and use_discriminator > 0.0: # train the generator on features loss_g = 0 # traing the generator on features for (idx, mod) in enumerate(loss_bn_feature_layers): nets_dis[idx].zero_grad() # frozen the gradient for the discriminator for p in nets_dis[idx].parameters(): p.requires_grad = False # to avoid computation feat_fake = mod.feat_fake.cuda() d_fake = nets_dis[idx](feat_fake) loss_g = loss_g - d_fake.mean() loss = loss + use_discriminator * loss_g if use_generator == True and it == 0: self.imgname = self.imgname + "_discriminator%0.3f" % ( use_discriminator) loss.backward() optimizer.step() if it % 100 == 0: tb.add_scalar("Total loss: ", loss, it) tb.add_scalar("Loss batchnorm", loss_bn, it) tb.add_histogram("Input", x, it) # tb.add_histogram("Input/gradients", x.grad, it) for name, param in self.net_gen.named_parameters(): tb.add_histogram(name, param.data, it) tb.add_histogram(name + "/gradients", param.grad, it) if noisify_network > 0.0: ''' reset the network's parameters ''' reset_params(self.net, orig_params) if knowledge_distill > 0.0: ''' ********************************************************************** To update the student network ********************************************************************** ''' if use_generator == True: ''' generating samples with generator ''' x = self.net_gen(z, y_gt) y_pd = self.net(x) y_st = self.net_std(x) #loss_kd = self.loss_func(y_st, y_pd.detach()) loss_kd = 1. - knowledge_distill_loss(y_pd.detach(), y_st) loss_kd.backward() optimizer_std.step() ''' store the main loss to plot on the figure ''' total_loss.append(loss.item()) if use_generator == True and use_discriminator > 0.0: # traing the discriminator on features for _ in range(5): loss_d = 0 x = self.net_gen(z, y_gt) self.net(x) for (idx, mod) in enumerate(loss_bn_feature_layers): nets_dis[idx].zero_grad() for p in nets_dis[idx].parameters( ): # reset requires_grad p.requires_grad = True feat_real = mod.feat_real.cuda() feat_fake = mod.feat_fake.cuda() d_real = nets_dis[idx](feat_real) d_fake = nets_dis[idx](feat_fake) penalty = calc_gradient_penalty(nets_dis[idx], feat_real, feat_fake, LAMBDA=1.0) loss_d = loss_d + use_discriminator * ( d_fake.mean() - d_real.mean() + penalty) loss_d.backward() self.optimizer_dis.step() if it % 10 == 0: print('-- iter %d --' % (it)) print('target loss: %f' % (loss_main.item())) print('l2-norm loss: %f' % (loss_l2.item())) print('batchnorm loss: %f' % (loss_bn.item())) if knowledge_distill > 0.0: print('distillation loss: %f' % (loss_bn.item())) if mutual_info > 0.0: print('mutual information / diversity losses: %f / %f' % (loss_mi.item(), loss_div.item())) if batchnorm_transfer > 0.0: print('batch-norm transfer loss: %f ' % (loss_bn_gen.item())) if use_generator == True and use_discriminator > 0.0: print('loss d / loss g: %f / %f' % (loss_d.item(), loss_g.item())) print('total loss: %f' % (loss.item())) ''' realtime plot ''' ax[0].plot(total_loss, c='b') fig.canvas.draw() if self.device == 'cuda': x_np = x.cpu().detach().numpy() else: x_np = x.detach().numpy() tb.close() ax[1].scatter(x_np[:, 0], x_np[:, 1], c='b', cmap=plt.cm.Accent) plt.savefig(self.basedir + "%s.png" % (self.imgname)) plt.show()
def main(args): if not os.path.exists(args.conf): print('Config not found' + args.conf) config = read_config(args.conf) print('Initializing parameters') template_file_path = config['template_fname'] template_mesh = Mesh(filename=template_file_path) if args.checkpoint_dir: checkpoint_dir = args.checkpoint_dir else: checkpoint_dir = config['checkpoint_dir'] if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) visualize = config['visualize'] output_dir = config['visual_output_dir'] if visualize is True and not output_dir: print( 'No visual output directory is provided. Checkpoint directory will be used to store the visual results' ) output_dir = checkpoint_dir if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) eval_flag = config['eval'] lr = config['learning_rate'] lr_decay = config['learning_rate_decay'] weight_decay = config['weight_decay'] total_epochs = config['epoch'] workers_thread = config['workers_thread'] opt = config['optimizer'] batch_size = config['batch_size'] val_losses, accs, durations = [], [], [] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Generating transforms') M, A, D, U = mesh_operations.generate_transform_matrices( template_mesh, config['downsampling_factors']) D_t = [scipy_to_torch_sparse(d).to(device) for d in D] U_t = [scipy_to_torch_sparse(u).to(device) for u in U] A_t = [scipy_to_torch_sparse(a).to(device) for a in A] num_nodes = [len(M[i].v) for i in range(len(M))] print('Loading Dataset') if args.data_dir: data_dir = args.data_dir else: data_dir = config['data_dir'] normalize_transform = Normalize() dataset = ComaDataset(data_dir, dtype='train', split=args.split, split_term=args.split_term, pre_transform=normalize_transform) dataset_val = ComaDataset(data_dir, dtype='val', split=args.split, split_term=args.split_term, pre_transform=normalize_transform) dataset_test = ComaDataset(data_dir, dtype='test', split=args.split, split_term=args.split_term, pre_transform=normalize_transform) train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=workers_thread) val_loader = DataLoader(dataset_val, batch_size=1, shuffle=True, num_workers=workers_thread) test_loader = DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=workers_thread) print('Loading model') start_epoch = 1 coma = Coma(dataset, config, D_t, U_t, A_t, num_nodes) if opt == 'adam': optimizer = torch.optim.Adam(coma.parameters(), lr=lr, weight_decay=weight_decay) elif opt == 'sgd': optimizer = torch.optim.SGD(coma.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9) else: raise Exception('No optimizer provided') checkpoint_file = config['checkpoint_file'] print(checkpoint_file) if checkpoint_file: checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch_num'] coma.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) #To find if this is fixed in pytorch for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) coma.to(device) if eval_flag: val_loss = evaluate(coma, output_dir, test_loader, dataset_test, template_mesh, device, visualize) print('val loss', val_loss) return best_val_loss = float('inf') val_loss_history = [] from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_dir = os.path.join('runs/cvae_dx', current_time) writer = SummaryWriter(log_dir + 'ds2_lr0.04_z2_kld_v5') print(coma.z) for epoch in range(start_epoch, total_epochs + 1): print("Training for epoch ", epoch) recon_loss, kld_loss, mu, var, kld_weight = train( coma, train_loader, len(dataset), optimizer, device, epoch) val_loss = evaluate(coma, output_dir, val_loader, dataset_val, template_mesh, device, visualize=visualize) train_loss = recon_loss + kld_loss writer.add_scalar('loss/train_loss', recon_loss + kld_loss, epoch) writer.add_scalar('train_loss/recon_loss', recon_loss, epoch) writer.add_scalar('train_loss/kld_loss', kld_loss, epoch) writer.add_scalar('loss/val_loss', val_loss, epoch) writer.add_histogram('hist/mean', mu, epoch) writer.add_histogram('hist/variance', var, epoch) print('epoch ', epoch, ' Recon loss ', recon_loss, ' KLD loss ', kld_loss, ' Val loss ', val_loss) print('kld weight ', kld_weight) if val_loss < best_val_loss: save_model(coma, optimizer, epoch, train_loss, val_loss, checkpoint_dir) best_val_loss = val_loss if epoch == total_epochs or epoch % 100 == 0: save_model(coma, optimizer, epoch, train_loss, val_loss, checkpoint_dir) val_loss_history.append(val_loss) val_losses.append(best_val_loss) if opt == 'sgd': adjust_learning_rate(optimizer, lr_decay) if torch.cuda.is_available(): torch.cuda.synchronize() writer.close()
def train(self): if self.resume: print('Resuming training ...') checkpoint = torch.load(os.path.join(self.log_root, 'torch_model')) self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: print('Starting training ...') writer = SummaryWriter(self.log_root) self.model = self.model.to(self.device) epoch = int(self.model.epoch) + 1 it = int(self.model.iteration) for epoch in range(epoch, epoch + self.num_epoch): epoch_root = 'epoch_{:02d}'.format(epoch) if not os.path.exists(os.path.join(self.log_root, epoch_root)): os.makedirs(os.path.join(self.log_root, epoch_root)) for phase in self.data_loaders.keys(): epoch_loss = 0 if phase == 'train': self.model.train(True) else: self.model.train(False) running_loss = 0.0 for i, (data, index) in enumerate(self.data_loaders[phase]): it += 1 # copy input and targets to the device object inputs = data['input'].to(self.device) targets = data['target'].to(self.device) # zero the parameter gradients self.optimizer.zero_grad() # forward + backward + optimize outputs = self.model(inputs) loss = self.criterion(outputs, targets) if phase == 'train': loss.backward() self.optimizer.step() # print statistics running_loss += loss.item() epoch_loss += loss.item() if (i + 1) % self.log_int == 0: running_loss_avg = running_loss / self.log_int print('Phase: ' + phase + ', epoch: {}, batch {}: running loss: {:0.3f}'. format(self.model.epoch, i + 1, running_loss_avg)) writer.add_scalars('running_loss', {phase: running_loss_avg}, it) running_loss = 0.0 if phase in ['train', 'val']: epoch_loss_avg = epoch_loss / self.data_lengths[phase] print('Phase: ' + phase + ', epoch: {}: epoch loss: {:0.3f}'.format( epoch, epoch_loss_avg)) writer.add_scalars('epoch_loss', {phase: epoch_loss_avg}, epoch) writer.add_histogram( 'input histogram', inputs.cpu().data.numpy()[0, 0].flatten(), epoch) writer.add_histogram( 'output histogram', outputs.cpu().data.numpy()[0, 0].flatten(), epoch) figure_inds = list(range(inputs.shape[0])) figure_inds = figure_inds if len( figure_inds) < 4 else list(range(4)) fig = Trainer.show_imgs(inputs, outputs, figure_inds) fig.savefig( os.path.join(self.log_root, epoch_root, phase + '.png')) writer.add_figure('images ' + phase, fig, epoch) if self.save & (phase == 'train'): print('Writing model graph...') writer.add_graph(self.model, inputs) print('Saving model state...') self.model.epoch = torch.nn.Parameter(torch.tensor(epoch), requires_grad=False) self.model.iteration = torch.nn.Parameter( torch.tensor(it), requires_grad=False) torch.save({ 'model_state_dict': self.model.state_dict(), }, os.path.join(self.log_root, epoch_root, 'model_state_dict')) torch.save( {'optimizer_state_dict': self.optimizer.state_dict()}, os.path.join(self.log_root, 'optimizer_state_dict')) print('Finished training ...') writer.close() print('Writer closed ...') # dictionary of accuracy metrics for tune hyperparameter optimization return {"val_loss_avg": epoch_loss_avg}
def train_net(net, device, epochs=5, batch_size=1, lr=0.001, val_percent=0.1, save_cp=True, img_scale=0.5): dataset = BasicDataset(dir_img, dir_mask, img_scale) n_val = int(len(dataset) * val_percent) n_train = len(dataset) - n_val train, val = random_split(dataset, [n_train, n_val]) train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=True) writer = SummaryWriter( comment=f'LR_{lr}_BS_{batch_size}_SCALE_{img_scale}') global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {batch_size} Learning rate: {lr} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images scaling: {img_scale} ''') optimizer = optim.RMSprop(net.parameters(), lr=lr, weight_decay=1e-8, momentum=0.9) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min' if net.n_classes > 1 else 'max', patience=2) if net.n_classes > 1: criterion = nn.CrossEntropyLoss() else: criterion = nn.BCEWithLogitsLoss() for epoch in range(epochs): net.train() epoch_loss = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img') as pbar: for batch in train_loader: imgs = batch['image'] true_masks = batch['mask'] assert imgs.shape[1] == net.n_channels, \ f'Network has been defined with {net.n_channels} input channels, ' \ f'but loaded images have {imgs.shape[1]} channels. Please check that ' \ 'the images are loaded correctly.' imgs = imgs.to(device=device, dtype=torch.float32) mask_type = torch.float32 if net.n_classes == 1 else torch.long true_masks = true_masks.to(device=device, dtype=mask_type) masks_pred = net(imgs) print(masks_pred.shape, true_masks.shape) loss = criterion(masks_pred, true_masks) epoch_loss += loss.item() writer.add_scalar('Loss/train', loss.item(), global_step) pbar.set_postfix(**{'loss (batch)': loss.item()}) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_value_(net.parameters(), 0.1) optimizer.step() pbar.update(imgs.shape[0]) global_step += 1 if global_step % (n_train // (10 * batch_size)) == 0: for tag, value in net.named_parameters(): tag = tag.replace('.', '/') writer.add_histogram('weights/' + tag, value.data.cpu().numpy(), global_step) writer.add_histogram('grads/' + tag, value.grad.data.cpu().numpy(), global_step) val_score = eval_net(net, val_loader, device) scheduler.step(val_score) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step) if net.n_classes > 1: logging.info( 'Validation cross entropy: {}'.format(val_score)) writer.add_scalar('Loss/test', val_score, global_step) else: logging.info( 'Validation Dice Coeff: {}'.format(val_score)) writer.add_scalar('Dice/test', val_score, global_step) writer.add_images('images', imgs, global_step) if net.n_classes == 1: writer.add_images('masks/true', true_masks, global_step) writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step) if save_cp: try: os.mkdir(dir_checkpoint) logging.info('Created checkpoint directory') except OSError: pass torch.save(net.state_dict(), dir_checkpoint + f'CP_epoch{epoch + 1}.pth') logging.info(f'Checkpoint {epoch + 1} saved !') writer.close()
"xsinx": x * np.sin(x), "xcosx": x * np.cos(x) }, x) writer.close() # ----------------------------------- 2 histogram ----------------------------------- # flag = 0 flag = 1 if flag: writer = SummaryWriter(comment='test_comment', filename_suffix="test_suffix") for x in range(2): np.random.seed(x) data_union = np.arange(100) data_normal = np.random.normal(size=1000) writer.add_histogram('distribution union', data_union, x) writer.add_histogram('distribution normal', data_normal, x) plt.subplot(121).hist(data_union, label="union") plt.subplot(122).hist(data_normal, label="normal") plt.legend() plt.show() writer.close()
elif opt.prune == 1: CBL_idx, _, prune_idx, shortcut_idx, _ = parse_moudle_defs1( model.module_defs) # TODO 剪枝策略3 print('shortcut sparse training') # tensorboard tb_writer = SummaryWriter() for epoch in range(opt.epochs): model.train() if opt.sr: # TODO bn可视化 for idx in prune_idx: bn_weights = gather_bn_weights(model.module_list, [idx]) tb_writer.add_histogram('bn_weight/hist', bn_weights.numpy(), epoch, bins='doane') start_time = time.time() for batch_i, (paths, imgs, targets) in enumerate(dataloader): batches_done = len(dataloader) * epoch + batch_i # TODO plot images 100次保存一次结果 if batches_done == 0: fname = 'train_batch%g.jpg' % batch_i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1],
def main(): """ Train and test :param opt: args :param writer: tensorboard :return: """ global opt opt = parse() arc = opt.arc cfg = opt.cfg teacher_cfg = opt.teacher_cfg img_size = opt.img_size epochs = opt.epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights teacher_weights = opt.teacher_weights multi_scale = opt.multi_scale sparsity_training = opt.st opt.weights = last if opt.resume else opt.weights # Initial logging logging.basicConfig( format="%(message)s", level=logging.INFO if opt.local_rank in [-1, 0] else logging.WARN) # Train logger.info(opt) if opt.local_rank in [-1, 0]: logger.info('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/') writer = SummaryWriter() # Hyperparameters with open(opt.hyp) as f_hyp: hyp = yaml.safe_load(f_hyp) # data dict with open(opt.data) as f_data: data = yaml.safe_load(f_data) # Distributed training initialize device = select_device(opt.device) if opt.local_rank != -1: dist.init_process_group(init_method="env://", backend='nccl') torch.cuda.set_device(opt.local_rank) device = torch.device(f"cuda:{opt.local_rank}") # world_size = torch.distributed.get_world_size() init_seeds() cuda = device.type != 'cpu' torch.backends.cudnn.benchmark = True if multi_scale: img_size_min = round(img_size / 32 / 1.5) + 1 img_size_max = round(img_size / 32 * 1.5) - 1 img_size = img_size_max * 32 # initiate with maximum multi_scale size logger.info(f'Using multi-scale {img_size_min * 32} - {img_size}') train_path = data['train'] num_classes = int(data['num_classes']) # number of classes # Load dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, hyp=hyp, rect=opt.rect) train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if opt.local_rank != -1 else None num_worker = os.cpu_count() // torch.cuda.device_count() dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=min([num_worker, batch_size, 8]), shuffle=not (opt.rect or train_sampler), sampler=train_sampler, pin_memory=True, collate_fn=dataset.collate_fn) # Load model model = Model(cfg, img_size, arc=arc).to(device) # Load teacher model if teacher_cfg: teacher_model = Model(teacher_cfg, img_size, arc).to(device) # optimizer parameter groups param_group0, param_group1 = [], [] for key, value in model.named_parameters(): if 'Conv2d.weight' in key: param_group1.append(value) else: param_group0.append(value) if opt.adam: optimizer = optim.Adam(param_group0, lr=hyp['lr0']) else: optimizer = optim.SGD(param_group0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) # add param_group1 with weight_decay optimizer.add_param_group({'params': param_group1, 'weight_decay': hyp['weight_decay']}) logger.info(f'Optimizer groups: {len(param_group1)} conv.weight, {len(param_group0)} other') del param_group0, param_group1 start_epoch = 0 best_fitness = 0. if weights.endswith('.pt'): checkpoint = torch.load(weights, map_location=device) state_dict = intersect_dicts(checkpoint['model'], model.state_dict()) model.load_state_dict(state_dict, strict=False) print('loaded weights from', weights, '\n') # load optimizer if checkpoint['optimizer'] is not None: optimizer.load_state_dict(checkpoint['optimizer']) best_fitness = checkpoint['best_fitness'] # load results if checkpoint.get('training_results') is not None: with open(results_file, 'w') as file: file.write(checkpoint['training_results']) # resume if opt.resume: start_epoch = checkpoint['epoch'] + 1 del checkpoint elif len(weights) > 0: # weights are 'yolov4.weights', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) logger.info(f'loaded weights from {weights}\n') # Load teacher weights if teacher_cfg: if teacher_weights.endswith('.pt'): teacher_model.load_state_dict(torch.load(teacher_weights, map_location=device)['model']) elif teacher_weights.endswith('.weights'): load_darknet_weights(teacher_model, teacher_weights) else: raise Exception('pls provide proper teacher weights for knowledge distillation') if not mixed_precision: teacher_model.eval() logger.info('<......................using knowledge distillation....................>') logger.info(f'teacher model: {teacher_weights}\n') # Sparsity training if opt.prune == 0: _, _, prune_index = parse_module_index(model.module_dicts) if sparsity_training: logger.info('normal sparse training') if mixed_precision: if teacher_cfg: [model, teacher_model], optimizer = amp.initialize([model, teacher_model], optimizer, opt_level='O1', verbosity=1) else: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=1) # SyncBatchNorm and distributed training if cuda and opt.local_rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) model = model.to(device) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[opt.local_rank]) model.module_list = model.module.module_list model.yolo_layers = model.module.yolo_layers for index in prune_index: bn_weights = gather_bn_weights(model.module_list, [index]) if opt.local_rank == 0: writer.add_histogram('before_train_per_layer_bn_weights/hist', bn_weights.numpy(), index, bins='doane') # Start training model.num_classes = num_classes model.arc = opt.arc model.hyp = hyp num_batch_size = len(dataloader) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' results = (0, 0, 0, 0, 0, 0, 0) start_train_time = time.time() logger.info('Image sizes %d \n Starting training for %d epochs...', img_size, epochs) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() mean_losses = torch.zeros(4).to(device) mean_soft_target = torch.zeros(1).to(device) pbar = enumerate(dataloader) logger.info(('\n %10s %10s %10s %10s %10s %10s %10s %10s'), 'Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size') if opt.local_rank in [-1, 0]: pbar = tqdm(pbar, total=num_batch_size) optimizer.zero_grad() for i, (imgs, targets, _, _) in pbar: # batch ------------------------------------------------------------- num_integrated_batches = i + num_batch_size * epoch # Adjust the learning rate learning_rate = adjust_learning_rate(optimizer, num_integrated_batches, num_batch_size, hyp, epoch, epochs) if i == 0 and opt.local_rank in [-1, 0]: logger.info(f'learning rate: {learning_rate}') imgs = imgs.to(device) / 255.0 targets = targets.to(device) # Multi-Scale training if multi_scale: if num_integrated_batches / accumulate % 10 == 0: img_size = random.randrange(img_size_min, img_size_max + 1) * 32 scale_factor = img_size / max(imgs.shape[2:]) if scale_factor != 1: new_shape = [math.ceil(x * scale_factor / 32.) * 32 for x in imgs.shape[2:]] imgs = F.interpolate(imgs, size=new_shape, mode='bilinear', align_corners=False) pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) # knowledge distillation soft_target = 0 if teacher_cfg: if mixed_precision: with torch.no_grad(): output_teacher = teacher_model(imgs) else: _, output_teacher = teacher_model(imgs) soft_target = distillation_loss(pred, output_teacher, model.num_classes, imgs.size(0)) loss += soft_target # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Sparse the BN layer that needs pruning if sparsity_training: # bn_l1_regularization(model.module_list, opt.penalty_factor, cba_index, epoch, epochs) bn_l1_regularization(model.module_list, opt.penalty_factor, prune_index, epoch, epochs) # Accumulate gradient for x batches before optimizing if num_integrated_batches % accumulate == 0: optimizer.step() optimizer.zero_grad() if opt.local_rank in [-1, 0]: mean_losses = (mean_losses * i + loss_items) / (i + 1) mean_soft_target = (mean_soft_target * i + soft_target) / (i + 1) memory = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0 # (GB) description = ('%10s' * 2 + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), '%.3gG' % memory, *mean_losses, mean_soft_target, img_size) pbar.set_description(description) # end batch ------------------------------------------------------------------------------------------------ # Update scheduler # scheduler.step() if opt.local_rank in [-1, 0]: final_epoch = epoch + 1 == epochs # Calculate mAP if not (opt.notest or opt.nosave) or final_epoch: with torch.no_grad(): results, _ = test(cfg, data, batch_size=batch_size, img_size=opt.img_size, model=model, conf_thres=0.001 if final_epoch and epoch > 0 else 0.1, # 0.1 for speed save_json=final_epoch and epoch > 0) # Write epoch results with open(results_file, 'a') as file: # P, R, mAP, F1, test_losses=(GIoU, obj, cls) file.write(description + '%10.3g' * 7 % results + '\n') # Write Tensorboard results if writer: outputs = list(mean_losses) + list(results) titles = ['GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'] for output, title in zip(outputs, titles): writer.add_scalar(title, output, epoch) bn_weights = gather_bn_weights(model.module_list, prune_index) writer.add_histogram('bn_weights/hist', bn_weights.numpy(), epoch, bins='doane') # Update best mAP fitness = results[2] if fitness > best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) if save and opt.local_rank == 0: with open(results_file, 'r') as file: # Create checkpoint checkpoint = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': file.read(), 'model': model.module.state_dict() if isinstance( model, nn.parallel.DistributedDataParallel) else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last checkpoint torch.save(checkpoint, last) # Save best checkpoint if best_fitness == fitness: torch.save(checkpoint, best) # Delete checkpoint del checkpoint # end epoch ----------------------------------------------------------------------------------------------- # end training if opt.local_rank in [-1, 0]: if len(opt.name): os.rename('results.txt', 'results_%s.txt' % opt.name) plot_results() # save as results.png print(f'{epoch - start_epoch + 1} epochs completed in {(time.time() - start_train_time) / 3600:.3f} hours.\n') if torch.cuda.device_count() > 1: dist.destroy_process_group() torch.cuda.empty_cache() return results
class DDPG: """ input : discrete actions, state space, type of learning(Straight/left/right)""" def __init__(self, action_space, state_space, radar_dim, type): self.action_space = action_space self.state_space = state_space self.radar_space = radar_dim self.lower_bound = 0.0 self.upper_bound = 0.6 # Steer limit self.epsilon = 0.8 self.gamma = .99 self.batch_size = 128 self.epsilon_min = .1 self.epsilon_decay = .997 self.critic_lr = 0.006 self.actor_lr = 0.006 # Custom tensorboard object now = time.localtime() self.tensorboard = ModifiedTensorBoard(log_dir=f"logs/{MODEL_NAME}-Feb_{now.tm_mday}_{now.tm_min}_{now.tm_hour}_{self.radar_space}_{self.actor_lr}_{self.batch_size}") self.type = type # Networks # we need to share some weights in between actor <--> critic # that we will do after every update self.actor = FeedForwardNN(self.radar_space, self.state_space, self.action_space, "actor") self.critic = FeedForwardNN(self.radar_space, self.state_space, 1, "critic") # Target model this is what we .predict against every step self.target_update_counter = 0 self.target_actor = self.actor self.target_critic = self.critic # We use different np.arrays for each tuple element for replay memory self.buffer_capacity=50_000 self.buffer_counter = 0; self.state_buffer = np.zeros((self.buffer_capacity, self.state_space)) self.action_buffer = np.zeros((self.buffer_capacity, self.action_space)) self.reward_buffer = np.zeros((self.buffer_capacity, 1)) self.next_state_buffer = np.zeros((self.buffer_capacity, self.state_space)) self.radar_buffer = np.zeros((self.buffer_capacity, self.radar_space)) self.next_radar_buffer = np.zeros((self.buffer_capacity, self.radar_space)) self.t_so_far = 0 self.writer = SummaryWriter(log_dir=f"runs/Feb_{now.tm_mday}_{now.tm_min}_{now.tm_hour}_{self.radar_space}_{self.actor_lr}_{self.batch_size}") # Takes (s,a,r,s') obervation tuple as input def remember(self,radar_state, radar_state_next, state, action, reward, next_state, done=None): # Set index to zero if buffer_capacity is exceeded, # replacing old records index = self.buffer_counter % self.buffer_capacity self.radar_buffer[index] = radar_state self.next_radar_buffer[index] = radar_state_next self.state_buffer[index] = state self.action_buffer[index] = action self.reward_buffer[index] = reward self.next_state_buffer[index] = next_state self.buffer_counter += 1 # policy() returns an action sampled from our Actor network plus # some noise for exploration. def policy(self, radar_state, physical_state): # .squeeze() function returns a tensor with the same value as its first # argument, but a different shape. It removes dimensions whose size is one. if np.random.rand() <= self.epsilon: sampled_actions = torch.rand(1) else: sampled_actions = self.actor(radar_state, physical_state, None) sampled_actions = sampled_actions.detach().numpy() # sampled_actions = np.array([(x+1)/2 for x in sampled_actions]) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # We make sure action is within bounds # Clip (limit) the values in an array here b/w lower and upper bound legal_action = np.clip(sampled_actions, self.lower_bound, self.upper_bound) return np.squeeze(legal_action) # We compute the loss and update parameters (learn) def replay(self): # Get sampling range record_range = min(self.buffer_counter, self.buffer_capacity) # Randomly sample indices(batch) batch_indices = np.random.choice(record_range, self.batch_size) # Convert to tensors state_batch = torch.tensor(self.state_buffer[batch_indices], dtype=torch.float) action_batch = torch.tensor(self.action_buffer[batch_indices], dtype=torch.float) reward_batch = torch.tensor(self.reward_buffer[batch_indices], dtype=torch.float) next_state_batch = torch.tensor(self.next_state_buffer[batch_indices], dtype=torch.float) radar_batch = torch.tensor(self.radar_buffer[batch_indices], dtype=torch.float) next_radar_batch = torch.tensor(self.next_radar_buffer[batch_indices], dtype=torch.float) """ `````````````````````````````````````````````````````````````````````````` # We are missing one more step # We got to match some preprocess layers of actor and critic # Create a function and call in between the above and here too `````````````````````````````````````````````````````````````````````````` """ # Setting the Actor and Critic common shared layer as mean of both tau = 0.01 new_dict = dict(self.critic.named_parameters()) for name, param in self.actor.named_parameters(): if 'layer' in name: new_dict[name] = (tau*param.data + (1-tau)*new_dict[name]) # old_dict = dict(self.critic.named_parameters()) self.critic.load_state_dict(new_dict) new_dict = dict(self.actor.named_parameters()) for name, param in self.critic.named_parameters(): if 'layer' in name: new_dict[name] = (tau*param.data + (1-tau)*new_dict[name]) self.actor.load_state_dict(new_dict) # Critic Network target_actions = self.target_actor(next_radar_batch, next_state_batch, None) y = reward_batch + self.gamma * self.target_critic(next_radar_batch, next_state_batch, target_actions) critic_value = self.critic(radar_batch, state_batch, action_batch) critic_loss = torch.mean((y-critic_value)**2) critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() # Actor Network actions = self.actor(radar_batch, state_batch, None) critic_value = self.critic(radar_batch, state_batch, action_batch) actor_loss = -torch.mean(critic_value) actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) actor_optimizer.zero_grad() actor_loss.backward() actor_optimizer.step() return actor_loss.detach().numpy(), critic_loss.detach().numpy() # This update target parameters slowly # Based on rate `tau`, which is much less than one ~0.001 order # This also logs the historgrams def update_target(self, tau, val): if(tau<1): new_dict = dict(self.target_critic.named_parameters()) for name, param in self.critic.named_parameters(): new_dict[name].data = (param.data * tau + new_dict[name].data * (1 - tau)) self.target_critic.load_state_dict(new_dict) new_dict = dict(self.target_actor.named_parameters()) for name, param in self.actor.named_parameters(): new_dict[name].data = (param.data * tau + new_dict[name].data * (1 - tau)) self.target_actor.load_state_dict(new_dict) else: self.target_critic.load_state_dict(self.critic.state_dict()) self.target_actor.load_state_dict(self.actor.state_dict()) # Log the histogram data of the Actor/Critic Network if(val%10==0): for name, param in self.actor.named_parameters(): if 'weight' in name: self.writer.add_histogram("actor"+name, param.detach().numpy(), self.t_so_far) for name, param in self.critic.named_parameters(): if 'weight' in name: self.writer.add_histogram("critic"+name, param.detach().numpy(), self.t_so_far) self.t_so_far += 1 def save_model(self): # serialize weights to HDF5 print("---Saved modelweights to disk---") # Save the weights torch.save(self.actor.state_dict(), str(self.type) + "_DDPGactor.pth") torch.save(self.critic.state_dict(), str(self.type) + "_DDPGcritic.pth") torch.save(self.target_actor.state_dict(), str(self.type) + "_target_actor.pth") torch.save(self.target_critic.state_dict(), str(self.type) + "_target_critic.pth")
optimizer.zero_grad() loss.backward() optimizer.step() # Calculate 'running' training accuracy features = data.reshape(data.shape[0], -1) img_grid = torchvision.utils.make_grid(data) _, predictions = scores.max(1) num_correct = (predictions == targets).sum() running_train_acc = float(num_correct) / float(data.shape[0]) accuracies.append(running_train_acc) # Plot things to tensorboard class_labels = [classes[label] for label in predictions] writer.add_image("mnist_images", img_grid) writer.add_histogram("fc1", model.fc1.weight) writer.add_scalar("Training loss", loss, global_step=step) writer.add_scalar("Training Accuracy", running_train_acc, global_step=step) if batch_idx == 230: writer.add_embedding( features, metadata=class_labels, label_img=data, global_step=batch_idx, ) step += 1 writer.add_hparams(
def train(appliance_name, model, mains, appliance, epochs, batch_size, pretrain, checkpoint_interval=None, train_patience=3): # Model configuration if USE_CUDA: model = model.cuda() if not pretrain: model.apply(initialize) # summary(model, (1, mains.shape[1])) Wrong with torchsummary API # Split the train and validation set train_mains, valid_mains, train_appliance, valid_appliance = train_test_split( mains, appliance, test_size=.2, random_state=random_seed) # Create optimizer, loss function, and dataloadr optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) loss_fn = torch.nn.MSELoss(reduction='mean') train_dataset = TensorDataset( torch.from_numpy(train_mains).float().permute(0, 2, 1), torch.from_numpy(train_appliance).float()) train_loader = tud.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True) valid_dataset = TensorDataset( torch.from_numpy(valid_mains).float().permute(0, 2, 1), torch.from_numpy(valid_appliance).float()) valid_loader = tud.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True) writer = SummaryWriter(comment='train_visual') patience, best_loss = 0, None for epoch in range(epochs): # Earlystopping if (patience == train_patience): print( "val_loss did not improve after {} Epochs, thus Earlystopping is calling" .format(train_patience)) break # Train the model model.train() st = time.time() for i, (batch_mains, batch_appliance) in enumerate(train_loader): if USE_CUDA: batch_mains = batch_mains.cuda() batch_appliance = batch_appliance.cuda() batch_pred = model(batch_mains) loss = loss_fn(batch_appliance, batch_pred) model.zero_grad() loss.backward() optimizer.step() ed = time.time() # Evaluate the model model.eval() with torch.no_grad(): cnt, loss_sum = 0, 0 for i, (batch_mains, batch_appliance) in enumerate(valid_loader): if USE_CUDA: batch_mains = batch_mains.cuda() batch_appliance = batch_appliance.cuda() batch_pred = model(batch_mains) loss = loss_fn(batch_appliance, batch_pred) loss_sum += loss cnt += 1 final_loss = loss_sum / cnt # Save best only if best_loss is None or final_loss < best_loss: best_loss = final_loss patience = 0 net_state_dict = model.state_dict() path_state_dict = "./" + appliance_name + "_bilstm_best_state_dict.pt" torch.save(net_state_dict, path_state_dict) else: patience = patience + 1 print("Epoch: {}, Valid_Loss: {}, Time consumption: {}s.".format( epoch, final_loss, ed - st)) # For the visualization of training process for name, param in model.named_parameters(): writer.add_histogram(name + '_grad', param.grad, epoch) writer.add_histogram(name + '_data', param, epoch) writer.add_scalars("MSELoss", {"Valid": final_loss}, epoch) # Save checkpoint if (checkpoint_interval != None) and ((epoch + 1) % checkpoint_interval == 0): checkpoint = { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoch": epoch } path_checkpoint = "./" + appliance_name + "_bilstm_checkpoint_{}_epoch.pkl".format( epoch) torch.save(checkpoint, path_checkpoint)
class RunManager: """ RunManager class, keeping track of overall training progress. """ def __init__(self): self.epoch = Epoch() self.run = Run() self.net = None self.images = None self.noisy_images = None self.tb = None self.min_val_loss = float('inf') def begin_run(self, hparams, net, test_images, test_noisy_images): # Begin next run with new hyperparameters self.run.begin(hparams) # Setup network, data and SummaryWriter self.net = net self.images = test_images self.noisy_images = test_noisy_images self.tb = SummaryWriter(comment=f'-{hparams}') # Add test images and graph to TensorBoard grid = make_grid(to_img(self.images), nrow=10) noisy_grid = make_grid(to_img(self.noisy_images), nrow=10) self.tb.add_image('original images', grid) self.tb.add_image('noisy images', noisy_grid) self.tb.add_graph(self.net, to_img(self.images)) self.save_img(grid, 'original_images.png') self.save_img(noisy_grid, 'noisy_images.png') def end_run(self): self.tb.flush() self.tb.close() self.net = None self.images = None self.noisy_images = None self.tb = None self.min_val_loss = float('inf') self.run.end(self.epoch) def begin_epoch(self): assert self.run.active, "Run is not active, cannot initialise epoch" self.epoch.begin() def end_epoch(self): run_duration = self.run.duration() epoch_duration, train_loss, val_loss = self.epoch.end() self.tb.add_scalar('Training loss', train_loss, self.epoch.count) self.tb.add_scalar('Validation loss', val_loss, self.epoch.count) with torch.no_grad(): preds = self.net(self.noisy_images) pred_imgs = to_img(preds) grid = make_grid(pred_imgs, nrow=10) self.tb.add_image('reconstructed images', grid, self.epoch.count) for name, param in self.net.named_parameters(): self.tb.add_histogram(name, param, self.epoch.count) self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch.count) if val_loss < self.min_val_loss: torch.save(self.net, './models/best_' + str(self.run.hparams) + '.pth') self.min_val_loss = val_loss self.save_img(grid, 'epoch{0}.png'.format(self.epoch.count)) results = OrderedDict() results['run'] = self.run.count results['epoch'] = self.epoch.count results['train loss'] = train_loss results['validation loss'] = val_loss results['epoch duration'] = epoch_duration results['run duration'] = run_duration for k, v in self.run.hparams._asdict().items(): results[k] = v self.run.append_and_display_data(results) def track_loss(self, loss, batch_size, mode='train'): self.epoch.add_loss(loss, batch_size, mode) def save(self, filename): self.run.save(filename) # Save image to local directory def save_img(self, grid, filename): if not os.path.exists('./gif'): os.mkdir('./gif') plt.figure(figsize=(15, 15)) plt.imsave('./gif/' + filename, np.transpose(grid, (1, 2, 0)).numpy())
class Runner(object): def __init__(self, net, env, num_envs, n_stack, rollout_size=5, num_updates=2500000, max_grad_norm=0.5, value_coeff=0.5, entropy_coeff=0.02, tensorboard_log=False, log_path="./log", is_cuda=True, seed=42): super().__init__() # constants self.num_envs = num_envs self.rollout_size = rollout_size self.num_updates = num_updates self.n_stack = n_stack self.seed = seed self.max_grad_norm = max_grad_norm # loss scaling coefficients self.is_cuda = torch.cuda.is_available() and is_cuda # objects """Tensorboard logger""" self.writer = SummaryWriter( comment="statistics", log_dir=log_path) if tensorboard_log else None """Environment""" self.env = env self.storage = RolloutStorage(self.rollout_size, self.num_envs, self.env.observation_space.shape[0:-1], self.n_stack, is_cuda=self.is_cuda, value_coeff=value_coeff, entropy_coeff=entropy_coeff, writer=self.writer) """Network""" self.net = net self.net.a2c.writer = self.writer if self.is_cuda: self.net = self.net.cuda() # self.writer.add_graph(self.net, input_to_model=(self.storage.states[0],)) --> not working for LSTMCEll def train(self): """Environment reset""" obs = self.env.reset() self.storage.states[0].copy_(self.storage.obs2tensor(obs)) best_loss = np.inf for num_update in range(self.num_updates): final_value, entropy = self.episode_rollout() self.net.optimizer.zero_grad() """Assemble loss""" loss = self.storage.a2c_loss(final_value, entropy) loss.backward(retain_graph=False) # gradient clipping nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) if self.writer is not None: self.writer.add_scalar("loss", loss.item()) self.net.optimizer.step() # it stores a lot of data which let's the graph # grow out of memory, so it is crucial to reset self.storage.after_update() if loss < best_loss: best_loss = loss.item() print("model saved with best loss: ", best_loss, " at update #", num_update) torch.save(self.net.state_dict(), "a2c_best_loss") elif num_update % 10 == 0: print("current loss: ", loss.item(), " at update #", num_update) self.storage.print_reward_stats() elif num_update % 100 == 0: torch.save(self.net.state_dict(), "a2c_time_log_no_norm") if self.writer is not None and len( self.storage.episode_rewards) > 1: self.writer.add_histogram( "episode_rewards", torch.tensor(self.storage.episode_rewards)) self.env.close() def episode_rollout(self): episode_entropy = 0 for step in range(self.rollout_size): """Interact with the environments """ # call A2C a_t, log_p_a_t, entropy, value, a2c_features = self.net.a2c.get_action( self.storage.get_state(step)) # accumulate episode entropy episode_entropy += entropy # interact obs, rewards, dones, infos = self.env.step(a_t.cpu().numpy()) # save episode reward self.storage.log_episode_rewards(infos) self.storage.insert(step, rewards, obs, a_t, log_p_a_t, value, dones) self.net.a2c.reset_recurrent_buffers(reset_indices=dones) # Note: # get the estimate of the final reward # that's why we have the CRITIC --> estimate final reward # detach, as the final value will only be used as a with torch.no_grad(): _, _, _, final_value, final_features = self.net.a2c.get_action( self.storage.get_state(step + 1)) return final_value, episode_entropy
collector_reconstruction_loss.mean(), iteration, ) writer.add_scalar("imq_mmd_average_20_obs", collector_imq_mmd.mean(), iteration) writer.add_scalar("codes_min_over_20_obs", collector_codes_min.min(), iteration) writer.add_scalar("codes_max_over_20_obs", collector_codes_max.max(), iteration) if iteration % (knobs["time_to_collect"] * 4) == 0: it_encoder_parameters = encoder.parameters() for k, v in encoder.state_dict().items(): if k.find("bias") != -1 or k.find("weight") != -1: writer.add_histogram("encoder/" + k.replace(".", "/"), v, iteration) writer.add_histogram( "encoder/" + k.replace(".", "/") + "/grad", next(it_encoder_parameters).grad, iteration, ) it_decoder_parameters = decoder.parameters() for k, v in decoder.state_dict().items(): if k.find("bias") != -1 or k.find("weight") != -1: writer.add_histogram("decoder/" + k.replace(".", "/"), v, iteration) writer.add_histogram( "decoder/" + k.replace(".", "/") + "/grad", next(it_decoder_parameters).grad, iteration, )
comment = f' batch_size={batch_size} lr={lr}' tb = SummaryWriter(comment=comment) tb.add_image('images', grid) tb.add_graph(network, images) for epoch in range(1): total_loss = 0 total_correct = 0 for batch in train_loader: images, labels = batch # Get Batch preds = network(images) # Pass Batch loss = F.cross_entropy(preds, labels) # Calculate Loss optimizer.zero_grad() # Zero Gradients loss.backward() # Calculate Gradients optimizer.step() # Update Weights total_loss += loss.item() * batch_size total_correct += get_num_correct(preds, labels) tb.add_scalar('Loss', total_loss, epoch) tb.add_scalar('Number Correct', total_correct, epoch) tb.add_scalar('Accuracy', total_correct / len(train_set), epoch) for name, param in network.named_parameters(): tb.add_histogram(name, param, epoch) tb.add_histogram(f'{name}.grad', param.grad, epoch) print("epoch", epoch, "total_correct:", total_correct, "loss:", total_loss) tb.close()
class MRTR(): def __init__(self, config): self.config = config self.iteration = 0 self.debug = False self.maskpreinpaint_model = MaskInpaintModel(config).to(config.DEVICE) self.psnr = PSNR(255.0).to(config.DEVICE) self.ssim = SSIM(5, reduction='mean') self.mse = torch.nn.MSELoss() self.maskacc = EdgeAccuracy(config.EDGE_THRESHOLD).to(config.DEVICE) # test mode if self.config.MODE == 2 or self.config.MODE == 4: self.test_dataset = Dataset(config, config.TEST_DATA, augment=False, training=False) else: # Create tfboard summary writer self.val_info = None self.is_best = True self.writer = SummaryWriter(self.config.LOG_DIR) self.train_dataset = Dataset(config, config.TRAIN_DATA, augment=True, training=True) self.val_dataset = Dataset(config, config.VAL_DATA, augment=False, training=True) self.sample_iterator = self.val_dataset.create_iterator( config.SAMPLE_SIZE) self.samples_path = os.path.join(config.MODEL_DIR, 'samples') self.results_path = config.TEST_DIR if config.RESULTS is not None: self.results_path = os.path.join(config.RESULTS) if config.DEBUG is not None and config.DEBUG != 0: self.debug = True self.log_file = os.path.join(config.PATH, 'log.dat') def load(self): self.maskpreinpaint_model.load() def save(self): self.maskpreinpaint_model.save() def train(self): train_loader = DataLoader(dataset=self.train_dataset, batch_size=self.config.BATCH_SIZE, num_workers=4, drop_last=True, pin_memory=True, shuffle=True) epoch = 0 keep_training = True max_iteration = int(float(self.config.MAX_ITERS)) if len(self.train_dataset) == 0: print( 'No training data was provided! Check \'TRAIN_DATA\' value in the configuration file.' ) return iteration = 0 while keep_training: epoch += 1 progbar = Progbar(stateful_metrics=['step']) for items in train_loader: self.maskpreinpaint_model.train() images, images_gt, masks, masks_gt, masks_refine_gt = self.get_inputs( items) # train prob = np.minimum( self.config.MASK_SWITCH_RATIO, np.ceil(iteration / self.config.MASK_SWITCH_STEP) / 10) use_gt_mask = False if np.random.binomial(1, prob) else True images_gen, pre_images_gen, masks_gen, gen_loss, dis_loss, logs = \ self.maskpreinpaint_model.process(images, images_gt, masks, masks_gt, masks_refine_gt, use_gt_mask=use_gt_mask) masks_cmp = masks_gt if use_gt_mask else masks_gen * masks images_cmp = self.get_complete_preinpaint( masks_cmp, images, images_gen) pre_images_cmp = self.get_complete_preinpaint( masks_cmp, images, pre_images_gen) # backward self.maskpreinpaint_model.backward(gen_loss, dis_loss) iteration = self.maskpreinpaint_model.iteration # Tensorboard record: scala if iteration % self.config.SAVE_SCALR_AT_STEP == 0: self._write_logs(logs, iteration) if iteration % self.config.SAVE_HIST_AT_STEP == 0: for name, value in self.maskpreinpaint_model.named_parameters( ): self.writer.add_histogram( 'MaskPreinpaint_weight/' + name, value, iteration) if value.grad is not None: self.writer.add_histogram( 'MaskPreinpaint_grad/' + name, value.grad.data, iteration) # Tensorboard record: image if iteration % self.config.SAVE_IMAGE_AT_STEP == 0: image = self.get_tensorboard_image([ images, images_gt, self.gray2rgb(masks), self.gray2rgb(masks_refine_gt), self.gray2rgb(masks_gen), self.gray2rgb(masks_cmp), pre_images_gen, pre_images_cmp, images_gen, images_cmp ]) self.writer.add_image('Train/', image, iteration) if iteration % self.config.PRINT_AT_STEP == 0: logs = [ ("step", str(epoch) + "/" + str(iteration)), ] + logs progbar.print_cur(self.config.PRINT_AT_STEP, values=logs) self.iteration = iteration self._run_steps_after_train(logs) if iteration >= max_iteration: keep_training = False break self.maskpreinpaint_model.gen_scheduler.step() self.maskpreinpaint_model.dis_scheduler.step() self.writer.close() print('\nEnd training....') def eval(self): if self.config.MODE == 1 or self.config.MODE == 3: val_loader = DataLoader(dataset=self.val_dataset, batch_size=self.config.BATCH_SIZE, drop_last=True, shuffle=True) total = len(self.val_dataset) else: val_loader = DataLoader(dataset=self.test_dataset, batch_size=self.config.BATCH_SIZE, drop_last=False, shuffle=False) total = len(self.test_dataset) self.config.N_EVAL = 7 self.maskpreinpaint_model.eval() logs = [] i_logs = [] progbar = Progbar(total, stateful_metrics=['it']) with torch.no_grad(): for _iteration, items in enumerate(val_loader): images, images_gt, masks, masks_gt, masks_refine_gt = self.get_inputs( items) # edge model images_gen, pre_images_gen, masks_gen, gen_loss, dis_loss, logs = \ self.maskpreinpaint_model.process(images, images_gt, masks, masks_gt, masks_refine_gt, use_gt_mask=self.config.EVAL_USE_GT_MASK) masks_cmp = masks_gen * masks images_cmp = self.get_complete_preinpaint( masks_cmp, images, images_gen) pre_images_cmp = self.get_complete_preinpaint( masks_cmp, images, pre_images_gen) #mask_blur = mask.filter(ImageFilter.GaussianBlur(10)) #im = Image.composite(im1, im2, mask_blur) # metrics psnr = self.psnr(self.postprocess(images_gt), self.postprocess(images_gen)) mae = (torch.sum(torch.abs(images_gt - images_gen)) / torch.sum(images_gt)).float() psnr_cmp = self.psnr(self.postprocess(images_gt), self.postprocess(images_cmp)) mae_cmp = (torch.sum(torch.abs(images_gt - images_cmp)) / torch.sum(images_gt)).float() logs.append(('psnr', psnr.item())) logs.append(('mae', mae.item())) logs.append(('psnr_cmp', psnr_cmp.item())) logs.append(('mae_cmp', mae_cmp.item())) # metrics psnr = self.psnr(self.postprocess(images_gt), self.postprocess(pre_images_gen)) mae = (torch.sum(torch.abs(images_gt - pre_images_gen)) / torch.sum(images_gt)).float() psnr_cmp = self.psnr(self.postprocess(images_gt), self.postprocess(pre_images_cmp)) mae_cmp = (torch.sum(torch.abs(images_gt - pre_images_cmp)) / torch.sum(images_gt)).float() logs.append(('pre_psnr', psnr.item())) logs.append(('pre_mae', mae.item())) logs.append(('pre_psnr_cmp', psnr_cmp.item())) logs.append(('pre_mae_cmp', mae_cmp.item())) ssim = self.ssim(images_gt, images_gen) ssim_cmp = self.ssim(images_gt, images_cmp) mse = self.mse(images_gt, images_gen) mse_cmp = self.mse(images_gt, images_cmp) logs.append(('ssim', (1 - ssim.item()) * 100)) logs.append(('ssim_cmp', (1 - ssim_cmp.item()) * 100)) logs.append(('mse', mse.item())) logs.append(('mse_cmp', mse_cmp.item())) ssim = self.ssim(images_gt, pre_images_gen) ssim_cmp = self.ssim(images_gt, pre_images_cmp) mse = self.mse(images_gt, pre_images_gen) mse_cmp = self.mse(images_gt, pre_images_cmp) logs.append(('pre_ssim', (1 - ssim.item()) * 100)) logs.append(('pre_ssim_cmp', (1 - ssim_cmp.item()) * 100)) logs.append(('pre_mse', mse.item())) logs.append(('pre_mse_cmp', mse_cmp.item())) # Hack: name of edgeacc mask_precision, mask_recall = self.maskacc( masks_refine_gt * masks, masks_cmp) logs.append(('M_P', mask_precision.item())) logs.append(('M_R', mask_recall.item())) logs = logs + i_logs progbar.add(len(images), values=logs if self.config.VERBOSE else [ x for x in logs if not x[0].startswith('l_') and not x[0].startswith('d_') ]) # print(_iteration) if _iteration >= self.config.N_EVAL - 1: break # Print the average values progbar.print_info() images = self.get_tensorboard_image([ images, images_gt, self.gray2rgb(masks), self.gray2rgb(masks_refine_gt), self.gray2rgb(masks_gen), self.gray2rgb(masks_cmp), pre_images_gen, pre_images_cmp, images_gen, images_cmp ]) if self.config.MODE == 1 or self.config.MODE == 3: # Writing to tfboard summary _val_info = {} # TODO: ensure following code is correct for item, value in progbar.get_average_log_values().items(): if not item.startswith('l_'): self.writer.add_scalar('Validation/' + item, value, self.iteration) _val_info[item] = value # if self.val_info is None: # self.val_info = _val_info # self.is_best = True # else: # if self.config.MODEL !=1 : # if _val_info['psnr_cmp'] > self.val_info['psnr_cmp']: # self.val_info = _val_info # self.is_best = True # elif self.config.MODEL == 1 : # # Hack: only looked at mask recall, this might be ugly # if _val_info['M_R'] > self.val_info['M_R']: # self.val_info = _val_info # self.is_best = True # else: # raise # get_tensorboard_image(self, img_list) # images = vutils.make_grid(images[0], normalize=True, scale_each=True) self.writer.add_image('Validation/', images, self.iteration) def test(self): self.maskpreinpaint_model.eval() create_dir(self.results_path) test_loader = DataLoader( dataset=self.test_dataset, batch_size=1, ) ### !!! FIX TEST index = 0 for items in test_loader: name = self.test_dataset.load_name(index) images, images_gt, masks, masks_gt, masks_refine_gt = self.get_inputs( items) index += 1 output_images, output_pre_images, output_masks = self.maskpreinpaint_model( images, masks) output_masks_cmp = output_masks output_images_cmp = self.get_complete_preinpaint( output_masks_cmp, images, output_images) output_pre_images_cmp = self.get_complete_preinpaint( output_masks_cmp, images, output_pre_images) outputs = self.postprocess(output_images)[0] outputs_cmp = self.postprocess(output_images_cmp)[0] path = os.path.join(self.results_path, name) tsplit = name.split('.') path_cmp = os.path.join(self.results_path, '%s_cmp.%s' % (tsplit[0], tsplit[1])) print(index, name) imsave(outputs, path) imsave(outputs_cmp, path_cmp) if self.debug: input_mask = self.postprocess(masks)[0] output_mask = self.postprocess(output_masks)[0] images = self.postprocess(images)[0] fname, fext = name.split('.') imsave( images, os.path.join(self.results_path, fname + '_input.' + fext)) imsave( input_mask, os.path.join(self.results_path, fname + '_input_mask.' + fext)) imsave( output_mask, os.path.join(self.results_path, fname + '_output_mask.' + fext)) print('\nEnd test....') def sample(self, it=None): # do not sample when validation set is empty if len(self.val_dataset) == 0: return self.maskpreinpaint_model.eval() model = self.config.MODEL items = next(self.sample_iterator) images, images_gt, masks, masks_gt, masks_refine_gt = self.get_inputs( items) image_per_row = 1 if self.config.SAMPLE_SIZE <= 6: image_per_row = 1 # edge model iteration = self.maskpreinpaint_model.iteration output_images, output_pre_images, output_masks = self.maskpreinpaint_model( images, masks) output_masks_cmp = output_masks * masks output_images_cmp = self.get_complete_preinpaint( output_masks_cmp, images, output_images) output_pre_images_cmp = self.get_complete_preinpaint( output_masks_cmp, images, output_pre_images) images = stitch_images(self.postprocess(images), self.postprocess(masks), self.postprocess(masks_refine_gt), self.postprocess(output_masks), self.postprocess(output_masks_cmp), self.postprocess(output_pre_images), self.postprocess(output_pre_images_cmp), self.postprocess(output_images), self.postprocess(output_images_cmp), img_per_row=image_per_row) if it is not None: iteration = it path = os.path.join(self.samples_path) name = os.path.join(path, str(iteration).zfill(5) + ".png") create_dir(path) print('\nsaving sample ' + name) images.save(name) def log(self, logs): with open(self.log_file, 'a') as f: f.write('%s\n' % ' '.join([str(item[1]) for item in logs])) def cuda(self, *args): return (item.to(self.config.DEVICE) for item in args) def postprocess(self, img): # [0, 1] => [0, 255] img = img * 255.0 img = img.permute(0, 2, 3, 1) return img.int() def gray2rgb(self, img): return torch.cat([img] * 3, dim=1) def _run_steps_after_train(self, logs): """ Args: logs: Returns: """ # log model at checkpoints if self.config.LOG_INTERVAL and self.iteration % self.config.LOG_INTERVAL == 0: self.log(logs) # sample model at checkpoints if self.config.SAMPLE_INTERVAL and self.iteration % self.config.SAMPLE_INTERVAL == 0: self.sample() is_finish_eval = False # evaluate model at checkpoints if self.config.EVAL_INTERVAL and self.iteration % self.config.EVAL_INTERVAL == 0: print('...Eval....\n') self.eval() print('...\n') is_finish_eval = True # # save model at checkpoints # if self.config.SAVE_INTERVAL and self.iteration % self.config.SAVE_INTERVAL == 0: # if is_finish_eval: # if self.is_best: # print('...Saving model....') # self.save() # else: # print('...Eval....\n') # self.eval() # print('...\n') # if self.is_best: # print('...Saving model....') # self.save() # save model at checkpoints if self.config.SAVE_INTERVAL and self.iteration % self.config.SAVE_INTERVAL == 0: if is_finish_eval: print('...Saving model....') self.save() else: print('...Eval....\n') self.eval() print('...\n') print('...Saving model....') self.save() def get_inputs(self, items): # if self.config.WITH_EDGE: images, images_gt, masks, masks_refine_gt, masks_refine_gt = self.cuda( *items) return images, images_gt, masks, masks_refine_gt, masks_refine_gt def get_tensorboard_image(self, img_list): col = 5 images = torch.cat(img_list, dim=1) images = images[0] images = images.view((len(img_list), -1, 256, 256)) image = vutils.make_grid(images, nrow=col, normalize=False, scale_each=True) # import matplotlib.pyplot as plt # npgrid = image.cpu().detach().numpy() # plt.imshow(np.transpose(npgrid, (1, 2, 0)), interpolation='nearest') # plt.savefig('out.png') return image def _write_logs(self, logs, iteration): for item, value in logs: if item.startswith("l_"): self.writer.add_scalar('Train/loss/' + item, value, iteration) elif item.startswith("d_"): self.writer.add_scalar('Train/diff/' + item, value, iteration) else: self.writer.add_scalar('Train/' + item, value, iteration) def get_auxiliary_with_groundtruth(self, masks, masks_refine_gt, images, images_gt): # !!! edge, mask order should be edge, mask. Cant be switched auxiliary = torch.cat([images, masks], dim=1) auxiliary_gt = torch.cat([images_gt, masks_refine_gt], dim=1) return auxiliary, auxiliary_gt def get_auxiliary(self, masks, images): auxiliary = torch.cat([images, masks], dim=1) return auxiliary def get_complete_preinpaint(self, mask, input, input_gen): output_cmp = (input_gen * mask) + (input * (1 - mask)) return output_cmp
writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], total_numsteps) writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], total_numsteps) writer.add_scalar("Scores/total", env.cost()['total'], total_numsteps) # Append the total score/reward to the list score_list.append(env.cost()['total']) reward_list.append(episode_reward) # Log how much storage is utilised by calculating abs sum of actions (CHECK IF WORKS WITH MULTIPLE BUILDINGS!!!) episode_actions = np.array(agent.action_tracker[-8759:]) cooling = sum(abs(episode_actions[:,0])) writer.add_scalar("Action/Cooling", cooling, total_numsteps) if agent.act_size[0] == 2: dhw = sum(abs(episode_actions[:,1])) writer.add_scalar("Action/DHW", dhw, total_numsteps) writer.add_histogram("Action/Tracker", np.array(agent.action_tracker), total_numsteps) print("Episode: {}, total numsteps: {}, total cost: {}, reward: {}".format(i_episode, total_numsteps, round(env.cost()['total'],5), round(episode_reward, 2))) # Save trained Actor and Critic network periodically as a checkpoint if it's the best model achieved if i_episode % args.checkpoint_interval == 0: if env.cost()['total'] < best_reward: best_reward = env.cost()['total'] print("Saving new best model to {}".format(parent_dir)) agent.save_model(parent_dir) # If training episodes completed if i_episode > args.num_episodes - 1: break env.close()
from day02.net import * from torch.utils.tensorboard import SummaryWriter import cv2 net = NetV2() net.load_state_dict(torch.load("./checkpoint/2.t")) summaryWriter = SummaryWriter("./logs") layer1_weight = net.sequential[0].weight layer2_weight = net.sequential[4].weight layer3_weight = net.sequential[8].weight summaryWriter.add_histogram("layer1_weight", layer1_weight) summaryWriter.add_histogram("layer2_weight", layer2_weight) summaryWriter.add_histogram("layer3_weight", layer3_weight) cv2.waitKey(0)
loss.backward() optimizer.step() # calculate 'running' training accuracy _, predictions = scores.max(1) num_correct = (predictions == targets).sum() running_train_acc = float(num_correct) / float(data.shape[0]) accuracies.append(running_train_acc) features = data.reshape(data.shape[0], -1) class_labels = [classes[label] for label in predictions] # visualizing data and weights of fc1 for each batch img_grid = torchvision.utils.make_grid(data) writer.add_image('mnist_images', img_grid) writer.add_histogram('fc1', model.fc1.weight) # data shape is [batch_size, 1, 28, 28] # plot things to tensorboard writer.add_scalar('Training Loss', loss, global_step=step) writer.add_scalar('Training Accuracy', running_train_acc, global_step=step) # PCA for images if batch_idx == 100: writer.add_embedding(features, metadata=class_labels, label_img=data, global_step=batch_idx)
def train(train_x, train_y, validate_x, validate_y, num_epoch, encoder_optimizer, decoder_optimizer, learning_rate, num_runs, save_to_file=False, write_summary=False): if write_summary: writer = SummaryWriter() writer.add_scalar('train/learning_rate', learning_rate) running_loss = 0 for epoch in range(num_epoch): # shuffle our training set num_train_sentences = len(train_x) shuffled_train_indexes = random.sample(range(num_train_sentences), num_train_sentences) for train_idx in range(num_train_sentences): pair_idx = shuffled_train_indexes[train_idx] # convert both input and output into vocabulary indexes input_vector_x = utils.convert_vector_word2idx( train_x[pair_idx], x_word2idx_dict) target_vector_y = utils.convert_vector_word2idx( train_y[pair_idx], y_word2idx_dict) # input_vector_x = utils.convert_vector_word2idx(x_sentences[pair_idx], x_word2idx_dict) # target_vector_y = utils.convert_vector_word2idx(y_sentences[pair_idx], y_word2idx_dict) # print(str(input_vector_x) + ' ' + str(target_vector_y)) # print(str(x_sentences[pair_idx]) + ' ' + str(y_sentences[pair_idx]), end='----\n') input_tensor_x = torch.tensor(input_vector_x).view( -1, 1) # -> [ seq_len, input_size = 1 since it's just a number ] input_tensor_x = input_tensor_x.unsqueeze(0).cuda().type( torch.cuda.LongTensor) # to create the batch size and length target_tensor_y = torch.tensor(target_vector_y).view(-1, 1) target_tensor_y = target_tensor_y.unsqueeze(0).cuda().type( torch.cuda.LongTensor) _, (last_hidden, _) = encoder(input_tensor_x) total_loss, output_words = decoder(last_hidden, y=target_tensor_y) # since we need to take into account also the length of the targets we need to divide the total loss # with the length of the target sequence, # HOWEVER, we notice that the BACKPROPAGATION occurs on the total loss, we do not divide it! total_loss.backward() # drawing graph # writer.add_graph(encoder, (input_tensor_x)) # writer.add_graph(decoder, (encoder_hidden, target_tensor_y)) # ------------------------- running_loss += total_loss.item() / len(target_vector_y) iter = epoch * num_train_sentences + train_idx + 1 if iter % num_runs == 0: avg_loss = running_loss / num_runs print( str(iter / (num_epoch * num_train_sentences) * 100) + " % it: " + str(iter) + " avg loss: " + str(avg_loss)) running_loss = 0 # compute norm of all gradients in encoder list_of_grad = [] for module in encoder._modules.values(): for params in module._parameters.values(): if params.grad is not None: # print(params.grad) # flatten the tensor of weights list_of_grad = list_of_grad + list( torch.flatten(params.grad)) # print('encoder parameters with gradient length: ' + str(len(list_of_grad))) encoder_grad_mean = torch.mean(torch.tensor(list_of_grad)) # do the same for the decoder list_of_grad = [] for module in encoder._modules.values(): for params in module._parameters.values(): if params.grad is not None: # print(params.grad) # flatten the tensor of weights list_of_grad = list_of_grad + list( torch.flatten(params.grad)) # print('decoder parameters with gradient length: ' + str(len(list_of_grad))) decoder_grad_mean = torch.mean(torch.tensor(list_of_grad)) print("Input: " + str(train_x[pair_idx]) + " Gt: " + str(train_y[pair_idx]) + \ " Output word: " + str(utils.convert_vector_idx2word(output_words, y_idx2word_list))) if write_summary: writer.add_scalar('train/encoder_gradient', encoder_grad_mean, iter) writer.add_scalar('train/decoder_gradient', decoder_grad_mean, iter) writer.add_scalar('train/loss', avg_loss, iter) writer.add_histogram( 'train_hist_encoder/encoder_embedding_weights', encoder._modules['embedding'].weight, iter) # writer.add_histogram('train_hist_encoder/encoder_out_fc_weights', encoder._modules['out_fc'].weight, iter) # writer.add_histogram('train_hist_encoder/encoder_out_fc_bias', encoder._modules['out_fc'].bias, iter) writer.add_histogram( 'train_hist_encoder/encoder_lstm_weights_hh_l0', encoder._modules['lstm'].weight_hh_l0, iter) writer.add_histogram( 'train_hist_encoder/encoder_lstm_bias_hh_l0', encoder._modules['lstm'].bias_hh_l0, iter) writer.add_histogram( 'train_hist_encoder/encoder_lstm_weights_ih_l0', encoder._modules['lstm'].weight_ih_l0, iter) writer.add_histogram( 'train_hist_encoder/encoder_lstm_bias_ih_l0', encoder._modules['lstm'].bias_ih_l0, iter) writer.add_histogram( 'train_hist_decoder/decoder_embedding_weights', decoder._modules['embedding'].weight, iter) writer.add_histogram( 'train_hist_decoder/decoder_out_fc_weights', decoder._modules['out_fc'].weight, iter) writer.add_histogram( 'train_hist_decoder/decoder_out_fc_bias', decoder._modules['out_fc'].bias, iter) writer.add_histogram( 'train_hist_decoder/decoder_lstm_weights_hh_l0', decoder._modules['lstm'].weight_hh_l0, iter) writer.add_histogram( 'train_hist_decoder/decoder_lstm_bias_hh_l0', decoder._modules['lstm'].bias_hh_l0, iter) writer.add_histogram( 'train_hist_decoder/decoder_lstm_weights_ih_l0', decoder._modules['lstm'].weight_ih_l0, iter) writer.add_histogram( 'train_hist_decoder/decoder_lstm_bias_ih_l0', decoder._modules['lstm'].bias_ih_l0, iter) # writer.add_scalar('Accuracy/train', np.random.random(), iter) encoder_optimizer.step() decoder_optimizer.step() if save_to_file: torch.save(encoder.state_dict(), './encoder_100k_iter_lr' + str(learning_rate) + '.pth') torch.save(decoder.state_dict(), './decoder_100k_iter_lr' + str(learning_rate) + '.pth') if write_summary: writer.close()
optim.step() if i % 10 == 0: # keep some parameters for debugging phi_gmm_unpacked = model.unpack_recognition_gmm(phi_gmm) u_mu, u_cov = gaussian.natural_to_standard( phi_gmm_unpacked[0], phi_gmm_unpacked[1]) #wmse = weighted_mse(data, y_reconstruction[0].detach().cpu(), torch.exp(log_z_given_y_phi).detach().cpu()) #print ("Training wmse {}".format(wmse)) #glogliki = diagonal_gaussian_logprob(data.view(-1, 784).to(device), y_reconstruction[0].detach(), y_reconstruction[1].detach(), log_z_given_y_phi.detach()) #print ("Training diagonal gaussian logprob {}".format(glogliki)) #if i % 100 == 0: #plot_grad_flow(model.named_parameters(), global_step) if i % 100 == 0: writer.add_histogram('logz', torch.exp(log_z_given_y_phi), global_step=global_step) for index, (name, kernel) in enumerate(model.named_parameters()): writer.add_histogram('{}_grad'.format(name), kernel.grad, global_step=global_step) writer.add_embedding(u_mu, tag='mu_phi_gmm', global_step=global_step) #writer.add_embedding(u_cov,tag='cov_phi_gmm') writer.add_histogram('pi_phi_gmm', torch.exp(phi_gmm_unpacked[-1]), global_step=global_step) beta_k, m_k, C_k, v_k = niw.natural_to_standard(
class Logger(object): def __init__(self, log_dir, save_tb=False, log_frequency=10000, agent='sac'): self._log_dir = log_dir self._log_frequency = log_frequency if save_tb: tb_dir = os.path.join(log_dir, 'tb') if os.path.exists(tb_dir): try: shutil.rmtree(tb_dir) except: print("logger.py warning: Unable to remove tb directory") pass self._sw = SummaryWriter(tb_dir) else: self._sw = None # each agent has specific output format for training assert agent in AGENT_TRAIN_FORMAT train_format = COMMON_TRAIN_FORMAT + AGENT_TRAIN_FORMAT[agent] self._train_mg = MetersGroup(os.path.join(log_dir, 'train'), formating=train_format) self._eval_mg = MetersGroup(os.path.join(log_dir, 'eval'), formating=COMMON_EVAL_FORMAT) def _should_log(self, step, log_frequency): log_frequency = log_frequency or self._log_frequency return step % log_frequency == 0 def _try_sw_log(self, key, value, step): if self._sw is not None: self._sw.add_scalar(key, value, step) def _try_sw_log_video(self, key, frames, step): if self._sw is not None: frames = torch.from_numpy(np.array(frames)) frames = frames.unsqueeze(0) self._sw.add_video(key, frames, step, fps=30) def _try_sw_log_histogram(self, key, histogram, step): if self._sw is not None: self._sw.add_histogram(key, histogram, step) def log(self, key, value, step, n=1, log_frequency=1): if not self._should_log(step, log_frequency): return assert key.startswith('train') or key.startswith('eval') if type(value) == torch.Tensor: value = value.item() self._try_sw_log(key, value / n, step) mg = self._train_mg if key.startswith('train') else self._eval_mg mg.log(key, value, n) def log_param(self, key, param, step, log_frequency=None): if not self._should_log(step, log_frequency): return self.log_histogram(key + '_w', param.weight.data, step) if hasattr(param.weight, 'grad') and param.weight.grad is not None: self.log_histogram(key + '_w_g', param.weight.grad.data, step) if hasattr(param, 'bias') and hasattr(param.bias, 'data'): self.log_histogram(key + '_b', param.bias.data, step) if hasattr(param.bias, 'grad') and param.bias.grad is not None: self.log_histogram(key + '_b_g', param.bias.grad.data, step) def log_video(self, key, frames, step, log_frequency=None): if not self._should_log(step, log_frequency): return assert key.startswith('train') or key.startswith('eval') self._try_sw_log_video(key, frames, step) def log_histogram(self, key, histogram, step, log_frequency=None): if not self._should_log(step, log_frequency): return assert key.startswith('train') or key.startswith('eval') self._try_sw_log_histogram(key, histogram, step) def dump(self, step, save=True, ty=None): if ty is None: self._train_mg.dump(step, 'train', save) self._eval_mg.dump(step, 'eval', save) elif ty == 'eval': self._eval_mg.dump(step, 'eval', save) elif ty == 'train': self._train_mg.dump(step, 'train', save) else: raise f'invalid log type: {ty}'
for xs, ys, xn, yn in dev: xs, xn = model(xs, xn) xs = xs.exp().view(-1, len(labels)) prediction.append(xs.argmax(1).cpu()) prior += xs.sum(dim=0) dev.set_description('Epoch %d Prior %.5f' % (epoch, prior.std().item())) prediction = torch.cat(prediction) prior = (prior / prediction.size(0)).log() / temperature writer.add_histogram('Prediction', prediction[prediction != labels.blank()], epoch) writer.add_histogram('Prior', prior, epoch) for xs, ys, xn, yn in test: xs, xn = model(xs, xn) loss1 = ctc_loss(xs, ys, xn, yn).mean() loss2 = -(xs.exp() * xs).sum(dim=-1).mean() err.update(loss1.item()) ent.update(loss2.item()) xs = xs - prior xs = xs.argmax(2).t().type(torch.int)
def train(model_type=ModelType.LINEAR, batch_size=128, num_epochs=2, learning_rate=0.1): trainset = torchvision.datasets.MNIST( root='./data', train=True, download=True, transform=transforms.ToTensor() ) trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True) testset = torchvision.datasets.MNIST( root='./data', train=False, download=True, transform=transforms.ToTensor() ) testLoader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print('training on %s' % device) module = importlib.import_module("bn") class_ = getattr(module, model_type.value) model = class_(device) model.to(device) loss_fn = nn.CrossEntropyLoss() opt = optim.SGD(model.parameters(), lr=learning_rate) loss_arr = [] writer = SummaryWriter(log_dir='runs/%s_%s' % (model_type.value, datetime.now().strftime("%H:%M:%S"))) for epoch in range(num_epochs): for i, data in enumerate(trainloader, 0): model.train() n_iter = (epoch * len(trainloader)) + i inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) opt.zero_grad() outputs = model(inputs) loss = loss_fn(outputs, labels) loss.backward() opt.step() loss_arr.append(loss.item()) writer.add_scalar('training/loss', loss.item(), n_iter) writer.add_scalar('inputs/layer1/mean', model.l1_inp.cpu().numpy().mean(), n_iter) writer.add_scalar('inputs/layer2/mean', model.l2_inp.cpu().numpy().mean(), n_iter) writer.add_histogram('inputs/layer1/dist', model.l1_inp.cpu().numpy(), n_iter) writer.add_histogram('inputs/layer2/dist', model.l2_inp.cpu().numpy(), n_iter) if i % 10 == 0: inputs = inputs.view(inputs.size(0), -1) model.eval() print('training loss: %0.2f' % loss.item()) model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for test_data, test_target in testLoader: test_data = test_data.to(device) test_target = test_target.to(device) output = model(test_data) test_loss += loss_fn(output, test_target) pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(test_target.view_as(pred)).sum().item() test_loss /= len(testLoader.dataset) writer.add_scalar('testing/loss', test_loss, n_iter) writer.add_scalar('testing/accuracy', correct/len(testLoader.dataset) * 100., n_iter) # compute summary l1_mean = [x[0].cpu() for x in model.l1_dist] l1_std = [x[1].cpu() for x in model.l1_dist] l2_mean = [x[0].cpu() for x in model.l2_dist] l2_std = [x[1].cpu() for x in model.l2_dist] return l1_mean, l1_std, l2_mean, l2_std, loss_arr, model_type.value
class runManager(): def __init__(self): # 记录每个epoch的参数 self.epoch_count = 0 # epoch的次数 self.epoch_loss = 0 # 每次epoch的loss self.epoch_num_correct = 0 # 每次epoch正确的个数 self.epoch_start_time = None # epoch的起始时间 # 记录每次运行(不同的超参数背景) self.run_params = None # 超参数的数值 self.run_count = 0 # 第几次运行,跟batch_size有关 self.run_data = [] # 每次epoch对应的超参数的数值以及计算出的loss等 self.run_start_time = None # 每次运行的起始时间 self.network = None # 网络 self.loader = None # 数据 self.tb = None # tensorboard的写入 # 每次运行开始需要进行的操作,需要传入一个网络和数据以及必要的超参数,放在RunBilder里面管理 def begin_run(self, run, network, loader): # 起始时间 self.run_start_time = time.time() # 记录此次运行的超参数 self.run_params = run # 记录运行的次数 self.run_count += 1 self.network = network self.loader = loader self.tb = SummaryWriter(comment=f'-{run}') # 写在tensorboard里面 images, labels = next(iter(self.loader)) grid = torchvision.utils.make_grid(images) self.tb.add_image('images', grid) self.tb.add_graph(self.network, images.to(getattr(run, 'device', 'cpu'))) # 每次运行结束时需要进行的操作 def end_run(self): # 关闭tensorboard的写操作 self.tb.close() # 将epoch的次数重新归零 self.epoch_count = 0 # 每次epoch开始时需要进行的操作 def begin_epoch(self): # 记录起始时间 self.epoch_start_time = time.time() # 记录epoch的次数 self.epoch_count += 1 # 将epoch的loss重新归零 self.epoch_loss = 0 # 将epoch的正确个数重新归零 self.epoch_num_correct = 0 # 每次epoch结束时需要进行的操作 def end_epoch(self): # 计算每次epoch完成所用的时间 epoch_duration = time.time() - self.epoch_start_time # 计算每次运行(所有epoch)所用时间,这里需要注意,这里其实是在对epoch的时间经行累加 run_duration = time.time() - self.run_start_time # 计算正确率 loss = self.epoch_loss accuracy = self.epoch_num_correct / len(self.loader.dataset) # tensorboard写入数据 self.tb.add_scalar('Loss', loss, self.epoch_count) self.tb.add_scalar('Accuracy', accuracy, self.epoch_count) # tensorboard写入数据 for name, param in self.network.named_parameters(): self.tb.add_histogram(name, param, self.epoch_count) # self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count) # 将结果用表格的形式可视化,每一次epoch是最小单位,所以应该在这里可视化 results = OrderedDict() results["run"] = self.run_count results["epoch"] = self.epoch_count results['loss'] = loss results["accuracy"] = accuracy results['epoch duration'] = epoch_duration results['run duration'] = run_duration for k, v in self.run_params._asdict().items(): results[k] = v self.run_data.append(results) print('runs: ' + "%d" % results["run"] + ', ' + 'epoch: ' + "%d" % results["epoch"] + ', ' + 'loss: ' + "%d" % results["loss"] + ', ' + 'accuracy: ' + "%f" % results["accuracy"]) ''' df = pd.DataFrame.from_dict(self.run_data, orient = 'columns') clear_output(wait=True) display(df) ''' # 计算loss的方法,batch[0].shape[0]其实就是batch_size def track_loss(self, loss, batch): self.epoch_loss += loss.item() * batch[0].shape[0] # 计算正确个数的方法的方法 def track_num_correct(self, preds, labels): self.epoch_num_correct += self._get_num_correct(preds, labels) def _get_num_correct(self, preds, labels): return preds.argmax(dim=1).eq(labels).sum().item() # 将结果(表格)分别存为excel.csv和json格式 def save(self, fileName): pd.DataFrame.from_dict(self.run_data, orient='columns').to_csv(f'{fileName}.csv') with open(f'{fileName}.json', 'w', encoding='utf-8') as f: json.dump(self.run_data, f, ensure_ascii=False, indent=4)
f_net.train() for epoch in range(epochs): running_loss = 0.0 for i, data in enumerate(train_loader): inputs, labels = data optimizer.zero_grad() if is_stochastic: loss = f_net(inputs, labels) else: outputs = f_net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() # lr_scheduler.step() writer.add_scalar('Loss', running_loss, epoch) for name, weight in f_net.named_parameters(): writer.add_histogram(name, weight, epoch) if epoch % 10 == 0: print("Epoch: ", epoch, "Running loss: ", running_loss) print('Finished Training') writer.close() torch.save(f_net.state_dict(), PATH_f_net)
def train_network(): print('') print('') # Start measuring time - to evaluate performance of the training function start = timeit.default_timer() # Set seeds set_seed(args) # Make folders if not yet exist try: os.makedirs('save') except FileExistsError: pass # Save relevant arguments from a and set hardcoded arguments lr = args.lr # learning rate batch_size = args.batch_size # Mini-batch size num_epochs = args.num_epochs # Number of epochs to train the network seq_len = args.seq_len # Network architecture: rnn_name = args.rnn_name inputs_list = args.inputs_list outputs_list = args.outputs_list load_rnn = args.load_rnn # If specified this is the name of pretrained RNN which should be loaded path_save = args.path_save # Create rnn instance and update lists of input, outputs and its name (if pretraind net loaded) net, rnn_name, inputs_list, outputs_list \ = create_rnn_instance(rnn_name, inputs_list, outputs_list, load_rnn, path_save, device) # Create log for this RNN and determine its full name rnn_full_name = create_log_file(rnn_name, inputs_list, outputs_list, path_save) net.rnn_full_name = rnn_full_name ######################################################## # Create Dataset ######################################################## train_dfs, _ = load_data(args, args.train_file_name) normalization_info = calculate_normalization_info(train_dfs, args.path_save, rnn_full_name) test_dfs, time_axes_dev = load_data(args, args.val_file_name) train_dfs_norm = normalize_df(train_dfs, normalization_info) test_dfs_norm = normalize_df(test_dfs, normalization_info) del train_dfs, test_dfs train_set = Dataset(train_dfs_norm, args) dev_set = Dataset(test_dfs_norm, args, time_axes=time_axes_dev) print('Number of samples in training set: {}'.format(train_set.number_of_samples)) print('The training sets sizes are: {}'.format(train_set.df_lengths)) print('Number of samples in validation set: {}'.format(dev_set.number_of_samples)) print('') plot_results(net=net, args=args, dataset=dev_set, seq_len=1024, comment='This is the network at the beginning of the training', inputs_list=inputs_list, outputs_list=outputs_list, save=True, closed_loop_enabled=True) # Create PyTorch dataloaders for train and dev set train_generator = data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=args.num_workers) dev_generator = data.DataLoader(dataset=dev_set, batch_size=512, shuffle=False, num_workers=args.num_workers) # Print parameter count print_parameter_count(net) # Seems not to function well # Select Optimizer optimizer = optim.Adam(net.parameters(), amsgrad=True, lr=lr) # TODO: Verify if scheduler is working. Try tweaking parameters of below scheduler and try cyclic lr scheduler # scheduler = lr_scheduler.CyclicLR(optimizer, base_lr=lr, max_lr=0.1) # scheduler = lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.5) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min',patience=1, verbose=True) # Select Loss Function criterion = nn.MSELoss() # Mean square error loss function ''' Init Tensorboard ''' comment = f' batch_size={batch_size} lr={lr} seq_len={seq_len}' tb = SummaryWriter(comment=comment) ######################################################## # Training ######################################################## print("Starting training...") print('') time.sleep(0.001) # Create dictionary to store training history dict_history = {} dict_history['epoch'] = [] dict_history['time'] = [] dict_history['lr'] = [] dict_history['train_loss'] = [] dict_history['dev_loss'] = [] dict_history['dev_gain'] = [] dict_history['test_loss'] = [] dev_gain = 1 # The epoch_saved variable will indicate from which epoch is the last RNN model, # which was good enough to be saved epoch_saved = -1 for epoch in range(num_epochs): ########################################################################################################### # Training - Iterate batches ########################################################################################################### # Set RNN in training mode net = net.train() # Define variables accumulating training loss and counting training batchs train_loss = 0 train_batches = 0 # Iterate training over available batches # tqdm() is just a function which displays the progress bar # Otherwise the line below is the same as "for batch, labels in train_generator:" for batch, labels in tqdm(train_generator): # Iterate through batches # Reset the network (internal states of hidden layers and output history not the weights!) net.reset() # Further modifying the input and output form to fit RNN requirements # If GPU available we send tensors to GPU (cuda) if torch.cuda.is_available(): batch = batch.float().cuda().transpose(0, 1) labels = labels.float().cuda() else: batch = batch.float().transpose(0, 1) labels = labels.float() # # Reset memory of gradients # optimizer.zero_grad() # Warm-up (open loop prediction) to settle the internal state of RNN hidden layers net(rnn_input=batch[:args.warm_up_len, :, :]) # Reset memory of gradients optimizer.zero_grad() # Forward propagation - These are the results from which we calculate the update to RNN weights # GRU Input size must be (seq_len, batch, input_size) net(rnn_input=batch[args.warm_up_len:, :, :]) out = net.return_outputs_history() # Get loss loss = criterion(out[:, args.warm_up_len:, :], labels[:, args.warm_up_len:, :]) # Backward propagation loss.backward() # Gradient clipping - prevent gradient from exploding torch.nn.utils.clip_grad_norm_(net.parameters(), 100) # Update parameters optimizer.step() # scheduler.step() # Update variables for loss calculation batch_loss = loss.detach() train_loss += batch_loss # Accumulate loss train_batches += 1 # Accumulate count so we can calculate mean later ########################################################################################################### # Validation - Iterate batches ########################################################################################################### # Set the network in evaluation mode net = net.eval() # Define variables accumulating evaluation loss and counting evaluation batches dev_loss = 0 dev_batches = 0 for (batch, labels) in tqdm(dev_generator): # Reset the network (internal states of hidden layers and output history not the weights!) net.reset() # Further modifying the input and output form to fit RNN requirements # If GPU available we send tensors to GPU (cuda) if torch.cuda.is_available(): batch = batch.float().cuda().transpose(0, 1) labels = labels.float().cuda() else: batch = batch.float().transpose(0, 1) labels = labels.float() # Warm-up (open loop prediction) to settle the internal state of RNN hidden layers net(rnn_input=batch) out = net.return_outputs_history() # Get loss # For evaluation we always calculate loss over the whole maximal prediction period # This allow us to compare RNN models from different epochs loss = criterion(out[:, args.warm_up_len: args.seq_len], labels[:, args.warm_up_len: args.seq_len]) # Update variables for loss calculation batch_loss = loss.detach() dev_loss += batch_loss # Accumulate loss dev_batches += 1 # Accumulate count so we can calculate mean later # Reset the network (internal states of hidden layers and output history not the weights!) net.reset() # Get current learning rate # TODO(Fixed. It does changes now): I think now the learning rate do not change during traing, or it is not a right way to get this info. for param_group in optimizer.param_groups: lr_curr = param_group['lr'] scheduler.step(dev_loss) ''' Add data for tensorboard TODO : Add network graph and I/O to tensorboard ''' # tb.add_graph(net) tb.add_scalar('Train Loss', train_loss / train_batches, epoch) tb.add_scalar('Dev Loss', dev_loss / dev_batches, epoch) # Add the first sample of batch to tensorboard. Prediction is represented by Dotted line # TODO: Concatenate such graphs. But they are not continous # for i in range(labels.shape[2]): # time_label = np.arange(0, labels.shape[1], 1) # time_out = np.arange(0, out.shape[1], 1) # true_data = labels[1, :, i] # predicted_data = out[1, :, i] # fig_tb = plt.figure(5) # plt.plot(time_label, true_data.detach().cpu()) # plt.plot(time_out, predicted_data.detach().cpu(), linestyle='dashed') # tb.add_figure(tag=str(a.outputs_list[i]), figure=fig_tb, global_step=epoch) for name, param in net.named_parameters(): tb.add_histogram(name, param, epoch) tb.add_histogram(f'{name}.grad', param.grad, epoch) tb.close() # Write the summary information about the training for the just completed epoch to a dictionary dict_history['epoch'].append(epoch) dict_history['lr'].append(lr_curr) dict_history['train_loss'].append( train_loss.detach().cpu().numpy() / train_batches / (args.seq_len - args.warm_up_len)) dict_history['dev_loss'].append( dev_loss.detach().cpu().numpy() / dev_batches / (args.seq_len - args.warm_up_len)) # Get relative loss gain for network evaluation if epoch >= 1: dev_gain = (dict_history['dev_loss'][epoch - 1] - dict_history['dev_loss'][epoch]) / \ dict_history['dev_loss'][epoch - 1] dict_history['dev_gain'].append(dev_gain) # Print the summary information about the training for the just completed epoch print('\nEpoch: %3d of %3d | ' 'LR: %1.5f | ' 'Train-L: %6.4f | ' 'Val-L: %6.4f | ' 'Val-Gain: %3.2f |' % (dict_history['epoch'][epoch], num_epochs - 1, dict_history['lr'][epoch], dict_history['train_loss'][epoch], dict_history['dev_loss'][epoch], dict_history['dev_gain'][epoch] * 100)) print('') # Save the best model with the lowest dev loss # Always save the model from epoch 0 # TODO: this is a bug: you should only save the model from epoch 0 if there is no pretraind network if epoch == 0: min_dev_loss = dev_loss # If current loss smaller equal than minimal till now achieved loss, # save the current RNN model and save its loss as minimal ever achieved if dev_loss <= min_dev_loss: epoch_saved = epoch min_dev_loss = dev_loss torch.save(net.state_dict(), args.path_save + rnn_full_name + '.pt', _use_new_zipfile_serialization=False) print('>>> saving best model from epoch {}'.format(epoch)) print('') plot_string = 'This is the network after {} training epoch'.format(epoch + 1) plot_results(net=net, args=args, dataset=dev_set, seq_len=1024, comment=plot_string, inputs_list=inputs_list, outputs_list=outputs_list, save=True, closed_loop_enabled=True) else: print('>>> We keep model from epoch {}'.format(epoch_saved)) print('') # Evaluate the performance of the current network # by checking its predictions on a randomly generated CartPole experiment # open_loop_prediction_experiment(net, a, val_file) # When finished the training print the final message print("Training Completed... ") print(" ") # Calculate the total time it took to run the function stop = timeit.default_timer() total_time = stop - start # Return the total time it took to run the function return total_time
class FlowTrainer(object): def __init__(self): super(FlowTrainer, self).__init__() # not the best model... self.model = PyramidUNet() self.epoch = 1000 self.dataloader = SintelLoader() self.gpu_ids = GPUS_LIST self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01) self.scheduler = CosineAnnealingLR(self.optimizer, len(self.dataloader.train())) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.writer = SummaryWriter() self.global_step = 0 self.tripletloss = torch.nn.TripletMarginLoss() self.load_model_path = "./archive/best.pth" self.stat_cache = None def initialize(self): self.model.to(self.device) self.model = torch.nn.DataParallel(self.model, device_ids=self.gpu_ids) if self.load_model_path: # LOAD MODEL WEIGHTS HERE if os.path.exists(self.load_model_path): self.load_old_best() self.initialized = True def savemodel(self, metrics): import json with open('./archive/metrics.txt','w') as f: json.dump(metrics,f) torch.save(self.model.module.state_dict(), self.load_model_path) def warpframes(self, ff, fb, frame): ff_ = self.warper(ff, frame, 'ff') fb_ = self.warper(fb, frame, 'fb') warpframe = (ff_, fb_) occlusion = self.occwarper(ff, fb) return occlusion, warpframe # def warpocclusion(self, ff, fb): # return self.occwarper(ff,fb) def train(self, nb_epoch): trainstream = tqdm(self.dataloader.train()) self.avg_loss = AverageMeter() self.avg_epe = AverageMeter() self.model.train() for i, data in enumerate(trainstream): self.global_step += 1 trainstream.set_description('TRAINING') # GET X and Frame 2 # wdt = data['displacement'].to(self.device) frame = data['frame'].to(self.device) flow = data['flow'].cpu() # frame.requires_grad = True flow.requires_grad = False """ NOTE : THIS MUST BE ADJUSTED AT DATA LOADER SIDE torch.Size([1, 2, 9, 436, 1024]) -> finalflow size torch.Size([1, 2, 9, 108, 256]) -> pyraflow1 size torch.Size([1, 2, 9, 54, 128]) -> pyraflow2 size torch.Size([1, 2, 9, 27, 64]) -> pyraflow3 size """ pyra1_frame = data['pyra1_frame'].to(self.device) # pyra1_frame.requires_grad = True pyra2_frame = data['pyra2_frame'].to(self.device) # pyra2_frame.requires_grad = True laten_frame = data['laten_frame'].to(self.device) # laten_frame.requires_grad = True self.optimizer.zero_grad() # forward with torch.set_grad_enabled(True): finalflow, pyraflow1, pyraflow2, latenflow = self.model(frame) # pyra1_frame = F.interpolate(frame, size = (108, 256)) # pyra2_frame = F.interpolate(frame, size = (54, 128)) # laten_frame = F.interpolate(frame, size=(27, 64)) occlu_final, frame_final = self.warpframes(*finalflow, frame) occlu_pyra1, frame_pyra1 = self.warpframes(*pyraflow1, pyra1_frame) occlu_pyra2, frame_pyra2 = self.warpframes(*pyraflow2, pyra2_frame) occlu_laten, frame_laten = self.warpframes(*latenflow, laten_frame) # print(occlu_final[0].shape) cost_final = self.getcost(*frame_final, *occlu_final, frame) cost_pyra1 = self.getcost(*frame_pyra1, *occlu_pyra1, pyra1_frame) cost_pyra2 = self.getcost(*frame_pyra2, *occlu_pyra2, pyra2_frame) cost_laten = self.getcost(*frame_laten, *occlu_laten, laten_frame) eper_final = self.epe(finalflow[1].cpu().detach(), flow.cpu().detach()) loss = cost_final + cost_pyra1 + cost_pyra2 + cost_laten self.avg_loss.update(loss.item(), i + 1) self.avg_epe.update(eper_final.item(), i + 1) loss.backward() self.optimizer.step() self.writer.add_scalar('Loss/train', self.avg_loss.avg, self.global_step) self.writer.add_scalar('EPE/train', self.avg_epe.avg, self.global_step) trainstream.set_postfix({'epoch': nb_epoch, 'loss': self.avg_loss.avg, 'epe': self.avg_epe.avg}) self.scheduler.step(loss) trainstream.close() fb_frame_final = frame_final[1] fb_final = finalflow[1] fb_occlu_final = occlu_final[1] self.writer.add_histogram('REAL/flow_u', flow[0,0].view(-1), nb_epoch) self.writer.add_histogram('REAL/flow_v', flow[0,1].view(-1), nb_epoch) self.writer.add_histogram('PRED/flow_u_ff', finalflow[0][0,0].view(-1), nb_epoch) self.writer.add_histogram('PRED/flow_v_ff', finalflow[0][0,1].view(-1), nb_epoch) self.writer.add_histogram('PRED/flow_u_fb', finalflow[1][0,0].view(-1), nb_epoch) self.writer.add_histogram('PRED/flow_v_fb', finalflow[1][0,1].view(-1), nb_epoch) self.writer.add_histogram('REAL/occ',data['occlusion'][0].view(-1),nb_epoch) self.writer.add_histogram('PRED/occ_ff',occlu_final[0][0].view(-1),nb_epoch) self.writer.add_histogram('PRED/occ_fb',occlu_final[1][0].view(-1),nb_epoch) return self.train_epoch_end({'TRloss': self.avg_loss.avg, 'epoch': nb_epoch, 'pred_frame': fb_frame_final[0:4], 'gt_frame': frame[0:4,:3], 'pred_flow': flow2rgb(fb_final[0:4], False), 'gt_flow': flow2rgb(flow[0:4],False), 'pred_occ': 1. - fb_occlu_final[0:4], 'gt_occ': data['occlusion'][0:4]}) def train_epoch_end(self, metrics): self.model.eval() with torch.no_grad(): pred_frame = metrics.get('pred_frame') gt_frame = metrics.get('gt_frame') pred_flow = metrics.get('pred_flow') gt_flow = metrics.get('gt_flow') pred_occ = replicatechannel(metrics.get('pred_occ')) gt_occ = replicatechannel(metrics.get('gt_occ')) data = torch.cat([pred_frame.cuda(), gt_frame.cuda(), pred_flow.cuda(), gt_flow.cuda(), pred_occ.cuda(), gt_occ.cuda()],0) data = data.cpu() grid = make_grid(data, nrow=4) grid = ToTensor()((ToPILImage()(grid)).resize((4106//6,2630//4))) self.writer.add_images('TRAIN/Results', grid.unsqueeze(0), metrics.get('n_batch')) self.val(metrics.get('epoch')) def val(self, nb_epoch): self.model.eval() # if self.val_loader is None: return self.test() # DO VAL STUFF HERE valstream = tqdm(self.dataloader.val()) self.avg_loss = AverageMeter() self.avg_epe = AverageMeter() valstream.set_description('VALIDATING') with torch.no_grad(): for i, data in enumerate(valstream): frame = data['frame'].to(self.device) flow = data['flow'].cpu() finalflow = self.model(frame) occlu_final, frame_final = self.warpframes(*finalflow, frame) loss = self.getcost(*frame_final, *occlu_final, frame) eper_final = self.epe(flow.cpu().detach(), finalflow[1].cpu().detach()) self.avg_loss.update(loss.item(), i + 1) self.avg_epe.update(eper_final.item(), i + 1) self.writer.add_scalar('Loss/val', self.avg_loss.avg, self.global_step) self.writer.add_scalar('EPE/val', self.avg_epe.avg, self.global_step) fb_frame_final = frame_final[1] fb_final = finalflow[1] fb_occlu_final = occlu_final[1] valstream.close() self.val_end({'VLloss': self.avg_loss.avg, 'VLepe':self.avg_epe.avg, 'epoch': nb_epoch, 'pred_frame': fb_frame_final[0:4], 'gt_frame': frame[0:4,:3], 'pred_flow': flow2rgb(fb_final[0:4], False), 'gt_flow': flow2rgb(flow[0:4],False), 'pred_occ': 1. - fb_occlu_final[0:4], 'gt_occ': data['occlusion'][0:4]}) def val_end(self, metrics): """WRITE STAT FIRST""" if self.stat_cache is None: self.stat_cache = {'VLloss': metrics.get('VLloss'), 'VLepe': metrics.get('VLepe')} self.savemodel({'VLloss': metrics.get('VLloss'), 'VLepe': metrics.get('VLepe')}) else: if self.stat_cache.get('VLloss') < metrics.get('VLloss'): self.stat_cache.update({'VLloss': metrics.get('VLloss'), 'VLepe': metrics.get('VLepe')}) self.savemodel(self.stat_cache) else: self.load_old_best() self.model.eval() with torch.no_grad(): pred_frame = metrics.get('pred_frame').cpu() gt_frame = metrics.get('gt_frame').cpu() pred_flow = metrics.get('pred_flow').cpu() gt_flow = metrics.get('gt_flow').cpu() pred_occ = replicatechannel(metrics.get('pred_occ')).cpu() gt_occ = replicatechannel(metrics.get('gt_occ')).cpu() data = torch.cat([pred_frame, gt_frame, pred_flow, gt_flow, pred_occ, gt_occ], 0).cpu() grid = make_grid(data, nrow=3) grid = ToTensor()((ToPILImage()(grid)).resize((4106 // 6, 2630 // 4))) self.writer.add_images('VAL/Results', grid.unsqueeze(0), metrics.get('n_batch')) # self.test(metrics.get('epoch')) def load_old_best(self): import json with open('./archive/metrics.txt', 'r') as f: self.stat_cache = json.load(f) self.model.module.load_state_dict(torch.load(self.load_model_path)) def test(self, nb_epoch): self.model.eval() teststream = tqdm(self.dataloader.test()) self.avg_loss = AverageMeter() teststream.set_description('TESTING') with torch.no_grad(): for i, data in enumerate(teststream): frame = data['frame'] finalflow = self.model(frame) occlu_final, frame_final = self.warpframes(*finalflow, frame) loss = self.getcost(*frame_final, *occlu_final, frame) self.avg_loss.update(loss.item(), i + 1) self.writer.add_scalar('Loss/test', self.avg_loss.avg, self.global_step) fb_frame_final = frame_final[1] fb_final = finalflow[1] fb_occlu_final = occlu_final[1] teststream.close() self.test_end({'VLloss': self.avg_loss.avg, 'epoch': nb_epoch, 'pred_frame': fb_frame_final[0, :, 0:4, :].permute(1, 0, 2, 3), 'gt_frame': frame[0, :, 0:4, :].permute(1, 0, 2, 3), 'pred_flow': flow2rgb(fb_final[0, :, 0:4, :].permute(1, 0, 2, 3),False), 'pred_occ': 1. - fb_occlu_final[0, :, 0:4, :].permute(1, 0, 2, 3), }) def test_end(self, metrics): self.model.eval() with torch.no_grad(): pred_frame = metrics.get('pred_frame').cpu() gt_frame = metrics.get('gt_frame').cpu() pred_flow = metrics.get('pred_flow').cpu() pred_occ = replicatechannel(metrics.get('pred_occ')).cpu() data = torch.stack([pred_frame, gt_frame, pred_flow, pred_occ], 0) data = data.reshape(-1, 3, data.size(3), data.size(4)).cpu() grid = make_grid(data, nrow=4) self.writer.add_images('Test/Results', grid.unsqueeze(0), metrics.get('n_batch')) def loggings(self, **metrics): pass def warper(self, flows, frames, mode='ff', scaled=True, nocuda=False): if mode == 'ff': dframe = frames[:,:3] # given frame from 0 to n-1 predict frame 1 to n elif mode == 'fb': dframe = frames[:, 3:] # given frame from 1 to n predict frame 0 to n-1 else: raise Exception("Mode must be flow-forwad 'ff' or flow-backward 'fb'") warped = warper(flows.cuda(), dframe.cuda(), scaled=True, nocuda=nocuda).cuda() return warped def occwarper(self, ff, fb): ff_occ, fb_occ = computeocclusion(ff, fb) return ff_occ, fb_occ def log_triplet_loss(self, anchor, positive, negative, maskp, maskn, q=1e-4): pos = torch.mul(torch.pow((torch.abs(anchor - positive) + 1e-2), q), maskp) neg = torch.mul(torch.pow((torch.abs(anchor - negative) + 1e-2), q), maskn) pos = pos.sum() / (maskp.sum() + 1e-10) neg = neg.sum() / (maskn.sum()+1e-10) # loss = torch.log(torch.exp(pos / (neg + 1e-10))) # loss = loss.sum() / (mask.sum() + 1e-10) return pos, neg def getcost(self, ff_frame, fb_frame, ff_occlu, fb_occlu, frame): ff_frame = ff_frame.cuda() fb_frame = fb_frame.cuda() frame = frame.cuda() ff_truth, fb_truth = frame[ :, 3:], frame[:, :3] ff_tloss = self.tripletloss(ff_truth, ff_frame, fb_frame) fb_tloss = self.tripletloss(ff_truth, ff_frame, fb_frame) f_ploss, b_ploss = self.log_triplet_loss(ff_truth, ff_frame, fb_frame, ff_occlu, fb_occlu) total = f_ploss + b_ploss + ff_tloss + fb_tloss # total = ff_tloss + fb_tloss return total def epe(self, source, target): with torch.no_grad(): source = source.cpu().detach() target = target.cpu().detach() # from termcolor import colored # print(colored(f'{source.shape, target.shape, source.max(), target.max()}','red')) B, C, H, W = source.size() diff = (source - target).reshape(-1, C * H * W) return torch.norm(diff, p=2, dim=1).mean() def run(self): self.initialize() for i in range(self.epoch): self.train(i) self.writer.close()
class network: def __init__(self, FLAGS): self.writer = SummaryWriter('output/s3dis_tensorboard') self.f_out = self.mkdir_log(FLAGS.log_dir) self.train_dataset = S3DIS('training') self.test_dataset = S3DIS('validation') self.train_dataloader = DataLoaderX( self.train_dataset, batch_size=FLAGS.batch_size, shuffle=True, num_workers=20, worker_init_fn=self.worker_init, collate_fn=self.train_dataset.collate_fn, pin_memory=True) self.test_dataloader = DataLoaderX( self.test_dataset, batch_size=FLAGS.batch_size, shuffle=True, num_workers=20, worker_init_fn=self.worker_init, collate_fn=self.test_dataset.collate_fn, pin_memory=True) print('train dataset length:{}'.format(len(self.train_dataset))) print('test dataset length:{}'.format(len(self.test_dataset))) print('train datalodaer length:{}'.format(len(self.train_dataloader))) print('test dataloader length:{}'.format(len(self.test_dataloader))) self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') self.config = ConfigS3DIS self.net = RandLANET('S3DIS', self.config) self.net.to(self.device) # torch.cuda.set_device(1) # if torch.cuda.device_count() > 1: # log_out("Let's use multi GPUs!", self.f_out) # device_ids=[1,2,3,4] # self.net = nn.DataParallel(self.net, device_ids=[1,2,3,4]) self.optimizer = optimizer.Adam(self.net.parameters(), lr=self.config.learning_rate) self.end_points = {} self.FLAGS = FLAGS def mkdir_log(self, out_path): if not os.path.exists(out_path): os.mkdir(out_path) f_out = open(os.path.join(out_path, 'log_s3dis_train.txt'), 'a') return f_out def worker_init(self, worker_id): np.random.seed(np.random.get_state()[1][0] + worker_id) def adjust_learning_rate(self, epoch): lr = self.optimizer.param_groups[0]['lr'] lr = lr * self.config.lr_decays[epoch] for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.writer.add_scalar('learning rate', lr, epoch) def train_one_epoch(self, epoch_count): self.stat_dict = {} # collect statistics self.adjust_learning_rate(epoch_count) self.net.train() # set model to training mode iou_calc = IoUCalculator(self.config) for batch_idx, batch_data in enumerate(self.train_dataloader): t_start = time.time() for key in batch_data: if type(batch_data[key]) is list: for i in range(len(batch_data[key])): batch_data[key][i] = batch_data[key][i].cuda() else: batch_data[key] = batch_data[key].cuda() xyz = batch_data['xyz'] # (batch,N,3) neigh_idx = batch_data['neigh_idx'] # (batch,N,16) sub_idx = batch_data['sub_idx'] # (batch,N/4,16) interp_idx = batch_data['interp_idx'] # (batch,N,1) features = batch_data['features'] # (batch, 3, N) labels = batch_data['labels'] # (batch, N) input_inds = batch_data['input_inds'] # (batch, N) cloud_inds = batch_data['cloud_inds'] # (batch, 1) # Forward pass self.optimizer.zero_grad() self.out = self.net(xyz, neigh_idx, sub_idx, interp_idx, features, labels, input_inds, cloud_inds) self.loss, self.end_points['valid_logits'], self.end_points[ 'valid_labels'] = compute_loss(self.out, labels, self.config) self.end_points['loss'] = self.loss # self.writer.add_graph(self.net, input_to_model=[xyz, neigh_idx, sub_idx, interp_idx, features, labels, input_inds, cloud_inds]) self.writer.add_scalar( 'training loss', self.loss, (epoch_count * len(self.train_dataloader) + batch_idx)) self.loss.backward() self.optimizer.step() self.acc = compute_acc(self.end_points['valid_logits'], self.end_points['valid_labels']) self.end_points['acc'] = self.acc self.writer.add_scalar( 'training accuracy', self.acc, (epoch_count * len(self.train_dataloader) + batch_idx)) iou_calc.add_data(self.end_points['valid_logits'], self.end_points['valid_labels']) for key in self.end_points: if 'loss' in key or 'acc' in key or 'iou' in key: if key not in self.stat_dict: self.stat_dict[key] = 0 self.stat_dict[key] += self.end_points[key].item() t_end = time.time() batch_interval = 10 if (batch_idx + 1) % batch_interval == 0: log_out( ' ----step %08d batch: %08d ----' % (epoch_count * len(self.train_dataloader) + batch_idx + 1, (batch_idx + 1)), self.f_out) for key in sorted(self.stat_dict.keys()): log_out( 'mean %s: %f---%f ms' % (key, self.stat_dict[key] / batch_interval, 1000 * (t_end - t_start)), self.f_out) self.writer.add_scalar( 'training mean {}'.format(key), self.stat_dict[key] / batch_interval, (epoch_count * len(self.train_dataloader) + batch_idx)) self.stat_dict[key] = 0 for name, param in self.net.named_parameters(): self.writer.add_histogram( name + '_grad', param.grad, (epoch_count * len(self.train_dataloader) + batch_idx)) self.writer.add_histogram( name + '_data', param, (epoch_count * len(self.train_dataloader) + batch_idx)) mean_iou, iou_list = iou_calc.compute_iou() self.writer.add_scalar('training mean iou', mean_iou, (epoch_count * len(self.train_dataloader))) log_out('training mean IoU:{:.1f}'.format(mean_iou * 100), self.f_out) s = 'training IoU:' for iou_tmp in iou_list: s += '{:5.2f} '.format(100 * iou_tmp) log_out(s, self.f_out) self.writer.close() def evaluate_one_epoch(self, epoch_count): self.current_loss = None self.net.eval() # set model to eval mode (for bn and dp) iou_calc = IoUCalculator(self.config) for batch_idx, batch_data in enumerate(self.test_dataloader): t_start = time.time() for key in batch_data: if type(batch_data[key]) is list: for i in range(len(batch_data[key])): batch_data[key][i] = batch_data[key][i].cuda() else: batch_data[key] = batch_data[key].cuda() xyz = batch_data['xyz'] # (batch,N,3) neigh_idx = batch_data['neigh_idx'] # (batch,N,16) sub_idx = batch_data['sub_idx'] # (batch,N/4,16) interp_idx = batch_data['interp_idx'] # (batch,N,1) features = batch_data['features'] # (batch, 3, N) labels = batch_data['labels'] # (batch, N) input_inds = batch_data['input_inds'] # (batch, N) cloud_inds = batch_data['cloud_inds'] # (batch, 1) # Forward pass with torch.no_grad(): self.out = self.net(xyz, neigh_idx, sub_idx, interp_idx, features, labels, input_inds, cloud_inds) self.loss, self.end_points['valid_logits'], self.end_points[ 'valid_labels'] = compute_loss(self.out, labels, self.config) self.end_points['loss'] = self.loss # self.writer.add_scalar('eval loss', self.loss, (epoch_count* len(self.test_dataloader) + batch_idx)) self.acc = compute_acc(self.end_points['valid_logits'], self.end_points['valid_labels']) self.end_points['acc'] = self.acc # self.writer.add_scalar('eval acc', self.acc, (epoch_count* len(self.test_dataloader) + batch_idx)) iou_calc.add_data(self.end_points['valid_logits'], self.end_points['valid_labels']) # Accumulate statistics and print out for key in self.end_points: if 'loss' in key or 'acc' in key or 'iou' in key: if key not in self.stat_dict: self.stat_dict[key] = 0 self.stat_dict[key] += self.end_points[key].item() t_end = time.time() batch_interval = 10 if (batch_idx + 1) % batch_interval == 0: log_out( ' ----step %08d batch: %08d ----' % (epoch_count * len(self.test_dataloader) + batch_idx + 1, (batch_idx + 1)), self.f_out) for key in sorted(self.stat_dict.keys()): log_out( 'mean %s: %f---%f ms' % (key, self.stat_dict[key] / batch_interval, 1000 * (t_end - t_start)), self.f_out) self.writer.add_scalar( 'eval mean {}'.format(key), self.stat_dict[key] / (float(batch_idx + 1)), (epoch_count * len(self.test_dataloader))) mean_iou, iou_list = iou_calc.compute_iou() self.writer.add_scalar('eval mean iou', mean_iou, (epoch_count * len(self.test_dataloader))) log_out('eval mean IoU:{:.1f}'.format(mean_iou * 100), self.f_out) s = 'eval IoU:' for iou_tmp in iou_list: s += '{:5.2f} '.format(100 * iou_tmp) log_out(s, self.f_out) self.writer.close() current_loss = self.stat_dict['loss'] / (float(batch_idx + 1)) return current_loss def train(self, start_epoch): loss = 0 min_loss = 100 current_loss = None for epoch in range(start_epoch, self.FLAGS.max_epoch): log_out('**************** EPOCH %03d ****************' % (epoch), self.f_out) log_out(str(datetime.datetime.now()), self.f_out) np.random.seed() self.train_one_epoch(epoch) if epoch == 0 or epoch % 10 == 9: log_out('**** EVAL EPOCH %03d START****' % (epoch), self.f_out) current_loss = self.evaluate_one_epoch(epoch) log_out('**** EVAL EPOCH %03d END****' % (epoch), self.f_out) save_dict = { 'epoch': epoch + 1, # after training one epoch, the start_epoch should be epoch+1 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': loss, } try: save_dict['model_state_dict'] = self.net.module.state_dict() except: save_dict['model_state_dict'] = self.net.state_dict() torch.save( save_dict, os.path.join(self.FLAGS.log_dir, 's3dis_checkpoint.tar')) def run(self): it = -1 start_epoch = 0 checkpoint_path = self.FLAGS.checkpoint_path if checkpoint_path is not None and os.path.isfile(checkpoint_path): checkpoint = torch.load(checkpoint_path) self.net.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] log_out( "-> loaded checkpoint %s (epoch: %d)" % (checkpoint_path, start_epoch), self.f_out) self.train(start_epoch)
class Logger(object): def __init__(self, log_dir, save_tb=False, log_frequency=10000, action_repeat=1, agent='drq'): self._log_dir = log_dir self._log_frequency = log_frequency self._action_repeat = action_repeat if save_tb: tb_dir = os.path.join(log_dir, 'tb') if os.path.exists(tb_dir): try: shutil.rmtree(tb_dir) except: print("logger.py warning: Unable to remove tb directory") pass self._sw = SummaryWriter(tb_dir) else: self._sw = None # each agent has specific output format for training assert agent in AGENT_TRAIN_FORMAT train_format = COMMON_TRAIN_FORMAT + AGENT_TRAIN_FORMAT[agent] self._train_mg = MetersGroup(os.path.join(log_dir, 'train'), formating=train_format) self._eval_mg = MetersGroup(os.path.join(log_dir, 'eval'), formating=COMMON_EVAL_FORMAT) def _should_log(self, step, log_frequency): log_frequency = log_frequency or self._log_frequency return step % log_frequency == 0 def _update_step(self, step): return step * self._action_repeat def _try_sw_log(self, key, value, step): step = self._update_step(step) if self._sw is not None: self._sw.add_scalar(key, value, step) def _try_sw_log_image(self, key, image, step): step = self._update_step(step) if self._sw is not None: assert image.dim() == 3 grid = torchvision.utils.make_grid(image.unsqueeze(1)) self._sw.add_image(key, grid, step) def _try_sw_log_video(self, key, frames, step): step = self._update_step(step) if self._sw is not None: frames = torch.from_numpy(np.array(frames)) frames = frames.unsqueeze(0) self._sw.add_video(key, frames, step, fps=30) def _try_sw_log_histogram(self, key, histogram, step): step = self._update_step(step) if self._sw is not None: self._sw.add_histogram(key, histogram, step) def log(self, key, value, step, n=1, log_frequency=1): if not self._should_log(step, log_frequency): return assert key.startswith('train') or key.startswith('eval') if type(value) == torch.Tensor: value = value.item() self._try_sw_log(key, value / n, step) mg = self._train_mg if key.startswith('train') else self._eval_mg mg.log(key, value, n) def eval_log(self, key, value, step, n=1, log_frequency=1): """Same as self.log(), except we don't call self._should_log(). In other words, we always log.""" assert key.startswith('train') or key.startswith('eval') if type(value) == torch.Tensor: value = value.item() self._try_sw_log(key, value / n, step) mg = self._train_mg if key.startswith('train') else self._eval_mg mg.log(key, value, n) def test_log(self, key, value, step, n=1, log_frequency=1): """Just writes to TensorBoard. We handle CSV writing separately.""" assert key.startswith('test') if type(value) == torch.Tensor: value = value.item() self._try_sw_log(key, value / n, step) def log_param(self, key, param, step, log_frequency=None): if not self._should_log(step, log_frequency): return self.log_histogram(key + '_w', param.weight.data, step) if hasattr(param.weight, 'grad') and param.weight.grad is not None: self.log_histogram(key + '_w_g', param.weight.grad.data, step) if hasattr(param, 'bias') and hasattr(param.bias, 'data'): self.log_histogram(key + '_b', param.bias.data, step) if hasattr(param.bias, 'grad') and param.bias.grad is not None: self.log_histogram(key + '_b_g', param.bias.grad.data, step) def log_image(self, key, image, step, log_frequency=None): if not self._should_log(step, log_frequency): return assert key.startswith('train') or key.startswith('eval') self._try_sw_log_image(key, image, step) def log_video(self, key, frames, step, log_frequency=None): if not self._should_log(step, log_frequency): return assert key.startswith('train') or key.startswith('eval') self._try_sw_log_video(key, frames, step) def log_histogram(self, key, histogram, step, log_frequency=None): if not self._should_log(step, log_frequency): return assert key.startswith('train') or key.startswith('eval') self._try_sw_log_histogram(key, histogram, step) def dump(self, step, save=True, ty=None): step = self._update_step(step) if ty is None: self._train_mg.dump(step, 'train', save) self._eval_mg.dump(step, 'eval', save) elif ty == 'eval': self._eval_mg.dump(step, 'eval', save) elif ty == 'train': self._train_mg.dump(step, 'train', save) else: raise f'invalid log type: {ty}'