def validate(val_dataloader, model, configs): losses = AverageMeter('Loss', ':.4e') criterion = Compute_Loss(device=configs.device) # switch to train mode model.eval() with torch.no_grad(): for batch_idx, batch_data in enumerate(tqdm(val_dataloader)): metadatas, imgs, targets = batch_data batch_size = imgs.size(0) for k in targets.keys(): targets[k] = targets[k].to(configs.device, non_blocking=True) imgs = imgs.to(configs.device, non_blocking=True).float() outputs = model(imgs) total_loss, loss_stats = criterion(outputs, targets) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) return losses.avg
def train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') progress = ProgressMeter(len(train_dataloader), [batch_time, data_time, losses], prefix="Train - Epoch: [{}/{}]".format(epoch, configs.num_epochs)) criterion = Compute_Loss(device=configs.device) num_iters_per_epoch = len(train_dataloader) # switch to train mode model.train() start_time = time.time() for batch_idx, batch_data in enumerate(tqdm(train_dataloader)): data_time.update(time.time() - start_time) metadatas, imgs, targets = batch_data batch_size = imgs.size(0) global_step = num_iters_per_epoch * (epoch - 1) + batch_idx + 1 for k in targets.keys(): targets[k] = targets[k].to(configs.device, non_blocking=True) imgs = imgs.to(configs.device, non_blocking=True).float() outputs = model(imgs) total_loss, loss_stats = criterion(outputs, targets) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) # compute gradient and perform backpropagation total_loss.backward() if global_step % configs.subdivisions == 0: optimizer.step() # zero the parameter gradients optimizer.zero_grad() # ######################### Sersy ######################################### # Adjust learning rate # if configs.step_lr_in_epoch: # lr_scheduler.step() # if tb_writer is not None: # tb_writer.add_scalar('LR', lr_scheduler.get_lr()[0], global_step) if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) # measure elapsed time # torch.cuda.synchronize() batch_time.update(time.time() - start_time) if tb_writer is not None: if (global_step % configs.tensorboard_freq) == 0: loss_stats['avg_loss'] = losses.avg tb_writer.add_scalars('Train', loss_stats, global_step) # Log message if logger is not None: if (global_step % configs.print_freq) == 0: logger.info(progress.get_message(batch_idx)) start_time = time.time()
def validate(val_dataloader, model, configs): losses = AverageMeter('Loss', ':.4e') criterion = Compute_Loss(device=configs.device) # switch to train mode model.eval() with torch.no_grad(): for batch_idx, batch_data in enumerate(tqdm(val_dataloader)): metadatas, targets = batch_data batch_size = len(metadatas['img_path']) voxelinput = metadatas['voxels'] coorinput = metadatas['coors'] numinput = metadatas['num_points'] for k in targets.keys(): targets[k] = targets[k].to(configs.device, non_blocking=True) #imgs = imgs.to(configs.device, non_blocking=True).float() dtype = torch.float32 voxelinputr = torch.tensor(voxelinput, dtype=torch.float32, device=configs.device).to(dtype) coorinputr = torch.tensor(coorinput, dtype=torch.int32, device=configs.device) numinputr = torch.tensor(numinput, dtype=torch.int32, device=configs.device) try: outputs = model(voxelinputr, coorinputr, numinputr) except RuntimeError as exception: if "out of memory" in str(exception): print("WARNING: out of memory") print('###############################3') if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: print('###############################3') raise exception #outputs = model(voxelinputr, coorinputr, numinputr) total_loss, loss_stats = criterion(outputs, targets) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) return losses.avg
def train_epoch(model, dataloader, solver, rtm3d_loss, configs, tb_writer, epoch): train_dataloader, train_sampler = dataloader nb = len(train_dataloader) epochs = configs.SOLVER.MAX_EPOCH model.train() if configs.distributed: train_sampler.set_epoch(epoch) if configs.is_master_node: print(('\n' + '%10s' * 10) % ( 'Epoch', 'gpu_mem', 'MKF', 'VFM', 'M_OFF', 'V_OFF', 'total', 'targets', 'lr', 'time')) pbar = tqdm.tqdm(enumerate(train_dataloader), total=nb) # progress bar else: pbar = enumerate(train_dataloader) mloss = torch.zeros((5,), dtype=torch.float32, device=configs.DEVICE) time1 = time.time() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- imgs = imgs.to(configs.DEVICE) targets = targets.to(configs.DEVICE) pred = model(imgs) time2 = time.time() loss, loss_items = rtm3d_loss(pred, targets) time3 = time.time() if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return if i: mloss = (mloss + loss_items) / 2 else: mloss = loss_items solver.step(loss) if configs.distributed: reduced_loss = torch_utils.reduce_tensor(loss.data, configs.world_size) else: reduced_loss = loss.data if configs.is_master_node: mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) mask = targets.get_field('mask') s = ('%10s' * 2 + '%10.4g' * 7 + '%10s') % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, mask.shape[0], solver.learn_rate, '%.1g/%.3g' % (float(time2) - float(time1), float(time3) - float(time2))) pbar.set_description(s) # write tensorboard if tb_writer is not None: Tags = ['MKF', 'VFM', 'M_OFF', 'V_OFF', 'total'] for x, tag in zip(list(mloss), Tags): tb_writer.add_scalar('loss/' + tag, x, epoch * nb + i) time1 = time.time()
def train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') progress = ProgressMeter(len(train_dataloader), [batch_time, data_time, losses], prefix="Train - Epoch: [{}/{}]".format( epoch, configs.num_epochs)) criterion = Compute_Loss(device=configs.device) num_iters_per_epoch = len(train_dataloader) # switch to train mode model.train() start_time = time.time() for batch_idx, batch_data in enumerate(tqdm(train_dataloader)): data_time.update(time.time() - start_time) metadatas, targets = batch_data batch_size = len(metadatas['img_path']) '''hetmap = np.array(targets['hm_cen'][0], dtype= np.uint8) * 100 hetmap = hetmap.transpose(1,2,0) hetmap = cv2.resize(hetmap,(800,800)) global count hetmap = hetmap.transpose(2,0,1) tb_writer.add_image('traget{}'.format(count), hetmap)''' voxelinput = metadatas['voxels'] coorinput = metadatas['coors'] numinput = metadatas['num_points'] global_step = num_iters_per_epoch * (epoch - 1) + batch_idx + 1 for k in targets.keys(): targets[k] = targets[k].to(configs.device, non_blocking=True) dtype = torch.float32 voxelinputr = torch.tensor(voxelinput, dtype=torch.float32, device=configs.device).to(dtype) coorinputr = torch.tensor(coorinput, dtype=torch.int32, device=configs.device) numinputr = torch.tensor(numinput, dtype=torch.int32, device=configs.device) #print('coor. {}'.format(coorinputr.shape)) outputs = model(voxelinputr, coorinputr, numinputr) #print(type(outputs)) #outputs = outputs._asdict() '''outhetmap = np.array(outputs['hm_cen'][0].cpu().detach().numpy(), dtype= np.uint8) * 100 outhetmap = outhetmap.transpose(1,2,0) outhetmap = cv2.resize(outhetmap,(800,800)) outhetmap = outhetmap.transpose(2,0,1) tb_writer.add_image('output{}'.format(count), outhetmap)''' #count += 1 #box_preds = outputs.view(batch_size, -1, 7) total_loss, loss_stats = criterion(outputs, targets) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) # compute gradient and perform backpropagation total_loss.backward() if global_step % configs.subdivisions == 0: optimizer.step() # zero the parameter gradients optimizer.zero_grad() # Adjust learning rate if configs.step_lr_in_epoch: lr_scheduler.step() if tb_writer is not None: tb_writer.add_scalar('LR', lr_scheduler.get_lr()[0], global_step) if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) # measure elapsed time # torch.cuda.synchronize() batch_time.update(time.time() - start_time) if tb_writer is not None: if (global_step % configs.tensorboard_freq) == 0: loss_stats['avg_loss'] = losses.avg tb_writer.add_scalars('Train', loss_stats, global_step) # Log message if logger is not None: if (global_step % configs.print_freq) == 0: logger.info(progress.get_message(batch_idx)) start_time = time.time()