def _test_gather(self, output_device): inputs = ( torch.randn(2, 4, device='cuda:0', requires_grad=True, dtype=torch.double), torch.randn(2, 4, device='cuda:1', requires_grad=True, dtype=torch.double), ) result = dp.gather(inputs, output_device) self.assertEqual(result.size(), torch.Size([4, 4])) self.assertEqual(result[:2], inputs[0]) self.assertEqual(result[2:], inputs[1]) if output_device != -1: self.assertEqual(result.get_device(), output_device) else: self.assertFalse(result.is_cuda) grad = torch.randn((4, 4), dtype=torch.double) if output_device != -1: grad = grad.cuda(output_device) result.backward(grad) self.assertEqual(inputs[0].grad.data, grad[:2]) self.assertEqual(inputs[1].grad.data, grad[2:]) _assertGradAndGradgradChecks( self, lambda x, y: dp.gather((x, y), output_device), inputs) # test scalar inputs, should stack into a vector in this case inputs = ( torch.randn((), device='cuda:0', requires_grad=True, dtype=torch.double), torch.randn((), device='cuda:1', requires_grad=True, dtype=torch.double), ) result = dp.gather(inputs, output_device) self.assertEqual(result.size(), torch.Size([2])) self.assertEqual(result[0], inputs[0]) self.assertEqual(result[1], inputs[1]) if output_device != -1: self.assertEqual(result.get_device(), output_device) else: self.assertFalse(result.is_cuda) grad = torch.randn(2, dtype=torch.double) if output_device != -1: grad = grad.cuda(output_device) result.backward(grad) self.assertEqual(inputs[0].grad, grad[0]) self.assertEqual(inputs[1].grad, grad[1]) _assertGradAndGradgradChecks( self, lambda x, y: dp.gather((x, y), output_device), inputs)
def calc_distill_loss(self): losses = [] for i, netA in enumerate(self.netAs): assert isinstance(netA, SuperConv2d) n = self.mapping_layers[i] netA_replicas = replicate(netA, self.gpu_ids) kwargs = tuple([{ 'config': { 'channel': netA.out_channels } } for idx in self.gpu_ids]) Sacts = parallel_apply( netA_replicas, tuple([ self.Sacts[key] for key in sorted(self.Sacts.keys()) if n in key ]), kwargs) Tacts = [ self.Tacts[key] for key in sorted(self.Tacts.keys()) if n in key ] loss = [F.mse_loss(Sact, Tact) for Sact, Tact in zip(Sacts, Tacts)] loss = gather(loss, self.gpu_ids[0]).sum() setattr(self, 'loss_G_distill%d' % i, loss) losses.append(loss) return sum(losses)
def train_epoch(model, dataloader, optimizer, args): model.train() epoch_loss = 0 for idx, (data, target) in tqdm(enumerate(dataloader)): data, target = data.to(args.device, non_blocking = args.non_blocking), \ target.to(args.device, non_blocking = args.non_blocking) output = model(data) loss = args.loss(output, target) optimizer.zero_grad() if args.multi_gpu: with torch.no_grad(): loss_resized = [x.unsqueeze(0) for x in loss] gathered_loss = gather(loss_resized, args.device) total_loss = torch.mean(gathered_loss) epoch_loss += total_loss.item() for item in loss: item.backward() else: epoch_loss += loss.item() loss.backward() optimizer.step() return epoch_loss/(idx+1)
def data_parallel(f, input, params, stats, mode, device_ids, output_device=None): if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: # only 1 device return f(input, params, stats, mode) # function inside data_parallel def replicate(param_dict, g): replicas = [{} for d in device_ids] # replicas, list of n_devices dict for k,v in param_dict.iteritems(): # v is parameter for i,u in enumerate(g(v)): replicas[i][k] = u return replicas # broadcast parameters params_replicas = replicate(params, lambda x: Broadcast(device_ids)(x)) # broadcast stats stats_replicas = replicate(stats, lambda x: comm.broadcast(x, device_ids)) replicas = [lambda x,p=p,s=s,mode=mode: f(x,p,s,mode) for i,(p,s) in enumerate(zip(params_replicas, stats_replicas))] inputs = scatter(input, device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def data_parallel(f, input, params, stats, mode, device_ids, output_device=None): assert isinstance(device_ids, list) if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, stats, mode) params_all = Broadcast.apply(device_ids, *params.values()) params_replicas = [{ k: params_all[i + j * len(params)] for i, k in enumerate(params.keys()) } for j in range(len(device_ids))] stats_replicas = [ dict(zip(stats.keys(), p)) for p in comm.broadcast_coalesced(list(stats.values()), device_ids) ] replicas = [ partial(f, params=p, stats=s, mode=mode) for p, s in zip(params_replicas, stats_replicas) ] inputs = scatter([input], device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def data_parallel(f, input, params, stats, mode, device_ids, output_device=None): if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, stats, mode) def replicate(param_dict, g): replicas = [{} for d in device_ids] for k, v in param_dict.iteritems(): for i, u in enumerate(g(v)): replicas[i][k] = u return replicas params_replicas = replicate(params, lambda x: Broadcast(device_ids)(x)) stats_replicas = replicate(stats, lambda x: comm.broadcast(x, device_ids)) replicas = [ lambda x, p=p, s=s, mode=mode: f(x, p, s, mode) for i, (p, s) in enumerate(zip(params_replicas, stats_replicas)) ] inputs = scatter(input, device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def allen_data_parallel(batch_group: List[TensorDict], model: Model, cuda_devices: List) -> Dict[str, torch.Tensor]: """ Performs a forward pass using multiple GPUs. This is a simplification of torch.nn.parallel.data_parallel to support the allennlp model interface. """ assert len(batch_group) <= len(cuda_devices) moved = [ move_to_device(batch, device) for batch, device in zip(batch_group, cuda_devices) ] used_device_ids = cuda_devices[:len(moved)] # Counterintuitively, it appears replicate expects the source device id to be the first element # in the device id list. See torch.cuda.comm.broadcast_coalesced, which is called indirectly. replicas = nnP.replicate(model, used_device_ids) # We pass all our arguments as kwargs. Create a list of empty tuples of the # correct shape to serve as (non-existent) positional arguments. inputs = [()] * len(batch_group) outputs = nnP.parallel_apply(replicas, inputs, moved, used_device_ids) # Only the 'loss' is needed. # a (num_gpu, ) tensor with loss on each GPU if LOSS_KEY in outputs[0]: result = { LOSS_KEY: nnP.gather([output[LOSS_KEY].unsqueeze(0) for output in outputs], target_device=used_device_ids[0], dim=0).mean() } else: result = {} for key in outputs[0]: if key == 'tags': result[key] = list(chain([output[key] for output in outputs])) elif key != LOSS_KEY: result[key] = [ nnP.gather([output[key]], target_device=used_device_ids[0], dim=0) for output in outputs ] return result
def val_seg_latency(model, dataset_loader, criterion=None, num_classes=21, device='cuda'): model.eval() inter_meter = AverageMeter() union_meter = AverageMeter() batch_time = AverageMeter() end = time.time() total_batches = 100 miou_class = MIOU(num_classes=num_classes) if criterion: losses = AverageMeter() with torch.no_grad(): for i, (inputs, target) in enumerate(dataset_loader): inputs = inputs.to(device=device) target = target.to(device=device) end = time.time() outputs = model(inputs) batch_time.update(time.time() - end) if criterion: if device == 'cuda': loss = criterion(outputs, target).mean() if isinstance(outputs, (list, tuple)): target_dev = outputs[0].device outputs = gather(outputs, target_device=target_dev) else: loss = criterion(outputs, target) losses.update(loss.item(), inputs.size(0)) inter, union = miou_class.get_iou(outputs, target) inter_meter.update(inter) union_meter.update(union) # measure elapsed time if i % 10 == 0: # print after every 100 batches iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 loss_ = losses.avg if criterion is not None else 0 print_log_message( "[%d/%d]\t\tBatch Time:%.4f\t\tLoss:%.4f\t\tmiou:%.4f" % (i, len(dataset_loader), batch_time.avg, loss_, miou)) if i >= total_batches: break iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 print_info_message('Mean IoU: {0:.2f}'.format(miou)) if criterion: return miou, losses.avg else: return miou, 0
def test_gather_different_len_dicts(self): inputs = ({ 'a': torch.randn(1, 2, requires_grad=True, device="cuda:0") }, { 'b': torch.randn(1, 2, requires_grad=True, device="cuda:1"), 'a': torch.randn(1, 2, requires_grad=True, device="cuda:1"), }) with self.assertRaises(ValueError): _ = dp.gather(inputs, target_device=0)
def train_seg(model, dataset_loader, optimizer, criterion, num_classes, epoch, device='cuda'): losses = AverageMeter() batch_time = AverageMeter() inter_meter = AverageMeter() union_meter = AverageMeter() end = time.time() model.train() miou_class = MIOU(num_classes=num_classes) for i, (inputs, target) in enumerate(dataset_loader): inputs = inputs.to(device=device) target = target.to(device=device) outputs = model(inputs) if device == 'cuda': loss = criterion(outputs, target).mean() if isinstance(outputs, (list, tuple)): target_dev = outputs[0].device outputs = gather(outputs, target_device=target_dev) else: loss = criterion(outputs, target) inter, union = miou_class.get_iou(outputs, target) inter_meter.update(inter) union_meter.update(union) losses.update(loss.item(), inputs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 10 == 0: # print after every 100 batches iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 print_log_message( "Epoch: %d[%d/%d]\t\tBatch Time:%.4f\t\tLoss:%.4f\t\tmiou:%.4f" % (epoch, i, len(dataset_loader), batch_time.avg, losses.avg, miou)) iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 return miou, losses.avg
def gather(self, outputs, output_device): outputs = zip(*outputs) for idx, data in enumerate(outputs): if idx == 0: # gather the first return item outputs[idx] = gather(outputs[idx], output_device, dim=self.dim) else: # for the other items, return the items on GPU:0 (do not gather) outputs[idx] = outputs[idx][0] return outputs
def _test_gather(self, output_device): if not TEST_MULTIGPU: raise unittest.SkipTest("Only one GPU detected") inputs = (Variable(torch.randn(2, 4).cuda(0)), Variable(torch.randn(2, 4).cuda(1))) result = dp.gather(inputs, output_device) self.assertEqual(result.size(), torch.Size([4, 4])) self.assertEqual(result[:2], inputs[0]) self.assertEqual(result[2:], inputs[1]) if output_device != -1: self.assertEqual(result.get_device(), output_device) else: self.assertFalse(result.is_cuda)
def data_parallel(f, input, params, mode, device_ids, output_device=None): assert isinstance(device_ids, list) if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, mode) params_all = Broadcast.apply(device_ids, *params.values()) params_replicas = [{k: params_all[i + j*len(params)] for i, k in enumerate(params.keys())} for j in range(len(device_ids))] replicas = [partial(f, params=p, mode=mode) for p in params_replicas] inputs = scatter([input], device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def data_parallel(f, input, params, mode, device_ids, output_device=None): device_ids = list(device_ids) if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, mode) params_all = Broadcast.apply(device_ids, *params.values()) params_replicas = [{ k: params_all[i + j * len(params)] for i, k in enumerate(params.keys()) } for j in range(len(device_ids))] replicas = [partial(f, params=p, mode=mode) for p in params_replicas] inputs = scatter([input], device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def parallel_chain_loss(model, inputs, den_graph): """ inputs: list of input tuple ((mfcc, inputs), supervision) on different gpus """ from torch.nn.parallel import replicate, parallel_apply, gather model = ForwardParallelChain(model, den_graph, args) device_ids = list(range(torch.cuda.device_count())) assert len(inputs) == len(device_ids) output_device = device_ids[0] used_device_ids = device_ids[:len(inputs)] replicas = replicate(model, used_device_ids) model_kwargs = None outputs = parallel_apply(replicas, inputs, model_kwargs, used_device_ids) dim = 0 ret = gather(outputs, output_device, dim) loss = ret[:, 0] weights = ret[:, -1] nummerator = loss * weights results = ChainResults() results.data = ret[:, 1:].sum(dim=0) return numerator.sum() / weights.sum(), results
def test_epoch(model, dataloader, args): model.eval() with torch.no_grad(): len_pred = len(dataloader) * (dataloader.batch_size) out_pred = torch.FloatTensor(len_pred, args.output_size).fill_(0).to(args.device) loss_val = 0 for idx, (data,target) in enumerate(dataloader): data, target = data.to(args.device, non_blocking=args.non_blocking), \ target.to(args.device, non_blocking=args.non_blocking) output = model(data) loss = args.loss(output, target) ### if args.multi_gpu: loss_resized = [x.unsqueeze(0) for x in loss] gathered_loss = gather(loss_resized, args.device) loss = torch.mean(gathered_loss) ### loss_val += loss.item() #out_pred[output.size(0)*idx:output.size(0)*(1+idx)] = output.data loss_mean = loss_val/(idx+1) return loss_mean
def forward(self, x, label, **kwargs): if self.gpus is None: # cpu mode, normal fc layer x = classify(x, self.weight, label, simple_output=True, **kwargs) with torch.no_grad(): acc = accuracy(x, label) x = F.log_softmax(x, dim=1) label = label.unsqueeze(-1) loss = torch.gather(x, 1, label) loss = -loss.mean() return loss, acc else: weight_scattered = (w.to(i) for w, i in zip(self.weights, self.gpus)) feat_copies = [x.to(i) for i in self.gpus] labels_scattered = [] for i in range(len(self.weights)): labels_new = label.clone() labels_new[(labels_new >= self.weight_idx[i + 1]) | (labels_new < self.weight_idx[i])] = -1 labels_new = labels_new - self.weight_idx[i] labels_scattered.append(labels_new) kwargs_scattered = scatter(kwargs, self.gpus) input_scattered = list( zip(feat_copies, weight_scattered, labels_scattered)) modules = [classify] * len(self.weights) results_scattered = parallel_apply(modules, input_scattered, kwargs_scattered, self.gpus) logits = [i[0] for i in results_scattered] xexps = [i[1] for i in results_scattered] sums = [i[2] for i in results_scattered] argmaxs = [i[3] for i in results_scattered] maxs = [i[4] for i in results_scattered] sums = gather(sums, 0, dim=1) sums = sums.sum(dim=1, keepdim=True) sums_scattered = [sums.to(i) for i in self.gpus] loss_input_scattered = list( zip(logits, xexps, labels_scattered, sums_scattered)) loss_results_scattered = parallel_apply( [nllDistributed] * len(self.gpus), loss_input_scattered, None, self.gpus) loss_results_scattered = [i.sum() for i in loss_results_scattered] loss_results_scattered = [i.to(0) for i in loss_results_scattered] loss = sum(loss_results_scattered) loss = loss / x.shape[0] for i in range(len(argmaxs)): argmaxs[i] = argmaxs[i] + self.weight_idx[i] maxs = [i.to(0) for i in maxs] maxs = torch.stack(maxs, dim=1) _, max_split = torch.max(maxs, dim=1) idx = torch.arange(0, maxs.size(0), dtype=torch.long) argmaxs = [i.to(0) for i in argmaxs] argmaxs = torch.stack(argmaxs, dim=1) predicted = argmaxs[idx, max_split] total = label.size(0) predicted = predicted.cpu() label = label.cpu() correct = (predicted == label).sum().item() acc = correct / total return loss, acc
def val_seg_ue(model, dataset_loader, criterion=None, num_classes=21, device='cuda', use_depth=False, add_criterion=None, greenhouse_use_trav=False): model.eval() inter_meter = AverageMeter() union_meter = AverageMeter() batch_time = AverageMeter() end = time.time() miou_class = MIOU(num_classes=num_classes - 1) if criterion: losses = AverageMeter() with torch.no_grad(): for i, batch in enumerate(dataset_loader): inputs = batch[0].to(device=device) target = batch[1].to(device=device) if use_depth: depth = batch[2].to(device=device) outputs = model(inputs, depth) else: outputs = model(inputs) if isinstance(outputs, OrderedDict): out_aux = outputs['aux'] outputs = outputs['out'] else: out_aux = outputs[1] outputs = outputs[0] outputs = outputs + 0.5 * out_aux if criterion: if device == 'cuda': loss = criterion(outputs, target).mean() if add_criterion is not None: loss += add_criterion(inputs, outputs) if isinstance(outputs, (list, tuple)): target_dev = outputs[0].device outputs = gather(outputs, target_device=target_dev) else: loss = criterion(outputs, target) if add_criterion is not None: loss += add_criterion(inputs, outputs) losses.update(loss.item(), inputs.size(0)) inter, union = miou_class.get_iou(outputs, target) inter_meter.update(inter) union_meter.update(union) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 10 == 0: # print after every 100 batches iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 loss_ = losses.avg if criterion is not None else 0 print_log_message( "[%d/%d]\t\tBatch Time:%.4f\t\tLoss:%.4f\t\tmiou:%.4f" % (i, len(dataset_loader), batch_time.avg, loss_, miou)) iou = inter_meter.sum / (union_meter.sum + 1e-10) if greenhouse_use_trav: miou = iou.mean() * 100 else: # miou = np.array(iou)[1, 2, 3].mean() * 100 miou = iou[[1, 2, 3]].mean() * 100 # miou = iou.mean() * 100 print_info_message('Mean IoU: {0:.2f}'.format(miou)) if criterion: return iou, losses.avg else: return iou, 0
def main(args): def log_string(str): logger.info(str) print(str) '''HYPER PARAMETER''' os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu '''CREATE DIR''' timestr = str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')) experiment_dir = Path('./log/') experiment_dir.mkdir(exist_ok=True) experiment_dir = experiment_dir.joinpath('part_seg') experiment_dir.mkdir(exist_ok=True) if args.log_dir is None: experiment_dir = experiment_dir.joinpath(timestr) else: experiment_dir = experiment_dir.joinpath(args.log_dir) experiment_dir.mkdir(exist_ok=True) checkpoints_dir = experiment_dir.joinpath('checkpoints/') checkpoints_dir.mkdir(exist_ok=True) log_dir = experiment_dir.joinpath('logs/') log_dir.mkdir(exist_ok=True) '''LOG''' args = parse_args() logger = logging.getLogger("Model") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_handler = logging.FileHandler('%s/%s.txt' % (log_dir, args.model)) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) log_string('PARAMETER ...') log_string(args) root = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/' file_list = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/train2.list' val_list = '/media/feihu/Storage/kitti_point_cloud/semantic_kitti/val2.list' TRAIN_DATASET = KittiDataset(root=root, file_list=file_list, npoints=args.npoint, training=True, augment=True) trainDataLoader = torch.utils.data.DataLoader(TRAIN_DATASET, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=2) TEST_DATASET = KittiDataset(root=root, file_list=val_list, npoints=args.npoint, training=False, augment=False) testDataLoader = torch.utils.data.DataLoader(TEST_DATASET, batch_size=args.batch_size, shuffle=False, drop_last=True, num_workers=2) log_string("The number of training data is: %d" % len(TRAIN_DATASET)) log_string("The number of test data is: %d" % len(TEST_DATASET)) # num_classes = 16 '''MODEL LOADING''' shutil.copy('models/%s.py' % args.model, str(experiment_dir)) shutil.copy('models/pointnet_util.py', str(experiment_dir)) num_devices = args.num_gpus #torch.cuda.device_count() # assert num_devices > 1, "Cannot detect more than 1 GPU." # print(num_devices) devices = list(range(num_devices)) target_device = devices[0] # MODEL = importlib.import_module(args.model) net = FusionNet(args.npoint, 4, 20, nPlanes) # net = MODEL.get_model(num_classes, normal_channel=args.normal) net = net.to(target_device) def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if m.weight is not None: torch.nn.init.xavier_normal_(m.weight.data) if m.bias is not None: torch.nn.init.constant_(m.bias.data, 0.0) elif classname.find('Linear') != -1: if m.weight is not None: torch.nn.init.xavier_normal_(m.weight.data) if m.bias is not None: torch.nn.init.constant_(m.bias.data, 0.0) try: checkpoint = torch.load( str(experiment_dir) + '/checkpoints/best_model.pth') start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['model_state_dict']) log_string('Use pretrain model') except: log_string('No existing model, starting training from scratch...') start_epoch = 0 net = net.apply(weights_init) if args.optimizer == 'Adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.decay_rate) else: optimizer = torch.optim.SGD(net.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4, nesterov=True) # optimizer = torch.optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9) def bn_momentum_adjust(m, momentum): if isinstance(m, torch.nn.BatchNorm2d) or isinstance( m, torch.nn.BatchNorm1d): m.momentum = momentum LEARNING_RATE_CLIP = 1e-5 MOMENTUM_ORIGINAL = 0.1 MOMENTUM_DECCAY = 0.5 MOMENTUM_DECCAY_STEP = 20 / 2 # args.step_size best_acc = 0 global_epoch = 0 best_class_avg_iou = 0 best_inctance_avg_iou = 0 # criterion = MODEL.get_loss() criterion = nn.CrossEntropyLoss() criterions = parallel.replicate(criterion, devices) # The raw version of the parallel_apply # replicas = parallel.replicate(net, devices) # input_coding = scn.InputLayer(dimension, torch.LongTensor(spatialSize), mode=4) for epoch in range(start_epoch, args.epoch): log_string('Epoch %d (%d/%s):' % (global_epoch + 1, epoch + 1, args.epoch)) '''Adjust learning rate and BN momentum''' # lr = max(args.learning_rate * (args.lr_decay ** (epoch // args.step_size)), LEARNING_RATE_CLIP) # lr = args.learning_rate * \ # math.exp((1 - epoch) * args.lr_decay) # log_string('Learning rate:%f' % lr) # for param_group in optimizer.param_groups: # param_group['lr'] = lr # for param_group in optimizer.param_groups: # param_group['lr'] = lr mean_correct = [] if 1: momentum = MOMENTUM_ORIGINAL * (MOMENTUM_DECCAY **(epoch // MOMENTUM_DECCAY_STEP)) if momentum < 0.01: momentum = 0.01 print('BN momentum updated to: %f' % momentum) net = net.apply(lambda x: bn_momentum_adjust(x, momentum)) '''learning one epoch''' net.train() # for iteration, data in tqdm(enumerate(trainDataLoader), total=len(trainDataLoader), smoothing=0.9): for iteration, data in enumerate(trainDataLoader): #adjust learing rate. if (iteration) % 320 == 0: lr_count = epoch * 6 + (iteration) / 320 lr = args.learning_rate * math.exp( (1 - lr_count) * args.lr_decay) for param_group in optimizer.param_groups: param_group['lr'] = lr log_string('Learning rate:%f' % lr) optimizer.zero_grad() if iteration > 1920: break points, target, ins, mask = data # print(torch.max(points[:, :, :3], 1)[0]) # print(torch.min(points[:, :, :3], 1)[0]) valid = mask > 0 total_points = valid.sum() orgs = points points = points.data.numpy() # print(total_points) inputs, targets, masks = [], [], [] coords = [] for i in range(num_devices): start = int(i * (args.batch_size / num_devices)) end = int((i + 1) * (args.batch_size / num_devices)) batch = provider.transform_for_sparse( points[start:end, :, :3], points[start:end, :, 3:], target[start:end, :].data.numpy(), mask[start:end, :].data.numpy(), scale, spatialSize) batch['x'][1] = batch['x'][1].type(torch.FloatTensor) batch['x'][0] = batch['x'][0].type(torch.IntTensor) batch['y'] = batch['y'].type(torch.LongTensor) org_xyz = orgs[start:end, :, :3].transpose(1, 2).contiguous() org_feas = orgs[start:end, :, 3:].transpose(1, 2).contiguous() label = Variable(batch['y'], requires_grad=False) maski = batch['mask'].type(torch.IntTensor) # print(torch.max(batch['x'][0], 0)[0]) # print(torch.min(batch['x'][0], 0)[0]) # locs, feas = input_layer(batch['x'][0].to(devices[i]), batch['x'][1].to(devices[i])) locs, feas = input_layer(batch['x'][0].cuda(), batch['x'][1].cuda()) # print(locs.size(), feas.size(), batch['x'][0].size()) # print(inputi.size(), batch['x'][1].size()) with torch.cuda.device(devices[i]): org_coords = batch['x'][0].to(devices[i]) inputi = ME.SparseTensor(feas.cpu(), locs).to( devices[i]) #input_coding(batch['x']) org_xyz = org_xyz.to(devices[i]) org_feas = org_feas.to(devices[i]) maski = maski.to(devices[i]) inputs.append( [inputi, org_coords, org_xyz, org_feas, maski]) targets.append(label.to(devices[i])) # masks.append(maski.contiguous().to(devices[i])) replicas = parallel.replicate(net, devices) predictions = parallel.parallel_apply(replicas, inputs, devices=devices) count = 0 # print("end ...") results = [] labels = [] match = 0 for i in range(num_devices): # temp = predictions[i]['output1'].F#.view(-1, num_classes) temp = predictions[i] # temp = output_layer(locs, predictions[i]['output1'].F, coords[i]) temp = temp[targets[i] > 0, :] results.append(temp) temp = targets[i] temp = temp[targets[i] > 0] labels.append(temp) # print(prediction2[i].size(), prediction1[i].size(), targets[i].size()) outputi = results[ i] #prediction2[i].contiguous().view(-1, num_classes) num_points = labels[i].size(0) count += num_points _, pred_choice = outputi.data.max(1) #[1] # print(pred_choice) correct = pred_choice.eq(labels[i].data).cpu().sum() match += correct.item() mean_correct.append(correct.item() / num_points) # print(prediction2, labels) losses = parallel.parallel_apply(criterions, tuple(zip(results, labels)), devices=devices) loss = parallel.gather(losses, target_device, dim=0).mean() loss.backward() optimizer.step() # assert(count1 == count2 and total_points == count1) log_string( "===> Epoch[{}]({}/{}) Valid points:{}/{} Loss: {:.4f} Accuracy: {:.4f}" .format(epoch, iteration, len(trainDataLoader), count, total_points, loss.item(), match / count)) # sys.stdout.flush() train_instance_acc = np.mean(mean_correct) log_string('Train accuracy is: %.5f' % train_instance_acc) # continue with torch.no_grad(): net.eval() evaluator = iouEval(num_classes, ignore) evaluator.reset() for iteration, (points, target, ins, mask) in tqdm(enumerate(testDataLoader), total=len(testDataLoader), smoothing=0.9): cur_batch_size, NUM_POINT, _ = points.size() # points, label, target, mask = points.float().cuda(), label.long().cuda(), target.long().cuda(), mask.float().cuda() if iteration > 192: break if 0: points = points.data.numpy() points[:, :, 0:3], norm = provider.pc_normalize( points[:, :, :3], mask.data.numpy()) points = torch.Tensor(points) orgs = points points = points.data.numpy() inputs, targets, masks = [], [], [] coords = [] for i in range(num_devices): start = int(i * (cur_batch_size / num_devices)) end = int((i + 1) * (cur_batch_size / num_devices)) batch = provider.transform_for_test( points[start:end, :, :3], points[start:end, :, 3:], target[start:end, :].data.numpy(), mask[start:end, :].data.numpy(), scale, spatialSize) batch['x'][1] = batch['x'][1].type(torch.FloatTensor) batch['x'][0] = batch['x'][0].type(torch.IntTensor) batch['y'] = batch['y'].type(torch.LongTensor) org_xyz = orgs[start:end, :, :3].transpose(1, 2).contiguous() org_feas = orgs[start:end, :, 3:].transpose(1, 2).contiguous() label = Variable(batch['y'], requires_grad=False) maski = batch['mask'].type(torch.IntTensor) locs, feas = input_layer(batch['x'][0].cuda(), batch['x'][1].cuda()) # print(locs.size(), feas.size(), batch['x'][0].size()) # print(inputi.size(), batch['x'][1].size()) with torch.cuda.device(devices[i]): org_coords = batch['x'][0].to(devices[i]) inputi = ME.SparseTensor(feas.cpu(), locs).to( devices[i]) #input_coding(batch['x']) org_xyz = org_xyz.to(devices[i]) org_feas = org_feas.to(devices[i]) maski = maski.to(devices[i]) inputs.append( [inputi, org_coords, org_xyz, org_feas, maski]) targets.append(label.to(devices[i])) # masks.append(maski.contiguous().to(devices[i])) replicas = parallel.replicate(net, devices) outputs = parallel.parallel_apply(replicas, inputs, devices=devices) # net = net.eval() # seg_pred = classifier(points, to_categorical(label, num_classes)) seg_pred = outputs[0].cpu() # mask = masks[0].cpu() target = targets[0].cpu() loc = locs[0].cpu() for i in range(1, num_devices): seg_pred = torch.cat((seg_pred, outputs[i].cpu()), 0) # mask = torch.cat((mask, masks[i].cpu()), 0) target = torch.cat((target, targets[i].cpu()), 0) seg_pred = seg_pred[target > 0, :] target = target[target > 0] _, seg_pred = seg_pred.data.max(1) #[1] target = target.data.numpy() evaluator.addBatch(seg_pred, target) # when I am done, print the evaluation m_accuracy = evaluator.getacc() m_jaccard, class_jaccard = evaluator.getIoU() log_string('Validation set:\n' 'Acc avg {m_accuracy:.3f}\n' 'IoU avg {m_jaccard:.3f}'.format(m_accuracy=m_accuracy, m_jaccard=m_jaccard)) # print also classwise for i, jacc in enumerate(class_jaccard): if i not in ignore: log_string( 'IoU class {i:} [{class_str:}] = {jacc:.3f}'.format( i=i, class_str=class_strings[class_inv_remap[i]], jacc=jacc)) log_string('Epoch %d test Accuracy: %f mean avg mIOU: %f' % (epoch + 1, m_accuracy, m_jaccard)) if (m_jaccard >= best_class_avg_iou): # logger.info('Save model...') log_string('Saveing model...') savepath = str(checkpoints_dir) + '/best_model.pth' log_string('Saving at %s' % savepath) state = { 'epoch': epoch, 'train_acc': train_instance_acc, 'test_acc': m_accuracy, 'class_avg_iou': m_jaccard, 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), } torch.save(state, savepath) # log_string('Saving model....') if m_accuracy > best_acc: best_acc = m_accuracy if m_jaccard > best_class_avg_iou: best_class_avg_iou = m_jaccard log_string('Best accuracy is: %.5f' % best_acc) log_string('Best class avg mIOU is: %.5f' % best_class_avg_iou) global_epoch += 1
def val_seg_cls(model, dataset_loader, criterion_seg=None, criterion_cls=None, num_classes=21, cls_loss_weight=1.0, device='cuda', use_depth=False): model.eval() inter_meter = AverageMeter() union_meter = AverageMeter() batch_time = AverageMeter() end = time.time() miou_class = MIOU(num_classes=num_classes) if criterion_seg: losses = AverageMeter() cls_losses = AverageMeter() seg_losses = AverageMeter() with torch.no_grad(): for i, batch in enumerate(dataset_loader): inputs = batch[0].to(device=device) target = batch[1].to(device=device) if use_depth: depth = batch[2].to(device=device) outputs_seg, outputs_cls = model(inputs, depth) else: outputs_seg, outputs_cls = model(inputs) cls_ids = batch[3].to(device=device) if criterion_seg and criterion_cls: if device == 'cuda': loss_seg = criterion_seg(outputs_seg, target).mean() loss_cls = criterion_cls(outputs_cls, cls_ids).mean() loss = loss_seg + cls_loss_weight * loss_cls if isinstance(outputs_seg, (list, tuple)): target_dev = outputs[0].device outputs_seg = gather(outputs_seg, target_device=target_dev) else: loss_seg = criterion_seg(outputs_seg, target) loss_cls = criterion_cls(outputs_cls, cls_ids) loss = loss_seg + cls_loss_weight * loss_cls losses.update(loss.item(), inputs.size(0)) seg_losses.update(loss_seg.item(), inputs.size(0)) cls_losses.update(loss_cls.item(), inputs.size(0)) inter, union = miou_class.get_iou(outputs_seg, target) inter_meter.update(inter) union_meter.update(union) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 10 == 0: # print after every 100 batches iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 loss_ = losses.avg if criterion_seg is not None else 0 print_log_message("[%d/%d]\t\tBatch Time:%.4f\t\tLoss:%.4f\t\tmiou:%.4f" % (i, len(dataset_loader), batch_time.avg, loss_, miou)) iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 print_info_message('Mean IoU: {0:.2f}'.format(miou)) if criterion_seg and criterion_cls: return miou, losses.avg, seg_losses.avg, cls_losses.avg else: return miou, 0, 0, 0
def train(pipeline_model, data_loader, val_data_loader, config): # Set up the train flag for batch normalization pipeline_model.train() num_devices = torch.cuda.device_count() num_devices = min(config.max_ngpu, num_devices) devices = list(range(num_devices)) target_device = devices[0] pipeline_model.to(target_device) if num_devices > 1: pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm( pipeline_model, devices) # Configuration writer = SummaryWriter(logdir=config.log_dir) data_timer, iter_timer = Timer(), Timer() data_time_avg, iter_time_avg = AverageMeter(), AverageMeter() meters = collections.defaultdict(AverageMeter) hists = pipeline_model.initialize_hists() optimizer = pipeline_model.initialize_optimizer(config) scheduler = pipeline_model.initialize_scheduler(optimizer, config) writer = SummaryWriter(logdir=config.log_dir) # Train the network logging.info('===> Start training') best_val, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True if config.resume: if osp.isfile(config.resume): logging.info("=> loading checkpoint '{}'".format(config.resume)) state = torch.load(config.resume) curr_iter = state['iteration'] + 1 epoch = state['epoch'] pipeline_model.load_state_dict(state['state_dict']) if config.resume_optimizer: curr_iter = state['iteration'] + 1 scheduler = pipeline_model.initialize_scheduler( optimizer, config, last_step=curr_iter) pipeline_model.load_optimizer(optimizer, state['optimizer']) if 'best_val' in state: best_val = state['best_val'] best_val_iter = state['best_val_iter'] logging.info("=> loaded checkpoint '{}' (epoch {})".format( config.resume, state['epoch'])) else: logging.info("=> no checkpoint found at '{}'".format( config.resume)) data_iter = data_loader.__iter__() while is_training: for iteration in range(len(data_loader)): pipeline_model.reset_gradient(optimizer) iter_timer.tic() pipelines = parallel.replicate(pipeline_model, devices) # Get training data data_timer.tic() inputs = [] for pipeline, device in zip(pipelines, devices): with torch.cuda.device(device): while True: datum = pipeline.load_datum(data_iter, has_gt=True) num_boxes = sum(box.shape[0] for box in datum['bboxes_coords']) if config.skip_empty_boxes and num_boxes == 0: continue break inputs.append(datum) data_time_avg.update(data_timer.toc(False)) outputs = parallel.parallel_apply(pipelines, [(x, True) for x in inputs], devices=devices) losses = parallel.parallel_apply( [pipeline.loss for pipeline in pipelines], tuple(zip(inputs, outputs)), devices=devices) losses = parallel.gather(losses, target_device) losses = dict([(k, v.mean()) for k, v in losses.items()]) meters, hists = pipeline_model.update_meters(meters, hists, losses) # Compute and accumulate gradient losses['loss'].backward() # Update number of steps pipeline_model.step_optimizer(losses, optimizer, scheduler, iteration) iter_time_avg.update(iter_timer.toc(False)) if curr_iter >= config.max_iter: is_training = False break if curr_iter % config.stat_freq == 0 or curr_iter == 1: lrs = ', '.join([ '{:.3e}'.format(x) for x in scheduler['default'].get_lr() ]) debug_str = "===> Epoch[{}]({}/{}): LR: {}\n".format( epoch, curr_iter, len(data_loader), lrs) debug_str += log_meters(meters, log_perclass_meters=False) debug_str += f"\n data time: {data_time_avg.avg:.3f}" debug_str += f" iter time: {iter_time_avg.avg:.3f}" logging.info(debug_str) # Reset timers data_time_avg.reset() iter_time_avg.reset() # Write logs update_writer(writer, meters, curr_iter, 'training') writer.add_scalar('training/learning_rate', scheduler['default'].get_lr()[0], curr_iter) # Reset meters reset_meters(meters, hists) # Save current status, save before val to prevent occational mem overflow if curr_iter % config.save_freq == 0: checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val, best_val_iter) if config.heldout_save_freq > 0 and curr_iter % config.heldout_save_freq == 0: checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val, best_val_iter, heldout_save=True) # Validation if curr_iter % config.val_freq == 0: if num_devices > 1: unconvert_sync_batchnorm(pipeline_model) best_val, best_val_iter = validate(pipeline_model, val_data_loader, config, writer, curr_iter, best_val, best_val_iter, optimizer, epoch) if num_devices > 1: pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm( pipeline_model, devices) if curr_iter % config.empty_cache_freq == 0: # Clear cache torch.cuda.empty_cache() # End of iteration curr_iter += 1 epoch += 1 # Explicit memory cleanup if hasattr(data_iter, 'cleanup'): data_iter.cleanup() # Save the final model if num_devices > 1: unconvert_sync_batchnorm(pipeline_model) validate(pipeline_model, val_data_loader, config, writer, curr_iter, best_val, best_val_iter, optimizer, epoch) if num_devices > 1: pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm( pipeline_model, devices) checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val, best_val_iter)
def gather(self, outputs, output_device): return gather(outputs, output_device, dim=self.gather_dim)
# Get new data inputs, all_labels = [], [] for i in range(num_devices): coordinates, features, labels = generate_input(config.file_name, voxel_size=0.05) with torch.cuda.device(devices[i]): inputs.append( ME.SparseTensor(features - 0.5, coords=coordinates).to(devices[i])) all_labels.append(labels.long().to(devices[i])) # The raw version of the parallel_apply st = time() replicas = parallel.replicate(net, devices) outputs = parallel.parallel_apply(replicas, inputs, devices=devices) # Extract features from the sparse tensors to use a pytorch criterion out_features = [output.F for output in outputs] losses = parallel.parallel_apply(criterions, tuple(zip(out_features, all_labels)), devices=devices) loss = parallel.gather(losses, target_device, dim=0).mean() t = time() - st min_time = min(t, min_time) print('Iteration: ', iteration, ', Loss: ', loss.item(), ', Time: ', t, ', Min time: ', min_time) # Gradient loss.backward() optimizer.step()
def train_seg_cls(model, dataset_loader, optimizer, criterion_seg, num_classes, epoch, criterion_cls, cls_loss_weight=1.0, device='cuda', use_depth=False): losses = AverageMeter() cls_losses = AverageMeter() seg_losses = AverageMeter() batch_time = AverageMeter() inter_meter = AverageMeter() union_meter = AverageMeter() end = time.time() model.train() miou_class = MIOU(num_classes=num_classes) for i, batch in enumerate(dataset_loader): inputs = batch[0].to(device=device) target = batch[1].to(device=device) if use_depth: depth = batch[2].to(device=device) outputs_seg, outputs_cls = model(inputs, depth) else: outputs_seg, outputs_cls = model(inputs) cls_ids = batch[3].to(device=device) if device == 'cuda': loss_seg = criterion_seg(outputs_seg, target).mean() loss_cls = criterion_cls(outputs_cls, cls_ids).mean() loss = loss_seg + cls_loss_weight * loss_cls if isinstance(outputs_seg, (list, tuple)): target_dev = outputs[0].device outputs_seg = gather(outputs_seg, target_device=target_dev) else: loss_seg = criterion_seg(outputs_seg, target) loss_cls = criterion_cls(outputs_cls, cls_ids) loss = loss_seg + cls_loss_weight * loss_cls inter, union = miou_class.get_iou(outputs_seg, target) inter_meter.update(inter) union_meter.update(union) losses.update(loss.item(), inputs.size(0)) seg_losses.update(loss_seg.item(), inputs.size(0)) cls_losses.update(loss_cls.item(), inputs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 10 == 0: # print after every 100 batches iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 print_log_message("Epoch: %d[%d/%d]\t\tBatch Time:%.4f\t\tLoss:%.4f\t\tmiou:%.4f" % (epoch, i, len(dataset_loader), batch_time.avg, losses.avg, miou)) iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 return miou, losses.avg, seg_losses.avg, cls_losses.avg
def train_seg_ue(model, dataset_loader, optimizer, criterion, num_classes, epoch, device='cuda', use_depth=False, add_criterion=None, weight=1.0, greenhouse_use_trav=False): losses = AverageMeter() ce_losses = AverageMeter() nid_losses = AverageMeter() batch_time = AverageMeter() inter_meter = AverageMeter() union_meter = AverageMeter() end = time.time() model.train() miou_class = MIOU(num_classes=num_classes - 1) kld_layer = PixelwiseKLD() print("train_seg_ue()") b = 0.015 for i, batch in enumerate(dataset_loader): inputs = batch[0].to(device=device) target = batch[1].to(device=device) if use_depth: depth = batch[2].to(device=device) outputs = model(inputs, depth) else: outputs = model(inputs) if isinstance(outputs, OrderedDict): out_aux = outputs['aux'] outputs = outputs['out'] else: out_aux = outputs[1] outputs = outputs[0] kld = kld_layer(outputs, out_aux) outputs = outputs + 0.5 * out_aux if device == 'cuda': # print("Target size {}".format(target.size())) # loss = criterion(outputs, target).mean() # + kld.mean() if add_criterion is not None: loss2 = add_criterion(inputs, outputs.to(device)) * weight loss += loss2 if isinstance(outputs, (list, tuple)): target_dev = outputs[0].device outputs = gather(outputs, target_device=target_dev) else: loss = criterion(outputs, target) # + kld.mean() if add_criterion is not None: loss2 = add_criterion(inputs, outputs) * weight loss += loss2 inter, union = miou_class.get_iou(outputs, target) inter_meter.update(inter) union_meter.update(union) loss = (loss - b).abs() + b losses.update(loss.item(), inputs.size(0)) if add_criterion is not None: nid_losses.update(loss2.item(), 1) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 10 == 0: # print after every 100 batches iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = iou.mean() * 100 print_log_message( "Epoch: %d[%d/%d]\t\tBatch Time:%.4f\t\tLoss:%.4f\t\tmiou:%.4f\t\tNID loss:%.4f" % (epoch, i, len(dataset_loader), batch_time.avg, losses.avg, miou, nid_losses.avg)) iou = inter_meter.sum / (union_meter.sum + 1e-10) if greenhouse_use_trav: miou = iou.mean() * 100 else: miou = iou[[1, 2, 3]].mean() * 100 # miou = iou.mean() * 100 return iou, losses.avg
def parallel_tensor_dict( tensor_dicts: List[Mapping], model: Model, device_ids: List, loss_key='loss', atom_types=(str, )) -> Dict[str, torch.Tensor]: """ Performs a forward pass using multiple GPUs. This is a simplification of torch.nn.parallel.data_parallel to support the allennlp model interface. """ if len(tensor_dicts) > len(device_ids): raise ValueError( "the number of tensor dicts must be the same as the number of device ids" ) # region 1 - copy data and model to multiple GPUS # NOTE, there can be fewer tensor dicts, # and in this case the number of used device ids might be less than the number of provided device ids moved = [ move_tensor_dict_to_device(tensor_dict, device_id) for tensor_dict, device_id in zip(tensor_dicts, device_ids) ] used_device_ids = device_ids[:len(moved)] # must replicate the model to the GPUs every time, because its parameters have been updated replicas = nnP.replicate(model, used_device_ids) # endregion # region 2 - get the outputs # the outputs must be a dictionary of results returned by each GPU outputs = nnP.parallel_apply( replicas, [()] * len(tensor_dicts), # no positional argument moved, # the tensor dict as named arguments used_device_ids) # endregion # region 3 - gather the results on the first GPU result = {} for k, v in outputs[0].items(): if k == loss_key: # special treatment for the loss key result[k] = nnP.gather( [output[k].unsqueeze(0) for output in outputs], target_device=used_device_ids[0], dim=0).mean() else: if isinstance(v, torch.Tensor): result[k] = [ nnP.gather([output[k]], target_device=used_device_ids[0], dim=0) for output in outputs ] elif gx.iterable__(v, atom_types=atom_types): result[k] = tuple(chain([output[k] for output in outputs])) else: result[k] = tuple(output[k] for output in outputs) # endregion return result
def train( self, train_dataset, *, progress_bar=True, resume=False, device=None, ): """ A simplified training loop:: for epoch in range(1, ...): for example in train_iterator: model_out = self.model(example) review = self.model.review(example, model_out) review = maybe_add_loss_from_losses(review) review['loss'].backward() self.optimizer.step() add_review_to_tensorboardX(review) The remaining code takes care about calling validation and save the result to tensorboard (if a validation_hook is registered), save checkpoints, cleanup checkpoints that are stale (not best according to metric and not last) and display a progessbar. The code is designed that many aspects can be customized. (e.g. see test_runtime_tests.py DictTrainer for multi model trainer) Args: train_iterator: The train_iterator is python iterable (e.g. tuple, list, ...) that can consumed multiple times (i.e. not generator). Usually it will be paderbox.database.BaseIterator that is returned from a database in paderbox.database. progress_bar: flag whether to show a progress bar or not. resume: Whether to resume a training or start a fresh one. device: Defines the device which shall be used ('cpu', 0, 1, ...). If None, it selects device 0 if CUDA is available and 'cpu' if CUDA is not available. """ if torch.cuda.is_available(): if device is None: device = 0 else: if device is None: warnings.warn( 'CUDA is not available in this environment! The training ' 'will run on the CPU! This might be caused by a damaged ' 'installation or a version mismatch between PyTorch and ' 'your CUDA installation.') device = 'cpu' elif device != 'cpu': raise RuntimeError( 'CUDA is not available in this environment, but you set ' 'device to use a GPU! This might be caused by a damaged ' 'installation or a version mismatch between PyTorch and ' 'your CUDA installation.') if resume: assert resume is True, resume self.load_checkpoint() else: assert not self.checkpoint_dir.exists(),\ f'A checkpoint directory already exists. If you want to ' \ f'restart the training set resume to True.' self.iteration = 0 self.epoch = 0 torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False # Change model to train mode (e.g. activate dropout) self.model.train() if isinstance(device, (tuple, list)): assert all([isinstance(d, int) for d in device]), device # multiple devises e.g. [0, 1], [0, 1, 2, 3], ... # torch.nn.parallel.DataParallel moves everything to the first gpu. # We do then the same thing. self.to(device[0]) device = list(device) else: self.to(device) device = [device] # Reset all gradients self.optimizer_zero_grad() self.writer = self.writer_cls(str(self.storage_dir)) hooks = [*self.hooks] if progress_bar: try: max_it_len = len(train_dataset) except TypeError: # TypeError: object of type '...' has no len() max_it_len = None hooks.append(ProgressBarHook(self._stop_trigger, max_it_len)) hooks = sorted(hooks, key=lambda h: h.priority, reverse=True) if len(device) >= 2: import textwrap print( 'WARNING: You called padertorch.Trainer.train with multiple\n' + textwrap.indent( 'devices. With this the trainer will use data parallel to\n' 'utilize the multiple GPUs to speedup your training.\n' 'We observed some problems with some versions of pytorch.\n' 'In 1.4 the performance on a NN was quite bad and accoring to\n' 'https://github.com/pytorch/pytorch/issues/33552\n' 'this was because the RNNs get no gradients.\n' 'In 1.5 the training got stuck, the reason is unclear in the' 'moment.\n' 'With Pytorch <= 1.3 we have not tested the code.\n' f'Your pytorch version is: {torch.__version__}', ' ' * len('WARNING: '))) assert self.virtual_minibatch_size % len(device) == 0, ( self.virtual_minibatch_size, device) assert len(device) > 0, (self.virtual_minibatch_size, device) # ================ MAIN TRAINING LOOP! =================== try: train_iterable = None while True: new_epoch = False if train_iterable is None: new_epoch = True # Call pre_step between the epochs. # We call it here, so it is done, before the iteration # over the train_dataset starts. for hook in hooks: hook.pre_step(self) train_iterable = iter(train_dataset) optimize = True with self.train_timer['time_per_iteration'] as timer: for minibatch_index in range(self.virtual_minibatch_size // len(device)): with self.train_timer['time_per_data_loading']: example = list( itertools.islice(train_iterable, len(device))) if len(example) == 0: train_iterable = None self.epoch += 1 if minibatch_index == 0: optimize = False break # end minibatch loop if new_epoch: new_epoch = False elif minibatch_index == 0: # Call pre_step after getting the next example, # to correctly detect the next epoch with timer.pause(): for hook in hooks: hook.pre_step(self) if len(device) == 1: assert len(example) == 1, (len(example), example) example = example[0] loss, example, model_output, review = \ self.train_step(self.model, example, device[0]) with timer.pause(): for hook in hooks: hook.post_step(self, example, model_output, review) # Release pytorch object to reduce memory footprint del example del model_output del review with self.train_timer['time_per_backward']: loss.backward(retain_graph=False) del loss else: # The data parallel idea here follows the idea from # torch.nn.parallel.DataParallel. # We also use the same functions # (i.e. replicate, parallel_apply and gather). # # The difference is, that we need no scatter, # because we simply use multiple examples and # the gather must only be applied on the loss. # Move copies of the model to each GPU with self.train_timer['time_per_replicate']: replicas = replicate(self.model, device[:len(example)]) # Use threads to call train_step. Each thread # processes one example on one GPU. with self.train_timer['time_per_parallel_apply']: outputs = parallel_apply( [self.train_step] * len(example), list( zip( replicas, example, device[:len(example)], )), ) del replicas # Take the sum of all losses. Since they are on # different GPUs, use gather. with self.train_timer['time_per_gather']: loss = gather([ loss.view(1) for loss, _, _, _ in outputs ], device[0]).sum() with timer.pause(): for _, example, model_output, review in outputs: for hook in hooks: hook.post_step(self, example, model_output, review) # Release pytorch object to reduce memory footprint del example del model_output del review with self.train_timer['time_per_backward']: loss.backward(retain_graph=False) del loss # Only the summary hook will use optimizer_review if optimize: with self.train_timer['time_per_optimize']: optimizer_summary = self.optimizer_step() for hook in hooks: hook.post_optimize(self, optimizer_summary) del optimizer_summary self.iteration += 1 except StopTraining: pass finally: try: for hook in hooks: hook.close(self) except Exception: print('Exception in finally. May hide actual exception!!!\n' 'You may comment this finally block for debugging.') raise self.writer.close() self.writer = None