def train(self, t, train_loader, model, optimizer, epoch, Tasks): """Train for one epoch on the training set""" batch_time = AverageMeter() losses = AverageMeter() accuracy = AverageMeter() # switch to train mode model.train() world_size = dist.get_world_size() rank = dist.get_rank() end = time.time() batch_cnt = int(len(train_loader)) for i, (input, target) in enumerate(train_loader): target = target.cuda(async=True) input = input.cuda() input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) output = torch.nn.functional.sigmoid(output) loss = self.criterion(output[:, Tasks[t]['subset']], target_var) / world_size # measure accuracy and record loss (accu, accus) = self.cleba_accuracy(t, output.data, target, Tasks) reduced_loss = loss.data.clone() reduced_accu = accu.clone() / world_size dist.all_reduce_multigpu([reduced_loss]) dist.all_reduce_multigpu([reduced_accu]) losses.update(reduced_loss[0], input.size(0)) accuracy.update(reduced_accu[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() average_gradients(model) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.print_freq == 0 and rank == 0: print( 'Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Accuracy {accuracy.val:.3f} ({accuracy.avg:.3f})'.format( epoch, i, batch_cnt, batch_time=batch_time, loss=losses, accuracy=accuracy))
def compute_pre_param(self, t, memory_cache, epoch, Tasks): # if self.rank == 0: # print("== BEGIN: compute grad for pre observed tasks: {task}".format(task=t)) # end = time.time() self.optimizer.zero_grad() mem_batch_cnt = int(len(memory_cache)) for input, target in memory_cache: # target = target.cuda(async=True) # input = input.cuda(async=True) # input, target already loaded into GPU input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = self.model(input_var) output = torch.nn.functional.sigmoid(output) # compute loss divided by world_size and mem_batch_cnt loss = self.criterion( output[:, Tasks[t]['subset']], target_var) / (self.world_size * mem_batch_cnt) # compute gradient for each batch of memory and accumulate loss.backward() average_gradients(self.model) # if self.rank == 0: # print("== END: compute grad for pre observed task: {task} | TIME: {time} ".\ # format(task=t, time=(time.time()-end)) ) return self.model.parameters
def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() # switch to train mode model.train() world_size = dist.get_world_size() rank = dist.get_rank() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(input.cuda()) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) # measure accuracy and record loss loss = criterion(output, target_var) / world_size prec1 = accuracy(output.data, target, topk=(1, 1)) reduced_loss = loss.data.clone() reduced_prec1 = prec1.clone() / world_size dist.all_reduce_multigpu([reduced_loss]) dist.all_reduce_multigpu([reduced_prec1]) losses.update(reduced_loss[0], input.size(0)) top1.update(reduced_prec1[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() average_gradients(model) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0 and rank == 0: print('Epoch: [{0}][{1}/{2}]' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'.format(epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1))
def train(self, t, train_loader, epoch, Tasks): """Train for one epoch on the training set""" batch_time = AverageMeter() losses = AverageMeter() accuracy = AverageMeter() # switch to train mode self.model.train() end = time.time() batch_cnt = int(len(train_loader)) for i, (input, target) in enumerate(train_loader): # ================================================================= # # compute grad for data at previous tasks if len(self.solved_tasks) > 0: if self.rank == 0: print( "====== compute grad for pre observed tasks: {tasks}". format(tasks=self.solved_tasks)) # compute grad for pre observed tasks for pre_t in self.solved_tasks: ## smaple few examples from previous tasks # memory_sampler = Tasks[pre_t]['memory_sampler'] # memory_sampler.set_epoch(epoch) # random or fix sample? # memory_loader = Tasks[pre_t]['memory_loader'] memory_cache = self.memory_caches[ pre_t] # memory_cache is a list of loaded gpu tensor ## compute gradient for few samples in previous tasks if self.rank == 0: print( "== BEGIN: compute grad for pre observed tasks: {task}" .format(task=pre_t)) end_pre = time.time() # # pre_param = self.compute_pre_param(pre_t, memory_loader, epoch, Tasks) pre_param = self.compute_pre_param(pre_t, memory_cache, epoch, Tasks) # if self.rank == 0: print("== END: compute grad for pre observed task: {task} | TIME: {time} ".\ format(task=pre_t, time=(time.time()-end_pre)) ) ## copy previous grad to tensor store_grad(pre_param, self.grads, self.grad_dims, pre_t) # ================================================================= # # compute grad for data at current task target = target.cuda(async=True) input = input.cuda(async=True) input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = self.model(input_var) output = torch.nn.functional.sigmoid(output) loss = self.criterion(output[:, Tasks[t]['subset']], target_var) / self.world_size # compute gradient within constraints and backprop errors self.optimizer.zero_grad() loss.backward() average_gradients(self.model) # ================================================================== # # check grad and get new grad if len(self.solved_tasks) > 0: if self.rank == 0: print( "== BEGIN: check constraints; if violate, get surrogate grad." ) end_opt = time.time() ## copy gradient for data at current task to a tensor and clear grad store_grad(self.model.parameters, self.grads, self.grad_dims, t) ## check if current step gradient violate constraints indx = torch.cuda.LongTensor(self.solved_tasks) dotp = torch.mm(self.grads[:, t].unsqueeze(0), self.grads.index_select(1, indx)) if (dotp < 0).sum() != 0: violate_constr = True else: violate_constr = False ## use convex quadratic prorgamming to get surrogate grad if violate_constr: # if violate, use quadprog to get new grad self.optimizer.zero_grad() project2cone2(self.grads[:, t].unsqueeze(1), self.grads.index_select(1, indx), self.margin) ## copy surrogate grad back to model gradient parameters overwrite_grad(self.model.parameters, self.grads[:, t], self.grad_dims) if self.rank == 0: print("== END: violate constraints? : {vio_constr} | TIME: {time}".\ format(vio_constr=violate_constr, time=(time.time()-end_opt)) ) # ================================================================= # # then do SGD step self.optimizer.step() # measure accuracy and record loss accu, _ = self.cleba_accuracy(t, output.data, target, Tasks) reduced_loss = loss.data.clone() reduced_accu = accu.clone() / self.world_size dist.all_reduce_multigpu([reduced_loss]) dist.all_reduce_multigpu([reduced_accu]) losses.update(reduced_loss[0], input.size(0)) accuracy.update(reduced_accu[0], input.size(0)) # measure elapsed time batch_time.update(time.time() - end) if i % self.print_freq == 0 and self.rank == 0: print( 'Training Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Accuracy {accuracy.val:.3f} ({accuracy.avg:.3f})'.format( epoch, i, batch_cnt, batch_time=batch_time, loss=losses, accuracy=accuracy)) end = time.time()
def train(epoch, op_explore): """ train model on each epoch in trainset """ global trainloader global testloader global net global criterion global optimizer global rank, world_size if rank == 0: logger.debug("Epoch: %d", epoch) net.train() train_loss = 0 correct = 0 total = 0 optimizer = op_explore f11 = open('/root/log', 'a+') f11.write('### ready to train \n') f11.close() for batch_idx, (inputs, targets) in enumerate(trainloader): f11 = open('/root/log', 'a+') f11.write('### loop to train \n') f11.close() targets = targets.cuda(async=True) #inputs, targets = inputs.to(device), targets.to(device) input_var = torch.autograd.Variable(inputs.cuda()) target_var = torch.autograd.Variable(targets) optimizer.zero_grad() outputs = net(input_var) loss = criterion(outputs, target_var) / world_size loss.backward() average_gradients(net) optimizer.step() train_loss += loss.item() _, predicted = outputs.data.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() #As the cost of all_reduce, we don't use all_reduce every batch to calculate acc." """ if rank == 0: logger.debug( "Loss: %.3f | Acc: %.3f%% (%d/%d)", train_loss / (batch_idx + 1), 100.0 * tmp_correct / tmp_total, tmp_correct, tmp_total, ) """ reduced_total = torch.Tensor([total]) reduced_correct = torch.Tensor([correct]) reduced_total = reduced_total.cuda() reduced_correct = reduced_correct.cuda() dist.all_reduce(reduced_total) dist.all_reduce(reduced_correct) tmp_total = int(reduced_total[0]) tmp_correct = int(reduced_correct[0]) acc = 100.0 * tmp_correct / tmp_total return acc