def _update_representation(self, train_loader, test_loader, optimizer, scheduler): prog_bar = tqdm(range(epochs)) for _, epoch in enumerate(prog_bar): self._network.train() losses = 0. correct, total = 0, 0 for i, (_, inputs, targets) in enumerate(train_loader): inputs, targets = inputs.to(self._device), targets.to( self._device) logits = self._network(inputs)['logits'] onehots = target2onehot(targets, self._total_classes) if self._old_network is None: loss = F.binary_cross_entropy_with_logits(logits, onehots) else: old_onehots = torch.sigmoid( self._old_network(inputs)['logits'].detach()) new_onehots = onehots.clone() new_onehots[:, :self._known_classes] = old_onehots loss = F.binary_cross_entropy_with_logits( logits, new_onehots) optimizer.zero_grad() loss.backward() optimizer.step() losses += loss.item() # acc _, preds = torch.max(logits, dim=1) correct += preds.eq(targets.expand_as(preds)).cpu().sum() total += len(targets) scheduler.step() # train_acc = self._compute_accuracy(self._network, train_loader) train_acc = np.around(tensor2numpy(correct) * 100 / total, decimals=2) test_acc = self._compute_accuracy(self._network, test_loader) info = 'Task {}, Epoch {}/{} => Loss {:.3f}, Train_accy {:.2f}, Test_accy {:.2f}'.format( self._cur_task, epoch + 1, epochs, losses / len(train_loader), train_acc, test_acc) prog_bar.set_description(info) logging.info(info)
def _update_representation(self, train_loader, test_loader, optimizer, scheduler): #特征表示更新 prog_bar = tqdm(range(epochs)) for _, epoch in enumerate(prog_bar): self._network.train() losses = 0. for i, (_, inputs, targets) in enumerate(train_loader): #新类 inputs, targets = inputs.to(self._device), targets.to( self._device) logits = self._network(inputs) #新类的输入和forward onehots = target2onehot(targets, self._total_classes) #新类标签->onehot if self._old_network is None: #没有原始网络 单纯分类 loss = F.binary_cross_entropy_with_logits(logits, onehots) else: old_onehots = torch.sigmoid( self._old_network( inputs).detach()) #原来的网络对新样本的预测(用于计算蒸馏损失) new_onehots = onehots.clone() #新的onehot new_onehots[:, :self. _known_classes] = old_onehots #由于gt这个onehot向量的label位肯定在 :_known_classes后面 loss = F.binary_cross_entropy_with_logits( logits, new_onehots) #因此新网络的输出既可以与新样本的新位置的gt算CELoss #也可以与原来的网络的预测结果算CELoss 以求不要忘记原来网络的输出 optimizer.zero_grad() loss.backward() optimizer.step() losses += loss.item() scheduler.step() train_acc = self._compute_accuracy(self._network, train_loader) test_acc = self._compute_accuracy(self._network, test_loader) info = 'Task {}, Epoch {}/{} => Loss {:.3f}, Train_accy {:.3f}, Test_accy {:.3f}'.format( self._cur_task, epoch + 1, epochs, losses / len(train_loader), train_acc, test_acc) prog_bar.set_description(info) logging.info(info)
def _run(self, train_loader, test_loader, optimizer, scheduler): for epoch in range(1, epochs + 1): self._network.train() ce_losses = 0. lf_losses = 0. is_losses = 0. correct, total = 0, 0 for i, (_, inputs, targets) in enumerate(train_loader): inputs, targets = inputs.to(self._device), targets.to( self._device) outputs = self._network(inputs) logits = outputs[ 'logits'] # Final outputs after scaling (bs, nb_classes) features = outputs[ 'features'] # Features before fc layer (bs, 64) ce_loss = F.cross_entropy(logits, targets) # Cross entropy loss lf_loss = 0. # Less forgetting loss is_loss = 0. # Inter-class speration loss if self._old_network is not None: old_outputs = self._old_network(inputs) old_features = old_outputs[ 'features'] # Features before fc layer lf_loss = F.cosine_embedding_loss( features, old_features.detach(), torch.ones(inputs.shape[0]).to( self._device)) * self.lamda scores = outputs[ 'new_scores'] # Scores before scaling (bs, nb_new) old_scores = outputs[ 'old_scores'] # Scores before scaling (bs, nb_old) old_classes_mask = np.where( tensor2numpy(targets) < self._known_classes)[0] if len(old_classes_mask) != 0: scores = scores[old_classes_mask] # (n, nb_new) old_scores = old_scores[ old_classes_mask] # (n, nb_old) # Ground truth targets gt_targets = targets[old_classes_mask] # (n) old_bool_onehot = target2onehot( gt_targets, self._known_classes).type(torch.bool) anchor_positive = torch.masked_select( old_scores, old_bool_onehot) # (n) anchor_positive = anchor_positive.view(-1, 1).repeat( 1, K) # (n, K) # Top K hard anchor_hard_negative = scores.topk(K, dim=1)[0] # (n, K) is_loss = F.margin_ranking_loss(anchor_positive, anchor_hard_negative, torch.ones(K).to( self._device), margin=margin) loss = ce_loss + lf_loss + is_loss optimizer.zero_grad() loss.backward() optimizer.step() ce_losses += ce_loss.item() lf_losses += lf_loss.item() if self._cur_task != 0 else lf_loss is_losses += is_loss.item() if self._cur_task != 0 and len( old_classes_mask) != 0 else is_loss # acc _, preds = torch.max(logits, dim=1) correct += preds.eq(targets.expand_as(preds)).cpu().sum() total += len(targets) scheduler.step() # train_acc = self._compute_accuracy(self._network, train_loader) train_acc = np.around(tensor2numpy(correct) * 100 / total, decimals=2) test_acc = self._compute_accuracy(self._network, test_loader) info1 = 'Task {}, Epoch {}/{} => '.format(self._cur_task, epoch, epochs) info2 = 'CE_loss {:.3f}, LF_loss {:.3f}, IS_loss {:.3f}, Train_accy {:.2f}, Test_accy {:.2f}'.format( ce_losses / (i + 1), lf_losses / (i + 1), is_losses / (i + 1), train_acc, test_acc) logging.info(info1 + info2)
def _run(self, train_loader, test_loader, optimizer, scheduler): for epoch in range(1, epochs + 1): self._network.train() clf_losses = 0. # cross entropy distill_losses = 0. # distillation attention_losses = 0. # attention distillation correct, total = 0, 0 for i, (_, inputs, targets) in enumerate(train_loader): inputs, targets = inputs.to(self._device), targets.to( self._device) outputs = self._network(inputs) logits = outputs['logits'] optimizer.zero_grad() # Same effect as nn.Module.zero_grad() if self._old_network is None: clf_loss = F.cross_entropy(logits, targets) clf_losses += clf_loss.item() loss = clf_loss else: self._old_network.zero_grad() old_outputs = self._old_network(inputs) old_logits = old_outputs['logits'] # Classification loss # if no old samples saved, only calculate loss for new logits clf_loss = F.cross_entropy(logits[:, self._known_classes:], targets - self._known_classes) clf_losses += clf_loss.item() # Distillation loss # if no old samples saved, only calculate distillation loss for old logits ''' distill_loss = F.binary_cross_entropy_with_logits( logits[:, :self._known_classes], torch.sigmoid(old_logits.detach()) ) * distill_ratio ''' distill_loss = _KD_loss(logits[:, :self._known_classes], old_logits.detach(), T=2) * distill_ratio distill_losses += distill_loss.item() # Attention distillation loss top_base_indices = logits[:, :self._known_classes].argmax( dim=1) onehot_top_base = target2onehot( top_base_indices, self._known_classes).to(self._device) logits[:, :self._known_classes].backward( gradient=onehot_top_base, retain_graph=True) old_logits.backward(gradient=onehot_top_base) attention_loss = gradcam_distillation( outputs['gradcam_gradients'][0], old_outputs['gradcam_gradients'][0].detach(), outputs['gradcam_activations'][0], old_outputs['gradcam_activations'] [0].detach()) * attention_ratio attention_losses += attention_loss.item() # Integration loss = clf_loss + distill_loss + attention_loss self._old_network.zero_grad() self._network.zero_grad() optimizer.zero_grad() # Same effect as nn.Module.zero_grad() loss.backward() optimizer.step() # acc _, preds = torch.max(logits, dim=1) correct += preds.eq(targets.expand_as(preds)).cpu().sum() total += len(targets) scheduler.step() # train_acc = self._compute_accuracy(self._network, train_loader) train_acc = np.around(tensor2numpy(correct) * 100 / total, decimals=2) test_acc = self._compute_accuracy(self._network, test_loader) info1 = 'Task {}, Epoch {}/{} => clf_loss {:.2f}, '.format( self._cur_task, epoch, epochs, clf_losses / (i + 1)) info2 = 'distill_loss {:.2f}, attention_loss {:.2f}, Train_accy {:.2f}, Test_accy {:.2f}'.format( distill_losses / (i + 1), attention_losses / (i + 1), train_acc, test_acc) logging.info(info1 + info2)
def _run(self, train_loader, test_loader, optimizer, scheduler): for epoch in range(1, epochs + 1): self._network.train() # set train mode ce_losses = 0. lf_losses = 0. is_losses = 0. correct, total = 0, 0 for i, (_, inputs, targets) in enumerate(train_loader): inputs, targets = inputs.to(self._device), targets.to( self._device) outputs = self._network(inputs) logits = outputs[ 'logits'] # Final outputs after scaling (bs, nb_classes), |* i.e., befroe probs=softmax(logits) features = outputs[ 'features'] # Features before fc layer (bs, 64) |* i.e., feature vector from feature extractor(backbone) ce_loss = F.cross_entropy( logits, targets ) # Cross entropy loss |* cross_entrophy implicityly implement softmax, so its input is logits. lf_loss = 0. # Less forgetting loss is_loss = 0. # Inter-class speration loss, i.e. margin ranking loss. Eq 8. if self._old_network is not None: old_outputs = self._old_network(inputs) old_features = old_outputs[ 'features'] # Features before fc layer lf_loss = F.cosine_embedding_loss( features, old_features.detach(), torch.ones(inputs.shape[0]).to( self._device)) * self.lamda # Eq 6. scores = outputs[ 'new_scores'] # Scores before scaling (bs, nb_new) old_scores = outputs[ 'old_scores'] # Scores before scaling (bs, nb_old) '''@Author:defeng 24 May 2021 (Monday) see Line 45 here, we know ucir uses CosineincNet and CosineincNet uses (Split)CosineLinearLayer. Line 93 forward function of SplitCosineLinearLayer, "out" times(X) the scaling factor eta while out1/2 doesn't. (CosineLinearLayer does not have the new/old_scores.) ''' old_classes_mask = np.where( tensor2numpy(targets) < self._known_classes)[0] if len(old_classes_mask) != 0: scores = scores[old_classes_mask] # (n, nb_new) old_scores = old_scores[ old_classes_mask] # (n, nb_old) # Ground truth targets gt_targets = targets[old_classes_mask] # (n) old_bool_onehot = target2onehot( gt_targets, self._known_classes).type(torch.bool) anchor_positive = torch.masked_select( old_scores, old_bool_onehot ) # *(n)* |* i.e. select GT class correspoding scores. anchor_positive = anchor_positive.view(-1, 1).repeat( 1, K ) # *(n, K)* |* i.e., <\bar{\theta}, \bar(f(x))> '''@Author:defeng torch.repeat is different from numpy.repeat. see for details: https://pytorch.org/docs/stable/tensors.html?highlight=repeat#torch.Tensor.repeat ''' # Top K hard anchor_hard_negative = scores.topk( K, dim=1 )[0] # *(n, K)* |* i.e., <\bar{\theta_{k}}, \bar(f(x))> is_loss = F.margin_ranking_loss(anchor_positive, anchor_hard_negative, torch.ones(K).to( self._device), margin=margin) '''@Author:defeng here, the params "torch.ones(K).to(self._device)" for margin_ranking_loss follows the params \ requirements in pytorch documentation(specifically, ones(K) is the variable y). see for details: https://pytorch.org/docs/stable/generated/torch.nn.MarginRankingLoss.html#torch.nn.MarginRankingLoss ''' loss = ce_loss + lf_loss + is_loss optimizer.zero_grad() loss.backward() optimizer.step() ce_losses += ce_loss.item() lf_losses += lf_loss.item() if self._cur_task != 0 else lf_loss is_losses += is_loss.item() if self._cur_task != 0 and len( old_classes_mask) != 0 else is_loss # acc(classification) _, preds = torch.max( logits, dim=1 ) # pred is the indexs/location of the max value in dim1. correct += preds.eq(targets.expand_as(preds)).cpu().sum() total += len(targets) scheduler.step() # train_acc = self._compute_accuracy(self._network, train_loader) train_acc = np.around(tensor2numpy(correct) * 100 / total, decimals=2) test_acc = self._compute_accuracy(self._network, test_loader) info1 = 'Task {}, Epoch {}/{} => '.format(self._cur_task, epoch, epochs) info2 = 'CE_loss {:.3f}, LF_loss {:.3f}, IS_loss {:.3f}, Train_accy {:.2f}, Test_accy {:.2f}'.format( ce_losses / (i + 1), lf_losses / (i + 1), is_losses / (i + 1), train_acc, test_acc) logging.info(info1 + info2)