def __init__(self, opt, lbstokens, emb_matrix=None, cls=None): self.opt = opt self.emb_matrix = emb_matrix if cls is None: self.model = GCNClassifier(opt, lbstokens, emb_matrix=emb_matrix) else: self.model = GCNClassifier(opt, lbstokens, emb_matrix=emb_matrix, cls=cls) self.cls = cls self.rel_types = len(constant.LABEL_TO_ID) self.loss_matrix = torch.zeros((self.rel_types, self.rel_types), requires_grad=False) self.miss_matrix = torch.zeros((self.rel_types, self.rel_types), requires_grad=False) # self.alpha=torch.full((1,),0.1,requires_grad=True) # self.beta=torch.full((1,),0.1,requires_grad=True) #self.model = nn.DataParallel(GCNClassifier(opt, emb_matrix=emb_matrix),device_ids=[0,1,2,3]) #self.model.half() print(self.get_parameter_number(self.model)) self.soft_criterion = SoftCrossEntropyLoss() self.criterion = nn.CrossEntropyLoss() self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.loss_matrix = self.loss_matrix.cuda() self.miss_matrix = self.miss_matrix.cuda() self.optimizer = torch_utils.get_optimizer( opt['optim'], [{ 'params': self.parameters }], opt['lr'])
def __init__(self, opt, emb_matrix=None): self.opt = opt self.model = Our_Model(opt, emb_matrix) # # pass weights per class, each class corresponds to its index # weights = [opt['weight_no_rel']] # rel_classes_weights = [opt["weight_rest"]] * 41 # weights.extend(rel_classes_weights) # print("Using weights", weights) # assert len(weights) == 42 # class_weights = torch.FloatTensor(weights).to("cuda") self.criterion = nn.CrossEntropyLoss() # weight=class_weights self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] # print(self.parameters) # print(len(self.parameters)) if opt['cuda']: self.model.to("cuda") self.criterion.to("cuda") self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, emb_matrix=None, data_dir='dataset/tacred/', init_temp=160, alpha=1, train_batch=None): self.opt = opt self.model = PositionAwareRNN(opt, emb_matrix) self.criterion = nn.CrossEntropyLoss() self.parameters = [p for p in self.model.parameters() if p.requires_grad] scores = [] self.data_dir = data_dir self.temp = init_temp self.alpha = alpha with open(os.path.join(data_dir, 'kl_prob.json'), 'r') as f: kl = json.load(f) for i in range(len(kl)): scores.append(kl[str(i)][0]) self.kl = torch.Tensor(scores) self.kl = torch.exp(-self.kl / init_temp) * alpha self.kl[[i for i in range(len(kl))], [i for i in range(len(kl))]] = 0 self.true_rel = None if train_batch is not None: self.true_rel = {} for batch in train_batch: for i in range(len(batch[0])): sub = batch[-1][i][0] obj = batch[-1][i][1] if (sub, obj) not in self.true_rel: self.true_rel[(sub, obj)] = set() self.true_rel[(sub, obj)].add((batch[7][i]).item()) self.kl = F.normalize(self.kl, p=1, dim=-1) # 归一化?? if opt['cuda']: self.model.cuda() self.criterion.cuda() self.kl = self.kl.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'], weight_decay=opt['weight_decay'])
def __init__(self, args, emb_matrix=None): self.args = args self.emb_matrix = emb_matrix self.model = RGATABSA(args, emb_matrix=emb_matrix) self.parameters = [p for p in self.model.parameters() if p.requires_grad] self.model.cuda() self.optimizer = torch_utils.get_optimizer(args.optim, self.parameters, args.lr)
def __init__(self, opt, emb_matrix=None): self.opt = opt self.model = PositionAwareRNN(opt, emb_matrix) self.criterion = nn.CrossEntropyLoss() self.parameters = [p for p in self.model.parameters() if p.requires_grad] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt): self.opt = opt self.model = BasicClassifier(opt) self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt): self.opt = opt self.model = BiGI(opt) self.criterion = nn.BCELoss() if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.model.parameters(), opt['lr']) self.epoch_rec_loss = [] self.epoch_dgi_loss = []
def __init__(self, opt, emb_matrix=None): self.opt = opt self.emb_matrix = emb_matrix self.model = GCNClassifier(opt, emb_matrix=emb_matrix) self.sigmoid = nn.Sigmoid() self.criterion = nn.BCEWithLogitsLoss() self.parameters = [p for p in self.model.parameters() if p.requires_grad] if opt['cuda']: self.model.cuda() self.sigmoid.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, bert_model): self.opt = opt self.bert_model = bert_model self.model = BiLSTMCNN(opt, bert_model) self.subj_criterion = nn.BCELoss(reduction='none') self.obj_criterion = nn.CrossEntropyLoss(reduction='none') self.parameters = [p for p in self.model.parameters() if p.requires_grad] if opt['cuda']: self.model.cuda() self.subj_criterion.cuda() self.obj_criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'], opt['weight_decay'])
def __init__(self, opt, emb_matrix=None): self.opt = opt self.model = PositionAwareRNN(opt, emb_matrix) self.criterion = nn.CrossEntropyLoss() self.parameters = [p for p in self.model.parameters() if p.requires_grad] if opt["cuda"]: print("starting cuda.") self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer( opt["optim"], self.parameters, opt["lr"] )
def __init__(self, opt, emb_matrix=None, asp_emb_matrix=None): self.opt = opt self.emb_matrix = emb_matrix self.model = AspModel(opt, emb_matrix, asp_emb_matrix) self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, model_name=None): self.opt = opt self.model = BertPositionAwareRNN(opt, model_name=model_name) self.criterion = nn.CrossEntropyLoss() self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: print("starting cuda.") self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, emb_matrix=None): self.opt = opt self.classifier = SynGCN(opt, emb_matrix) self.decoder = Decoder(opt) self.criterion = nn.CrossEntropyLoss() self.criterion_d = nn.NLLLoss(ignore_index=constant.PAD_ID) self.parameters = [p for p in self.classifier.parameters() if p.requires_grad] + [p for p in self.decoder.parameters() if p.requires_grad] if opt['cuda']: self.classifier.cuda() self.decoder.cuda() self.criterion.cuda() self.criterion_d.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, emb_matrix=None): self.opt = opt self.emb_matrix = emb_matrix self.model = GCNClassifier(opt, emb_matrix=emb_matrix) self.criterion = nn.CrossEntropyLoss( weight=torch.from_numpy(np.array([1.0, 5.0])).float()) self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, predictor): # options self.opt = opt # encoding model self.model = predictor # loss function self.criterion = nn.CrossEntropyLoss() # all parameters of the model self.parameters = [p for p in self.model.parameters() if p.requires_grad] # whether moving all data to gpu if opt['cuda']: self.model.cuda() self.criterion.cuda() # intialize the optimizer self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, emb_matrix=None): """ GCN Trainer :param opt: :param emb_matrix: word embedding matrix, torch tensor """ self.opt = opt self.emb_matrix = emb_matrix self.model = GCNClassifier(opt, emb_matrix=emb_matrix) self.criterion = nn.CrossEntropyLoss() self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] # only update some parameter, because we may not update some parameter self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, emb_matrix=None): self.opt = opt self.model = PositionAwareRNN(opt, emb_matrix) self.criterion = nn.CrossEntropyLoss() # self.attn_loss = nn.KLDivLoss(reduction='sum') self.attn_loss = nn.CosineSimilarity() self.loss_scaler = opt["loss_scaler"] self.parameters = [p for p in self.model.parameters() if p.requires_grad] if opt["cuda"]: print("starting cuda.") self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer( opt["optim"], self.parameters, opt["lr"] )
def __init__(self, opt, selector): # options self.opt = opt # encoding model self.model = selector # all parameters of the model self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] # whether moving all data to gpu if opt['cuda']: self.model.cuda() # intialize the optimizer self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, emb_matrix=None, ucca_embedding_matrix=None): self.opt = opt self.emb_matrix = emb_matrix self.ucca_embedding_matrix = ucca_embedding_matrix self.model = GCNClassifier(opt, emb_matrix=emb_matrix, ucca_embedding_matrix=ucca_embedding_matrix) self.criterion = nn.CrossEntropyLoss() self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, emb_matrix=None): self.opt = opt self.emb_matrix = emb_matrix self.model = GCNClassifier(opt, emb_matrix=emb_matrix) self.criterion = nn.CrossEntropyLoss(reduction="none") self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] self.crf = CRF(self.opt['num_class'], batch_first=True) self.bc = nn.BCELoss() if opt['cuda']: self.model.cuda() self.criterion.cuda() self.crf.cuda() self.bc.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, knowledge_emb=None, word_emb=None): self.opt = opt self.knowledge_emb = knowledge_emb self.word_emb = word_emb self.model = GCNClassifier(opt, knowledge_emb=knowledge_emb, word_emb=word_emb) #print(self.model) self.criterion = nn.BCEWithLogitsLoss() self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt, emb_matrix=None, joint=False): self.opt = opt self.model = model.BLSTM_CRF(opt, emb_matrix) if opt['crf']: print("Using CRF loss...") self.crit = crf.CRFLoss(opt['num_class'], True) else: self.crit = loss.SequenceLoss(opt['num_class']) self.parameters = [ p for m in (self.model, self.crit) for p in m.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.crit.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'], opt.get('momentum', 0))
def __init__(self, opt, emb_matrix=None): self.opt = opt self.model = PositionAwareRNN(opt, emb_matrix) self.criterion = nn.CrossEntropyLoss() # self.criterion2 = torch.nn.BCELoss(size_average=True) self.criterion2 = nn.CrossEntropyLoss( weight=torch.Tensor([1.0, 1.0]).cuda()) self.criterion3 = nn.NLLLoss() self.mse = nn.MSELoss() self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr']) self.att_w = torch.eye(len(constant.LABEL_TO_ID)).cuda() # self.att_w[0][0] = 1 self.epoch = 0
def __init__(self, opt, model, model_type='predictor'): self.opt = opt self.model_type = model_type self.model = model if model_type == 'predictor': self.criterion = nn.CrossEntropyLoss(reduction='none') elif model_type == 'pointwise': self.criterion = nn.BCEWithLogitsLoss() elif model_type == 'pairwise': self.criterion = nn.BCEWithLogitsLoss( ) # Only a placeholder, will NOT use this criterion self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
def __init__(self, opt=None, vocab=None, emb_matrix=None, model_file=None): if model_file is not None: # load model, config and vocab directly from file self.load(model_file) else: # otherwise build model from scratch self.opt = opt # use pointer-generator self.model = Seq2SeqWithCopyModel(opt, emb_matrix=emb_matrix) self.vocab = vocab # by default use 0 weight for coverage loss self.criterion = SequenceLoss(self.vocab.size, self.opt.get('cov_alpha', 0)) self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if self.opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(self.parameters, self.opt['lr'])
def __init__(self, opt, emb_matrix=None): self.opt = opt self.emb_matrix = emb_matrix #self.model = GCNClassifier(opt, emb_matrix=emb_matrix) self.model = DGAModel(opt, emb_matrix=emb_matrix) # self.criterion = nn.CrossEntropyLoss() self.alpha = [] with open("alpha.txt", 'r') as f: for line in f.readlines(): self.alpha.append(float(line.strip().split('\t')[1])) assert len(self.alpha) == len(constant.LABEL_TO_ID) self.alpha = np.array(self.alpha) #self.criterion = FocalLoss(len(constant.LABEL_TO_ID), size_average=True) self.criterion = nn.CrossEntropyLoss() self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt["lr"], opt["conv_l2"])
def build_model(self, model_kwargs): """Build the necessary model. Args: model_kwargs (dict): model args """ # model parameters if model_kwargs['TYPE'] == 'vs_gcnn': params = [ self.num_classes, model_kwargs['IN_CHANNELS'], self.num_viewgroups, model_kwargs['DROPOUT'], model_kwargs['LAYER_CHANNELS'] ] else: raise ValueError("Invalid Model. Model Type should be \ one of %s" % ', '.join(MODEL_TYPE.keys())) # model self.model = MODEL_TYPE[model_kwargs['TYPE']](*params) self.loss = get_loss_fn(model_kwargs['LOSS']) self.step_epochs = np.array([ math.ceil(float(self.args['EPOCHS'] * x)) for x in self.args['STEP'] ]) # optimizer optimizer_args = model_kwargs['OPTIMIZER'] self.lr = optimizer_args['LR'] self.model.apply(weights_init) self.model.to(self.cuda) self.optimizer = get_optimizer(optimizer_args['TYPE'])( self.model.parameters(), lr=self.lr, weight_decay=optimizer_args['WEIGHT_DECAY']) if model_kwargs['PRETRAIN_NAME'] != '': self.load_model()
def __init__(self, args, emb_matrix=None): self.args = args self.emb_matrix = emb_matrix self.model = RGATABSA(args) self.parameters = [ p for p in self.model.parameters() if p.requires_grad ] self.model.cuda() self.optimizer = torch_utils.get_optimizer(args.optim, self.parameters, args.lr, l2=1e-5) # ''' bert_model = self.model.enc.encoder.Sent_encoder bert_params_dict = list(map(id, bert_model.parameters())) base_params = filter(lambda p: id(p) not in bert_params_dict, self.model.parameters()) # no_decay = ["bias", "LayerNorm.weight"] # optimizer_grouped_parameters = [ # {"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],"weight_decay": args.l2,}, # {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": args.l2}, # {"params": base_params}, # {"params": bert_model.parameters(), "lr": args.bert_lr} # ] optimizer_grouped_parameters = [ { "params": base_params }, { "params": bert_model.parameters(), "lr": args.bert_lr }, ] self.optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=args.lr, weight_decay=args.l2)
def train(self, model_pattern, model_name): ori_model = model_pattern(config=self) if self.pretrain_model != None: ori_model.load_state_dict(torch.load(self.pretrain_model)) ori_model.cuda() parameters = [p for p in ori_model.parameters() if p.requires_grad] optimizer = torch_utils.get_optimizer(self.optim, parameters, self.lr) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, self.lr_decay) model = nn.DataParallel(ori_model) # 模块在多个GPU上进行并行化 BCE = nn.BCEWithLogitsLoss(reduction='none') if not os.path.exists(self.checkpoint_dir): os.mkdir(self.checkpoint_dir) best_auc = 0.0 best_f1 = 0.0 best_epoch = 0 model.train() global_step = 0 total_loss = 0 start_time = time.time() def logging(s, print_=True, log_=True): if print_: print(s) if log_: with open(os.path.join(os.path.join("log", model_name)), 'a+') as f_log: f_log.write(s + '\n') dev_score_list = [] f1 = 0 dev_score_list.append(f1) for epoch in range(self.max_epoch): gc.collect() self.acc_NA.clear() self.acc_not_NA.clear() self.acc_total.clear() print("epoch:{}, Learning rate:{}".format( epoch, optimizer.param_groups[0]['lr'])) epoch_start_time = time.time() for no, data in enumerate(self.get_train_batch()): context_idxs = data['context_idxs'] context_pos = data['context_pos'] h_mapping = data['h_mapping'] t_mapping = data['t_mapping'] relation_label = data['relation_label'] input_lengths = data['input_lengths'] relation_multi_label = data['relation_multi_label'] relation_mask = data['relation_mask'] context_ner = data['context_ner'] context_char_idxs = data['context_char_idxs'] ht_pair_pos = data['ht_pair_pos'] context_seg = data['context_seg'] dis_h_2_t = ht_pair_pos + 10 dis_t_2_h = -ht_pair_pos + 10 torch.cuda.empty_cache() context_idxs = context_idxs.cuda() context_pos = context_pos.cuda() context_ner = context_ner.cuda() #context_char_idxs = context_char_idxs.cuda() #input_lengths = input_lengths.cuda() h_mapping = h_mapping.cuda() t_mapping = t_mapping.cuda() relation_mask = relation_mask.cuda() dis_h_2_t = dis_h_2_t.cuda() dis_t_2_h = dis_t_2_h.cuda() node_position = data['node_position'].cuda() entity_position = data['entity_position'].cuda() node_sent_num = data['node_sent_num'].cuda() all_node_num = data['all_node_num'].cuda() entity_num = torch.Tensor(data['entity_num']).cuda() #sent_num = torch.Tensor(data['sent_num']).cuda() sdp_pos = data['sdp_position'].cuda() sdp_num = torch.Tensor(data['sdp_num']).cuda() predict_re = model(context_idxs, context_pos, context_ner, h_mapping, t_mapping, relation_mask, dis_h_2_t, dis_t_2_h, context_seg, node_position, entity_position, node_sent_num, all_node_num, entity_num, sdp_pos, sdp_num) relation_multi_label = relation_multi_label.cuda() loss = torch.sum( BCE(predict_re, relation_multi_label) * relation_mask.unsqueeze(2)) / torch.sum(relation_mask) output = torch.argmax(predict_re, dim=-1) output = output.data.cpu().numpy() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) # 梯度裁剪 optimizer.step() relation_label = relation_label.data.cpu().numpy() for i in range(output.shape[0]): for j in range(output.shape[1]): label = relation_label[i][j] if label < 0: break if label == 0: self.acc_NA.add(output[i][j] == label) else: self.acc_not_NA.add(output[i][j] == label) self.acc_total.add(output[i][j] == label) global_step += 1 total_loss += loss.item() if global_step % self.period == 0: cur_loss = total_loss / self.period elapsed = time.time() - start_time logging( '| epoch {:2d} | step {:4d} | ms/b {:5.2f} | train loss {:5.3f} | NA acc: {:4.2f} | not NA acc: {:4.2f} | tot acc: {:4.2f} ' .format(epoch, global_step, elapsed * 1000 / self.period, cur_loss, self.acc_NA.get(), self.acc_not_NA.get(), self.acc_total.get())) total_loss = 0 start_time = time.time() if epoch > self.evaluate_epoch: logging('-' * 89) eval_start_time = time.time() model.eval() f1, f1_ig, auc, pr_x, pr_y = self.test(model, model_name) model.train() logging('| epoch {:3d} | time: {:5.2f}s'.format( epoch, time.time() - eval_start_time)) logging('-' * 89) if f1 > best_f1: best_f1 = f1 best_auc = auc best_epoch = epoch path = os.path.join(self.checkpoint_dir, model_name) torch.save(ori_model.state_dict(), path) logging("best f1 is: {}, epoch is: {}, save path is: {}". format(best_f1, best_epoch, path)) if epoch > self.decay_epoch: # and epoch < self.evaluate_epoch:# and epoch < self.evaluate_epoch: if self.optim == 'sgd' and f1 < dev_score_list[-1]: self.lr *= self.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = self.lr if self.optim == 'adam' and optimizer.param_groups[0][ 'lr'] > 1e-4: #epoch < 30:# and f1 < dev_score_list[-1]: scheduler.step() dev_score_list.append(f1) print("train time for epoch {}: {}".format( epoch, time.time() - epoch_start_time)) print("Finish training") print("Best epoch = {} | F1 {}, auc = {}".format( best_epoch, best_f1, best_auc)) print("Storing best result...") print("Finish storing")
def train(task_id, data, mnet, hnet, device, config, shared, writer, logger): """Train the hyper network using the task-specific loss plus a regularizer that should overcome catastrophic forgetting. :code:`loss = task_loss + beta * regularizer`. Args: task_id: The index of the task on which we train. data: The dataset handler. mnet: The model of the main network. hnet: The model of the hyper network. May be ``None``. device: Torch device (cpu or gpu). config: The command line arguments. shared (argparse.Namespace): Set of variables shared between functions. writer: The tensorboard summary writer. logger: The logger that should be used rather than the print method. """ start_time = time() print('data: ', data) print('data.num_classes: ', data.num_classes) print('data.num_train_samples: ', data.num_train_samples) logger.info('Training network ...') mnet.train() if hnet is not None: hnet.train() ################# ### Optimizer ### ################# # Define the optimizers used to train main network and hypernet. if hnet is not None: theta_params = list(hnet.theta) if config.continue_emb_training: for i in range(task_id): # for all previous task embeddings theta_params.append(hnet.get_task_emb(i)) # Only for the current task embedding. # Important that this embedding is in a different optimizer in case # we use the lookahead. emb_optimizer = get_optimizer([hnet.get_task_emb(task_id)], config.lr, momentum=config.momentum, weight_decay=config.weight_decay, use_adam=config.use_adam, adam_beta1=config.adam_beta1, use_rmsprop=config.use_rmsprop) else: theta_params = mnet.weights emb_optimizer = None theta_optimizer = get_optimizer(theta_params, config.lr, momentum=config.momentum, weight_decay=config.weight_decay, use_adam=config.use_adam, adam_beta1=config.adam_beta1, use_rmsprop=config.use_rmsprop) ################################ ### Learning rate schedulers ### ################################ if config.plateau_lr_scheduler: assert (config.epochs != -1) # The scheduler config has been taken from here: # https://keras.io/examples/cifar10_resnet/ # Note, we use 'max' instead of 'min' as we look at accuracy rather # than validation loss! plateau_scheduler_theta = optim.lr_scheduler.ReduceLROnPlateau( \ theta_optimizer, 'max', factor=np.sqrt(0.1), patience=5, min_lr=0.5e-6, cooldown=0) plateau_scheduler_emb = None if emb_optimizer is not None: plateau_scheduler_emb = optim.lr_scheduler.ReduceLROnPlateau( \ emb_optimizer, 'max', factor=np.sqrt(0.1), patience=5, min_lr=0.5e-6, cooldown=0) if config.lambda_lr_scheduler: assert (config.epochs != -1) def lambda_lr(epoch): """Multiplicative Factor for Learning Rate Schedule. Computes a multiplicative factor for the initial learning rate based on the current epoch. This method can be used as argument ``lr_lambda`` of class :class:`torch.optim.lr_scheduler.LambdaLR`. The schedule is inspired by the Resnet CIFAR-10 schedule suggested here https://keras.io/examples/cifar10_resnet/. Args: epoch (int): The number of epochs Returns: lr_scale (float32): learning rate scale """ lr_scale = 1. if epoch > 180: lr_scale = 0.5e-3 elif epoch > 160: lr_scale = 1e-3 elif epoch > 120: lr_scale = 1e-2 elif epoch > 80: lr_scale = 1e-1 return lr_scale lambda_scheduler_theta = optim.lr_scheduler.LambdaLR( theta_optimizer, lambda_lr) lambda_scheduler_emb = None if emb_optimizer is not None: lambda_scheduler_emb = optim.lr_scheduler.LambdaLR( emb_optimizer, lambda_lr) ############################## ### Prepare CL Regularizer ### ############################## # Whether we will calculate the regularizer. calc_reg = task_id > 0 and not config.mnet_only and config.beta > 0 and \ not config.train_from_scratch # Compute targets when the reg is activated and we are not training # the first task if calc_reg: if config.online_target_computation: # Compute targets for the regularizer whenever they are needed. # -> Computationally expensive. targets_hypernet = None prev_theta = [p.detach().clone() for p in hnet.theta] prev_task_embs = [p.detach().clone() for p in hnet.get_task_embs()] else: # Compute targets for the regularizer once and keep them all in # memory -> Memory expensive. targets_hypernet = hreg.get_current_targets(task_id, hnet) prev_theta = None prev_task_embs = None # If we do not want to regularize all outputs (in a multi-head setup). # Note, we don't care whether output heads other than the current one # change. regged_outputs = None if config.cl_scenario != 2: # FIXME We assume here that all tasks have the same output size. n_y = data.num_classes regged_outputs = [ list(range(i * n_y, (i + 1) * n_y)) for i in range(task_id) ] # We need to tell the main network, which batch statistics to use, in case # batchnorm is used and we checkpoint the batchnorm stats. mnet_kwargs = {} if mnet.batchnorm_layers is not None: if config.bn_distill_stats: raise NotImplementedError() elif not config.bn_no_running_stats and \ not config.bn_no_stats_checkpointing: # Specify current task as condition to select correct # running stats. mnet_kwargs['condition'] = task_id ###################### ### Start training ### ###################### iter_per_epoch = -1 if config.epochs == -1: training_iterations = config.n_iter else: assert (config.epochs > 0) iter_per_epoch = int(np.ceil(data.num_train_samples / \ config.batch_size)) training_iterations = config.epochs * iter_per_epoch summed_iter_runtime = 0 for i in range(training_iterations): ### Evaluate network. # We test the network before we run the training iteration. # That way, we can see the initial performance of the untrained network. if i % config.val_iter == 0: test(task_id, data, mnet, hnet, device, shared, config, writer, logger, train_iter=i) mnet.train() if hnet is not None: hnet.train() if i % 200 == 0: logger.info('Training step: %d ...' % i) iter_start_time = time() theta_optimizer.zero_grad() if emb_optimizer is not None: emb_optimizer.zero_grad() ####################################### ### Data for current task and batch ### ####################################### batch = data.next_train_batch(config.batch_size) X = data.input_to_torch_tensor(batch[0], device, mode='train') T = data.output_to_torch_tensor(batch[1], device, mode='train') # Get the output neurons depending on the continual learning scenario. n_y = data.num_classes if config.cl_scenario == 1: # Choose current head. task_out = [task_id * n_y, (task_id + 1) * n_y] elif config.cl_scenario == 2: # Always all output neurons, only one head is used. task_out = [0, n_y] else: # Choose current head, which will be inferred during inference. task_out = [task_id * n_y, (task_id + 1) * n_y] ######################## ### Loss computation ### ######################## if config.mnet_only: weights = None else: weights = hnet.forward(task_id=task_id) Y_hat_logits = mnet.forward(X, weights, **mnet_kwargs) # Restrict output neurons Y_hat_logits = Y_hat_logits[:, task_out[0]:task_out[1]] assert (T.shape[1] == Y_hat_logits.shape[1]) # compute loss on task and compute gradients if config.soft_targets: soft_label = 0.95 num_classes = data.num_classes soft_targets = torch.where( T == 1, torch.Tensor([soft_label]), torch.Tensor([(1 - soft_label) / (num_classes - 1)])) soft_targets = soft_targets.to(device) loss_task = Classifier.softmax_and_cross_entropy( Y_hat_logits, soft_targets) else: loss_task = Classifier.logit_cross_entropy_loss(Y_hat_logits, T) # Compute gradients based on task loss (those might be used in the CL # regularizer). loss_task.backward(retain_graph=calc_reg, create_graph=calc_reg and \ config.backprop_dt) # The current task embedding only depends in the task loss, so we can # update it already. if emb_optimizer is not None: emb_optimizer.step() ############################# ### CL (HNET) Regularizer ### ############################# loss_reg = 0 dTheta = None if calc_reg: if config.no_lookahead: dTembs = None dTheta = None else: dTheta = opstep.calc_delta_theta( theta_optimizer, False, lr=config.lr, detach_dt=not config.backprop_dt) if config.continue_emb_training: dTembs = dTheta[-task_id:] dTheta = dTheta[:-task_id] else: dTembs = None loss_reg = hreg.calc_fix_target_reg( hnet, task_id, targets=targets_hypernet, dTheta=dTheta, dTembs=dTembs, mnet=mnet, inds_of_out_heads=regged_outputs, prev_theta=prev_theta, prev_task_embs=prev_task_embs, batch_size=config.cl_reg_batch_size) loss_reg *= config.beta loss_reg.backward() # Now, that we computed the regularizer, we can use the accumulated # gradients and update the hnet (or mnet) parameters. theta_optimizer.step() Y_hat = F.softmax(Y_hat_logits, dim=1) classifier_accuracy = Classifier.accuracy(Y_hat, T) * 100.0 # print('train T: ',Y_hat.argmax(dim=1, keepdim=False)) # print('train T: ',T.argmax(dim=1, keepdim=False)) # print('train Y_hat: ',Y_hat.size()) # print('train T: ',T.size()) ######################### # Learning rate scheduler ######################### if config.plateau_lr_scheduler: assert (iter_per_epoch != -1) if i % iter_per_epoch == 0 and i > 0: curr_epoch = i // iter_per_epoch logger.info('Computing test accuracy for plateau LR ' + 'scheduler (epoch %d).' % curr_epoch) # We need a validation quantity for the plateau LR scheduler. # FIXME we should use an actual validation set rather than the # test set. # Note, https://keras.io/examples/cifar10_resnet/ uses the test # set to compute the validation loss. We use the "validation" # accuracy instead. # FIXME We increase `train_iter` as the print messages in the # test method suggest that the testing has been executed before test_acc, _ = test(task_id, data, mnet, hnet, device, shared, config, writer, logger, train_iter=i + 1) mnet.train() if hnet is not None: hnet.train() plateau_scheduler_theta.step(test_acc) if plateau_scheduler_emb is not None: plateau_scheduler_emb.step(test_acc) if config.lambda_lr_scheduler: assert (iter_per_epoch != -1) if i % iter_per_epoch == 0 and i > 0: curr_epoch = i // iter_per_epoch logger.info('Applying Lambda LR scheduler (epoch %d).' % curr_epoch) lambda_scheduler_theta.step() if lambda_scheduler_emb is not None: lambda_scheduler_emb.step() ########################### ### Tensorboard summary ### ########################### # We don't wanna slow down training by having too much output. if i % 50 == 0: writer.add_scalar('train/task_%d/class_accuracy' % task_id, classifier_accuracy, i) writer.add_scalar('train/task_%d/loss_task' % task_id, loss_task, i) writer.add_scalar('train/task_%d/loss_reg' % task_id, loss_reg, i) ### Show the current training progress to the user. if i % config.val_iter == 0: msg = 'Training step {}: Classifier Accuracy: {:.3f} ' + \ '(on current training batch).' logger.debug(msg.format(i, classifier_accuracy)) iter_end_time = time() summed_iter_runtime += (iter_end_time - iter_start_time) if i % 200 == 0: logger.info('Training step: %d ... Done -- (runtime: %f sec)' % \ (i, iter_end_time - iter_start_time)) if mnet.batchnorm_layers is not None: if not config.bn_distill_stats and \ not config.bn_no_running_stats and \ not config.bn_no_stats_checkpointing: # Checkpoint the current running statistics (that have been # estimated while training the current task). for bn_layer in mnet.batchnorm_layers: assert (bn_layer.num_stats == task_id + 1) bn_layer.checkpoint_stats() avg_iter_time = summed_iter_runtime / config.n_iter logger.info('Average runtime per training iteration: %f sec.' % \ avg_iter_time) logger.info('Elapsed time for training task %d: %f sec.' % \ (task_id+1, time()-start_time))