def __init__(self, multimodal_feat_size=256): super(ImageEncoder, self).__init__() self.inception = custom_inception_v3() freeze_model(self.inception) self.map_global = nn.Linear(2048, multimodal_feat_size) self.map_local = nn.Linear(768, multimodal_feat_size)
def train(self, t, xtrain, ytrain, xvalid, yvalid, data, input_size, taskcla): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr patience = self.lr_patience self.optimizer = self._get_optimizer(lr) self.W = {} self.p_old = {} for n, p in self.model.named_parameters(): if p.requires_grad: n = n.replace('.', '__') self.W[n] = p.data.clone().zero_() self.p_old[n] = p.data.clone() # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() num_batch = xtrain.size(0) self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/num_batch,1000*self.sbatch*(clock2-clock1)/num_batch,train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') print() #save log for current task & old tasks at every epoch if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model, best_model) self.update_omega(self.W, self.epsilon) self.model_old = deepcopy(self.model) utils.freeze_model(self.model_old) # Freeze the weights return
def main(args): print('Dataset: {}, Normal Label: {}, LR: {}'.format(args.dataset, args.label, args.lr)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) model_type = args.model if model_type == 'resnet': model = utils.get_resnet_model(resnet_type=args.resnet_type) if args.dataset in ['rsna3D']: model = ResNet3D(model) elif model_type == 'timesformer': model = utils.get_timesformer_model(mode=args.timesformer_mode) model = model.to(device) ewc_loss = None # Freezing Pre-trained model for EWC if args.ewc: frozen_model = deepcopy(model).to(device) frozen_model.eval() utils.freeze_model(frozen_model) fisher = torch.load(args.diag_path) ewc_loss = EWCLoss(frozen_model, fisher) utils.freeze_parameters(model) sorted_train_loader, shuffled_train_loader, test_loader = utils.get_loaders(dataset=args.dataset, label_class=args.label, batch_size=args.batch_size, lookup_tables_paths=(args.train_lookup_table, args.test_lookup_table)) train_model(model, sorted_train_loader, shuffled_train_loader, test_loader, device, args, ewc_loss)
def post_train(self, t, xtrain, ytrain, xvalid, yvalid): # store the old model (and freeze it for gradients) self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # NOTE: other option is to save models to disk and reload them after each training session (slower but more accurate?) # deep copy the values from the old fisher matrix (previous models) if t > 0: fisher_old = {} for n, _ in self.model.named_parameters(): fisher_old[n] = self.fisher[n].clone() # compute the fisher matrix for the current model # NOTE: shouldn't it be recomputed for all outputs? self.fisher = utils.fisher_matrix_diag(t, xtrain, ytrain, self.model, self._fw_pass) # combine the fisher matrices if t > 0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, therefore we have to merge fisher diagonals # NOTE: is that equivalent? for n, _ in self.model.named_parameters(): # count the old fisher matrix t times for the number of pervious tasks self.fisher[n] = (self.fisher[n] + fisher_old[n] * t) / ( t + 1) # Checked: it is better than the other option #self.fisher[n]=0.5*(self.fisher[n]+fisher_old[n]) return
def train(self,t,xtrain,ytrain,xvalid,yvalid): best_loss=np.inf best_model=utils.get_model(self.model) lr=self.lr patience=self.lr_patience self.optimizer=self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') # Adapt lr if valid_loss<best_loss: best_loss=valid_loss best_model=utils.get_model(self.model) patience=self.lr_patience print(' *',end='') else: patience-=1 if patience<=0: lr/=self.lr_factor print(' lr={:.1e}'.format(lr),end='') if lr<self.lr_min: print() break patience=self.lr_patience self.optimizer=self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model,best_model) # Update old self.model_old=deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # Fisher ops if t>0: fisher_old={} for n,_ in self.model.named_parameters(): fisher_old[n]=self.fisher[n].clone() self.fisher=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) if t>0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, therefore we have to merge fisher diagonals for n,_ in self.model.named_parameters(): self.fisher[n]=(self.fisher[n]+fisher_old[n]*t)/(t+1) # Checked: it is better than the other option #self.fisher[n]=0.5*(self.fisher[n]+fisher_old[n]) torch.save(self.model.state_dict(),'pretrain_ewc.pth') return
def Xnet(backbone_name='densenet121', input_shape=(None, None, 3), input_tensor=None, encoder_weights='imagenet', freeze_encoder=False, skip_connections='default', decoder_block_type='upsampling', decoder_filters=(256, 128, 64, 32, 16), decoder_use_batchnorm=True, n_upsample_blocks=5, upsample_rates=(2, 2, 2, 2, 2), classes=1, activation='sigmoid'): """ Args: backbone_name: (str) look at list of available backbones. input_shape: (tuple) dimensions of input data (H, W, C) input_tensor: keras tensor encoder_weights: one of `None` (random initialization), 'imagenet' (pre-training on ImageNet), 'dof' (pre-training on DoF) freeze_encoder: (bool) Set encoder layers weights as non-trainable. Useful for fine-tuning skip_connections: if 'default' is used take default skip connections, else provide a list of layer numbers or names starting from top of model decoder_block_type: (str) one of 'upsampling' and 'transpose' (look at blocks.py) decoder_filters: (int) number of convolution layer filters in decoder blocks decoder_use_batchnorm: (bool) if True add batch normalisation layer between `Conv2D` ad `Activation` layers n_upsample_blocks: (int) a number of upsampling blocks upsample_rates: (tuple of int) upsampling rates decoder blocks classes: (int) a number of classes for output activation: (str) one of keras activations for last model layer Returns: keras.models.Model instance """ backbone = tf.keras.applications.DenseNet121(input_shape=input_shape, input_tensor=input_tensor, weights=encoder_weights, include_top=False) if skip_connections == 'default': skip_connections = (311, 139, 51, 4) # n_upsample_blocks = len(skip_connections) model = build_xnet(backbone, classes, skip_connections, decoder_filters=decoder_filters, block_type=decoder_block_type, activation=activation, n_upsample_blocks=n_upsample_blocks, upsample_rates=upsample_rates, use_batchnorm=decoder_use_batchnorm) # lock encoder weights for fine-tuning if freeze_encoder: freeze_model(backbone) return model
def train(self, t, xtrain, ytrain, xvalid, yvalid): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr patience = self.lr_patience self.optimizer = self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0 = time.time() self.train_epoch(t, xtrain, ytrain) clock1 = time.time() train_loss, train_acc = self.eval(t, xtrain, ytrain) clock2 = time.time() print( "| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |".format( e + 1, 1000 * self.sbatch * (clock1 - clock0) / xtrain.size(0), 1000 * self.sbatch * (clock2 - clock1) / xtrain.size(0), train_loss, 100 * train_acc, ), end="", ) # Valid valid_loss, valid_acc = self.eval(t, xvalid, yvalid) print( " Valid: loss={:.3f}, acc={:5.1f}% |".format( valid_loss, 100 * valid_acc ), end="", ) # Adapt lr if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(" *", end="") else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(" lr={:.1e}".format(lr), end="") if lr < self.lr_min: print() break patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() # Restore best and save model as old utils.set_model_(self.model, best_model) self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) return
def train(self,t,xtrain,ytrain,xvalid,yvalid): best_loss=np.inf best_model=utils.get_model(self.model) lr=self.lr patience=self.lr_patience self.optimizer=self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') # Adapt lr if valid_loss<best_loss: best_loss=valid_loss best_model=utils.get_model(self.model) patience=self.lr_patience print(' *',end='') else: patience-=1 if patience<=0: lr/=self.lr_factor print(' lr={:.1e}'.format(lr),end='') if lr<self.lr_min: print() break patience=self.lr_patience self.optimizer=self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model,best_model) # Model update if t==0: self.fisher=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) else: fisher_new=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) for (n,p),(_,p_old) in zip(self.model.named_parameters(),self.model_old.named_parameters()): p=fisher_new[n]*p+self.fisher[n]*p_old self.fisher[n]+=fisher_new[n] p/=(self.fisher[n]==0).float()+self.fisher[n] # Old model save self.model_old=deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) return
def train(self,t,xtrain,ytrain,xvalid,yvalid,data): best_loss=np.inf best_model=utils.get_model(self.model) lr=self.lr patience=self.lr_patience self.optimizer=self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format(e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') #save log for current task & old tasks at every epoch self.logger.add(epoch=(t*self.nepochs)+e, task_num=t+1, valid_loss=valid_loss, valid_acc=valid_acc) for task in range(t): xvalid_t=data[task]['valid']['x'].cuda() yvalid_t=data[task]['valid']['y'].cuda() valid_loss_t,valid_acc_t=self.eval(task,xvalid_t,yvalid_t) self.logger.add(epoch=(t*self.nepochs)+e, task_num=task+1, valid_loss=valid_loss_t, valid_acc=valid_acc_t) # Adapt lr if valid_loss<best_loss: best_loss=valid_loss best_model=utils.get_model(self.model) patience=self.lr_patience print(' *',end='') else: patience-=1 if patience<=0: lr/=self.lr_factor print(' lr={:.1e}'.format(lr),end='') if lr<self.lr_min: print() break patience=self.lr_patience self.optimizer=self._get_optimizer(lr) print() # Restore best and save model as old utils.set_model_(self.model,best_model) self.model_old = Net([1, 28, 28], [(0, 10), (1, 10), (2, 10), (3, 10), (4, 10), (5, 10), (6, 10), (7, 10), (8, 10), (9, 10)]).cuda() self.model_old.load_state_dict(self.model.state_dict()) self.model_old.eval() utils.freeze_model(self.model_old) self.logger.save() return
def train(self,t,xtrain,ytrain,xvalid,yvalid): best_loss=np.inf best_model=utils.get_model(self.model) lr=self.lr patience=self.lr_patience self.optimizer=self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') # Adapt lr if valid_loss<best_loss: best_loss=valid_loss best_model=utils.get_model(self.model) patience=self.lr_patience print(' *',end='') else: patience-=1 if patience<=0: lr/=self.lr_factor print(' lr={:.1e}'.format(lr),end='') if lr<self.lr_min: print() break patience=self.lr_patience self.optimizer=self._get_optimizer(lr) print() # Restore best, save model as old utils.set_model_(self.model,best_model) if t>0: model_state = utils.get_model(self.model) model_old_state = utils.get_model(self.model_old) for name, param in self.model.named_parameters(): #model_state[name]=(1-self.alpha)*model_old_state[name]+self.alpha*model_state[name] model_state[name]=(model_state[name]+model_old_state[name]*t)/(t+1) utils.set_model_(self.model,model_state) self.model_old=deepcopy(self.model) utils.freeze_model(self.model_old) self.model_old.eval() return
def post_train(self, t, xtrain, ytrain, xvalid, yvalid): # Restore best, save model as old if t > 0: model_state = utils.get_model(self.model) model_old_state = utils.get_model(self.model_old) for name, param in self.model.named_parameters(): #model_state[name]=(1-self.alpha)*model_old_state[name]+self.alpha*model_state[name] model_state[name] = (model_state[name] + model_old_state[name] * t) / (t + 1) utils.set_model_(self.model, model_state) self.model_old = deepcopy(self.model) utils.freeze_model(self.model_old) self.model_old.eval() return
def load_model_ckpt(self, path): # Load weights = torch.load(path) self.image_encoder.load_state_dict(weights['img_enc']) self.text_encoder.load_state_dict(weights['txt_enc']) # Freeze parameters freeze_model(self.image_encoder) freeze_model(self.text_encoder) self.image_encoder.eval() self.text_encoder.eval() for i, d in enumerate(self.discriminators): d.load_state_dict(weights['discriminator'][i]) self.generator.load_state_dict(weights['generator']) return weights['epoch']
def main(): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print(f"Training on {device}") cnn_model = models.resnet18(pretrained = True) tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") transformer_model = AutoModelForSequenceClassification.from_pretrained( "distilbert-base-uncased" ) if FREEZE: freeze_model(cnn_model) freeze_model(transformer_model) image_model = ImageModel(cnn_model, num_out=H) text_model = TextModel(transformer_model, num_out=H) if PRETRAINED_MODEL: model = torch.load(PRETRAINED_MODEL) else: model = MegaModel(image_model, text_model, num_hidden=H) # model = MegaModelAggregator(image_model, text_model, num_hidden=2*H) model.to(device) train_dataloader, test_dataloader = load_dataloaders( path=PATH, image_folder=IMAGE_FOLDER, descriptor=DESCRIPTOR, batch_size=BATCH_SIZE ) loss_fn = torch.nn.CrossEntropyLoss(reduction='mean') train_optim( model=model, tokenizer=tokenizer, train_dataloader=train_dataloader, test_dataloader=train_dataloader, loss_fn=loss_fn, epochs=EPOCHS, log_frequency=LOG_FREQUENCY, device=device, save_file=SAVE_INFO, save_dir=SAVE_MODEL, learning_rate=LEARNING_RATE )
def __init__(self, config): self.config = config self.augmentation = None self.CLIP, clip_preprocess = clip.load("ViT-B/32", device=self.config.device) self.CLIP = self.CLIP.eval() freeze_model(self.CLIP) self.model = self.config.model(config).to(self.config.device).eval() freeze_model(self.model) if config.task == "txt2img": self.tokens = clip.tokenize([self.config.target ]).to(self.config.device) self.text_features = self.CLIP.encode_text(self.tokens).detach() if config.task == "img2txt": image = clip_preprocess(Image.open( self.config.target)).unsqueeze(0).to(self.config.device) self.image_features = self.CLIP.encode_image(image)
def solve(self, t, Tasks): task = Tasks[t] train_loader = task['train_loader'] val_loader = task['test_loader'] class_num = task['class_num'] self.optimizer = torch.optim.SGD(self.model.parameters(), self.lr, momentum=self.momentum, weight_decay=self.weight_decay) criterion = self.criterion world_size = dist.get_world_size() rank = dist.get_rank() best_model = utils.get_model(self.model) best_accu = 0 train_sampler = task['train_sampler'] test_sampler = task['test_sampler'] for epoch in range(self.epochs): train_sampler.set_epoch(epoch) self.adjust_learning_rate(self.optimizer, epoch) # train for one epoch self.train(t, train_loader, self.model, self.model_old, self.optimizer, epoch, Tasks) # evaluate on validation set accu = self.validate(t, self.model, epoch, Tasks) # remember best prec@1 and save checkpoint if accu > best_accu: best_accu = accu #best_model = utils.get_model(self.model) ??? # if rank == 0: # print('Best accuracy: ', best_accu) # Restore best and save model as old #utils.set_model_(self.model, best_model) self.model_old = deepcopy(self.model) utils.freeze_model(self.model_old) return best_accu
def post_train(self, t, xtrain, ytrain, xvalid, yvalid): # Model update if t == 0: self.fisher = utils.fisher_matrix_diag(t, xtrain, ytrain, self.model, self._fw_pass) else: fisher_new = utils.fisher_matrix_diag(t, xtrain, ytrain, self.model, self._fw_pass) for (n, p), (_, p_old) in zip(self.model.named_parameters(), self.model_old.named_parameters()): p = fisher_new[n] * p + self.fisher[n] * p_old self.fisher[n] += fisher_new[n] p /= (self.fisher[n] == 0).float() + self.fisher[n] # Old model save self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) return
def train(self, t, train_data, valid_data, device='cuda'): # train network for task t # 1 search the best model for task 1:n if t > 0: # 1.2.1 expand self.model.expand(t, device) # 1.2.2 freeze the model utils.freeze_model(self.model) # 1.2.3 search the best expand action self.search_network(t, train_data, valid_data, self.o_batch, self.o_epochs, device=device) # 1.2.4 select the best action best_archi = self.model.select(t) self.archis.append(best_archi) # 1.2.5 unfreeze the model that need to train utils.freeze_model(self.model) self.model.modify_param(self.model.model_to_train, True) # 1.2.6 look up the super model print(best_archi) utils.print_model_report(self.model) # 2 training self.train_network(t, train_data, valid_data, self.batch, self.epochs, device)
def main(args): print('Dataset: {}, Normal Label: {}, LR: {}'.format( args.dataset, args.label, args.lr)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) model = utils.get_resnet_model(resnet_type=args.resnet_type) model = model.to(device) ewc_loss = None # Freezing Pre-trained model for EWC if args.ewc: frozen_model = deepcopy(model).to(device) frozen_model.eval() utils.freeze_model(frozen_model) fisher = torch.load(args.diag_path) ewc_loss = EWCLoss(frozen_model, fisher) utils.freeze_parameters(model) train_loader, test_loader = utils.get_loaders(dataset=args.dataset, label_class=args.label, batch_size=args.batch_size) train_model(model, train_loader, test_loader, device, args, ewc_loss)
def __init__(self, args): super(ARDM, self).__init__() self.args = args # define the two language models self.model_A = GPT2SimpleLM(UnifiedGPT2SmallConfig) self.model_B = GPT2SimpleLM(UnifiedGPT2SmallConfig) # language model KL self.language_model = GPT2SimpleLM(UnifiedGPT2SmallConfig) # load weights self.model_A.load_state_dict(get_pretrained("unified-gpt2-small")) self.model_B.load_state_dict(get_pretrained("unified-gpt2-small")) self.language_model.load_state_dict( get_pretrained("unified-gpt2-small") ) # freeze weights utils.freeze_model(self.language_model) self.criterion = sequence_ce_lm_loss self.lm_coef = 0.1 self.lm_coef_decay = 0.9999 self.discount_factor = 0.95 self.lm_stream = torch.cuda.Stream()
def load_model(self, ckpt, text_enc, gen): if ckpt is None and text_enc is None and gen is None: raise FileNotFoundError("Set path to load the model") if ckpt is not None and (text_enc is not None or gen is not None): raise ValueError("Specify just one way for loading:" "checkpoint path or two separate files" "for text encoder and generator") if ckpt is not None: print('Loading from checkpoint') weights = torch.load(ckpt) self.text_encoder.load_state_dict(weights['txt_enc']) self.generator.load_state_dict(weights['generator']) elif gen is not None and text_enc is not None: print('Loading from separate files') self.text_encoder.load_state_dict(torch.load(text_enc)) self.generator.load_state_dict(torch.load(gen)) elif gen is None or text_enc is None: raise FileNotFoundError( "Specify both generator and text encoder files") self.eval() freeze_model(self)
def fit_n_epochs(self, num_epochs, lr, freeze_until=None, sched_type='onecycle'): """ Train the model for a given number of epochs Args: num_epochs (int): number of epochs to train lr (float): learning rate to be used by the scheduler freeze_until (str, optional): last layer to freeze sched_type (str, optional): type of scheduler to use """ if self.configwb: wandb.watch(self.criterion, log="all", log_freq=10) self.epoch = 0 self.train_loss_recorder = [] self.val_loss_recorder = [] self.model = freeze_model(self.model, freeze_until) # Update param groups & LR self._reset_opt(lr) # Scheduler self._reset_scheduler(lr, num_epochs, sched_type) mb = master_bar(range(num_epochs)) for _ in mb: self._fit_epoch(freeze_until, mb) # Check whether ops invalidated the buffer self._params.assert_buffer_is_valid() eval_metrics = self.evaluate() # master bar mb.main_bar.comment = f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs}" mb.write( f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs} - " f"{self._eval_metrics_str(eval_metrics)}") self.save(self.output_file)
def train(self, t, xtrain, ytrain, xvalid, yvalid, data, input_size, taskcla): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr self.optimizer = self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0 = time.time() num_batch = xtrain.size(0) self.train_epoch(t, xtrain, ytrain) clock1 = time.time() train_loss, train_acc = self.eval(t, xtrain, ytrain) clock2 = time.time() print( '| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |' .format(e + 1, 1000 * self.sbatch * (clock1 - clock0) / num_batch, 1000 * self.sbatch * (clock2 - clock1) / num_batch, train_loss, 100 * train_acc), end='') # Valid valid_loss, valid_acc = self.eval(t, xvalid, yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format( valid_loss, 100 * valid_acc), end='') print(' lr : {:.6f}'.format(self.optimizer.param_groups[0]['lr'])) #save log for current task & old tasks at every epoch # Adapt lr if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model, best_model) # Update old self.model_old = deepcopy(self.model) utils.freeze_model(self.model_old) # Freeze the weights self.omega_update(t, xtrain) return
def train(self, t, train_data_loader, test_data_loader, val_data_loader): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr patience = self.lr_patience self.optimizer = self._get_optimizer(lr) task = torch.autograd.Variable( torch.LongTensor([t]).cuda(), volatile=False ) if torch.cuda.is_available() else torch.autograd.Variable( torch.LongTensor([t]), volatile=False) # Loop epochs for e in range(self.nepochs): # Train clock0 = time.time() self.train_epochewc(t, train_data_loader) clock1 = time.time() train_loss, train_acc, train_recall, train_f1 = self.eval_withregx( t, test_data_loader) clock2 = time.time() print( '| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |' .format( e + 1, 1000 * self.sbatch * (clock1 - clock0) / train_data_loader.__len__(), 1000 * self.sbatch * (clock2 - clock1) / train_data_loader.__len__(), train_loss, 100 * train_acc), end='') # Valid valid_loss, valid_acc, valid_recall, valid_f1 = self.eval_withregx( t, val_data_loader) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format( valid_loss, 100 * valid_acc), end='') # Adapt lr if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() break patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model, best_model) # Update old self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # Fisher ops if t > 0: fisher_old = {} startDateTimeOldLast = datetime.now() for n, _ in self.model.named_parameters(): fisher_old[n] = self.fisher[n].clone() print('DataTime OldLast', datetime.now() - startDateTimeOldLast) print("Analysis compute memory waste in Old Task") # self.fisher=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) self.fisher = utils.fisher_matrix_diag_nlp(t, train_data_loader, self.model, self.criterion, opt=self.opt) if t > 0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, therefore we have to merge fisher diagonals startDateTime = datetime.now() for n, _ in self.model.named_parameters(): self.fisher[n] = (self.fisher[n] + fisher_old[n] * t) / ( t + 1) # Checked: it is better than the other option #self.fisher[n]=0.5*(self.fisher[n]+fisher_old[n]) print("Analysis compute memory waste") print('DataTime OldLast', datetime.now() - startDateTime) return
def main(): mask_dir = os.path.join(args.dataset_dir, args.train_mask_dir_name) val_mask_dir = os.path.join(args.dataset_dir, args.val_mask_dir_name) train_data_dir = os.path.join(args.dataset_dir, args.train_data_dir_name) val_data_dir = os.path.join(args.dataset_dir, args.val_data_dir_name) # mask_dir = 'data/train/masks_fail' # val_mask_dir = 'data/val/masks' # # train_data_dir = 'data/train/images_fail' # val_data_dir = 'data/val/images if args.net_alias is not None: formatted_net_alias = '-{}-'.format(args.net_alias) best_model_file =\ '{}/{}{}loss-{}-fold_{}-{}{:.6f}'.format(args.models_dir, args.network, formatted_net_alias, args.loss_function, args.fold, args.input_width, args.learning_rate) +\ '-{epoch:d}-{val_loss:0.7f}-{val_dice_coef:0.7f}-{val_mean_io:0.7f}-{val_dice_coef_clipped:0.7f}.h5' if args.edges: ch = 5 else: ch = 3 model = make_model((None, None, args.stacked_channels + ch)) freeze_model(model, args.freeze_till_layer) if args.weights is None: print('No weights passed, training from scratch') else: print('Loading weights from {}'.format(args.weights)) model.load_weights(args.weights, by_name=True) optimizer = Adam(lr=args.learning_rate) if args.show_summary: model.summary() model.compile(loss=make_loss(args.loss_function), optimizer=optimizer, metrics=[ dice_coef_border, dice_coef, binary_crossentropy, dice_coef_clipped, mean_iou ]) crop_size = None if args.use_crop: crop_size = (args.input_height, args.input_width) print('Using crops of shape ({}, {})'.format(args.input_height, args.input_width)) else: print('Using full size images, --use_crop=True to do crops') # folds_df = pd.read_csv(os.path.join(args.dataset_dir, args.folds_source)) # train_ids = generate_filenames(folds_df[folds_df.fold != args.fold]['id']) # val_ids = generate_filenames(folds_df[folds_df.fold == args.fold]['id']) train_df = pd.read_csv('../data/train_df.csv') val_df = pd.read_csv('../data/val_df.csv') train_ids = [img + '.png' for img in train_df['id'].values] val_ids = [img + '.png' for img in val_df['id'].values] # train_ids = os.listdir(train_data_dir) # val_ids = os.listdir(val_data_dir) print('Training fold #{}, {} in train_ids, {} in val_ids'.format( args.fold, len(train_ids), len(val_ids))) train_generator = build_batch_generator(train_ids, img_dir=train_data_dir, batch_size=args.batch_size, shuffle=True, out_size=(args.out_height, args.out_width), crop_size=crop_size, mask_dir=mask_dir, aug=True) val_generator = build_batch_generator(val_ids, img_dir=val_data_dir, batch_size=args.batch_size, shuffle=False, out_size=(args.out_height, args.out_width), crop_size=crop_size, mask_dir=val_mask_dir, aug=False) best_model = ModelCheckpoint(best_model_file, monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=True) callbacks = [ best_model, EarlyStopping(patience=45, verbose=10), TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) ] if args.clr is not None: clr_params = args.clr.split(',') base_lr = float(clr_params[0]) max_lr = float(clr_params[1]) step = int(clr_params[2]) mode = clr_params[3] clr = CyclicLR(base_lr=base_lr, max_lr=max_lr, step_size=step, mode=mode) callbacks.append(clr) model.fit_generator(ThreadsafeIter(train_generator), steps_per_epoch=len(train_ids) / args.batch_size + 1, epochs=args.epochs, validation_data=ThreadsafeIter(val_generator), validation_steps=len(val_ids) / args.batch_size + 1, callbacks=callbacks, max_queue_size=50, workers=4)
def train(self, t, xtrain, ytrain, xvalid, yvalid, data, input_size, taskcla): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr patience = self.lr_patience self.optimizer = self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') #save log for current task & old tasks at every epoch self.logger.add(epoch=(t*self.nepochs)+e, task_num=t+1, valid_loss=valid_loss, valid_acc=valid_acc) for task in range(t): xvalid_t=data[task]['valid']['x'].cuda() yvalid_t=data[task]['valid']['y'].cuda() valid_loss_t,valid_acc_t=self.eval(task,xvalid_t,yvalid_t) self.logger.add(epoch=(t*self.nepochs)+e, task_num=task+1, valid_loss=valid_loss_t, valid_acc=valid_acc_t) # Adapt lr if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() break patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model, best_model) self.logger.save() # Update old self.model_old = Net(input_size, taskcla).cuda() self.model_old.load_state_dict(self.model.state_dict()) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # Fisher ops if t>0: fisher_old={} for n,_ in self.model.named_parameters(): fisher_old[n]=self.fisher[n].clone() self.fisher=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) if t>0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, therefore we have to merge fisher diagonals for n,_ in self.model.named_parameters(): self.fisher[n]=(self.fisher[n]+fisher_old[n]*t)/(t+1) # Checked: it is better than the other option #self.fisher[n]=0.5*(self.fisher[n]+fisher_old[n]) return
def train(self, t, train_data_loader, test_data_loader, val_data_loader): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr patience = self.lr_patience self.optimizer = self._get_optimizer(lr) task = torch.autograd.Variable( torch.LongTensor([t]).cuda(), volatile=False ) if torch.cuda.is_available() else torch.autograd.Variable( torch.LongTensor([t]), volatile=False) # Loop epochs print("Size of account ===> " + str(self.nepochs)) for e in range(self.nepochs): # Train clock0 = time.time() self.train_epochlwf(t, train_data_loader) clock1 = time.time() train_loss, train_acc, train_recall, train_f1 = self.evallwf( t, test_data_loader) clock2 = time.time() print( '| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |' .format( e + 1, 1000 * self.sbatch * (clock1 - clock0) / train_data_loader.__len__(), 1000 * self.sbatch * (clock2 - clock1) / train_data_loader.__len__(), train_loss, 100 * train_acc), end='') # Valid valid_loss, valid_acc, valid_recall, valid_f1 = self.evallwf( t, val_data_loader) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format( valid_loss, 100 * valid_acc), end='') # Adapt lr if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() break patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model, best_model) # Update old self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights return
def train(self, t, xtrain, ytrain, xvalid, yvalid, data, input_size, taskcla): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr lr_rho = self.lr_rho patience = self.lr_patience self.optimizer = self._get_optimizer(lr, lr_rho) # Loop epochs for e in range(self.nepochs): self.epoch = self.epoch + 1 # Train clock0 = time.time() num_batch = xtrain.size(0) self.train_epoch(t, xtrain, ytrain) clock1 = time.time() train_loss, train_acc = self.eval(t, xtrain, ytrain) clock2 = time.time() print( '| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |' .format(e + 1, 1000 * self.sbatch * (clock1 - clock0) / num_batch, 1000 * self.sbatch * (clock2 - clock1) / num_batch, train_loss, 100 * train_acc), end='') # Valid valid_loss, valid_acc = self.eval(t, xvalid, yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format( valid_loss, 100 * valid_acc), end='') # save log for current task & old tasks at every epoch self.logger.add(epoch=(t * self.nepochs) + e, task_num=t + 1, valid_loss=valid_loss, valid_acc=valid_acc) for task in range(t): xvalid_t = data[task]['valid']['x'].cuda() yvalid_t = data[task]['valid']['y'].cuda() valid_loss_t, valid_acc_t = self.eval(task, xvalid_t, yvalid_t) self.logger.add(epoch=(t * self.nepochs) + e, task_num=task + 1, valid_loss=valid_loss_t, valid_acc=valid_acc_t) if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor lr_rho /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() patience = self.lr_patience self.optimizer = self._get_optimizer(lr, lr_rho) print() utils.freeze_model(self.model_old) # Freeze the weights # Restore best utils.set_model_(self.model, best_model) self.model_old = deepcopy(self.model) self.saved = 1 self.logger.save() return
def main(): print('start') args = get_args() logging.basicConfig(filename=args.model + '.log', level=logging.INFO, format="%(asctime)s: %(message)s") logger = logging.getLogger() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info(device) models = ['FCN', 'Unet', 'Deeplab'] backbones = ['resnet18', 'resnet34', 'resnet50', 'mobilenet'] assert args.model in models, "Choose valid model" if args.model is 'FCN': assert args.backbone[: 6] == 'resnet', 'Only resnet backbones supported for FCN' model = FCN.FCN16(args.backbone, num_classes=11).to(device) if args.model is 'Unet': model = Unet().to(device) if args.model is 'Deeplab': assert args.backbone[: 9] == 'mobilenet', 'Only mobilenet backbones supported for Deeplab' model = Deeplab().to(device) # TODO logger.info("Model: {}, Backbone Used: {}".format(args.model, args.backbone)) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) criterion = torch.nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=1e-3) logger.info("Optimizer: Adam") logger.info("Criterion/Loss Function: Cross Entropy Loss") logger.info("Scheduler: ReduceLROnPlateau") logger.info("Learning Rate: {}".format(args.lr)) if args.load: checkpoint = torch.load(args.checkpoint) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) criterion = checkpoint['criterion'] logger.info("Model Loaded") if args.fine_tune: assert args.load, "Please specify model to be loaded to fine-tune" model = utils.freeze_model(args, model) ## Dataloaders train_dataset, val_dataset = get_datasets(args.dataset, train=True) logger.info("Dataset Used: {}".format(args.dataset)) if train_dataset: train_dataloader = DataLoader(train_dataset, batch_size=args.batchsize) logger.info("Training Dataset Length: {}".format( len(train_dataloader))) if val_dataset: val_dataloader = DataLoader(val_dataset, batch_size=args.batchsize, shuffle=True) logger.info("Validation Dataset Length: {}".format( len(val_dataloader))) #if not os.path.exists(args.saveDir): writer = SummaryWriter(os.path.join(args.saveDir, args.model + '_log')) train_model(model=model, optimizer=optimizer, criterion=criterion, scheduler=scheduler, training_dataloader=train_dataloader, validation_dataloader=val_dataloader, device=device, epochs=args.epochs, batch_size=args.batchsize, lr=args.lr, CHECKPOINT_PATH=args.saveDir, model_name=args.model, logger=logger, writer=writer, save_epoch=5, show_figure=True, save_model=True)
def train(self, t, xtrain, ytrain, xvalid, yvalid): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr patience = self.lr_patience self.optimizer = self._get_optimizer(lr) # Update old self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # reset importance omega for n, p in self.model.named_parameters(): if p.requires_grad: self.omega[n] = p.data.clone().zero_() self.DELTA[n] = p.data.clone().zero_() self.p_old[n] = p.data.clone() # Loop epochs for e in range(self.nepochs): # Train clock0 = time.time() self.train_epoch(t, xtrain, ytrain, e) clock1 = time.time() train_loss, train_acc = self.eval(t, xtrain, ytrain) clock2 = time.time() print( '| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |' .format( e + 1, 1000 * self.sbatch * (clock1 - clock0) / xtrain.size(0), 1000 * self.sbatch * (clock2 - clock1) / xtrain.size(0), train_loss, 100 * train_acc), end='') # Valid valid_loss, valid_acc = self.eval(t, xvalid, yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format( valid_loss, 100 * valid_acc), end='') self.logger.log_scalar(str(t) + "_train acc", train_acc, e) self.logger.log_scalar(str(t) + "_valid acc", valid_acc, e) self.logger.log_scalar(str(t) + "_train loss", train_loss, e) self.logger.log_scalar(str(t) + "_valid loss", valid_loss, e) # Adapt lr if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() break patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model, best_model) # Update task regularization OMEGA for (n, param), (_, param_old) in zip(self.model.named_parameters(), self.model_old.named_parameters()): if p.requires_grad: #change = param.detach().clone() - param_old #o = torch.nn.functional.relu(self.omega[n])/(change.pow(2) + self.xi) o = torch.nn.functional.relu( self.omega[n]) / (self.DELTA[n].pow(2) + self.xi) self.OMEGA[n] = self.OMEGA[n] * self.decay + o * ( 1 - self.decay) #self.OMEGA[n] + o # return
def train(self, t, xtrain, ytrain, xvalid, yvalid, data, input_size, taskcla): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr patience = self.lr_patience self.optimizer = self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0 = time.time() # self.model.variance_init() # trainer net의 variance크게 init # 1. trainer_net training 하는데 regularization을 위해서 saver_net의 정보 이용 self.train_epoch(xtrain, ytrain) clock1 = time.time() train_loss, train_acc = self.eval(xtrain, ytrain, self.sample) clock2 = time.time() print( '| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |' .format( e + 1, 1000 * self.sbatch * (clock1 - clock0) / xtrain.size(0), 1000 * self.sbatch * (clock2 - clock1) / xtrain.size(0), train_loss, 100 * train_acc), end='') # Valid valid_loss, valid_acc = self.eval(xvalid, yvalid, self.sample) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format( valid_loss, 100 * valid_acc), end='') # save log for current task & old tasks at every epoch self.logger.add(epoch=(t * self.nepochs) + e, task_num=t + 1, valid_loss=valid_loss, valid_acc=valid_acc) for task in range(t): xvalid_t = data[task]['valid']['x'].cuda() yvalid_t = data[task]['valid']['y'].cuda() valid_loss_t, valid_acc_t = self.eval(xvalid_t, yvalid_t, self.sample) self.logger.add(epoch=(t * self.nepochs) + e, task_num=task + 1, valid_loss=valid_loss_t, valid_acc=valid_acc_t) # Adapt lr if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() break patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() #self.model_old = deepcopy(self.model) utils.freeze_model(self.model_old) # Freeze the weights #self.print_log(e) # for n, m in self.model.named_children(): # print(n, m.weight.sigma.min()) # Restore best utils.set_model_(self.model, best_model) self.logger.save() return