def lr_find(self, freeze_until=None, start_lr=1e-7, end_lr=1, num_it=100): """Gridsearch the optimal learning rate for the training Args: freeze_until (str, optional): last layer to freeze start_lr (float, optional): initial learning rate end_lr (float, optional): final learning rate num_it (int, optional): number of iterations to perform """ self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(start_lr) gamma = (end_lr / start_lr)**(1 / (num_it - 1)) scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma) self.lr_recorder = [start_lr * gamma**idx for idx in range(num_it)] self.loss_recorder = [] for batch_idx, (x, target) in enumerate(self.train_loader): x, target = self.to_cuda(x, target) # Forward batch_loss = self._get_loss(x, target) self._backprop_step(batch_loss) # Update LR scheduler.step() # Record self.loss_recorder.append(batch_loss.item()) # Stop after the number of iterations if batch_idx + 1 == num_it: break
def lr_find( self, freeze_until: Optional[str] = None, start_lr: float = 1e-7, end_lr: float = 1, norm_weight_decay: Optional[float] = None, num_it: int = 100, ) -> None: """Gridsearch the optimal learning rate for the training Args: freeze_until (str, optional): last layer to freeze start_lr (float, optional): initial learning rate end_lr (float, optional): final learning rate norm_weight_decay (float, optional): weight decay to apply to normalization parameters num_it (int, optional): number of iterations to perform """ if num_it > len(self.train_loader): raise ValueError("the value of `num_it` needs to be lower than the number of available batches") self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(start_lr, norm_weight_decay) gamma = (end_lr / start_lr) ** (1 / (num_it - 1)) scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma) self.lr_recorder = [start_lr * gamma ** idx for idx in range(num_it)] self.loss_recorder = [] if self.amp: self.scaler = torch.cuda.amp.GradScaler() for batch_idx, (x, target) in enumerate(self.train_loader): x, target = self.to_cuda(x, target) # Forward batch_loss = self._get_loss(x, target) self._backprop_step(batch_loss) # Update LR scheduler.step() # Record if torch.isnan(batch_loss) or torch.isinf(batch_loss): if batch_idx == 0: raise ValueError("loss value is NaN or inf.") else: break self.loss_recorder.append(batch_loss.item()) # Stop after the number of iterations if batch_idx + 1 == num_it: break self.lr_recorder = self.lr_recorder[:len(self.loss_recorder)]
def lr_range_test(model, train, test, train_loader, test_loader): device = 'cuda' if torch.cuda.is_available() else 'cpu' #model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=0.0001) lmbda = lambda epoch: 1.4 #scheduler = OneCycleLR(optimizer,max_lr=0.5,total_steps=25) scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda) learning_lr_trace = [] for epoch in range(1, 25): print(f'Epoch: {epoch} Learning_Rate {scheduler.get_lr()}') learning_lr_trace.append(scheduler.get_lr()) train_loss, train_acc = train(model, device, train_loader, optimizer, epoch) test_loss, test_acc_l1 = test(model, device, test_loader) scheduler.step() return learning_lr_trace, train_acc, test_acc_l1
num_train = int(P_TRAIN * num_cars) num_test = num_cars - num_train train_data, test_data = random_split(dataset, [num_train, num_test]) # set up the train and test data loaders train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False) # load ResNet-50 with every layer frozen except for layer3-bottleneck5 and beyond, # and a new fully-connected network which outputs a 196-dim vector device = get_device() model = load_resnet50_layer3_bottleneck5(num_car_models) model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=LEARNING_RATE) scheduler = MultiplicativeLR(optimizer, lr_lambda=lambda epoch: LR_DECAY) # set up the output logger output_dir = '/home/mchobanyan/data/research/transfer/vis/finetune-car-resnet50' model_dir = os.path.join(output_dir, 'models') create_folder(model_dir) logger = TrainingLogger(filepath=os.path.join(output_dir, 'training-log.csv')) for epoch in tqdm(range(NUM_EPOCHS)): train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device) test_loss, test_acc = test_epoch(model, test_loader, criterion, device) scheduler.step() logger.add_entry(epoch, train_loss, test_loss, train_acc, test_acc) checkpoint(model, os.path.join(model_dir, f'model_epoch{epoch}.pt'))
def record_lr( model: torch.nn.Module, train_loader: DataLoader, batch_transforms, optimizer, start_lr: float = 1e-7, end_lr: float = 1, num_it: int = 100, amp: bool = False, ): """Gridsearch the optimal learning rate for the training. Adapted from https://github.com/frgfm/Holocron/blob/master/holocron/trainer/core.py """ if num_it > len(train_loader): raise ValueError( "the value of `num_it` needs to be lower than the number of available batches" ) model = model.train() # Update param groups & LR optimizer.defaults["lr"] = start_lr for pgroup in optimizer.param_groups: pgroup["lr"] = start_lr gamma = (end_lr / start_lr)**(1 / (num_it - 1)) scheduler = MultiplicativeLR(optimizer, lambda step: gamma) lr_recorder = [start_lr * gamma**idx for idx in range(num_it)] loss_recorder = [] if amp: scaler = torch.cuda.amp.GradScaler() for batch_idx, (images, targets) in enumerate(train_loader): if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) # Forward, Backward & update optimizer.zero_grad() if amp: with torch.cuda.amp.autocast(): train_loss = model(images, targets)["loss"] scaler.scale(train_loss).backward() # Gradient clipping scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 5) # Update the params scaler.step(optimizer) scaler.update() else: train_loss = model(images, targets)["loss"] train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() # Update LR scheduler.step() # Record if not torch.isfinite(train_loss): if batch_idx == 0: raise ValueError("loss value is NaN or inf.") else: break loss_recorder.append(train_loss.item()) # Stop after the number of iterations if batch_idx + 1 == num_it: break return lr_recorder[:len(loss_recorder)], loss_recorder
def fit( self, X_train, y_train, X_validation=None, y_validation=None, loss_key="opt", batch_size=128, num_workers=0, learning_rate=1e-3, learning_rate_lambda=0.995, max_epoch=10000, early_stopping=100, device="cpu", verbose=False, ): """ Train the model using gradient descent back propagation Parameters ---------- X_train : {array-like, sparse matrix} of shape (n_samples, n_features) Features matrix used to train the model y_train : vector-like of shape (n_samples, 1) The target vector used to train the model X_validation : {array-like, sparse matrix} of shape (n_samples, n_features) Features matrix used for early stopping of the training y_validation : vector-like of shape (n_samples, 1) The target vector used for early stopping of the training loss_key: string (default = 'opt') Which field of the loss dictionary to optimize batch_size: int (default = 128) Batch size num_workers: int (default = 0) Number of cpus to use learning_rate: float (default = 1e-3) Gradient descent learning rate learning_rate_lambda: float (default = 0.995) The rate of decreasing learning_rate max_epoch: int (default = 10000) The maximum number of optimization epochs early_stopping: int (default = 100) The number of epochs without improving the bast validation loss allowed before stopping device : 'cpu' or 'gpu' (default = 'cpu') Device used by pytorch for training the model and using the trained model for encoding/decoding verbose: True or False (default = False) Verbosity """ assert X_train.shape[1] == self.input_dim self.to(device) train_loader = torch.utils.data.DataLoader( TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train)), batch_size=batch_size, shuffle=True, num_workers=num_workers, ) if X_validation is not None: validation_loader = torch.utils.data.DataLoader( TensorDataset(torch.Tensor(X_validation), torch.Tensor(y_validation)), batch_size=batch_size, shuffle=True, num_workers=num_workers, ) else: validation_loader = None optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) scheduler = MultiplicativeLR( optimizer, lr_lambda=(lambda epoch: learning_rate_lambda)) best_validation_loss = None iter_no_improve = 0 for epoch in range(max_epoch): self.train() training_loss = 0 for data in train_loader: Xb = data[0].to(device) optimizer.zero_grad() output = self(Xb) loss = self.loss(output, Xb)[loss_key] loss.backward() optimizer.step() training_loss += loss.detach().cpu().numpy() self.eval() validation_loss = 0 if validation_loader: with torch.no_grad(): for data in validation_loader: Xb = data[0].to(device) output = self(Xb) loss = self.loss(output, Xb)[loss_key] validation_loss += loss.detach().cpu().numpy() if best_validation_loss is None or validation_loss < best_validation_loss: best_validation_loss = validation_loss iter_no_improve = 0 else: iter_no_improve += 1 if iter_no_improve > early_stopping: if verbose: print(f"Early stopping after {epoch} epochs") break scheduler.step() if verbose: print( f"[{epoch}] training loss={training_loss}, validation loss={validation_loss}" ) return self
fitness_shaping) train_writer.add_scalar('fitness', raw_fitness.mean(), i) train_writer.add_scalar('fitness/std', raw_fitness.std(), i) for p_idx, p in enumerate(population.parameters()): train_writer.add_histogram('grads/%d' % p_idx, p.grad, i) for k, p in population.mixing_logits.items(): train_writer.add_histogram( "entropy/%s" % k, t.distributions.Categorical(logits=p).entropy(), i) means = population.component_means # (480, 5) dist = ((means.unsqueeze(0) - means.unsqueeze(1))**2).sum( dim=2).sqrt() # (1, 480, 5,) - (480, 1, 5) = (480, 480, 5) train_writer.add_histogram("dist", dist, i) optim.step() sched.step() population.std *= 0.999 mean_fit = raw_fitness.mean().item() pbar.set_description("avg fit: %.3f, std: %.3f" % (mean_fit, raw_fitness.std().item())) all_params = population.parameters() t.save(all_params, 'last.t') if mean_fit > best_so_far: best_so_far = mean_fit t.save(all_params, 'best.t') util.upload_results('best.t')
class DeepSeqNet(Module): def __init__(self): super(DeepSeqNet, self).__init__() def _compile(self, optimizer, learning_rate): self._set_optim(optimizer, learning_rate) self._set_scheduler() self._set_criterion() def _set_optim(self, optimizer, learning_rate): optimizer = optimizer.lower() if optimizer == "adam": self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) elif optimizer == "rmsprop": self.optimizer = optim.RMSprop(self.parameters(), lr=learning_rate) else: self.optimizer = optim.SGD(self.parameters(), lr=learning_rate) def _set_scheduler(self): self.scheduler = MultiplicativeLR(self.optimizer, lr_lambda=(lambda x: 0.95)) def _set_criterion(self): self.criterion = nn.CrossEntropyLoss() def forward(self, x_txt, x_num): txt_features = self.txt_net_forward(x_txt) num_features = self.num_net_forward(x_num) features = torch.cat((txt_features, num_features), 1) out_features = self.dropout(features) logits = self.fc(out_features) return logits def txt_net_forward(self, x_txt): raise NotImplementedError() def num_net_forward(self, x_num): for linear in self.linear_layers: x_num = self.activation_layer(linear(x_num)) return x_num def fit(self, x_txt, x_num, y): self.train() self.optimizer.zero_grad() y_ = self.forward(x_txt, x_num) loss = self.criterion(y_, y) loss.backward() self.optimizer.step() return loss def evaluate(self, data_iterator): self.eval() labels, preds = [], [] for _, (x_txt, x_num, y) in enumerate(data_iterator): x_txt, x_num = x_txt.t(), x_num.t() if torch.cuda.is_available(): x_txt, x_num = x_txt.cuda(), x_num.cuda() y_ = self.forward(x_txt, x_num) pred = torch.argmax(y_, 1) preds.extend(pred.cpu().numpy()) labels.extend(y.numpy()) score = accuracy_score(labels, np.array(preds).flatten()) return score def run_epoch(self, train_iterator, val_iterator): train_losses = [] val_accuracies = [] losses = [] for i, (x_txt, x_num, y) in enumerate(train_iterator): x_txt, x_num = x_txt.t(), x_num.t() if torch.cuda.is_available(): x_txt, x_num = x_txt.cuda(), x_num.cuda() y = y.cuda() loss = self.fit(x_txt, x_num, y) losses.append(loss.item()) if i % 100 == 0 and i != 0: avg_train_loss = float(np.mean(losses)) train_losses.append(avg_train_loss) losses = [] val_accuracy = self.evaluate(val_iterator) print("Iteration: %4d | train loss: %3.2f | val acc.: %.2f" % ((i + 1), avg_train_loss * 100, val_accuracy * 100)) # Run the scheduler to reduce the learning rate self.scheduler.step(epoch=None) return train_losses, val_accuracies
class CNNModel(): def __init__(self, args={}): self.args = args self.parse_args(args) self.classifier = ConvNet() self.optimizer = optim.Adam(self.classifier.parameters(), lr=self.lr, betas=(0.9, 0.98), eps=1e-9) self.loss_function = nn.CrossEntropyLoss() lmbda = lambda epoch: self.lr_factor self.lr_scheduler = MultiplicativeLR(self.optimizer, lr_lambda=lmbda) def parse_args(self, args): self.lr = args['learning_rate'] if 'learning_rate' in args else 0.001 self.max_epoch = args['max_epoch'] if 'max_epoch' in args else 100 self.early_stop = args['early_stop'] if 'early_stop' in args else False self.batch_size = args['batch_size'] if 'batch_size' in args else 64 self.shuffle = args['shuffle'] if 'shuffle' in args else False self.adjust_lr = args[ 'adaptive_learning_rate'] if 'adaptive_learning_rate' in args else False self.early_stop_idx_limit = 10 self.lr_factor = 0.95 self.min_lr = 5e-6 def adjust_learning_rate(optimizer, factor=.5, min_lr=0.00001): for i, param_group in enumerate(optimizer.param_groups): old_lr = float(param_group['lr']) new_lr = max(old_lr * factor, min_lr) param_group['lr'] = new_lr logger.info('adjusting learning rate from %.6f to %.6f' % (old_lr, new_lr)) def train_model(self, train_X, train_Y): if self.early_stop: best_acc = 0 best_model = None early_stop_idx = 0 train_X, dev_X = np.split(train_X, [int(len(train_X) * .8)]) train_Y, dev_Y = np.split(train_Y, [int(len(train_Y) * .8)]) tensor_dev_X = torch.Tensor(dev_X) tensor_dev_Y = torch.Tensor(dev_Y).type(torch.LongTensor) dev = TensorDataset(tensor_dev_X, tensor_dev_Y) dev_loader = DataLoader(dev, batch_size=self.batch_size, shuffle=False) tensor_train_X = torch.Tensor(train_X) tensor_train_Y = torch.Tensor(train_Y).type(torch.LongTensor) train = TensorDataset(tensor_train_X, tensor_train_Y) train_loader = DataLoader(train, batch_size=self.batch_size, shuffle=self.shuffle) prev_loss = np.inf for epoch in range(self.max_epoch): running_loss = 0.0 for i, data in enumerate(train_loader): features, labels = data self.optimizer.zero_grad() outputs = self.classifier( features.view(features.size(0), 1, 28, 28)) loss = self.loss_function(outputs, labels) loss.backward() self.optimizer.step() running_loss += loss.item() print("epoch: ", epoch, "training loss: ", running_loss) if self.adjust_lr and running_loss > prev_loss: old_lr = self.optimizer.param_groups[0]['lr'] self.lr_scheduler.step() new_lr = self.optimizer.param_groups[0]['lr'] print("Adjusting learning rate from %.5f to %.5f" % (old_lr, new_lr)) prev_loss = running_loss if self.early_stop: with torch.no_grad(): dev_correct = 0. dev_total = 0. dev_loss = 0. for data in dev_loader: features, labels = data outputs = self.classifier( features.view(features.size(0), 1, 28, 28)) loss = self.loss_function(outputs, labels) _, predicted = torch.max(outputs.data, 1) dev_total += labels.size(0) dev_correct += (predicted == labels).sum().item() dev_loss += loss.item() current_acc = dev_correct / dev_total if current_acc > best_acc: print("Best dev accuracy obtained: %.3f" % current_acc) best_model = copy.deepcopy(self.classifier) best_acc = current_acc early_stop_idx = 0 else: early_stop_idx += 1 if early_stop_idx >= self.early_stop_idx_limit: print("early stop triggered") self.classifier = best_model break return self def score(self, test_X, test_Y): tensor_test_X = torch.Tensor(test_X) tensor_test_Y = torch.Tensor(test_Y).type(torch.LongTensor) test = TensorDataset(tensor_test_X, tensor_test_Y) test_loader = DataLoader(test, batch_size=self.batch_size, shuffle=False) correct = 0.0 total = 0.0 with torch.no_grad(): for data in test_loader: features, labels = data outputs = self.classifier( features.view(features.size(0), 1, 28, 28)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() return correct / total @staticmethod def Name(): return "CNN"