def __init__(self, params, lr=1e-3, eps=1e-8, alpha=1e-7, beta=1e-5, gamma=0.9, momentum=1, sgd_steps=5, K=10): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 1 >= momentum: raise ValueError("Invalid momentum value: {}".format(eps)) self.iter = 0 self.sgd = SGD(params, lr=lr, momentum=0.9) param_count = np.sum([np.prod(p.size()) for p in params]) # got from MNIST-GAN defaults = dict(lr=lr, eps=eps, alpha=alpha, beta=beta * param_count, gamma=gamma, sgd_steps=sgd_steps, momentum=momentum, K=K) super(Neumann, self).__init__(params, defaults)
def split_optimizer(model: nn.Module, cfg: dict): param_weight_decay, param_bias, param_other = split_params(model) if len(param_other) != 0: if cfg['optimizer'] == 'Adam': optimizer = Adam(param_other, lr=cfg['lr']) elif cfg['optimizer'] == 'SGD': optimizer = SGD(param_other, lr=cfg['lr'], momentum=cfg['momentum']) else: raise NotImplementedError("optimizer {:s} is not support!".format( cfg['optimizer'])) optimizer.add_param_group({ 'params': param_weight_decay, 'weight_decay': cfg['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': param_bias}) else: if cfg['optimizer'] == 'Adam': optimizer = Adam(param_weight_decay, lr=cfg['lr'], weight_decay=cfg['weight_decay']) elif cfg['optimizer'] == 'SGD': optimizer = SGD(param_weight_decay, lr=cfg['lr'], momentum=cfg['momentum'], weight_decay=cfg['weight_decay']) else: raise NotImplementedError("optimizer {:s} is not support!".format( cfg['optimizer'])) optimizer.add_param_group({'params': param_bias}) return optimizer
def optimizer_choose(model, args, writer, block): params = [] for key, value in model.named_parameters(): if value.requires_grad: params += [{ 'params': [value], 'lr': args.lr, 'key': key, 'weight_decay': args.wd }] if args.optimizer == 'adam': optimizer = torch.optim.Adam(params) block.log('Using Adam optimizer') elif args.optimizer == 'sgd': momentum = 0.9 optimizer = SGD(params, momentum=momentum) block.log('Using SGD with momentum ' + str(momentum)) elif args.optimizer == 'sgd_nev': momentum = 0.9 optimizer = SGD(params, momentum=momentum, nesterov=True) block.log('Using SGD with momentum ' + str(momentum) + 'and nesterov') else: momentum = 0.9 optimizer = SGD(params, momentum=momentum) block.log('Using SGD with momentum ' + str(momentum)) # shutil.copy2(inspect.getfile(optimizer), args.model_saved_name) shutil.copy2(__file__, args.model_saved_name) return optimizer
def step(self, optimizer: SGD, *args, **kwargs) -> Optional[float]: # type: ignore """ :meth:`step` carries out the following two operations: 1. Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer`` earlier in the iteration). As part of the :meth:`unscale_`, gradients are checked for infs/NaNs. 2. If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled gradients. Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params. ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``. Returns the return value of ``optimizer.step(*args, **kwargs)``. Args: optimizer (torch.optim.Optimizer): Optimizer that applies the gradients. args: Any arguments. kwargs: Any keyword arguments. .. warning:: Closure use is not currently supported. Note: This is an exact copy of the step function in grad_scaler.py. If this copy is deleted then the unittest test_cpu_offload_and_cpu_grads fails. This is because the parent class step function calls the parent class unscale_ function which does not handle torch.distributed.all_reduce on cpu. """ if not self._enabled: return optimizer.step(*args, **kwargs) if "closure" in kwargs: raise RuntimeError("Closure use is not currently supported if GradScaler is enabled.") self._check_scale_growth_tracker("step") # type: ignore optimizer_state = self._per_optimizer_states[id(optimizer)] if optimizer_state["stage"] is OptState.STEPPED: raise RuntimeError("step() has already been called since the last update().") retval = None if hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling: # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly. # The contract with custom optimizers is that their step() should accept an additional, # optional grad_scaler kwarg. We append self to the kwargs so the custom optimizer has full information: # it can query its own state, invoke unscale_ on itself, etc retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self)) optimizer_state["stage"] = OptState.STEPPED return retval if optimizer_state["stage"] is OptState.READY: self.unscale_(optimizer) assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer." retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs) # type: ignore optimizer_state["stage"] = OptState.STEPPED return retval
def _initOptimizer(self): modelParallel = self.getModelParallel() args = self.getArgs() optimizer = SGD(modelParallel.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # load optimizer pre-trained state dict if exists if self.optimizerStateDict: optimizer.load_state_dict(self.optimizerStateDict) return optimizer
def create_model(args, model=None): # Create MVCCN model based on the given architecture. if model is None: model = SVCNN(nclasses=args.num_classes, pretraining=args.pretrained, cnn_name=args.arch, feature_extraction=args.feature_extraction) else: model = MVCNN(model, num_views=args.nview) # Multi GPUs if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Send model to GPU or keep it to the CPU model = model.to(device=args.device) if args.optimizer == "ADAM": optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), args.learning_rate, weight_decay=args.weight_decay) elif args.optimizer == "ADAGRAD": optimizer = Adagrad(filter(lambda p: p.requires_grad, model.parameters()), args.learning_rate, weight_decay=args.weight_decay) else: # If we use feature extraction (features weights are frozen), we need to keep only differentiable params optimizer = SGD(filter(lambda p: p.requires_grad, model.parameters()), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) return model, optimizer
def configure_optimizers(self): # Add args to define the values params_list = [{ 'params': self.model.parameters(), 'lr': 0.01 }, { 'params': self.classifier0.parameters() }, { 'params': self.classifier1.parameters() }, { 'params': self.classifier2.parameters() }, { 'params': self.classifier3.parameters() }, { 'params': self.classifier4.parameters() }, { 'params': self.classifier5.parameters() }] optim = SGD(params_list, lr=self.learning_rate, weight_decay=5e-4, momentum=0.9, nesterov=True) scheduler = lr_scheduler.StepLR(optim, step_size=40, gamma=0.130) return [optim], [scheduler]
def test_trainer_train_full(fake_loader, simple_neural_net): def transform_fn(batch): inputs, y_true = batch return inputs, y_true.float() metrics = [BinaryAccuracy()] transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) train_loader = DataLoader(FakeData(size=100, image_size=(3, 32, 32), num_classes=2, transform=transform), batch_size=4, shuffle=True, num_workers=1) val_loader = DataLoader(FakeData(size=50, image_size=(3, 32, 32), num_classes=2, transform=transform), batch_size=4, shuffle=True, num_workers=1) model = Net() loss = nn.BCELoss() optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9) plotter = VisdomLinePlotter(env_name=f'Model {11}') callbacks = [ ProgressBar(log_every=10), VisdomEpoch(plotter, on_iteration_every=10), VisdomEpoch(plotter, on_iteration_every=10, monitor='binary_acc'), EarlyStoppingEpoch(min_delta=0.1, monitor='val_running_loss', patience=10), ReduceLROnPlateauCallback(factor=0.1, threshold=0.1, patience=2, verbose=True) ] trainer = TorchTrainer(model) trainer.prepare(optimizer, loss, train_loader, val_loader, transform_fn=transform_fn, callbacks=callbacks, metrics=metrics) epochs = 10 batch_size = 10 trainer.train(epochs, batch_size)
def create_optimizer(name, parameters, lr): if name == 'Adadelta': return Adadelta(parameters, lr=lr) elif name == 'Adam': return Adam(parameters, lr=lr) elif name == 'SGD': return SGD(parameters, lr=lr) else: raise KeyError( 'Unknown optimizer type {!r}. Choose from [Adadelta | Adam | SGD]')
def train(self): args = self.args model = self.model logger = self.logger epochRange = self._getEpochRange(self.nEpochs) # init optimizer optimizer = SGD(model.alphas(), args.search_learning_rate, momentum=args.search_momentum, weight_decay=args.search_weight_decay) # init scheduler scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.95, patience=args.search_patience, min_lr=args.search_learning_rate_min) for epoch in epochRange: print('========== Epoch:[{}/{}] =============='.format( epoch, self.nEpochs)) # init epoch train logger trainLogger = HtmlLogger(self.trainFolderPath, epoch) # set loggers dictionary loggersDict = {self.trainLoggerKey: trainLogger} # create epoch jobs epochDataRows = self._createEpochJobs(epoch) # add epoch data rows for jobDataRow in epochDataRows: logger.addDataRow(jobDataRow, trType='<tr bgcolor="#2CBDD6">') # train alphas # epochLossDict, alphasDataRow = self.trainAlphas(self._getNextSearchQueueDataLoader(), optimizer, epoch, loggersDict) epochLossDict, alphasDataRow = self.trainAlphas( self.valid_queue, optimizer, epoch, loggersDict) # update scheduler scheduler.step(epochLossDict.get(self.flopsLoss.totalKey())) # calc model choosePathAlphasAsPartition flops ratio model.choosePathAlphasAsPartition() # add values to alphas data row additionalData = { self.epochNumKey: epoch, self.lrKey: optimizer.param_groups[0]['lr'], self.validFlopsRatioKey: model.flopsRatio() } self._applyFormats(additionalData) # add alphas data row alphasDataRow.update(additionalData) logger.addDataRow(alphasDataRow) # save checkpoint save_checkpoint(self.trainFolderPath, model, optimizer, epochLossDict)
def main(): args = ArgumentsTrainVal().parse_args() print('***************************Arguments****************************') print(args) model, distribution = construct_model(args) print('--------------------------Model Info----------------------------') print(model) if args.resume_model is None: init_network(model) criterion = functional.cross_entropy optimizer = SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) train_iterator, validate_iterator = construct_train_dataloaders(args) engine_args = [ args.gpu_ids, model, criterion, distribution, train_iterator, validate_iterator, optimizer ] if args.num_classes == 1000 or args.num_classes == 1001: topk = [1, 5] else: topk = [1] # learning rate points lr_points = [] if args.num_classes == 100: lr_points = [150, 225] elif args.num_classes == 1000 or args.num_classes == 1001: lr_points = [30, 60] print('==> Set lr_points for resnet54: {}'.format(lr_points)) engine = construct_engine(*engine_args, checkpoint_iter_freq=args.checkpoint_iter_freq, checkpoint_epoch_freq=args.checkpoint_epoch_freq, checkpoint_save_path=args.checkpoint_save_path, iter_log_freq=args.iter_log_freq, topk=topk, num_classes=args.num_classes, lambda_error=args.lambda_error, environment=args.environment, lr_points=lr_points) if args.ada_train: engine.ada_train(args.maxepoch) else: engine.resume(args.maxepoch, args.resume_epoch, args.resume_iteration)
def test_linreg(n_epochs): print('Test: linear regression') x_values = [] y_values = [] for i in range(5): x_values.append(i) y_values.append(5*i + 2 + torch.randn(1).data.item()) x_data = np.array(x_values, dtype=np.float32).reshape(-1, 1) y_data = np.array(y_values, dtype=np.float32).reshape(-1, 1) answer = RegressionAnswer(LinearRegression(), lambda model : LBFGS(model.parameters()), torch.nn.MSELoss(), x_data, y_data).run(60) test = [ RegressionTest('A2Grad-uni', LinearRegression(), lambda model : A2Grad(model.parameters(), 'uni', 1e-1), torch.nn.MSELoss(), x_data, y_data), RegressionTest('A2Grad-inc', LinearRegression(), lambda model : A2Grad(model.parameters(), 'inc', 1e-1), torch.nn.MSELoss(), x_data, y_data), RegressionTest('A2Grad-exp', LinearRegression(), lambda model : A2Grad(model.parameters(), 'exp', 1e-1), torch.nn.MSELoss(), x_data, y_data), RegressionTest('Adam', LinearRegression(), lambda model : Adam(model.parameters()), torch.nn.MSELoss(), x_data, y_data), RegressionTest('SGD', LinearRegression(), lambda model : SGD(model.parameters(), lr=1e-2), torch.nn.MSELoss(), x_data, y_data), RegressionTest('LBFGS', LinearRegression(), lambda model : LBFGS(model.parameters()), torch.nn.MSELoss(), x_data, y_data) ] plt.figure(figsize=(14, 8)) for i in range(len(test)): test[i].run(n_epochs) plt.plot(np.arange(1, n_epochs + 1), np.array(test[i].errors) - answer, label=test[i].name) plt.legend(fontsize=12, loc=1) plt.title('Linear regression') plt.xlabel('Epoch') plt.ylabel('MSE') plt.savefig('linear.png') plt.figure(figsize=(14, 8)) for i in range(len(test)): plt.plot(np.arange(1, n_epochs + 1), np.array(test[i].errors) - answer, label=test[i].name) plt.legend(fontsize=12, loc=1) plt.ylim(0, 1e-5) plt.title('Linear regression') plt.xlabel('Epoch') plt.ylabel('MSE') plt.savefig('linear2.png') points = np.arange(10, n_epochs, 10) header = "method " for i in points: header += "{} ".format(i) print(header) for i in range(len(test)): test[i].output(points) print('')
def test_trainer_train_without_plugins(fake_loader, simple_neural_net): train_loader = fake_loader val_loader = fake_loader loss = nn.BCELoss() optimizer = SGD(simple_neural_net.parameters(), lr=0.001, momentum=0.9) trainer = TorchTrainer(simple_neural_net) trainer.prepare(optimizer, loss, train_loader, val_loader, transform_fn=transform_fn) trainer.train(1, 4)
def train_model(training_file: str): oracles = read_oracles_from_file(DiscriminativeOracle, training_file) dataset = OracleDataset(oracles) dataset.load() dataset_loader = DataLoader(dataset, collate_fn=lambda x: x[0]) parser = DiscriminativeRnnGrammar(action_store=dataset.action_store, word2id=dataset.word_store, pos2id=dataset.pos_store, non_terminal2id=dataset.nt_store) optimiser = SGD(parser.parameters(), 0.1) train_early_stopping(dataset_loader, dataset_loader, parser, optimiser)
def __init__(self, hparams, ckpt_name, homedir, separate_history, patience): self.hparams = hparams # batch size self.batch_size = 256 # loader self.loader_train, self.loader_valid, self.loader_test = mnist_data_loader( self.batch_size, homedir) # model self.model = Network(hparams) # loss function self.loss_fn = nn.CrossEntropyLoss() # initial learning rate self.lr = hparams['lr'] # momentum coef self.momentum = hparams['momentum'] # optimizer self.optimizer = SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum, nesterov=True) # epoch self.epoch = 0 # check point self.ckpt_dir = homedir + "ckpt" self.ckpt_name = ckpt_name # history self.separate_history = separate_history # patience self.patience = patience try: ckpt = self._load_checkpoint(self.ckpt_name) self.model.load_state_dict(ckpt['state_dict']) self.epoch = ckpt['current_epoch'] except FileNotFoundError: pass
def preTraining(): ''' Creates the model, data and optimizor for training ----- Returns ----- data - tuple with caption_ids, good_image and bad_image optimizer - Adam optimizer from mynn model - traiing model ''' model = Model() data = ct.create_triples(300000, 25).reshape(-1,3) optim = SGD(model.parameters(), lr = 0.001, momentum = 0.9) return data, optim, model
def get_optimizer(project_variable, model): # project_variable = ProjectVariable() if project_variable.optimizer == 'adam': optimizer = Adam(model.parameters(), lr=project_variable.learning_rate) elif project_variable.optimizer == 'sgd': optimizer = SGD(model.parameters(), lr=project_variable.learning_rate, momentum=project_variable.momentum) else: print('Error: optimizer %s not supported' % project_variable.optimizer) optimizer = None return optimizer
def test_train_MEG_swap(): dataset_path = ["Z:\Desktop\sub8\\ball1_sss.fif"] dataset = MEG_Dataset(dataset_path, duration=1.0, overlap=0.0) train_len, valid_len, test_len = len_split(len(dataset)) train_dataset, valid_dataset, test_dataset = random_split( dataset, [train_len, valid_len, test_len] ) device = "cpu" trainloader = DataLoader( train_dataset, batch_size=10, shuffle=False, num_workers=1 ) validloader = DataLoader( valid_dataset, batch_size=2, shuffle=False, num_workers=1 ) epochs = 1 with torch.no_grad(): x, _, _ = iter(trainloader).next() n_times = x.shape[-1] net = models.MNet(n_times) optimizer = SGD(net.parameters(), lr=0.0001, weight_decay=5e-4) loss_function = torch.nn.MSELoss() model, _, _ = train( net, trainloader, validloader, optimizer, loss_function, device, epochs, 10, 0, "", ) print("Test succeeded!")
def test_trainer_train_steplr(fake_loader, simple_neural_net): def transform_fn(batch): inputs, y_true = batch return inputs, y_true.float() metrics = [BinaryAccuracy()] transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) train_loader = DataLoader(FakeData(size=100, image_size=(3, 32, 32), num_classes=2, transform=transform), batch_size=4, shuffle=True, num_workers=1) val_loader = DataLoader(FakeData(size=50, image_size=(3, 32, 32), num_classes=2, transform=transform), batch_size=4, shuffle=True, num_workers=1) model = Net() loss = nn.BCELoss() optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9) callbacks = [StepLREpochCallback()] trainer = TorchTrainer(model) trainer.prepare(optimizer, loss, train_loader, val_loader, transform_fn=transform_fn, callbacks=callbacks, metrics=metrics) epochs = 10 batch_size = 10 trainer.train(epochs, batch_size)
def get_optimizer(model: nn.Module, optim: str, lr: float) -> Optimizer: """ Return the optimizer that corresponds to string optim. Add the parameters from model and set learning rate to lr :param model: model to get the parameters from :param optim: name of the optimizer :param lr: learning rate to use in the optimizer :return: """ if optim == "adagrad": return Adagrad(model.parameters(), lr=lr) elif optim == "sgd": return SGD(model.parameters(), lr=lr) elif optim == "rmsprop": return RMSprop(model.parameters(), lr=lr) elif optim == "adam": return Adam(model.parameters(), lr=lr) else: raise ValueError("Invalid optimizer")
def learn_second(network, lr, model, examples_files, total_example, alpha=1.0, batch_size=20): """ Helper function used to optimize O2 :param network: network model to optimize :param lr: learning rate :param model: model containing the shared data :param examples_files: list of files containing the examples :param total_example: total example for training :param alpha: trade-off param :param batch_size: size of the batch :return: loss value """ num_batch = 0 log.info("compute o2") optimizer = SGD(network.parameters(), lr) log.debug("read example file: {}".format("\t".join(examples_files))) loss_val = 0 if alpha <= 0: return loss_val for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, graph_utils.combine_example_files_iter(examples_files), network.transfer_fn(model.vocab)), batch_size, long_tensor=LongTensor): input, output = batch loss = (alpha * network.forward( input, output, negative_sampling_fn=model.negative_sample)) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / (total_example / batch_size))) log.debug("O2 loss: {}".format(loss_val)) return loss_val
def split_optimizer(model: nn.Module, cfg: dict): param_weight_decay, param_bias, param_other = split_params(model) if cfg['optimizer'] == 'Adam': optimizer = Adam(param_other, lr=cfg['lr'], betas=(cfg['momentum'], 0.999)) # adjust beta1 to momentum elif cfg['optimizer'] == 'SGD': optimizer = SGD(param_other, lr=cfg['lr'], momentum=cfg['momentum'], nesterov=True) else: raise NotImplementedError("optimizer {:s} is not support!".format( cfg['optimizer'])) optimizer.add_param_group({ 'params': param_weight_decay, 'weight_decay': cfg['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': param_bias}) # add pg2 (biases) return optimizer
def test_trainer_train_with_metric(fake_loader, simple_neural_net): train_loader = fake_loader val_loader = fake_loader metrics = [BinaryAccuracy()] loss = nn.BCELoss() optimizer = SGD(simple_neural_net.parameters(), lr=0.001, momentum=0.9) trainer = TorchTrainer(simple_neural_net) trainer.prepare(optimizer, loss, train_loader, val_loader, transform_fn=transform_fn, metrics=metrics, validate_every=1) final_result = trainer.train(1, 4) assert 'binary_acc' in final_result assert 'val_binary_acc' in final_result
def learn_community(network, lr, model, nodes, beta=1.0, batch_size=20): """ Helper function used to optimize O3 :param network: model to optimize :param lr: learning rate :param model: model containing the shared data :param nodes: nodes on which execute the learning :param beta: trade-off value :param batch_size: size of the batch :return: loss value """ num_batch = 0 log.info("compute o3") optimizer = SGD(network.parameters(), lr) loss_val = 0 if beta <= 0.: return loss_val for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, nodes, network.transfer_fn()), batch_size, long_tensor=LongTensor): input, output = batch loss = network.forward(input, model) loss.data *= (beta / model.k) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / (total_example / batch_size))) log.debug("O3 loss: {}".format(loss_val)) return loss_val
def learn_first(network, lr, model, edges, num_iter=1, batch_size=20): """ Helper function used to optimize O1 :param network: neural network to train :param lr: learning rate :param model: model containing the shared data :param edges: numpy list of edges used for training :param num_iter: iteration number over the edges :param batch_size: size of the batch :return: loss value """ log.info("computing o1") optimizer = SGD(network.parameters(), lr) num_batch = 0 total_batch = (edges.shape[0] * num_iter) / batch_size loss_val = 0 for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, edges, network.transfer_fn(model.vocab)), batch_size, long_tensor=LongTensor): input, output = batch loss = network.forward(input, output, negative_sampling_fn=model.negative_sample) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / total_batch)) log.debug("O1 loss: {}".format(loss_val)) return loss_val
def learn_second(network, lr, model, examples_files, alpha=1.0): """ Helper function used to optimize O1 and O3 :param loss: loss to optimize :param lr: learning rate :param model: deprecated_model used to compute the batches and the negative sampling :param examples_files: list of files containing the examples :param num_iter: iteration number over the edges :return: """ log.info("compute o2") optimizer = SGD(network.parameters(), lr) log.debug("read example file: {}".format("\t".join(examples_files))) for batch in emb_utils.batch_generator( emb_utils.prepare_sentences( model, graph_utils.combine_example_files_iter(examples_files), network.transfer_fn(model.vocab)), 20): input, output = batch loss = (alpha * network.forward( input, output, negative_sampling_fn=model.negative_sample)) optimizer.zero_grad() loss.backward() optimizer.step()
def learn_first(network, lr, model, edges, num_iter=1): """ Helper function used to optimize O1 and O3 :param network: neural network to train :param lr: learning rate :param model: deprecated_model used to compute the batches and the negative sampling :param edges: numpy list of edges used for training :param num_iter: iteration number over the edges :return: """ log.info("computing o1") optimizer = SGD(network.parameters(), lr) for batch in emb_utils.batch_generator( emb_utils.prepare_sentences( model, emb_utils.RepeatCorpusNTimes(edges, n=num_iter), network.transfer_fn(model.vocab)), 20): input, output = batch loss = network.forward(input, output, negative_sampling_fn=model.negative_sample) optimizer.zero_grad() loss.backward() optimizer.step()
def main(args): data_dir = args.data_dir figure_path = args.figure_dir model_path = args.model_dir # Set skip_training to False if the model has to be trained, to True if the model has to be loaded. skip_training = False # Set the torch device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device = {}".format(device)) # Initialize parameters parameters = Params_cross( subject_n=args.sub, hand=args.hand, batch_size=args.batch_size, valid_batch_size=args.batch_size_valid, test_batch_size=args.batch_size_test, epochs=args.epochs, lr=args.learning_rate, wd=args.weight_decay, patience=args.patience, device=device, desc=args.desc, ) # Import data and generate train-, valid- and test-set # Set if generate with RPS values or not (check network architecture used later) print("Testing: {} ".format(parameters.desc)) mlp = False train_dataset = MEG_Cross_Dataset(data_dir, parameters.subject_n, parameters.hand, mode="train") valid_dataset = MEG_Cross_Dataset(data_dir, parameters.subject_n, parameters.hand, mode="val") test_dataset = MEG_Cross_Dataset(data_dir, parameters.subject_n, parameters.hand, mode="test") transfer_dataset = MEG_Cross_Dataset(data_dir, parameters.subject_n, parameters.hand, mode="transf") print("Train dataset len {}, valid dataset len {}, test dataset len {}, " "transfer dataset len {}".format( len(train_dataset), len(valid_dataset), len(test_dataset), len(transfer_dataset), )) # Initialize the dataloaders trainloader = DataLoader(train_dataset, batch_size=parameters.batch_size, shuffle=True, num_workers=4) validloader = DataLoader(valid_dataset, batch_size=parameters.valid_batch_size, shuffle=True, num_workers=4) testloader = DataLoader( test_dataset, batch_size=parameters.test_batch_size, shuffle=False, num_workers=4, ) transferloader = DataLoader(transfer_dataset, batch_size=parameters.valid_batch_size, shuffle=True, num_workers=4) # Initialize network if mlp: net = RPS_MLP() else: # Get the n_times dimension with torch.no_grad(): sample, y, _ = iter(trainloader).next() n_times = sample.shape[-1] net = RPS_MNet_ivan(n_times) print(net) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs net = nn.DataParallel(net) # Training loop if not skip_training: print("Begin training....") # Check the optimizer before running (different from model to model) optimizer = Adam(net.parameters(), lr=parameters.lr, weight_decay=parameters.wd) # optimizer = SGD(net.parameters(), lr=parameters.lr, momentum=0.9, weight_decay=parameters.wd) scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=15) print("scheduler : ", scheduler) loss_function = torch.nn.MSELoss() # loss_function = torch.nn.L1Loss() start_time = timer.time() if mlp: net, train_loss, valid_loss = train_bp_MLP( net, trainloader, validloader, optimizer, scheduler, loss_function, parameters.device, parameters.epochs, parameters.patience, parameters.hand, model_path, ) else: net, train_loss, valid_loss = train_bp( net, trainloader, validloader, optimizer, scheduler, loss_function, parameters.device, parameters.epochs, parameters.patience, parameters.hand, model_path, ) train_time = timer.time() - start_time print("Training done in {:.4f}".format(train_time)) # visualize the loss as the network trained fig = plt.figure(figsize=(10, 4)) plt.plot(range(1, len(train_loss) + 1), train_loss, label="Training Loss") plt.plot(range(1, len(valid_loss) + 1), valid_loss, label="Validation Loss") # find position of lowest validation loss minposs = valid_loss.index(min(valid_loss)) + 1 plt.axvline( minposs, linestyle="--", color="r", label="Early Stopping Checkpoint", ) plt.xlabel("epochs") plt.ylabel("loss") # plt.ylim(0, 0.5) # consistent scale # plt.xlim(0, len(train_loss)+1) # consistent scale plt.grid(True) plt.legend() plt.tight_layout() plt.show() image1 = fig plt.savefig(os.path.join(figure_path, "loss_plot.pdf")) if not skip_training: # Save the trained model save_pytorch_model(net, model_path, "model.pth") else: # Load the model (properly select the model architecture) net = RPS_MNet() net = load_pytorch_model(net, os.path.join(model_path, "model.pth"), parameters.device) # Evaluation print("Evaluation...") net.eval() y_pred = [] y = [] y_pred_valid = [] y_valid = [] # if RPS integration with torch.no_grad(): if mlp: for _, labels, bp in testloader: labels, bp = ( labels.to(parameters.device), bp.to(parameters.device), ) y.extend(list(labels[:, parameters.hand])) y_pred.extend((list(net(bp)))) for _, labels, bp in validloader: labels, bp = ( labels.to(parameters.device), bp.to(parameters.device), ) y_valid.extend(list(labels[:, parameters.hand])) y_pred_valid.extend((list(net(bp)))) else: for data, labels, bp in testloader: data, labels, bp = ( data.to(parameters.device), labels.to(parameters.device), bp.to(parameters.device), ) y.extend(list(labels[:, parameters.hand])) y_pred.extend((list(net(data, bp)))) for data, labels, bp in validloader: data, labels, bp = ( data.to(parameters.device), labels.to(parameters.device), bp.to(parameters.device), ) y_valid.extend(list(labels[:, parameters.hand])) y_pred_valid.extend((list(net(data, bp)))) # Calculate Evaluation measures print("Evaluation measures") mse = mean_squared_error(y, y_pred) rmse = mean_squared_error(y, y_pred, squared=False) mae = mean_absolute_error(y, y_pred) r2 = r2_score(y, y_pred) rmse_valid = mean_squared_error(y_valid, y_pred_valid, squared=False) r2_valid = r2_score(y_valid, y_pred_valid) valid_loss_last = min(valid_loss) print("Test set ") print("mean squared error {}".format(mse)) print("root mean squared error {}".format(rmse)) print("mean absolute error {}".format(mae)) print("r2 score {}".format(r2)) print("Validation set") print("root mean squared error valid {}".format(rmse_valid)) print("r2 score valid {}".format(r2_valid)) print("last value of the validation loss: {}".format(valid_loss_last)) # plot y_new against the true value focus on 100 timepoints fig, ax = plt.subplots(1, 1, figsize=[10, 4]) times = np.arange(200) ax.plot(times, y_pred[0:200], color="b", label="Predicted") ax.plot(times, y[0:200], color="r", label="True") ax.set_xlabel("Times") ax.set_ylabel("Target") ax.set_title("Sub {}, hand {}, Target prediction".format( str(parameters.subject_n), "sx" if parameters.hand == 0 else "dx")) plt.legend() plt.savefig(os.path.join(figure_path, "Times_prediction_focus.pdf")) plt.show() # plot y_new against the true value fig, ax = plt.subplots(1, 1, figsize=[10, 4]) times = np.arange(len(y_pred)) ax.plot(times, y_pred, color="b", label="Predicted") ax.plot(times, y, color="r", label="True") ax.set_xlabel("Times") ax.set_ylabel("Target") ax.set_title("Sub {}, hand {}, target prediction".format( str(parameters.subject_n), "sx" if parameters.hand == 0 else "dx")) plt.legend() plt.savefig(os.path.join(figure_path, "Times_prediction.pdf")) plt.show() # scatterplot y predicted against the true value fig, ax = plt.subplots(1, 1, figsize=[10, 4]) ax.scatter(np.array(y), np.array(y_pred), color="b", label="Predicted") ax.set_xlabel("True") ax.set_ylabel("Predicted") # plt.legend() plt.savefig(os.path.join(figure_path, "Scatter.pdf")) plt.show() # scatterplot y predicted against the true value fig, ax = plt.subplots(1, 1, figsize=[10, 4]) ax.scatter(np.array(y_valid), np.array(y_pred_valid), color="b", label="Predicted") ax.set_xlabel("True") ax.set_ylabel("Predicted") # plt.legend() plt.savefig(os.path.join(figure_path, "Scatter_valid.pdf")) plt.show() # Transfer learning, feature extraction. optimizer_trans = SGD(net.parameters(), lr=3e-4) loss_function_trans = torch.nn.MSELoss() # loss_function_trans = torch.nn.L1Loss() if mlp: net, train_loss = train_mlp_transfer( net, transferloader, optimizer_trans, loss_function_trans, parameters.device, 50, parameters.patience, parameters.hand, model_path, ) else: # net, train_loss = train_bp_transfer( # net, # transferloader, # optimizer_trans, # loss_function_trans, # parameters.device, # 50, # parameters.patience, # parameters.hand, # model_path, # ) net, train_loss = train_bp_fine_tuning(net, transferloader, optimizer_trans, loss_function_trans, parameters.device, 50, 10, parameters.hand, model_path) # Evaluation print("Evaluation after transfer...") net.eval() y_pred = [] y = [] # if RPS integration with torch.no_grad(): if mlp: for _, labels, bp in testloader: labels, bp = ( labels.to(parameters.device), bp.to(parameters.device), ) y.extend(list(labels[:, parameters.hand])) y_pred.extend((list(net(bp)))) else: for data, labels, bp in testloader: data, labels, bp = ( data.to(parameters.device), labels.to(parameters.device), bp.to(parameters.device), ) y.extend(list(labels[:, parameters.hand])) y_pred.extend((list(net(data, bp)))) print("Evaluation measures") rmse_trans = mean_squared_error(y, y_pred, squared=False) r2_trans = r2_score(y, y_pred) print("root mean squared error after transfer learning {}".format( rmse_trans)) print("r2 score after transfer learning {}".format(r2_trans)) # scatterplot y predicted against the true value fig, ax = plt.subplots(1, 1, figsize=[10, 4]) ax.scatter(np.array(y), np.array(y_pred), color="b", label="Predicted") ax.set_xlabel("True") ax.set_ylabel("Predicted") # plt.legend() plt.savefig(os.path.join(figure_path, "Scatter_after_trans.pdf")) plt.show() # log the model and parameters using mlflow tracker with mlflow.start_run(experiment_id=args.experiment) as run: for key, value in vars(parameters).items(): mlflow.log_param(key, value) mlflow.log_param("Time", train_time) mlflow.log_metric("MSE", mse) mlflow.log_metric("RMSE", rmse) mlflow.log_metric("MAE", mae) mlflow.log_metric("R2", r2) mlflow.log_metric("RMSE_Valid", rmse_valid) mlflow.log_metric("R2_Valid", r2_valid) mlflow.log_metric("Valid_loss", valid_loss_last) mlflow.log_metric("RMSE_T", rmse_trans) mlflow.log_metric("R2_T", r2_trans) mlflow.log_artifact(os.path.join(figure_path, "Times_prediction.pdf")) mlflow.log_artifact( os.path.join(figure_path, "Times_prediction_focus.pdf")) mlflow.log_artifact(os.path.join(figure_path, "loss_plot.pdf")) mlflow.log_artifact(os.path.join(figure_path, "Scatter.pdf")) mlflow.log_artifact(os.path.join(figure_path, "Scatter_valid.pdf")) mlflow.log_artifact( os.path.join(figure_path, "Scatter_after_trans.pdf")) mlflow.pytorch.log_model(net, "models")
assert False, "Invalid model name" if args.weights_path is None: model.load_vgg_weights() else: model.load_weights(args.weights_path) if args.device == 'cuda': model = nn.DataParallel(model) logging.info(model) ### train info ### if args.optimizer == 'SGD': optimizer = SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) logging.info( 'Optimizer Info:' '\nOptimizer: {}' '\nlearning rate: {}, Momentum: {}, Weight decay: {}\n'.format( args.optimizer, args.learning_rate, args.momentum, args.weight_decay)) elif args.optimizer == 'Adam': optimizer = Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) logging.info('Optimizer Info:' '\nOptimizer: {}' '\nlearning rate: {}, Weight decay: {}\n'.format( args.optimizer, args.learning_rate, args.weight_decay))
def main(): args = parser.parse_args() # REPRODUCIBILITY torch.manual_seed(0) np.random.seed(0) if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) # Retrieve views candidates and right number of views if args.case == '1': args.vcand = np.load('view_candidates/vcand_case1.npy') args.nview = 12 elif args.case == '2': args.vcand = np.load('view_candidates/vcand_case2.npy') args.nview = 20 elif args.case == '3': args.vcand = np.load('view_candidates/vcand_case3.npy') args.nview = 160 # Names for the saved checkpoints args.fname_best = 'rotationnet{}_model_best{}.pth.tar'.format(args.nview, datetime.now().strftime("%d_%b_%Y_%H_%M_%S")) args.fname = 'rotationnet{}_model{}.pth.tar'.format(args.nview, datetime.now().strftime("%d_%b_%Y_%H_%M_%S")) logger.debug("Number of view candidates: {}".format(np.shape(args.vcand)[0])) logger.debug("Number of views: {}".format(args.nview)) if torch.cuda.is_available(): args.device = torch.device('cuda') else: args.device = torch.device('cpu') logger.debug("PyTorch is using {}".format(args.device)) # Mini batch size is used to do an update of the gradient so it need to be divisible by the number of views # otherwise one or more classification are not complete if args.batch_size % args.nview != 0: logger.error('Batch size should be multiplication of the number of views, {}'.format(args.nview)) exit(1) # Get number of classes logger.debug("Number of classes: {}".format(args.num_classes)) # Create RotationNet model based on the given architecture. # The output size is (num_classes + wrong_view class) * the number of views model = RotationNet(args.arch, args.pretrained, (args.num_classes + 1) * args.nview, args.feature_extraction, args.depth) # Multi GPUs if torch.cuda.device_count() > 1: logger.debug("Using multiple GPUs") model = torch.nn.DataParallel(model) # Send model to GPU or keep it to the CPU model = model.to(device=args.device) # Define loss function (criterion) and optimizer # Sending loss to cuda is unnecessary because loss function is not stateful # TODO test if it works without sending loss to GPU criterion = nn.CrossEntropyLoss().to(device=args.device) if args.optimizer == "ADAM": optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), args.learning_rate, weight_decay=args.weight_decay) elif args.optimizer == "ADAGRAD": optimizer = Adagrad(filter(lambda p: p.requires_grad, model.parameters()), args.learning_rate, weight_decay=args.weight_decay) else: # If we use feature extraction (features weights are frozen), we need to keep only differentiable params optimizer = SGD(filter(lambda p: p.requires_grad, model.parameters()), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # https://stackoverflow.com/questions/58961768/set-torch-backends-cudnn-benchmark-true-or-not # some boost when the network do not change # useless because cluster do not have cudnn # cudnn.benchmark = True logger.info("Model args: {}".format(args)) if args.train_type == 'k-fold': logger.debug("K-fold training") train_k_fold(model, criterion, optimizer, args) elif args.train_type == 'hold-out': logger.debug("Hold-out training") train_hold_out(model, criterion, optimizer, args) elif args.train_type == 'full': logger.debug("Full training") train_all(model, criterion, optimizer, args) elif args.train_type == 'evaluate': logger.debug("Start evaluation on test set") test_model(model, criterion, args) elif args.train_type == 'aligned': logger.debug("Holt-out training on aligned set") train_hold_out_aligned(model, criterion,optimizer, args) elif args.train_type == "test": logger.debug("Start real time test") threshold_evaluation(model, args)