def __init__(self, config: ModelConfig): """ Defines the model using the appropriate config class containing all the necessary parameters (activation, loss, loss, optimizer, training/eval batch size, ...) :param config: an object of the ModelConfig class. """ super(BaseModel, self).__init__() self._check_config(config) self._name = config.name self._set_activation(config.activation) self._set_normalization(config.normalization) self._set_loss(config.loss) self._build_model(config) self.initialize_params( config.initializer) # initialize the parameters using the config if len(list(self.parameters())) == 0: raise ValueError( "Model has no parameters defined and optimizer cannot be defined: len(self.parameters) = " "0. Parameters have to be defined in the _build_model() method." ) else: # only set the optimizer if some parameters have been already defined self._set_optimizer(config.optimizer) self._set_scheduler(config.scheduler) # define hparams for later logging self.hparams = config.dict() self.save_hyperparameters(config.dict()) # define an attribute to hold all the history of train/val/test metrics for later plotting /analysing self.results = {'training': [], 'validation': [], 'test': []}
def setUp(self) -> None: config_dict = read_yaml(CONFIG_PATH) self.input_size = config_dict['architecture']['input_size'] self.base_lr = config_dict['optimizer']['params']['lr'] self.n_warmup_steps = 1 self.batch_size = 128 self.width = config_dict['architecture']['width'] self.L = 3 config_dict['architecture']['n_layers'] = self.L + 1 config_dict['optimizer']['params']['lr'] = self.base_lr config_dict['scheduler'] = { 'name': 'warmup_switch', 'params': { 'n_warmup_steps': self.n_warmup_steps, 'calibrate_base_lr': True } } self.base_model_config = ModelConfig(config_dict) self.training_dataset, _ = load_data(download=False, flatten=True) self.train_data_loader = DataLoader(self.training_dataset, shuffle=True, batch_size=self.batch_size) self.batches = list(self.train_data_loader)
def test_resnet_config_from_file(self): config = ModelConfig(config_file=self.reset_config_path) self.assertTrue(config.name == "ResNet") # architecture self.assertDictEqual( { "input_size": 28, "n_blocks": 2, "n_layers": 2, "kernel_size": 3, "stride": 1, "n_channels": 32, "bias_conv": True, "fc_dim": 256, "bias_fc": True, "output_size": 10 }, config.architecture) # activation self.assertTrue(config.activation.name == "relu") self.assertFalse(hasattr(config.activation, "params")) # loss self.assertTrue(config.loss.name == "cross_entropy") self.assertDictEqual(config.loss.params, {"reduction": "mean"}) # opt self.assertTrue(config.optimizer.name == "adam") self.assertDictEqual(config.optimizer.params, { "lr": 1.0e-4, "beta1": 0.9, "beta2": 0.999 }) # norm self.assertTrue(config.normalization.name == "batch_norm_2d") self.assertFalse(hasattr(config.normalization, "params"))
def setUp(self) -> None: config_dict = read_yaml(CONFIG_PATH) self.base_model_config = ModelConfig(config_dict) self.width = 0 self.ntk = ntk.FCNTK(self.base_model_config, self.width) self.ip = ip.FCIP(self.base_model_config, c=0, width=self.width) self.muP = muP.FCmuP(self.base_model_config, self.width)
def _run_trial(self, idx, seed, k, r, batch_size, n, n_train, d, m): trial_name = 'trial_{}'.format(idx + 1) self.trial_dir = os.path.join(self.exp_dir, trial_name) if not os.path.exists( self.trial_dir): # run trial only if it doesn't already exist create_dir(self.trial_dir) # directory to save the trial self.trial_version = '{}_{}'.format( self.version, trial_name) # version for TensorBoard self._set_tb_logger_and_callbacks( trial_name) # tb logger, checkpoints and early stopping log_dir = os.path.join( self.trial_dir, self.LOG_NAME) # define path to save the logs of the trial logger = set_up_logger(log_dir) logger.info('----- Trial {:,} with version {} -----\n'.format( idx, self.trial_version)) self._log_experiment_info(k, r, batch_size, n, n_train, d, m) set_random_seeds(seed) # set random seed for the trial logger.info('Random seed used for the script : {:,}'.format( self.SEED)) logger.info('Random seed used for the trial : {:,}\n'.format(seed)) config = ModelConfig( config_dict=self.config_dict ) # define the config as a class to pass to the model two_layer_net = TwoLayerNet(config, train_hidden=True) # define the model logger.info('Number of model parameters : {:,}'.format( two_layer_net.count_parameters())) logger.info('Model architecture :\n{}\n'.format(two_layer_net)) # training and validation pipeline trainer = pl.Trainer( max_epochs=MAX_EPOCHS, max_steps=MAX_STEPS, logger=self.tb_logger, checkpoint_callback=self.checkpoint_callback, num_sanity_val_steps=0, early_stop_callback=self.early_stopping_callback) trainer.fit(model=two_layer_net, train_dataloader=self.train_data_loader, val_dataloaders=self.val_data_loader) # test pipeline test_results = trainer.test(model=two_layer_net, test_dataloaders=self.test_data_loader) logger.info('Test results :\n{}\n'.format(test_results)) # save all training val and test results to pickle file with open(os.path.join(self.trial_dir, self.RESULTS_FILE), 'wb') as file: pickle.dump(two_layer_net.results, file)
def test_named_config_from_dict(self): config = ModelConfig(config_dict={"name": "Config"}) self.assertTrue(config.name == "Config") # activation self.assertTrue(config.activation.name is None) self.assertFalse(hasattr(config.activation, "params")) # loss self.assertTrue(config.loss.name is None) self.assertFalse(hasattr(config.loss, "params")) # opt self.assertTrue(config.optimizer.name is None) self.assertFalse(hasattr(config.optimizer, "params")) # norm self.assertTrue(config.normalization is None)
def test_empty_config_from_file(self): config = ModelConfig(config_file=self.empty_config_path) self.assertTrue(config.name == "model") # activation self.assertTrue(config.activation.name is None) self.assertFalse(hasattr(config.activation, "params")) # loss self.assertTrue(config.loss.name is None) self.assertFalse(hasattr(config.loss, "params")) # opt self.assertTrue(config.optimizer.name is None) self.assertFalse(hasattr(config.optimizer, "params")) # norm self.assertTrue(config.normalization is None)
def setUp(self) -> None: config_dict = read_yaml(CONFIG_PATH) self.input_size = config_dict['architecture']['input_size'] self.base_lr = config_dict['optimizer']['params']['lr'] self.n_warmup_steps = 2 self.width = config_dict['architecture']['width'] self.L = config_dict['architecture']['n_layers'] - 1 config_dict['optimizer']['params']['lr'] = self.base_lr config_dict['scheduler'] = {'name': 'warmup_switch', 'params': {'n_warmup_steps': self.n_warmup_steps, 'calibrate_base_lr': False}} self.base_model_config = ModelConfig(config_dict) self.ipllr = FcIPLLR(self.base_model_config, n_warmup_steps=4)
def setUp(self) -> None: config_file = os.path.join('../../pytorch/configs', 'wide_two_layer_net.yaml') with open(config_file, 'r') as stream: try: config_dict = yaml.safe_load(stream) except yaml.YAMLError as e: raise Exception( "Exception while reading yaml file {} : {}".format( config_file, e)) # parameters of the experiment r, k, self.n, d = 0.5, 3, 700, 20 self.n_train, self.n_val = 256, 256 # n_test = n - (n_train + n_val) self.version = 'test_new_mac_n={}_d={}_m={}'.\ format(self.n, d, config_dict['architecture']['hidden_layer_dim']) # config and net config_dict['architecture']['input_size'] = d self.config = ModelConfig(config_dict=config_dict) self.two_layer_net = TwoLayerNet(self.config, train_hidden=True) # generate data ds = self._generate_data(k, r, d, self.n) # define train/val/test data loaders self._set_data_loaders(ds, self.n, self.n_train, self.n_val) # define tb logger and callbacks self.tb_logger = TensorBoardLogger(save_dir=SAVE_DIR, version=self.version, name=NAME) checkpoints_name_template = '{epoch}_{val_accuracy:.3f}_{val_loss:.3f}_{val_auc:.3f}' checkpoints_path = os.path.join(SAVE_DIR, NAME, self.version, 'checkpoints', checkpoints_name_template) self.checkpoint_callback = ModelCheckpoint(filepath=checkpoints_path, save_top_k=3, verbose=True, monitor='val_accuracy', mode='max', prefix='') self.early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=1.0e-6, patience=5, mode='min')
def setUp(self) -> None: config_file = os.path.join('../../pytorch/configs', 'wide_two_layer_net.yaml') with open(config_file, 'r') as stream: try: config_dict = yaml.safe_load(stream) except yaml.YAMLError as e: raise Exception("Exception while reading yaml file {} : {}".format(config_file, e)) self.config = ModelConfig(config_dict=config_dict) self.two_layer_net = TwoLayerNet(self.config, train_hidden=True) r, k, n = 1., 4, 5000 d = config_dict['architecture']['input_size'] data = random.RandomData(k=k, r=r, d=d, n=n) data.generate_samples() self.ds = dataset.RandomDataset(data) self.data_loader = DataLoader(self.ds, shuffle=True, batch_size=BATCH_SIZE)
def setUp(self) -> None: config_dict = read_yaml(CONFIG_PATH) L = 4 width = 1024 mean = 1.0 self.input_size = config_dict['architecture']['input_size'] self.base_lr = config_dict['optimizer']['params']['lr'] self.n_warmup_steps = 1 self.width = width config_dict['architecture']['width'] = width self.L = L self.mean = mean config_dict['architecture']['n_layers'] = L + 1 config_dict['optimizer']['params']['lr'] = self.base_lr self.base_model_config = ModelConfig(config_dict) self.base_model_config.initializer.params["mean"] = mean self.ip_non_centered = StandardFCIP(self.base_model_config)
def setUp(self) -> None: # define model config = ModelConfig( config_file=os.path.join(RESOURCES_DIR, 'resnet_config.yaml')) self.resnet = ResNetMNIST(config) # set up train and val data loaders n_train = int(RATIO_TRAIN * N_SAMPLES) n_val = N_SAMPLES - n_train indexes = list(range(N_SAMPLES)) np.random.shuffle(indexes) train_indexes = indexes[:n_train] val_indexes = indexes[n_train:] self.train_data_loader = self.resnet.train_dataloader( data_dir=DATA_DIR, download=False, batch_size=BATCH_SIZE, indexes=train_indexes) self.val_data_loader = self.resnet.train_dataloader( data_dir=DATA_DIR, download=False, batch_size=BATCH_SIZE, indexes=val_indexes) self.test_dataloader = self.resnet.test_dataloader( data_dir=DATA_DIR, download=False, batch_size=BATCH_SIZE) self.tb_logger = TensorBoardLogger(save_dir=SAVE_DIR, version=version, name=NAME) checkpoints_path = os.path.join( SAVE_DIR, NAME, version, 'checkpoints', '{epoch}_{val_accuracy:.3f}_{val_loss:.3f}_{val_auc:.3f}') self.checkpoint_callback = ModelCheckpoint(filepath=checkpoints_path, save_top_k=3, verbose=True, monitor='val_accuracy', mode='max', prefix='')
def setUp(self) -> None: config_dict = read_yaml(CONFIG_PATH) self.base_model_config = ModelConfig(config_dict) self.width = 0 self.standard_ip = StandardFCIP(self.base_model_config, self.width)
def setUp(self) -> None: self.config = ModelConfig(config_file=os.path.join(RESOURCES_DIR, 'resnet_config.yaml'))
def main(activation="relu", base_lr=0.01, batch_size=512, dataset="mnist"): config_path = os.path.join(CONFIG_PATH, 'fc_ipllr_{}.yaml'.format(dataset)) figures_dir = os.path.join(FIGURES_DIR, dataset) create_dir(figures_dir) log_path = os.path.join(figures_dir, 'log_muP_{}.txt'.format(activation)) logger = set_up_logger(log_path) logger.info('Parameters of the run:') logger.info('activation = {}'.format(activation)) logger.info('base_lr = {}'.format(base_lr)) logger.info('batch_size = {:,}'.format(batch_size)) logger.info('Random SEED : {:,}'.format(SEED)) logger.info( 'Number of random trials for each model : {:,}'.format(N_TRIALS)) try: set_random_seeds(SEED) # set random seed for reproducibility config_dict = read_yaml(config_path) version = 'L={}_m={}_act={}_lr={}_bs={}'.format( L, width, activation, base_lr, batch_size) template_name = 'muP_{}_ranks_{}_' + version config_dict['architecture']['width'] = width config_dict['architecture']['n_layers'] = L + 1 config_dict['optimizer']['params']['lr'] = base_lr config_dict['activation']['name'] = activation base_model_config = ModelConfig(config_dict) # Load data & define models logger.info('Loading data ...') if dataset == 'mnist': from utils.dataset.mnist import load_data elif dataset == 'cifar10': from utils.dataset.cifar10 import load_data elif dataset == 'cifar100': # TODO : add cifar100 to utils.dataset pass else: error = ValueError( "dataset must be one of ['mnist', 'cifar10', 'cifar100'] but was {}" .format(dataset)) logger.error(error) raise error training_dataset, test_dataset = load_data(download=False, flatten=True) train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size) batches = list(train_data_loader) full_x = torch.cat([a for a, _ in batches], dim=0) full_y = torch.cat([b for _, b in batches], dim=0) logger.info('Defining models') base_model_config.scheduler = None muPs = [FCmuP(base_model_config) for _ in range(N_TRIALS)] for muP in muPs: for i, param_group in enumerate(muP.optimizer.param_groups): if i == 0: param_group['lr'] = param_group['lr'] * (muP.d + 1) # save initial models muPs_0 = [deepcopy(muP) for muP in muPs] # train model one step logger.info('Training model a first step (t=1)') x, y = batches[0] muPs_1 = [] for muP in muPs: train_model_one_step(muP, x, y, normalize_first=True) muPs_1.append(deepcopy(muP)) # train models for a second step logger.info('Training model a second step (t=2)') x, y = batches[1] muPs_2 = [] for muP in muPs: train_model_one_step(muP, x, y, normalize_first=True) muPs_2.append(deepcopy(muP)) # set eval mode for all models for i in range(N_TRIALS): muPs[i].eval() muPs_0[i].eval() muPs_1[i].eval() muPs_2[i].eval() logger.info('Storing initial and update matrices') # define W0 and b0 W0s = [] b0s = [] for muP_0 in muPs_0: W0, b0 = get_W0_dict(muP_0, normalize_first=True) W0s.append(W0) b0s.append(b0) # define Delta_W_1 and Delta_b_1 Delta_W_1s = [] Delta_b_1s = [] for i in range(N_TRIALS): Delta_W_1, Delta_b_1 = get_Delta_W1_dict(muPs_0[i], muPs_1[i], normalize_first=True) Delta_W_1s.append(Delta_W_1) Delta_b_1s.append(Delta_b_1) # define Delta_W_2 and Delta_b_2 Delta_W_2s = [] Delta_b_2s = [] for i in range(N_TRIALS): Delta_W_2, Delta_b_2 = get_Delta_W2_dict(muPs_1[i], muPs_2[i], normalize_first=True) Delta_W_2s.append(Delta_W_2) Delta_b_2s.append(Delta_b_2) x, y = full_x, full_y # compute pre-activations on full batch # contributions after first step h0s = [] delta_h_1s = [] h1s = [] x1s = [] for i in range(N_TRIALS): h0, delta_h_1, h1, x1 = get_contributions_1(x, muPs_1[i], W0s[i], b0s[i], Delta_W_1s[i], Delta_b_1s[i], normalize_first=True) h0s.append(h0) delta_h_1s.append(delta_h_1) h1s.append(h0) x1s.append(x1) # ranks of initial weight matrices and first two updates logger.info('Computing ranks of weight matrices ...') weight_ranks_dfs_dict = dict() tol = None weight_ranks_dfs_dict['svd_default'] = [ get_svd_ranks_weights(W0s[i], Delta_W_1s[i], Delta_W_2s[i], L, tol=tol) for i in range(N_TRIALS) ] tol = 1e-7 weight_ranks_dfs_dict['svd_tol'] = [ get_svd_ranks_weights(W0s[i], Delta_W_1s[i], Delta_W_2s[i], L, tol=tol) for i in range(N_TRIALS) ] weight_ranks_dfs_dict['squared_tr'] = [ get_square_trace_ranks_weights(W0s[i], Delta_W_1s[i], Delta_W_2s[i], L) for i in range(N_TRIALS) ] weight_ranks_df_dict = { key: get_concatenated_ranks_df(weight_ranks_dfs_dict[key]) for key in weight_ranks_dfs_dict.keys() } avg_ranks_df_dict = { key: get_avg_ranks_dfs(weight_ranks_df_dict[key]) for key in weight_ranks_df_dict.keys() } logger.info('Saving weights ranks data frames to csv ...') for key in weight_ranks_df_dict.keys(): logger.info(key) logger.info('\n' + str(avg_ranks_df_dict[key]) + '\n\n') avg_ranks_df_dict[key].to_csv(os.path.join( figures_dir, template_name.format(key, 'weights') + '.csv'), header=True, index=True) ranks_dfs = [ weight_ranks_df_dict['svd_default'], weight_ranks_df_dict['svd_tol'], weight_ranks_df_dict['squared_tr'] ] # plot weights ranks logger.info('Plotting weights ranks') plt.figure(figsize=(12, 6)) plot_weights_ranks_vs_layer('W0', ranks_dfs, tol, L, width, base_lr, batch_size, y_scale='log') plt.savefig( os.path.join(figures_dir, template_name.format('W0', 'weights') + '.png')) plt.figure(figsize=(12, 6)) plot_weights_ranks_vs_layer('Delta_W_1', ranks_dfs, tol, L, width, base_lr, batch_size, y_scale='log') plt.savefig( os.path.join(figures_dir, template_name.format('Delta_W_1', 'weights') + '.png')) plt.figure(figsize=(12, 6)) plot_weights_ranks_vs_layer('Delta_W_2', ranks_dfs, tol, L, width, base_lr, batch_size, y_scale='log') plt.savefig( os.path.join(figures_dir, template_name.format('Delta_W_2', 'weights') + '.png')) # ranks of the pre-activations logger.info('Computing ranks of (pre-)activations ...') act_ranks_dfs_dict = dict() tol = None act_ranks_dfs_dict['svd_default'] = [ get_svd_ranks_acts(h0s[i], delta_h_1s[i], h1s[i], x1s[i], L, tol=tol) for i in range(N_TRIALS) ] tol = 1e-7 act_ranks_dfs_dict['svd_tol'] = [ get_svd_ranks_acts(h0s[i], delta_h_1s[i], h1s[i], x1s[i], L, tol=tol) for i in range(N_TRIALS) ] act_ranks_dfs_dict['squared_tr'] = [ get_square_trace_ranks_acts(h0s[i], delta_h_1s[i], h1s[i], x1s[i], L) for i in range(N_TRIALS) ] act_ranks_df_dict = { key: get_concatenated_ranks_df(act_ranks_dfs_dict[key]) for key in act_ranks_dfs_dict.keys() } avg_ranks_df_dict = { key: get_avg_ranks_dfs(act_ranks_df_dict[key]) for key in act_ranks_df_dict.keys() } logger.info('Saving (pre-)activation ranks data frames to csv ...') for key in avg_ranks_df_dict.keys(): logger.info(key) logger.info('\n' + str(avg_ranks_df_dict[key]) + '\n\n') avg_ranks_df_dict[key].to_csv(os.path.join( figures_dir, template_name.format(key, 'acts') + '.csv'), header=True, index=True) ranks_dfs = [ act_ranks_df_dict['svd_default'], act_ranks_df_dict['svd_tol'], act_ranks_df_dict['squared_tr'] ] logger.info('Plotting (pre-)activation ranks') plt.figure(figsize=(12, 6)) plot_acts_ranks_vs_layer('h0', ranks_dfs, tol, L, width, base_lr, batch_size, y_scale='log') plt.savefig( os.path.join(figures_dir, template_name.format('h0', 'acts') + '.png')) plt.figure(figsize=(12, 6)) plot_acts_ranks_vs_layer('h1', ranks_dfs, tol, L, width, base_lr, batch_size, y_scale='log') plt.savefig( os.path.join(figures_dir, template_name.format('h1', 'acts') + '.png')) plt.figure(figsize=(12, 6)) plot_acts_ranks_vs_layer('x1', ranks_dfs, tol, L, width, base_lr, batch_size, y_scale='log') plt.savefig( os.path.join(figures_dir, template_name.format('x1', 'acts') + '.png')) plt.figure(figsize=(12, 6)) plot_acts_ranks_vs_layer('delta_h_1', ranks_dfs, tol, L, width, base_lr, batch_size, y_scale='log') plt.savefig( os.path.join(figures_dir, template_name.format('delta_h_1', 'acts') + '.png')) # diversity in terms of the index of the maximum entry logger.info( 'Computing diversity of the maximum entry of pre-activations...') max_acts_diversity_dfs = [ get_max_acts_diversity(h0s[i], delta_h_1s[i], h1s[i], L) for i in range(N_TRIALS) ] max_acts_diversity_df = get_concatenated_ranks_df( max_acts_diversity_dfs) avg_max_acts_diversity_df = get_avg_ranks_dfs(max_acts_diversity_df) logger.info('Diversity of the maximum activation index df:') logger.info(str(avg_max_acts_diversity_df)) avg_max_acts_diversity_df.to_csv(os.path.join( figures_dir, 'muP_max_acts_' + version + '.csv'), header=True, index=True) except Exception as e: logger.exception("Exception when running the script : {}".format(e))
def main(activation="relu", n_steps=300, base_lr=0.01, batch_size=512, dataset="mnist"): config_path = os.path.join(CONFIG_PATH, 'fc_ipllr_{}.yaml'.format(dataset)) figures_dir = os.path.join(FIGURES_DIR, dataset) create_dir(figures_dir) log_path = os.path.join(figures_dir, 'log_muP_{}.txt'.format(activation)) logger = set_up_logger(log_path) logger.info('Parameters of the run:') logger.info('activation = {}'.format(activation)) logger.info('n_steps = {:,}'.format(n_steps)) logger.info('base_lr = {}'.format(base_lr)) logger.info('batch_size = {:,}'.format(batch_size)) logger.info('Random SEED : {:,}'.format(SEED)) logger.info( 'Number of random trials for each model : {:,}'.format(N_TRIALS)) try: set_random_seeds(SEED) # set random seed for reproducibility config_dict = read_yaml(config_path) fig_name_template = 'muP_{}_{}_L={}_m={}_act={}_lr={}_bs={}.png' config_dict['architecture']['width'] = width config_dict['architecture']['n_layers'] = L + 1 config_dict['optimizer']['params']['lr'] = base_lr config_dict['activation']['name'] = activation base_model_config = ModelConfig(config_dict) # Load data & define models logger.info('Loading data ...') if dataset == 'mnist': from utils.dataset.mnist import load_data elif dataset == 'cifar10': from utils.dataset.cifar10 import load_data elif dataset == 'cifar100': # TODO : add cifar100 to utils.dataset config_dict['architecture']['output_size'] = 100 pass else: error = ValueError( "dataset must be one of ['mnist', 'cifar10', 'cifar100'] but was {}" .format(dataset)) logger.error(error) raise error training_dataset, test_dataset = load_data(download=False, flatten=True) train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size) batches = list(train_data_loader) logger.info('Defining models') base_model_config.scheduler = None muPs = [FCmuP(base_model_config) for _ in range(N_TRIALS)] muPs_renorm = [FCmuP(base_model_config) for _ in range(N_TRIALS)] muPs_renorm_scale_lr = [ FCmuP(base_model_config) for _ in range(N_TRIALS) ] for muP in muPs_renorm_scale_lr: for i, param_group in enumerate(muP.optimizer.param_groups): if i == 0: param_group['lr'] = param_group['lr'] * (muP.d + 1) logger.info('Copying parameters of base muP') for i in range(N_TRIALS): muPs_renorm[i].copy_initial_params_from_model(muPs[i]) muPs_renorm_scale_lr[i].copy_initial_params_from_model(muPs[i]) muPs_renorm[i].initialize_params() muPs_renorm_scale_lr[i].initialize_params() results = dict() logger.info('Generating training results ...') results['muP'] = [ collect_training_losses(muPs[i], batches, n_steps, normalize_first=False) for i in range(N_TRIALS) ] results['muP_renorm'] = [ collect_training_losses(muPs_renorm[i], batches, n_steps, normalize_first=True) for i in range(N_TRIALS) ] results['muP_renorm_scale_lr'] = [ collect_training_losses(muPs_renorm_scale_lr[i], batches, n_steps, normalize_first=True) for i in range(N_TRIALS) ] mode = 'training' losses = dict() for key, res in results.items(): losses[key] = [r[0] for r in res] chis = dict() for key, res in results.items(): chis[key] = [r[1] for r in res] # Plot losses and derivatives logger.info('Saving figures at {}'.format(figures_dir)) key = 'loss' plt.figure(figsize=(12, 8)) plot_losses_models(losses, key=key, L=L, width=width, activation=activation, lr=base_lr, batch_size=batch_size, mode=mode, normalize_first=renorm_first, marker=None, name='muP') plt.ylim(0, 2.5) plt.savefig( os.path.join( figures_dir, fig_name_template.format(mode, key, L, width, activation, base_lr, batch_size))) key = 'chi' plt.figure(figsize=(12, 8)) plot_losses_models(chis, key=key, L=L, width=width, activation=activation, lr=base_lr, batch_size=batch_size, mode=mode, marker=None, name='muP') plt.savefig( os.path.join( figures_dir, fig_name_template.format(mode, key, L, width, activation, base_lr, batch_size))) except Exception as e: logger.exception("Exception when running the script : {}".format(e))
def _run_trial(self, idx): trial_name = 'trial_{}'.format(idx + 1) self.trial_dir = os.path.join( self.base_experiment_path, trial_name) # folder to hold trial results if not os.path.exists( self.trial_dir): # run trial only if it doesn't already exist create_dir(self.trial_dir) # directory to save the trial set_random_seeds( self.trial_seeds[idx]) # set random seed for the trial self._set_tb_logger_and_callbacks( trial_name) # tb logger, checkpoints and early stopping log_dir = os.path.join( self.trial_dir, self.LOG_NAME) # define path to save the logs of the trial logger = set_up_logger(log_dir) config = ModelConfig( config_dict=self.config_dict ) # define the config as a class to pass to the model model = self.model(config) # define the model logger.info('----- Trial {:,} ----- with model config {}\n'.format( idx + 1, self.model_config)) self._log_experiment_info(len(self.train_dataset), len(self.val_dataset), len(self.test_dataset), model.std) logger.info('Random seed used for the script : {:,}'.format( self.SEED)) logger.info('Number of model parameters : {:,}'.format( model.count_parameters())) logger.info('Model architecture :\n{}\n'.format(model)) try: # training and validation pipeline trainer = pl.Trainer( max_epochs=self.max_epochs, max_steps=self.max_steps, logger=self.tb_logger, checkpoint_callback=self.checkpoint_callback, num_sanity_val_steps=0, early_stop_callback=self.early_stopping_callback) trainer.fit(model=model, train_dataloader=self.train_data_loader, val_dataloaders=self.val_data_loader) # test pipeline test_results = trainer.test( model=model, test_dataloaders=self.test_data_loader) logger.info('Test results :\n{}\n'.format(test_results)) # save all training, val and test results to pickle file with open(os.path.join(self.trial_dir, self.RESULTS_FILE), 'wb') as file: pickle.dump(model.results, file) except Exception as e: # dump and save results before exiting with open(os.path.join(self.trial_dir, self.RESULTS_FILE), 'wb') as file: pickle.dump(model.results, file) logger.warning('model results dumped before interruption') logger.exception( "Exception while running the train-val-test pipeline : {}". format(e)) raise Exception(e) else: logging.warning( "Directory for trial {:,} of experiment {} already exists". format(idx, self.model_config))
def main(activation="relu", n_steps=300, base_lr=0.01, batch_size=512, dataset="mnist"): config_path = os.path.join(CONFIG_PATH, 'fc_ipllr_{}.yaml'.format(dataset)) figures_dir = os.path.join(FIGURES_DIR, dataset) create_dir(figures_dir) log_path = os.path.join(figures_dir, 'log_ipllr_{}.txt'.format(activation)) logger = set_up_logger(log_path) logger.info('Parameters of the run:') logger.info('activation = {}'.format(activation)) logger.info('n_steps = {:,}'.format(n_steps)) logger.info('base_lr = {}'.format(base_lr)) logger.info('batch_size = {:,}'.format(batch_size)) logger.info('dataset = {}'.format(dataset)) logger.info('Random SEED : {:,}'.format(SEED)) logger.info( 'Number of random trials for each model : {:,}'.format(N_TRIALS)) try: set_random_seeds(SEED) # set random seed for reproducibility config_dict = read_yaml(config_path) fig_name_template = 'IPLLRs_1_last_small_{}_{}_L={}_m={}_act={}_lr={}_bs={}.png' config_dict['architecture']['width'] = width config_dict['architecture']['n_layers'] = L + 1 config_dict['optimizer']['params']['lr'] = base_lr config_dict['activation']['name'] = activation config_dict['scheduler'] = { 'name': 'warmup_switch', 'params': { 'n_warmup_steps': n_warmup_steps, 'calibrate_base_lr': True, 'default_calibration': False } } # Load data & define models logger.info('Loading data ...') if dataset == 'mnist': from utils.dataset.mnist import load_data elif dataset == 'cifar10': from utils.dataset.cifar10 import load_data elif dataset == 'cifar100': # TODO : add cifar100 to utils.dataset pass else: error = ValueError( "dataset must be one of ['mnist', 'cifar10', 'cifar100'] but was {}" .format(dataset)) logger.error(error) raise error training_dataset, test_dataset = load_data(download=False, flatten=True) train_data_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size) batches = list(train_data_loader) logger.info('Number of batches (steps) per epoch : {:,}'.format( len(batches))) logger.info('Number of epochs : {:,}'.format(n_steps // len(batches))) config_dict['scheduler']['params']['calibrate_base_lr'] = False config = ModelConfig(config_dict) logger.info('Defining models') ipllrs = [FcIPLLR(config) for _ in range(N_TRIALS)] config_dict['scheduler']['params']['calibrate_base_lr'] = True config = ModelConfig(config_dict) ipllrs_calib = [ FcIPLLR(config, lr_calibration_batches=batches) for _ in range(N_TRIALS) ] ipllrs_calib_renorm = [ FcIPLLR(config, lr_calibration_batches=batches) for _ in range(N_TRIALS) ] ipllrs_calib_renorm_scale_lr = [ FcIPLLR(config, lr_calibration_batches=batches) for _ in range(N_TRIALS) ] logger.info('Copying parameters of base ipllr') for i in range(N_TRIALS): ipllrs_calib[i].copy_initial_params_from_model(ipllrs[i]) ipllrs_calib_renorm[i].copy_initial_params_from_model(ipllrs[i]) ipllrs_calib_renorm_scale_lr[i].copy_initial_params_from_model( ipllrs[i]) ipllrs_calib[i].initialize_params() ipllrs_calib_renorm[i].initialize_params() ipllrs_calib_renorm_scale_lr[i].initialize_params() # Make sure calibration takes into account normalization logger.info('Recalibrating lrs with new initialisation') for ipllr in ipllrs_calib: initial_base_lrs = ipllr.scheduler.calibrate_base_lr( ipllr, batches=batches, normalize_first=False) ipllr.scheduler._set_param_group_lrs(initial_base_lrs) for ipllr in ipllrs_calib_renorm: initial_base_lrs = ipllr.scheduler.calibrate_base_lr( ipllr, batches=batches, normalize_first=True) ipllr.scheduler._set_param_group_lrs(initial_base_lrs) for ipllr in ipllrs_calib_renorm_scale_lr: initial_base_lrs = ipllr.scheduler.calibrate_base_lr( ipllr, batches=batches, normalize_first=True) ipllr.scheduler._set_param_group_lrs(initial_base_lrs) # scale lr of first layer if needed for ipllr in ipllrs_calib_renorm_scale_lr: ipllr.scheduler.warm_lrs[0] = ipllr.scheduler.warm_lrs[0] * ( ipllr.d + 1) # with calibration results = dict() logger.info('Generating training results ...') results['ipllr_calib'] = [ collect_training_losses(ipllrs_calib[i], batches, n_steps, normalize_first=False) for i in range(N_TRIALS) ] results['ipllr_calib_renorm'] = [ collect_training_losses(ipllrs_calib_renorm[i], batches, n_steps, normalize_first=True) for i in range(N_TRIALS) ] results['ipllr_calib_renorm_scale_lr'] = [ collect_training_losses(ipllrs_calib_renorm_scale_lr[i], batches, n_steps, normalize_first=True) for i in range(N_TRIALS) ] mode = 'training' losses = dict() for key, res in results.items(): losses[key] = [r[0] for r in res] chis = dict() for key, res in results.items(): chis[key] = [r[1] for r in res] # Plot losses and derivatives logger.info('Saving figures at {}'.format(figures_dir)) key = 'loss' plt.figure(figsize=(12, 8)) plot_losses_models(losses, key=key, L=L, width=width, activation=activation, lr=base_lr, batch_size=batch_size, mode=mode, normalize_first=renorm_first, marker=None, name='IPLLR') plt.savefig( os.path.join( figures_dir, fig_name_template.format(mode, key, L, width, activation, base_lr, batch_size))) key = 'chi' plt.figure(figsize=(12, 8)) plot_losses_models(chis, key=key, L=L, width=width, activation=activation, lr=base_lr, batch_size=batch_size, mode=mode, marker=None, name='IPLLR') plt.savefig( os.path.join( figures_dir, fig_name_template.format(mode, key, L, width, activation, base_lr, batch_size))) except Exception as e: logger.exception("Exception when running the script : {}".format(e))
def setUp(self) -> None: config = ModelConfig( config_file=os.path.join(RESOURCES_DIR, 'resnet_config.yaml')) self.resnet = ResNetMNIST(config)