def _set_network(self): """Setup the network and explain matrix.""" self.network = tab_network.TabNet( self.input_dim, self.output_dim, n_d=self.n_d, n_a=self.n_a, n_steps=self.n_steps, gamma=self.gamma, cat_idxs=self.cat_idxs, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, n_independent=self.n_independent, n_shared=self.n_shared, epsilon=self.epsilon, virtual_batch_size=self.virtual_batch_size, momentum=self.momentum, mask_type=self.mask_type, ).to(self.device) self.reducing_matrix = create_explain_matrix( self.network.input_dim, self.network.cat_emb_dim, self.network.cat_idxs, self.network.post_embed_dim, )
def init_network( self, input_dim, output_dim, n_d, n_a, n_steps, gamma, cat_idxs, cat_dims, cat_emb_dim, n_independent, n_shared, epsilon, virtual_batch_size, momentum, device_name, mask_type, ): self.network = tab_network.TabNet( input_dim, output_dim, n_d=n_d, n_a=n_a, n_steps=n_steps, gamma=gamma, cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, n_independent=n_independent, n_shared=n_shared, epsilon=epsilon, virtual_batch_size=virtual_batch_size, momentum=momentum, device_name=device_name, mask_type=mask_type).to(self.device) self.reducing_matrix = create_explain_matrix( self.network.input_dim, self.network.cat_emb_dim, self.network.cat_idxs, self.network.post_embed_dim)
def fit(self, X_train, y_train, X_valid=None, y_valid=None, loss_fn=None, weights=0, max_epochs=100, patience=10, batch_size=1024, virtual_batch_size=128, num_workers=0, drop_last=False): """Train a neural network stored in self.network Using train_dataloader for training data and valid_dataloader for validation. Parameters ---------- X_train: np.ndarray Train set y_train : np.array Train targets X_train: np.ndarray Train set y_train : np.array Train targets weights : bool or dictionnary 0 for no balancing 1 for automated balancing dict for custom weights per class max_epochs : int Maximum number of epochs during training patience : int Number of consecutive non improving epoch before early stopping batch_size : int Training batch size virtual_batch_size : int Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size) num_workers : int Number of workers used in torch.utils.data.DataLoader drop_last : bool Whether to drop last batch during training """ # update model name self.update_fit_params(X_train, y_train, X_valid, y_valid, loss_fn, weights, max_epochs, patience, batch_size, virtual_batch_size, num_workers, drop_last) train_dataloader, valid_dataloader = self.construct_loaders( X_train, y_train, X_valid, y_valid, self.updated_weights, self.batch_size, self.num_workers, self.drop_last) self.network = tab_network.TabNet( self.input_dim, self.output_dim, n_d=self.n_d, n_a=self.n_d, n_steps=self.n_steps, gamma=self.gamma, cat_idxs=self.cat_idxs, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, n_independent=self.n_independent, n_shared=self.n_shared, epsilon=self.epsilon, virtual_batch_size=self.virtual_batch_size, momentum=self.momentum, device_name=self.device_name).to(self.device) self.reducing_matrix = create_explain_matrix( self.network.input_dim, self.network.cat_emb_dim, self.network.cat_idxs, self.network.post_embed_dim) self.optimizer = self.optimizer_fn(self.network.parameters(), lr=self.lr) if self.scheduler_fn: self.scheduler = self.scheduler_fn(self.optimizer, **self.scheduler_params) else: self.scheduler = None losses_train = [] losses_valid = [] metrics_train = [] metrics_valid = [] if self.verbose > 0: print("Will train until validation stopping metric", f"hasn't improved in {self.patience} rounds.") msg_epoch = f'| EPOCH | train | valid | total time (s)' print('---------------------------------------') print(msg_epoch) total_time = 0 while (self.epoch < self.max_epochs and self.patience_counter < self.patience): starting_time = time.time() fit_metrics = self.fit_epoch(train_dataloader, valid_dataloader) # leaving it here, may be used for callbacks later losses_train.append(fit_metrics['train']['loss_avg']) losses_valid.append(fit_metrics['valid']['total_loss']) metrics_train.append(fit_metrics['train']['stopping_loss']) metrics_valid.append(fit_metrics['valid']['stopping_loss']) stopping_loss = fit_metrics['valid']['stopping_loss'] if stopping_loss < self.best_cost: self.best_cost = stopping_loss self.patience_counter = 0 # Saving model self.best_network = copy.deepcopy(self.network) # Updating feature_importances_ self.feature_importances_ = fit_metrics['train'][ 'feature_importances_'] else: self.patience_counter += 1 self.epoch += 1 total_time += time.time() - starting_time if self.verbose > 0: if self.epoch % self.verbose == 0: separator = "|" msg_epoch = f"| {self.epoch:<5} | " msg_epoch += f"{-fit_metrics['train']['stopping_loss']:.5f}" msg_epoch += f' {separator:<2} ' msg_epoch += f"{-fit_metrics['valid']['stopping_loss']:.5f}" msg_epoch += f' {separator:<2} ' msg_epoch += f" {np.round(total_time, 1):<10}" print(msg_epoch) if self.verbose > 0: if self.patience_counter == self.patience: print(f"Early stopping occured at epoch {self.epoch}") print(f"Training done in {total_time:.3f} seconds.") print('---------------------------------------') self.history = { "train": { "loss": losses_train, "metric": metrics_train }, "valid": { "loss": losses_valid, "metric": metrics_valid } } # load best models post training self.load_best_model()
def fit(self, X_train, y_train, X_valid=None, y_valid=None, loss_fn=None, weights=0, max_epochs=100, patience=10, batch_size=1024, virtual_batch_size=128): """Train a neural network stored in self.network Using train_dataloader for training data and valid_dataloader for validation. Parameters ---------- X_train: np.ndarray Train set y_train : np.array Train targets X_train: np.ndarray Train set y_train : np.array Train targets weights : bool or dictionnary 0 for no balancing 1 for automated balancing dict for custom weights per class max_epochs : int Maximum number of epochs during training patience : int Number of consecutive non improving epoch before early stopping batch_size : int Training batch size virtual_batch_size : int Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size) """ self.update_fit_params(X_train, y_train, X_valid, y_valid, loss_fn, weights, max_epochs, patience, batch_size, virtual_batch_size) train_dataloader, valid_dataloader = self.construct_loaders( X_train, y_train, X_valid, y_valid, self.updated_weights, self.batch_size) self.network = tab_network.TabNet( self.input_dim, self.output_dim, n_d=self.n_d, n_a=self.n_d, n_steps=self.n_steps, gamma=self.gamma, cat_idxs=self.cat_idxs, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, n_independent=self.n_independent, n_shared=self.n_shared, epsilon=self.epsilon, virtual_batch_size=self.virtual_batch_size, momentum=self.momentum, device_name=self.device_name).to(self.device) self.optimizer = self.optimizer_fn(self.network.parameters(), **self.opt_params) if self.scheduler_fn: self.scheduler = self.scheduler_fn(self.optimizer, **self.scheduler_params) else: self.scheduler = None losses_train = [] losses_valid = [] metrics_train = [] metrics_valid = [] while (self.epoch < self.max_epochs and self.patience_counter < self.patience): print(f"EPOCH : {self.epoch}") fit_metrics = self.fit_epoch(train_dataloader, valid_dataloader) losses_train.append(fit_metrics['train']['loss_avg']) losses_valid.append(fit_metrics['valid']['total_loss']) metrics_train.append(fit_metrics['train']['stopping_loss']) metrics_valid.append(fit_metrics['valid']['stopping_loss']) stopping_loss = fit_metrics['valid']['stopping_loss'] if stopping_loss < self.best_cost: self.best_cost = stopping_loss self.patience_counter = 0 # Saving model torch.save(self.network, self.saving_path + f"{self.model_name}.pt") # Updating feature_importances_ self.feature_importances_ = fit_metrics['train'][ 'feature_importances_'] else: self.patience_counter += 1 print("Best metric valid: ", self.best_cost) self.epoch += 1 if self.epoch % self.verbose == 0: plot_losses(losses_train, losses_valid, metrics_train, metrics_valid) # load best models post training self.load_best_model()
def __init__(self, context: PyTorchTrialContext): super().__init__(context) self.context = context clip_value = None if clip_value: self.clip_grads = lambda params: nn.utils.clip_grad_norm_( params, clip_value) else: self.clip_grads = None path_train = f"https://{S3_BUCKET}.s3-us-west-2.amazonaws.com/{S3_KEY}/{TRAIN_CSV}" path_valid = f"https://{S3_BUCKET}.s3-us-west-2.amazonaws.com/{S3_KEY}/{VAL_CSV}" path_store = f"https://{S3_BUCKET}.s3-us-west-2.amazonaws.com/{S3_KEY}/{STORE_CSV}" print("Downloading data") urllib.request.urlretrieve(path_train, TRAIN_CSV) urllib.request.urlretrieve(path_valid, VAL_CSV) urllib.request.urlretrieve(path_store, STORE_CSV) print("Done downloading data") # CUDF if self.context.get_hparam("cudf"): print("Reading CSVs with cudf") df_train = cudf.read_csv(TRAIN_CSV) df_valid = cudf.read_csv(VAL_CSV) df_store = cudf.read_csv(STORE_CSV) print("Joining dataframes") df_train_joined = df_train.join(df_store, how='left', on='store_id', rsuffix='store').fillna(0) df_val_joined = df_valid.join(df_store, how='left', on='store_id', rsuffix='store').fillna(0) print("Done joining") cols = df_train_joined.columns.tolist() X_train = df_train_joined[cols[:12] + cols[14:]].values.astype( np.float32) y_train = df_train_joined[cols[12]].values.astype(np.float32) X_valid = df_val_joined[cols[:12] + cols[14:]].values.astype( np.float32) y_valid = df_val_joined[cols[12]].values.astype(np.float32) print("Done loading data") self.train_dataset = TorchDataset(cupy.asnumpy(X_train), cupy.asnumpy(y_train)) self.valid_dataset = TorchDataset(cupy.asnumpy(X_valid), cupy.asnumpy(y_valid)) else: print("Reading CSVs with pandas") df_train = pd.read_csv(TRAIN_CSV) df_valid = pd.read_csv(VAL_CSV) df_store = pd.read_csv(STORE_CSV) print("Joining dataframes") df_train_joined = df_train.join(df_store, how='left', on='store_id', rsuffix='store').fillna(0) df_val_joined = df_valid.join(df_store, how='left', on='store_id', rsuffix='store').fillna(0) print("Done joining") cols = df_train_joined.columns.tolist() X_train = df_train_joined[cols[:12] + cols[14:]].values.astype( np.float32) y_train = df_train_joined[cols[12]].values.astype(np.float32) X_valid = df_val_joined[cols[:12] + cols[14:]].values.astype( np.float32) y_valid = df_val_joined[cols[12]].values.astype(np.float32) print("Done loading data") self.train_dataset = TorchDataset(X_train, y_train) self.valid_dataset = TorchDataset(X_valid, y_valid) self.lambda_sparse = 10**(-self.context.get_hparam("lambda_sparse")) self.loss_fn = nn.functional.mse_loss self.optimizer_params = { "lr": self.context.get_hparam("learning_rate"), } self.model = tab_network.TabNet( input_dim=22, output_dim=1, n_d=self.context.get_hparam("n_d"), n_a=self.context.get_hparam("n_a"), n_steps=self.context.get_hparam("n_steps"), gamma=self.context.get_hparam("gamma"), cat_idxs=[], cat_dims=[], cat_emb_dim=1, n_independent=2, n_shared=2, epsilon=1e-15, virtual_batch_size=256 * self.context.get_hparam("virtual_batch_size"), momentum=self.context.get_hparam("momentum"), mask_type="sparsemax") self.model = self.context.wrap_model(self.model) self.optimizer = self.context.wrap_optimizer( opt.Adam(self.model.parameters(), **self.optimizer_params)) lmbda = lambda epoch: self.context.get_hparam("lr_decay") self.lr_scheduler = self.context.wrap_lr_scheduler( opt.lr_scheduler.MultiplicativeLR(self.optimizer, lr_lambda=lmbda), step_mode=LRScheduler.StepMode.MANUAL_STEP)