def test_small_moddata_feature_selection_classif(small_moddata): """ This test creates classifier MODData and test the feature selection method """ x1 = np.array([0] * 500 + [1] * 500 + [2] * 500, dtype='float') x2 = np.random.choice(2, 1500) x3 = x1 * x2 x4 = x1 + (x2 * 0.5) targets = np.array(x1, dtype='int').reshape(-1, 1) features = np.array([x1, x2, x3, x4]).T names = ['my_classes'] c_nmi = pd.DataFrame([[1, 0, 0.5, 0.5], [0, 1, 0.5, 0.5], [0.5, 0.5, 1, 0.5], [0.5, 0.5, 0.5, 1]], columns=['f1', 'f2', 'f3', 'f4'], index=['f1', 'f2', 'f3', 'f4']) classif_md = MODData(['dummy'] * 1500, targets, target_names=names, num_classes={"my_classes": 3}) classif_md.df_featurized = pd.DataFrame(features, columns=['f1', 'f2', 'f3', 'f4']) classif_md.feature_selection(n=3, cross_nmi=c_nmi) assert len(classif_md.get_optimal_descriptors()) == 3 assert classif_md.get_optimal_descriptors() == ['f1', 'f4', 'f3']
def fit(self,data:MODData, val_fraction = 0.0, val_key = None, lr=0.001, epochs = 200, batch_size = 128, xscale='minmax',yscale=None): print('new') self.xscale = xscale self.target_names = data.names self.optimal_descriptors = data.get_optimal_descriptors() x = data.get_featurized_df()[self.optimal_descriptors[:self.n_feat]].values print(x.shape) y = data.get_target_df()[self.targets_flatten].values.transpose() print(y.shape) #Scale the input features: if self.xscale == 'minmax': self.xmin = x.min(axis=0) self.xmax = x.max(axis=0) x=(x-self.xmin)/(self.xmax-self.xmin) - 0.5 elif self.xscale == 'standard': self.scaler = StandardScaler() x = self.scaler.fit_transform(x) x = np.nan_to_num(x) if val_fraction > 0: if self.PP: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_{}_mae'.format(val_key)]))) else: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_mae']))) else: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}".format(epoch,logs['loss']))) fit_params = { 'x': x, 'y': list(y), 'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'validation_split' : val_fraction, 'callbacks':[print_callback] } print('compile',flush=True) self.model.compile(loss = 'mse',optimizer=keras.optimizers.Adam(lr=lr),metrics=['mae'],loss_weights=self.weights) print('fit',flush=True) self.model.fit(**fit_params)
def fit_preset( self, data: MODData, presets: List[Dict[str, Any]] = None, val_fraction: float = 0.15, verbose: int = 0, classification: bool = False, refit: bool = True, fast: bool = False, nested: int = 5, callbacks: List[Any] = None, n_jobs=None, ) -> Tuple[List[List[Any]], np.ndarray, Optional[List[float]], List[List[float]], Dict[str, Any], ]: """Chooses an optimal hyper-parametered MODNet model from different presets. This function implements the "inner loop" of a cross-validation workflow. By modifying the `nested` argument, it can be run in full nested mode (i.e. train n_fold * n_preset models) or just with a simple random hold-out set. The data is first fitted on several well working MODNet presets with a validation set (10% of the furnished data by default). Sets the `self.model` attribute to the model with the lowest mean validation loss across all folds. Args: data: MODData object contain training and validation samples. presets: A list of dictionaries containing custom presets. verbose: The verbosity level to pass to tf.keras val_fraction: The fraction of the data to use for validation. classification: Whether or not we are performing classification. refit: Whether or not to refit the final model for each fold with the best-performing settings. fast: Used for debugging. If `True`, only fit the first 2 presets and reduce the number of epochs. nested: integer specifying whether or not to perform a full nested CV. If 0, a simple validation split is performed based on val_fraction argument. If an integer, use this number of inner CV folds, ignoring the `val_fraction` argument. Note: If set to 1, the value will be overwritten to a default of 5 folds. n_jobs: number of jobs for multiprocessing Returns: - A list of length num_outer_folds containing lists of MODNet models of length num_inner_folds. - A list of validation losses achieved by the best model for each fold during validation (excluding refit). - The learning curve of the final (refitted) model (or `None` if `refit` is `False`) - A nested list of learning curves for each trained model of lengths (num_outer_folds, num_inner folds). - The settings of the best-performing preset. """ from modnet.matbench.benchmark import matbench_kfold_splits if callbacks is None: es = tf.keras.callbacks.EarlyStopping( monitor="loss", min_delta=0.001, patience=100, verbose=verbose, mode="auto", baseline=None, restore_best_weights=False, ) callbacks = [es] if presets is None: from modnet.model_presets import gen_presets presets = gen_presets( len(data.optimal_features), len(data.df_targets), classification=classification, ) if fast and len(presets) >= 2: presets = presets[:2] for k, _ in enumerate(presets): presets[k]["epochs"] = 100 num_nested_folds = 5 if nested: num_nested_folds = nested if num_nested_folds <= 1: num_nested_folds = 5 # create tasks splits = matbench_kfold_splits(data, n_splits=num_nested_folds, classification=classification) if not nested: splits = [ train_test_split(range(len(data.df_featurized)), test_size=val_fraction) ] n_splits = 1 else: n_splits = num_nested_folds train_val_datas = [] for train, val in splits: train_val_datas.append(data.split((train, val))) tasks = [] for i, params in enumerate(presets): n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"]) for ind in range(n_splits): val_params = {} train_data, val_data = train_val_datas[ind] val_params["val_data"] = val_data tasks += [{ "train_data": train_data, "targets": self.targets, "weights": self.weights, "num_classes": self.num_classes, "n_feat": n_feat, "num_neurons": params["num_neurons"], "lr": params["lr"], "batch_size": params["batch_size"], "epochs": params["epochs"], "loss": params["loss"], "act": params["act"], "out_act": self.out_act, "callbacks": callbacks, "preset_id": i, "fold_id": ind, "verbose": verbose, **val_params, }] val_losses = 1e20 * np.ones((len(presets), n_splits)) learning_curves = [[None for _ in range(n_splits)] for _ in range(len(presets))] models = [[None for _ in range(n_splits)] for _ in range(len(presets))] ctx = multiprocessing.get_context("spawn") pool = ctx.Pool(processes=n_jobs) LOG.info( f"Multiprocessing on {n_jobs} cores. Total of {multiprocessing.cpu_count()} cores available." ) for res in tqdm.tqdm( pool.imap_unordered(map_validate_model, tasks, chunksize=1), total=len(tasks), ): val_loss, learning_curve, model, preset_id, fold_id = res LOG.info(f"Preset #{preset_id} fitting finished, loss: {val_loss}") # reload the model object after serialization model._restore_model() val_losses[preset_id, fold_id] = val_loss learning_curves[preset_id][fold_id] = learning_curve models[preset_id][fold_id] = model pool.close() pool.join() val_loss_per_preset = np.mean(val_losses, axis=1) best_preset_idx = int(np.argmin(val_loss_per_preset)) best_model_idx = int(np.argmin(val_losses[best_preset_idx, :])) best_preset = presets[best_preset_idx] best_learning_curve = learning_curves[best_preset_idx][best_model_idx] best_model = models[best_preset_idx][best_model_idx] LOG.info( "Preset #{} resulted in lowest validation loss with params {}". format(best_preset_idx + 1, tasks[n_splits * best_preset_idx + best_model_idx])) if refit: LOG.info("Refitting with all data and parameters: {}".format( best_preset)) # Building final model n_feat = min(len(data.get_optimal_descriptors()), best_preset["n_feat"]) self.model = MODNetModel( self.targets, self.weights, num_neurons=best_preset["num_neurons"], n_feat=n_feat, act=best_preset["act"], out_act=self.out_act, num_classes=self.num_classes, ).model self.n_feat = n_feat self.fit( data, val_fraction=0, lr=best_preset["lr"], epochs=best_preset["epochs"], batch_size=best_preset["batch_size"], loss=best_preset["loss"], callbacks=callbacks, verbose=verbose, ) else: self.n_feat = best_model.n_feat self.model = best_model.model self._scaler = best_model._scaler return models, val_losses, best_learning_curve, learning_curves, best_preset
def fit( self, training_data: MODData, val_fraction: float = 0.0, val_key: Optional[str] = None, val_data: Optional[MODData] = None, lr: float = 0.001, epochs: int = 200, batch_size: int = 128, xscale: Optional[str] = "minmax", metrics: List[str] = ["mae"], callbacks: List[Callable] = None, verbose: int = 0, loss: str = "mse", **fit_params, ) -> None: """Train the model on the passed training `MODData` object. Parameters: training_data: A `MODData` that has been featurized and feature selected. The first `self.n_feat` entries in `training_data.get_optimal_descriptors()` will be used for training. val_fraction: The fraction of the training data to use as a validation set for tracking model performance during training. val_key: The target name to track on the validation set during training, if performing multi-target learning. lr: The learning rate. epochs: The maximum number of epochs to train for. batch_size: The batch size to use for training. xscale: The feature scaler to use, either `None`, `'minmax'` or `'standard'`. metrics: A list of tf.keras metrics to pass to `compile(...)`. loss: The built-in tf.keras loss to pass to `compile(...)`. fit_params: Any additional parameters to pass to `fit(...)`, these will be overwritten by the explicit keyword arguments above. """ if self.n_feat > len(training_data.get_optimal_descriptors()): raise RuntimeError( "The model requires more features than computed in data. " f"Please reduce n_feat below or equal to {len(training_data.get_optimal_descriptors())}" ) self.xscale = xscale self.target_names = list(self.weights.keys()) self.optimal_descriptors = training_data.get_optimal_descriptors() x = training_data.get_featurized_df()[ self.optimal_descriptors[:self.n_feat]].values # For compatibility with MODNet 0.1.7; if there is only one target in the training data, # use that for the name of the target too. if (len(self.targets_flatten) == 1 and len(training_data.df_targets.columns) == 1): self.targets_flatten = list(training_data.df_targets.columns) y = [] for targ in self.targets_flatten: if self.num_classes[targ] >= 2: # Classification y_inner = tf.keras.utils.to_categorical( training_data.df_targets[targ].values, num_classes=self.num_classes[targ], ) loss = "categorical_crossentropy" else: y_inner = training_data.df_targets[targ].values.astype( np.float, copy=False) y.append(y_inner) # Scale the input features: if self.xscale == "minmax": self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5)) elif self.xscale == "standard": self._scaler = StandardScaler() x = self._scaler.fit_transform(x) x = np.nan_to_num(x, nan=-1) if val_data is not None: val_x = val_data.get_featurized_df()[ self.optimal_descriptors[:self.n_feat]].values val_x = self._scaler.transform(val_x) val_x = np.nan_to_num(val_x, nan=-1) try: val_y = list(val_data.get_target_df()[ self.targets_flatten].values.astype( np.float, copy=False).transpose()) except Exception: val_y = list(val_data.get_target_df().values.astype( np.float, copy=False).transpose()) validation_data = (val_x, val_y) else: validation_data = None # Optionally set up print callback if verbose: if val_fraction > 0 or validation_data: if self._multi_target and val_key is not None: val_metric_key = f"val_{val_key}_mae" else: val_metric_key = "val_mae" print_callback = tf.keras.callbacks.LambdaCallback( on_epoch_end=lambda epoch, logs: print( f"epoch {epoch}: loss: {logs['loss']:.3f}, " f"val_loss:{logs['val_loss']:.3f} {val_metric_key}:{logs[val_metric_key]:.3f}" )) else: print_callback = tf.keras.callbacks.LambdaCallback( on_epoch_end=lambda epoch, logs: print( f"epoch {epoch}: loss: {logs['loss']:.3f}")) if callbacks is None: callbacks = [print_callback] else: callbacks.append(print_callback) fit_params = { "x": x, "y": y, "epochs": epochs, "batch_size": batch_size, "verbose": verbose, "validation_split": val_fraction, "validation_data": validation_data, "callbacks": callbacks, } self.model.compile( loss=loss, optimizer=tf.keras.optimizers.Adam(lr=lr), metrics=metrics, loss_weights=self.weights, ) history = self.model.fit(**fit_params) self.history = history.history
def fit_preset( self, data: MODData, presets: List[Dict[str, Any]] = None, val_fraction: float = 0.1, verbose: int = 0 ) -> None: """Chooses an optimal hyper-parametered MODNet model from different presets. The data is first fitted on several well working MODNet presets with a validation set (10% of the furnished data by default). Sets the `self.model` attribute to the model with the lowest loss. Args: data: MODData object contain training and validation samples. presets: A list of dictionaries containing custom presets. verbose: The verbosity level to pass to Keras val_fraction: The fraction of the data to use for validation. """ rlr = keras.callbacks.ReduceLROnPlateau( monitor="loss", factor=0.5, patience=20, verbose=verbose, mode="auto", min_delta=0, ) es = keras.callbacks.EarlyStopping( monitor="loss", min_delta=0.001, patience=300, verbose=verbose, mode="auto", baseline=None, restore_best_weights=True, ) callbacks = [rlr, es] if presets is None: from modnet.model_presets import MODNET_PRESETS presets = MODNET_PRESETS val_losses = 1e20 * np.ones((len(presets),)) best_model = None best_n_feat = None for i, params in enumerate(presets): logging.info("Training preset #{}/{}".format(i + 1, len(presets))) n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"]) self.model = MODNetModel( self.targets, self.weights, num_neurons=params["num_neurons"], n_feat=n_feat, act=params["act"], ).model self.n_feat = n_feat self.fit( data, val_fraction=val_fraction, lr=params["lr"], epochs=params["epochs"], batch_size=params["batch_size"], loss=params["loss"], callbacks=callbacks, verbose=verbose, ) val_loss = np.array(self.model.history.history["val_loss"])[-20:].mean() if val_loss < min(val_losses): best_model = self.model best_n_feat = n_feat val_losses[i] = val_loss logging.info("Validation loss: {:.3f}".format(val_loss)) best_preset = val_losses.argmin() logging.info( "Preset #{} resulted in lowest validation loss.\nFitting all data...".format( best_preset + 1 ) ) self.n_feat = best_n_feat self.model = best_model