def load_ext_dataset(dataset_name: str, expected_type: Union[Usage, str]): """Load one of the preset datasets from the `DATASETS` constant. Will not overwrite any existing local data with remote datasets. Checks hashes against what is expected and will not depickle if unrecognised. Parameters: dataset_name: The name (key) of the dataset in `DATASETS`. expected_type: A string representing the expected usage of the dataset, e.g. `'_MODData'` or `'cross_nmi'`. Returns: The path to the downloaded or previously installed model. """ import urllib.request import urllib.error if dataset_name not in DATASETS: raise ValueError( f"No dataset {dataset_name} found, must be one of {list(DATASETS.keys())}" ) dataset = DATASETS[dataset_name] if isinstance(expected_type, str): if expected_type == "MODData": expected_type = "_MODData" expected_type = Usage[expected_type] if dataset.usage != expected_type: raise ValueError( f"Cannot load {dataset_name} as it has the wrong type {dataset.usage}." ) data_dir = Path(__file__).parent.joinpath("data") model_path = data_dir.joinpath(dataset.filename) if not model_path.is_file(): LOG.info( f"Downloading featurized dataset {dataset_name} from {dataset.url} into {model_path}" ) if not data_dir.is_dir(): os.makedirs(data_dir) try: zip_file, response = urllib.request.urlretrieve( dataset.url, model_path) except (urllib.error.URLError, urllib.error.HTTPError) as exc: raise ValueError( f"There was a problem downloading {dataset.url}: {exc.reason}") if dataset.md5 is not None: from modnet.utils import get_hash_of_file file_md5 = get_hash_of_file(model_path, algo="md5") if file_md5 != dataset.md5: raise RuntimeError( f"Precomputed {str(dataset.usage.name.strip('_'))} did not match expected MD5 from {dataset.url}, will not depickle." f"\nExpected: {str(dataset.md5)}" f"\nReceived: {str(file_md5)}") return model_path
def save(self, filename: str): """Pickle the contents of the `MODData` object so that it can be loaded in with `MODData.load()`. If the filename ends in "tgz", "bz2" or "zip", the pickle will be compressed accordingly by `pandas.to_pickle(...)`. """ pd.to_pickle(self, filename) LOG.info(f"Data successfully saved as {filename}!")
def save(self, filename: str): """Save the `MODNetModel` to filename: Parameters: filename: The base filename to save to. If the filename ends in "tgz", "bz2" or "zip", the pickle will be compressed accordingly by `pandas.to_pickle(...)`. """ self._make_picklable() pd.to_pickle(self, filename) self._restore_model() LOG.info(f'Model successfully saved as {filename}!')
def save(self, filename: str) -> None: """Save the `MODNetModel` to filename: If the filename ends in "tgz", "bz2" or "zip", the pickle will be compressed accordingly by :meth:`pandas.DataFrame.to_pickle`. Parameters: filename: The base filename to save to. """ self._make_picklable() pd.to_pickle(self, filename) self._restore_model() LOG.info(f"Model successfully saved as {filename}!")
def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: """Decorate input `pandas.DataFrame` of structures with composition features from matminer, specified by the MODFeaturizer preset. Currently applies the set of all matminer composition features. Arguments: df: the input dataframe with a `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame, or an empty DataFrame if no composition/oxidation featurizers exist for this class. """ df = df.copy() if self.composition_featurizers: LOG.info("Applying composition featurizers...") df["composition"] = df["structure"].apply(lambda s: s.composition) df = self._fit_apply_featurizers( df, self.composition_featurizers, "composition" ) df = df.rename(columns={"Input Data": ""}) df.columns = df.columns.map("|".join).str.strip("|") if self.oxid_composition_featurizers: LOG.info("Applying oxidation state featurizers...") if getattr(self, "fast_oxid", False): df = CompositionToOxidComposition( all_oxi_states=False, max_sites=-1 ).featurize_dataframe(df, "composition") else: df = CompositionToOxidComposition().featurize_dataframe( df, "composition" ) df = self._fit_apply_featurizers( df, self.oxid_composition_featurizers, "composition_oxid" ) df = df.rename(columns={"Input Data": ""}) df.columns = df.columns.map("|".join).str.strip("|") return df
def featurize_site( self, df: pd.DataFrame, aliases: Optional[Dict[str, str]] = None) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with site features, specified by the MODFeaturizer preset. Arguments: df: the input dataframe with a `"structure"` column containing `pymatgen.Structure` objects. aliases: optional dictionary to map matminer output column names to new aliases, mostly used for backwards-compatibility. Returns: pandas.DataFrame: the decorated DataFrame. """ LOG.info("Applying site featurizers...") df = df.copy() df.columns = ["Input data|" + x for x in df.columns] for fingerprint in self.site_featurizers: site_stats_fingerprint = SiteStatsFingerprint( fingerprint, stats=self.site_stats) df = site_stats_fingerprint.featurize_dataframe( df, "Input data|structure", multiindex=False, ignore_errors=True) fingerprint_name = fingerprint.__class__.__name__ if aliases: fingerprint_name = aliases.get(fingerprint_name, fingerprint_name) if "|" not in fingerprint_name: fingerprint_name += "|" df.columns = [ f"{fingerprint_name}{x}" if "|" not in x else x for x in df.columns ] return df
def load(filename: str) -> "MODNetModel": """Load `MODNetModel` object pickled by the :meth:`MODNetModel.save` method. If the filename ends in "tgz", "bz2" or "zip", the pickle will be decompressed accordingly by :func:`pandas.read_pickle`. Returns: The loaded `MODNetModel` object. """ pickled_data = None if isinstance(filename, Path): filename = str(filename) # handle .zip files explicitly for OS X/macOS compatibility if filename.endswith(".zip"): from zipfile import ZipFile with ZipFile(filename, "r") as zf: namelist = zf.namelist() _files = [ _ for _ in namelist if not _.startswith("__MACOSX/") or _.startswith(".DS_STORE") ] if len(_files) == 1: with zf.open(_files.pop()) as f: pickled_data = pd.read_pickle(f) if pickled_data is None: pickled_data = pd.read_pickle(filename) if isinstance(pickled_data, MODNetModel): if not hasattr(pickled_data, "__modnet_version__"): pickled_data.__modnet_version__ = "unknown" pickled_data._restore_model() LOG.info( f"Loaded {pickled_data} object, created with modnet version {pickled_data.__modnet_version__}" ) return pickled_data raise ValueError( f"File {filename} did not contain compatible data to create a MODNetModel object, " f"instead found {pickled_data.__class__.__name__}.")
def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame: """Decorate input `pandas.DataFrame` of structures with structural features from matminer, specified by the MODFeaturizer preset. Currently applies the set of all matminer structure features. Arguments: df: the input dataframe with a `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ LOG.info("Applying structure featurizers...") df = df.copy() df = self._fit_apply_featurizers(df, self.structure_featurizers, "structure") df.columns = df.columns.map("|".join).str.strip("|") return df
def get_features_dyn(n_feat, cross_nmi, target_nmi): missing = [x for x in cross_nmi.index if x not in target_nmi.index] cross_nmi = cross_nmi.drop(missing, axis=0).drop(missing, axis=1) missing = [x for x in target_nmi.index if x not in cross_nmi.index] target_nmi = target_nmi.drop(missing, axis=0) target_nmi = target_nmi.replace([np.inf, -np.inf, np.nan], 0) first_feature = target_nmi.nlargest(1).index[0] feature_set = [first_feature] get_p = get_rr_p_parameter_default get_c = get_rr_c_parameter_default if n_feat == -1: n_feat = len(cross_nmi.index) else: n_feat = min(len(cross_nmi.index), n_feat) for n in range(n_feat - 1): if (n + 1) % 50 == 0: LOG.info("Selected {}/{} features...".format(n + 1, n_feat)) p = get_p(n) c = get_c(n) score = cross_nmi.copy() # score = score.loc[target_mi.index, target_mi.index] score = score.drop(feature_set, axis=0) score = score[feature_set] for i in score.index: row = score.loc[i, :] score.loc[i, :] = target_nmi[i] / (row**p + c) next_feature = score.min(axis=1).idxmax(axis=0) feature_set.append(next_feature) return feature_set
def _fit_apply_featurizers( self, df: pd.DataFrame, featurizers: Iterable[BaseFeaturizer], column: str, fit_to_df: bool = True, ) -> pd.DataFrame: """For the list of featurizers, fit each to the chosen column of the input pd.DataFrame and then apply them as a MultipleFeaturizer. Arguments: df: The DataFrame to featurize. featurizers: The list of matminer featurizers to fit and apply to the DataFrame. column: The name of the column to apply the featurizers to. fit_to_df: Whether or not to fit the featurizers to the input dataframe. If not true, it will be assumed that any featurizers that required fitting have already been fitted. Returns: pandas.DataFrame: the decorated DataFrame. """ LOG.info(f"Applying featurizers {featurizers} to column {column!r}.") if fit_to_df: _featurizers = MultipleFeaturizer( [feat.fit(df[column]) for feat in featurizers] ) else: _featurizers = MultipleFeaturizer(featurizers) if self._n_jobs is not None: _featurizers.set_n_jobs(self._n_jobs) return _featurizers.featurize_dataframe( df, column, multiindex=True, ignore_errors=True )
def fit( self, training_data: MODData, n_jobs=1, **kwargs, ) -> None: """Train the model on the passed training `MODData` object. Parameters: same as MODNetModel fit. """ if self.bootstrap: LOG.info("Generating bootstrap data...") train_datas = [training_data.split((resample(np.arange(len(training_data.df_targets)), replace=True, random_state=2943),[]))[0] for _ in range(self.n_models)] else: train_datas = [training_data for _ in range(self.n_models)] if n_jobs<=1: for i in range(self.n_models): LOG.info(f"Bootstrap fitting model #{i + 1}/{self.n_models}") self.model[i].fit(train_datas[i], **kwargs) model_summary = "" for k in self.model[i].history.keys(): model_summary += "{}: {:.4f}\t".format(k, self.model[i].history[k][-1]) LOG.info(model_summary) else: ctx = multiprocessing.get_context('spawn') pool = ctx.Pool(processes=n_jobs) tasks =[] for i,m in enumerate(self.model): m._make_picklable() tasks.append({'model':m, 'training_data':train_datas[i], 'model_id':i, **kwargs}) for res in tqdm.tqdm(pool.imap_unordered(_map_fit_MODNet, tasks, chunksize=1), total=self.n_models): model, model_id = res model._restore_model() self.model[model_id] = model model_summary = f"Model #{model_id}\t" for k in model.history.keys(): model_summary += "{}: {:.4f}\t".format(k, model.history[k][-1]) LOG.info(model_summary) pool.close() pool.join()
def feature_selection( self, n: int = 1500, cross_nmi: Optional[pd.DataFrame] = None, use_precomputed_cross_nmi: bool = False, n_jobs: int = None, ): """Compute the mutual information between features and targets, then apply relevance-redundancy rankings to choose the top `n` features. Sets the `self.optimal_features` attribute to a list of feature names. Args: n: number of desired features. cross_nmi: specify the cross NMI between features as a dataframe. use_precomputed_cross_nmi: Whether or not to use the cross NMI that was computed on Materials Project features, instead of precomputing. n_jobs: max. number of processes to use when calculating cross NMI. """ if getattr(self, "df_featurized", None) is None: raise RuntimeError( "Mutual information feature selection requiresd featurized data, please call `.featurize()`" ) if getattr(self, "df_targets", None) is None: raise RuntimeError( "Mutual information feature selection requires target properties" ) ranked_lists = [] optimal_features_by_target = {} if cross_nmi is not None: self.cross_nmi = cross_nmi elif getattr(self, "cross_nmi", None) is None: self.cross_nmi = None # Loading mutual information between features if use_precomputed_cross_nmi: LOG.info("Loading cross NMI from 'Features_cross' file.") from modnet.ext_data import load_ext_dataset cnmi_path = load_ext_dataset("MP_2018.6_CROSS_NMI", "cross_nmi") self.cross_nmi = pd.read_pickle(cnmi_path) precomputed_cols = set(self.cross_nmi.columns) featurized_cols = set(self.df_featurized.columns) if len(precomputed_cols | featurized_cols) > len(precomputed_cols): LOG.warning( "Feature mismatch between precomputed `Features_cross` and `df_featurized`. " f"Missing columns: {featurized_cols - precomputed_cols}") if self.cross_nmi is None: df = self.df_featurized.copy() self.cross_nmi, self.feature_entropy = get_cross_nmi( df, return_entropy=True, n_jobs=n_jobs) if self.cross_nmi.isna().sum().sum() > 0: raise RuntimeError( "Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero." ) for i, name in enumerate(self.names): LOG.info( f"Starting target {i + 1}/{len(self.names)}: {self.names[i]} ..." ) # Computing mutual information with target LOG.info( "Computing mutual information between features and target...") if getattr(self, "num_classes", None) and self.num_classes[name] >= 2: task_type = "classification" else: task_type = "regression" self.target_nmi = nmi_target(self.df_featurized, self.df_targets[[name]], task_type)[name] LOG.info("Computing optimal features...") optimal_features_by_target[name] = get_features_dyn( n, self.cross_nmi, self.target_nmi) ranked_lists.append(optimal_features_by_target[name]) LOG.info("Done with target {}/{}: {}.".format( i + 1, len(self.names), name)) LOG.info("Merging all features...") self.optimal_features = merge_ranked(ranked_lists) self.optimal_features_by_target = optimal_features_by_target LOG.info("Done.")
def featurize(self, fast: bool = False, db_file: str = "feature_database.pkl", n_jobs=None): """For the input structures, construct many matminer features and save a featurized dataframe. If `db_file` is specified, this method will try to load previous feature calculations for each structure ID instead of recomputing. Sets the `self.df_featurized` attribute. Args: fast (bool): whether or not to try to load from a backup. db_file (str): filename of a pickled dataframe containing with the same ID index as this `MODData` object. """ LOG.info("Computing features, this can take time...") df_done = None df_todo = None if n_jobs is not None: self.featurizer.set_n_jobs(n_jobs) if self.df_featurized is not None: raise RuntimeError( "Not overwriting existing featurized dataframe.") if fast: LOG.info("Fast featurization on, retrieving from database...") global DATABASE if DATABASE.empty: DATABASE = pd.read_pickle(db_file) ids_done = [x for x in self.structure_ids if x in DATABASE.index] LOG.info( f"Retrieved features for {len(ids_done)} out of {len(self.structure_ids)} materials" ) df_done = DATABASE.loc[ids_done] df_todo = self.df_structure.drop(ids_done, axis=0) # if any structures were already loaded if fast and not df_done.empty: # if any are left to compute, do them if len(df_todo) > 0: df_finished = self.featurizer.featurize(df_todo) df_final = df_done.append(df_finished) df_final = df_final.reindex(self.structure_ids) # otherwise, all structures were successfully loaded else: df_final = df_done # otherwise, no structures were loaded, so we need to compute all else: df_final = self.featurizer.featurize(self.df_structure) df_final = df_final.replace([np.inf, -np.inf, np.nan], 0) self.df_featurized = df_final LOG.info("Data has successfully been featurized!")
def __init__( self, materials: Optional[List[Union[Structure, Composition]]] = None, targets: Optional[Union[List[float], np.ndarray]] = None, target_names: Optional[Iterable] = None, structure_ids: Optional[Iterable] = None, num_classes: Optional[Dict[str, int]] = None, df_featurized: Optional[pd.DataFrame] = None, featurizer: Optional[Union[MODFeaturizer, str]] = None, structures: Optional[List[Union[Structure, Composition]]] = None, ): """Initialise the MODData object either from a list of structures or from an already featurized dataframe. Prediction targets per structure can be specified as lists or an array alongside their target names. A list of unique IDs can be provided to label the structures. Args: materials: list of structures or compositions to featurize and predict. targets: optional List of targets corresponding to each structure. When learning on multiple targets this is a ndarray where each column corresponds to a target, i.e. of shape (n_materials,n_targets). target_names: optional Iterable (e.g. list) of names of target properties to use in the dataframe. structure_ids: optional Iterable of unique IDs to use instead of generated integers. num_classes: Dictionary defining the target types (classification or regression). Should be constructed as follows: key: string giving the target name; value: integer n, with n=0 for regression and n>=2 for classification with n the number of classes. df_featurized: optional featurized dataframe to use instead of featurizing a new one. Should be passed without structures. featurizer: optional MODFeaturizer object to use for featurization, or string preset to look up in presets dictionary. structures: deprecated (alias to materials for backward compatibility) do not use this. """ from modnet.featurizers.presets import FEATURIZER_PRESETS self.__modnet_version__ = __version__ self.df_featurized = df_featurized self.featurizer = featurizer self.cross_nmi = None if structures is not None: # overwrite materials for backward compatibility materials = structures if materials is not None and self.df_featurized is not None: if len(materials) != len(self.df_featurized): raise RuntimeError( "Mismatched shape of structures and passed df_featurized") if materials is None and self.df_featurized is None: raise RuntimeError( "At least one of `structures` or `df_featurized` should be passed to `MODData`." ) if targets is not None: targets = np.array(targets).reshape((len(targets), -1)) if materials is not None and targets is not None: if np.shape(targets)[0] != len(materials): raise ValueError( f"Targets must have same length as structures: {np.shape(targets)} vs {len(materials)}" ) if materials is not None and isinstance(materials[0], Composition): materials = [CompositionContainer(s) for s in materials] self._composition_only = True if isinstance(featurizer, str): self.featurizer = FEATURIZER_PRESETS.get(featurizer)() if self.featurizer is None: raise RuntimeError( "Requested preset {featurizer} not found in available presets: {FEATURIZER_PRESETS.keys()}" ) elif isinstance(featurizer, MODFeaturizer): self.featurizer = featurizer elif featurizer is None and self.df_featurized is None: if getattr(self, "_composition_only", False): self.featurizer = FEATURIZER_PRESETS["CompositionOnly"]() else: self.featurizer = FEATURIZER_PRESETS["DeBreuck2020"]() if self.featurizer is not None: LOG.info( f"Loaded {self.featurizer.__class__.__name__} featurizer.") if target_names is not None: if np.shape(targets)[-1] != len(target_names): raise ValueError( "Target names must be supplied for every target.") elif targets is not None: target_names = ["prop" + str(i) for i in range(len(targets))] if structure_ids is not None: # for backwards compat, always store the *passed* list of # IDs, so they can be used when loading from a database file # check ids are unique if len(set(structure_ids)) != len(structure_ids): raise ValueError( "List of IDs (`structure_ids`) provided must be unique.") if len(structure_ids) != len(materials): raise ValueError( "List of IDs (`structure_ids`) must have same length as list of structure." ) else: num_entries = (len(materials) if materials is not None else len(df_featurized)) structure_ids = [f"id{i}" for i in range(num_entries)] if targets is not None: # set up dataframe for targets with columns (id, property_1, ..., property_n) self.df_targets = pd.DataFrame(targets, index=structure_ids, columns=target_names) # set up number of classes self.num_classes = {name: 0 for name in self.target_names} if num_classes is not None: self.num_classes.update(num_classes) # set up dataframe for structures with columns (id, structure) self.df_structure = pd.DataFrame({ "id": structure_ids, "structure": materials }) self.df_structure.set_index("id", inplace=True)
def get_features_relevance_redundancy( target_nmi: pd.DataFrame, cross_nmi: pd.DataFrame, n_feat: Optional[int] = None, rr_parameters: Optional[Dict[str, Union[float, Callable[[int], float]]]] = None, return_pc: bool = False, ) -> List: """ Select features from the Relevance Redundancy (RR) score between the input features and the target output. The RR is defined following Equation 2 of De Breuck et al, arXiv:2004:14766, with default values, ..math:: p = \\max{0.1, 4.5 - n^{0.4}}, and ..math:: c = 10^{-6} n^3, where :math:`n` is the number of features in the "chosen" subset for that iteration. These values can be overriden with the `rr_parameters` dictionary argument. Args: target_nmi (pandas.DataFrame): dataframe containing the Normalized Mutual Information (NMI) between a list of input features and a target variable, as computed from :py:func:`nmi_target`. cross_nmi (pandas.DataFrame): dataframe containing the NMI between the input features, as computed from :py:func:`get_cross_nmi`. n_feat (int): Number of features for which the RR score needs to be computed (default: all features). rr_parameters (dict): Allows tuning of p and c parameters. Currently allows fixing of p and c to constant values instead of using the dynamical evaluation. Expects to find keys `"p"` and `"c"`, containing either a callable that takes `n` as an argument and returns the desired `p` or `c`, or another dictionary containing the key `"value"` that stores a constant value of `p` or `c`. return_pc: Whether to return p and c values in the output dictionaries. Returns: list: List of dictionaries containing the results of the relevance-redundancy selection algorithm. """ # Initial checks if set(cross_nmi.index) != set(cross_nmi.columns): raise ValueError( "The cross_nmi DataFrame should have its indices and columns identical." ) if not set(target_nmi.index).issubset(set(cross_nmi.index)): raise ValueError( "The indices of the target DataFrame should be included in the cross_nmi DataFrame indices." ) # Define the functions for the parameters if rr_parameters is None: get_p = get_rr_p_parameter_default get_c = get_rr_c_parameter_default else: if "p" not in rr_parameters or "c" not in rr_parameters: raise ValueError( "When tuning p and c with rr_parameters in get_features_relevance_redundancy, " "both parameters should be tuned") # Set up p if callable(rr_parameters["p"]): get_p = rr_parameters["p"] elif rr_parameters["p"].get("function") == "constant": def get_p(_): return rr_parameters["p"]["value"] else: raise ValueError( 'If not passing a callable, "p" dict must contain keys "function" and "value".' ) # Set up c if callable(rr_parameters["c"]): get_c = rr_parameters["c"] elif rr_parameters["c"].get("function") == "constant": def get_c(_): return rr_parameters["c"]["value"] else: raise ValueError( 'If not passing a callable, "c" dict must contain keys "function" and "value".' ) # Set up the output list out = [] # The first feature is the one with the largest target NMI target_column = target_nmi.columns[0] first_feature = target_nmi.nlargest(1, columns=target_column).index[0] feature_set = [first_feature] feat_out = { "feature": first_feature, "RR_score": None, "NMI_target": target_nmi[target_column][first_feature], } if return_pc: feat_out["RR_p"] = None feat_out["RR_c"] = None out.append(feat_out) # Default is to get the RR score for all features if n_feat is None: n_feat = len(target_nmi.index) missing = [x for x in cross_nmi.index if x not in target_nmi.index] cross_nmi = cross_nmi.drop(missing, axis=0).drop(missing, axis=1) # Loop on the number of features for n in range(1, n_feat): LOG.debug("In selection of feature {}/{} features...".format( n + 1, n_feat)) if (n + 1) % 50 == 0: LOG.info("Selected {}/{} features...".format(n, n_feat)) p = get_p(n) c = get_c(n) # Compute the RR score score = cross_nmi.copy() # Remove features already selected for the index score = score.drop(feature_set, axis=0) # Use features already selected to compute the maximum NMI between # the remaining features and those already selected score = score[feature_set] # Get the scores of the remaining features for i in score.index: row = score.loc[i, :] score.loc[i, :] = target_nmi.loc[i, target_column] / (row**p + c) # Get the next feature (the one with the highest score) scores_remaining_features = score.min(axis=1) next_feature = scores_remaining_features.idxmax(axis=0) feature_set.append(next_feature) # Add the results for the next feature to the list feat_out = { "feature": next_feature, "RR_score": scores_remaining_features[next_feature], "NMI_target": target_nmi[target_column][next_feature], } if return_pc: feat_out["RR_p"] = p feat_out["RR_c"] = c out.append(feat_out) return out
def get_cross_nmi( df_feat: pd.DataFrame, drop_thr: float = 0.2, return_entropy=False, n_jobs: int = None, **kwargs, ) -> pd.DataFrame: """ Computes the Normalized Mutual Information (NMI) between input features. Args: df_feat (pandas.DataFrame): Dataframe containing the input features for which the NMI with the target variable is to be computed. drop_thr: Features having an information entropy (or self mutual information) threshold below this value will be dropped. return_entropy: If set to True, the information entropy of each feature is also returned **kwargs: Keyword arguments to be passed down to the :py:func:`mutual_info_regression` function from scikit-learn. This can be useful e.g. for testing purposes. Returns: mutual_info: pandas.DataFrame containing the Normalized Mutual Information between features. if return_entropy=True : (mutual_info, diag): With diag a dictionary with all features as keys and information entropy as values. """ if kwargs.get("random_state"): seed = kwargs.pop("random_state") else: seed = np.random.RandomState() if kwargs.get("n_neighbors"): n_neighbors = kwargs.pop("n_neighbors") else: n_neighbors = 3 # Prepare the output DataFrame and compute the mutual information mutual_info = pd.DataFrame([], columns=df_feat.columns, index=df_feat.columns) # create pool of workers if n_jobs is None: n_jobs = 1 pool = Pool(processes=n_jobs) LOG.info(f"Multiprocessing on {n_jobs} workers.") # Compute the "self" mutual information (i.e. information entropy) of the features LOG.info('Computing "self" MI (i.e. information entropy) of features') diag = {} tasks = [] for x_feat in df_feat.columns: tasks += [{ "x": df_feat[x_feat].values, "y": df_feat[x_feat].values, "x_name": x_feat, "y_name": x_feat, "random_state": seed, "n_neighbors": n_neighbors, }] for res in tqdm.tqdm(pool.imap_unordered(map_mi, tasks, chunksize=100), total=len(tasks)): feat_name = res[1] diag[feat_name] = res[0] if (diag[feat_name] < drop_thr or abs(df_feat[feat_name].max() - df_feat[feat_name].min()) < EPS): mutual_info.drop(feat_name, axis=0, inplace=True) mutual_info.drop(feat_name, axis=1, inplace=True) else: mutual_info.loc[feat_name, feat_name] = 1.0 tasks = [] LOG.info("Computing cross NMI between all features...") for idx, x_feat in enumerate(mutual_info.columns): for y_feat in mutual_info.columns[idx + 1:]: tasks += [{ "x": df_feat[x_feat].values, "y": df_feat[y_feat].values, "x_name": x_feat, "y_name": y_feat, "random_state": seed, "n_neighbors": n_neighbors, }] for res in tqdm.tqdm(pool.imap_unordered(map_mi, tasks, chunksize=100), total=len(tasks)): mutual_info.loc[res[1], res[2]] = mutual_info.loc[ res[2], res[1]] = res[0] / (0.5 * (diag[res[1]] + diag[res[2]])) pool.close() pool.join() mutual_info.fillna(0, inplace=True) # if na => no relation => set to zero if return_entropy: return ( mutual_info, diag, ) # diag can be useful for future elimination based on entropy without the need of recomputing the cross NMI else: return mutual_info
def train_fold( fold: Tuple[int, Tuple[MODData, MODData]], target: List[str], target_weights: Dict[str, float], fit_settings: Dict[str, Any], model_type: Type[MODNetModel] = MODNetModel, presets=None, hp_optimization=True, classification=False, save_folds=False, fast=False, save_models=False, nested=False, n_jobs=None, **model_kwargs, ) -> dict: """Train one fold of a CV. Unless stated, all arguments have the same meaning as in `matbench_benchmark(...)`. Arguments: fold: A tuple containing the fold index, and another tuple of the training MODData and test MODData. Returns: A dictionary summarising the fold results. """ fold_ind, (train_data, test_data) = fold results = {} if classification: fit_settings["num_classes"] = {t: 2 for t in target_weights} multi_target = bool(len(target) - 1) # If not performing hp_optimization, load model init settings from fit_settings model_settings = {} if not hp_optimization: model_settings = { "num_neurons": fit_settings["num_neurons"], "num_classes": fit_settings.get("num_classes"), "act": fit_settings.get("act"), "out_act": fit_settings.get("out_act", "linear"), "n_feat": fit_settings["n_feat"], } model_settings.update(model_kwargs) model = model_type(target, target_weights, **model_settings) if hp_optimization: ( models, val_losses, best_learning_curve, learning_curves, best_presets, ) = model.fit_preset( train_data, presets=presets, fast=fast, classification=classification, nested=nested, n_jobs=n_jobs, ) if save_models: for ind, nested_model in enumerate(models): score = val_losses[ind] nested_model.save( f"results/nested_model_{fold_ind}_{ind}_{score:3.3f}") model.save(f"results/best_model_{fold_ind}_{score:3.3f}") results["nested_losses"] = val_losses results["nested_learning_curves"] = learning_curves results["best_learning_curves"] = best_learning_curve else: if fit_settings["increase_bs"]: model.fit( train_data, lr=fit_settings["lr"], epochs=fit_settings["epochs"], batch_size=fit_settings["batch_size"], loss="mse", ) model.fit( train_data, lr=fit_settings["lr"] / 7, epochs=fit_settings["epochs"] // 2, batch_size=fit_settings["batch_size"] * 2, loss=fit_settings["loss"], ) else: model.fit(train_data, **fit_settings) try: predict_kwargs = {} if classification: predict_kwargs["return_prob"] = True if model.can_return_uncertainty: predict_kwargs["return_unc"] = True pred_results = model.predict(test_data, **predict_kwargs) if isinstance(pred_results, tuple): predictions, stds = pred_results else: predictions = pred_results stds = None targets = test_data.df_targets if classification: from sklearn.metrics import roc_auc_score from sklearn.preprocessing import OneHotEncoder y_true = OneHotEncoder().fit_transform(targets.values).toarray() score = roc_auc_score(y_true, predictions.values) pred_bool = model.predict(test_data, return_prob=False) LOG.info(f"ROC-AUC: {score}") errors = targets - pred_bool elif multi_target: errors = targets - predictions score = np.mean(np.abs(errors.values), axis=0) else: errors = targets - predictions score = np.mean(np.abs(errors.values)) except Exception: print_exc() print("Something went wrong benchmarking this model.") predictions = None errors = None score = None if save_folds: opt_feat = train_data.optimal_features[:fit_settings["n_feat"]] df_train = train_data.df_featurized df_train = df_train[opt_feat] df_train.to_csv("folds/train_f{}.csv".format(ind + 1)) df_test = test_data.df_featurized df_test = df_test[opt_feat] errors.columns = [x + "_error" for x in errors.columns] df_test = df_test.join(errors) df_test.to_csv("folds/test_f{}.csv".format(ind + 1)) results["predictions"] = predictions if stds is not None: results["stds"] = stds results["targets"] = targets results["errors"] = errors results["scores"] = score results["best_presets"] = best_presets results["model"] = model return results
def fit_preset( self, data: MODData, presets: List[Dict[str, Any]] = None, val_fraction: float = 0.15, verbose: int = 0, classification: bool = False, refit: bool = True, fast: bool = False, nested: int = 5, callbacks: List[Any] = None, n_jobs=None, ) -> Tuple[List[List[Any]], np.ndarray, Optional[List[float]], List[List[float]], Dict[str, Any], ]: """Chooses an optimal hyper-parametered MODNet model from different presets. This function implements the "inner loop" of a cross-validation workflow. By modifying the `nested` argument, it can be run in full nested mode (i.e. train n_fold * n_preset models) or just with a simple random hold-out set. The data is first fitted on several well working MODNet presets with a validation set (10% of the furnished data by default). Sets the `self.model` attribute to the model with the lowest mean validation loss across all folds. Args: data: MODData object contain training and validation samples. presets: A list of dictionaries containing custom presets. verbose: The verbosity level to pass to tf.keras val_fraction: The fraction of the data to use for validation. classification: Whether or not we are performing classification. refit: Whether or not to refit the final model for each fold with the best-performing settings. fast: Used for debugging. If `True`, only fit the first 2 presets and reduce the number of epochs. nested: integer specifying whether or not to perform a full nested CV. If 0, a simple validation split is performed based on val_fraction argument. If an integer, use this number of inner CV folds, ignoring the `val_fraction` argument. Note: If set to 1, the value will be overwritten to a default of 5 folds. n_jobs: number of jobs for multiprocessing Returns: - A list of length num_outer_folds containing lists of MODNet models of length num_inner_folds. - A list of validation losses achieved by the best model for each fold during validation (excluding refit). - The learning curve of the final (refitted) model (or `None` if `refit` is `False`) - A nested list of learning curves for each trained model of lengths (num_outer_folds, num_inner folds). - The settings of the best-performing preset. """ from modnet.matbench.benchmark import matbench_kfold_splits if callbacks is None: es = tf.keras.callbacks.EarlyStopping( monitor="loss", min_delta=0.001, patience=100, verbose=verbose, mode="auto", baseline=None, restore_best_weights=False, ) callbacks = [es] if presets is None: from modnet.model_presets import gen_presets presets = gen_presets( len(data.optimal_features), len(data.df_targets), classification=classification, ) if fast and len(presets) >= 2: presets = presets[:2] for k, _ in enumerate(presets): presets[k]["epochs"] = 100 num_nested_folds = 5 if nested: num_nested_folds = nested if num_nested_folds <= 1: num_nested_folds = 5 # create tasks splits = matbench_kfold_splits(data, n_splits=num_nested_folds, classification=classification) if not nested: splits = [ train_test_split(range(len(data.df_featurized)), test_size=val_fraction) ] n_splits = 1 else: n_splits = num_nested_folds train_val_datas = [] for train, val in splits: train_val_datas.append(data.split((train, val))) tasks = [] for i, params in enumerate(presets): n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"]) for ind in range(n_splits): val_params = {} train_data, val_data = train_val_datas[ind] val_params["val_data"] = val_data tasks += [{ "train_data": train_data, "targets": self.targets, "weights": self.weights, "num_classes": self.num_classes, "n_feat": n_feat, "num_neurons": params["num_neurons"], "lr": params["lr"], "batch_size": params["batch_size"], "epochs": params["epochs"], "loss": params["loss"], "act": params["act"], "out_act": self.out_act, "callbacks": callbacks, "preset_id": i, "fold_id": ind, "verbose": verbose, **val_params, }] val_losses = 1e20 * np.ones((len(presets), n_splits)) learning_curves = [[None for _ in range(n_splits)] for _ in range(len(presets))] models = [[None for _ in range(n_splits)] for _ in range(len(presets))] ctx = multiprocessing.get_context("spawn") pool = ctx.Pool(processes=n_jobs) LOG.info( f"Multiprocessing on {n_jobs} cores. Total of {multiprocessing.cpu_count()} cores available." ) for res in tqdm.tqdm( pool.imap_unordered(map_validate_model, tasks, chunksize=1), total=len(tasks), ): val_loss, learning_curve, model, preset_id, fold_id = res LOG.info(f"Preset #{preset_id} fitting finished, loss: {val_loss}") # reload the model object after serialization model._restore_model() val_losses[preset_id, fold_id] = val_loss learning_curves[preset_id][fold_id] = learning_curve models[preset_id][fold_id] = model pool.close() pool.join() val_loss_per_preset = np.mean(val_losses, axis=1) best_preset_idx = int(np.argmin(val_loss_per_preset)) best_model_idx = int(np.argmin(val_losses[best_preset_idx, :])) best_preset = presets[best_preset_idx] best_learning_curve = learning_curves[best_preset_idx][best_model_idx] best_model = models[best_preset_idx][best_model_idx] LOG.info( "Preset #{} resulted in lowest validation loss with params {}". format(best_preset_idx + 1, tasks[n_splits * best_preset_idx + best_model_idx])) if refit: LOG.info("Refitting with all data and parameters: {}".format( best_preset)) # Building final model n_feat = min(len(data.get_optimal_descriptors()), best_preset["n_feat"]) self.model = MODNetModel( self.targets, self.weights, num_neurons=best_preset["num_neurons"], n_feat=n_feat, act=best_preset["act"], out_act=self.out_act, num_classes=self.num_classes, ).model self.n_feat = n_feat self.fit( data, val_fraction=0, lr=best_preset["lr"], epochs=best_preset["epochs"], batch_size=best_preset["batch_size"], loss=best_preset["loss"], callbacks=callbacks, verbose=verbose, ) else: self.n_feat = best_model.n_feat self.model = best_model.model self._scaler = best_model._scaler return models, val_losses, best_learning_curve, learning_curves, best_preset
def gen_presets(n_feat: int, n_samples: int, classification: bool = False) -> List[Dict[str, Any]]: """Generates sensible preset architectures and learning parameters based on number of samples and features. Arguments: n_feat: The number of training features available to the model. n_samples: The number of training samples available to the model. Returns: List of dictionaries to individually pass as kwargs to `model.fit(...)`. """ if n_samples < 1000: batch_sizes = [32, 64] else: batch_sizes = [64] learning_rates = [0.001, 0.005, 0.01] epochs = [1000] if classification: losses = ["categorical_crossentropy"] else: losses = ["mae"] activations = ["elu"] xscale = ["minmax", "standard"] n_feat_list = [64, 128, 256, 512] n_feat_list = [n for n in n_feat_list if n <= n_feat] n_feat_list = [n for n in n_feat_list if n > n_feat / 20] if len(n_feat_list) == 1: n_feat_list.append(n_feat) if len(n_feat_list) < 3: n_feat_list.append((n_feat_list[0] + n_feat_list[1]) // 2) n_feat_list = sorted(n_feat_list) archs = [] for nf in n_feat_list: archs += [ (nf, [[nf * 2], [nf // 2], [nf // 8], [nf // 8]]), (nf, [[nf], [nf // 2], [nf // 8], [nf // 8]]), (nf, [[nf // 2], [nf // 4], [nf // 8], [nf // 8]]), ] LOG.info( "Proceeding with grid search: archs: {}, batch sizes: {}, learning_rates: {}" .format(archs, batch_sizes, learning_rates)) hyperparam_presets = [] for a, bs, lr, e, l, act, scaler in itertools.product( archs, batch_sizes, learning_rates, epochs, losses, activations, xscale): preset = { "batch_size": bs, "lr": lr, "n_feat": a[0], "num_neurons": a[1], "epochs": e, "loss": l, "act": act, "xscale": scaler, } hyperparam_presets.append(preset) return hyperparam_presets