def test_small_moddata_featurization(): """ This test creates a new MODData from the MP 2018.6 structures. """ data_file = Path(__file__).parent.joinpath("data/MP_2018.6_small.zip") # Loading pickles can be dangerous, so lets at least check that the MD5 matches # what it was when created assert (get_sha512_of_file(data_file) == "37bd4f8ce6f29c904a13e5670dd53af9a8779094727052ec85ccd6362b1b3765" "ac613426331811b3f626242896d87c3f6bc1884cc5545875b5ae66a712f9e218") old = MODData.load(data_file) structures = old.structures targets = old.targets names = old.names new = MODData(structures, targets, target_names=names) new.featurize(fast=False) new_cols = sorted(new.df_featurized.columns.tolist()) old_cols = sorted(old.df_featurized.columns.tolist()) for i in range(len(old_cols)): print(new_cols[i], old_cols[i]) assert new_cols[i] == old_cols[i] np.testing.assert_array_equal(old_cols, new_cols) for col in new.df_featurized.columns: np.testing.assert_almost_equal( new.df_featurized[col].to_numpy(), old.df_featurized[col].to_numpy(), )
def load_or_featurize(task, n_jobs=1): data_files = glob.glob("./precomputed/*.gz") if len(data_files) == 0: data = featurize(task, n_jobs=n_jobs) else: precomputed_moddata = data_files[0] if len(data_files) > 1: print( f"Found multiple data files {data_files}, loading the first {data_files[0]}" ) data = MODData.load(precomputed_moddata) return data
def _load_moddata(filename): """Loads the pickled MODData from the test directory and checks it's hash.""" from modnet.preprocessing import MODData data_file = Path(__file__).parent.joinpath(f"data/{filename}") if filename not in _TEST_DATA_HASHES: raise RuntimeError( f"Cannot verify hash of {filename} as it was not provided, will not load pickle." ) # Loading pickles can be dangerous, so lets at least check that the MD5 matches # what it was when created assert get_hash_of_file(data_file) == _TEST_DATA_HASHES[filename] return MODData.load(data_file)
def test_load_moddata_zip(): """ This test checks that older MODData objects can still be loaded. """ data_file = Path(__file__).parent.joinpath("data/MP_2018.6_subset.zip") # Loading pickles can be dangerous, so lets at least check that the MD5 matches # what it was when created assert (get_sha512_of_file(data_file) == "d7d75e646dbde539645c8c0b065fd82cbe93f81d3500809655bd13d0acf2027c" "1786091a73f53985b08868c5be431a3c700f7f1776002df28ebf3a12a79ab1a1") data = MODData.load(data_file) assert len(data.structures) == 100 assert len(data.mpids) == 100 assert len(data.df_structure) == 100 assert len(data.df_featurized) == 100 assert len(data.df_targets) == 100 assert len(data.df_targets) == 100
def matbench_benchmark( data: MODData, target: List[str], target_weights: Dict[str, float], fit_settings: Optional[Dict[str, Any]] = None, classification: bool = False, model_type: Type[MODNetModel] = MODNetModel, save_folds: bool = False, save_models: bool = False, hp_optimization: bool = True, inner_feat_selection: bool = True, use_precomputed_cross_nmi: bool = True, presets: Optional[List[dict]] = None, fast: bool = False, n_jobs: Optional[int] = None, nested: bool = False, **model_init_kwargs, ) -> dict: """Train and cross-validate a model against Matbench data splits, optionally performing hyperparameter optimisation. Arguments: data: The entire dataset as a `MODData`. target: The list of target names to train on. target_weights: The target weights to use for the `MODNetModel`. fit_settings: Any settings to pass to `model.fit(...)` directly (typically when not performing hyperparameter optimisation). classification: Whether all tasks are classification rather than regression. model_type: The type of the model to create and benchmark. save_folds: Whether to save dataframes with pre-processed fold data (e.g. feature selection). save_models: Whether to pickle all trained models according to their fold index and performance. hp_optimization: Whether to perform hyperparameter optimisation. inner_feat_selection: Whether to perform split-level feature selection or try to use pre-computed values. use_precomputed_cross_nmi: Whether to use the precmputed cross NMI from the Materials Project dataset, or recompute per fold. presets: Override the built-in hyperparameter grid with these presets. fast: Whether to perform debug training, i.e. reduced presets and epochs. n_jobs: Try to parallelize the inner fit_preset over this number of processes. Maxes out at number_of_presets*nested_folds nested: Whether to perform nested CV for hyperparameter optimisation. **model_init_kwargs: Additional arguments to pass to the model on creation. Returns: A dictionary containing all the results from the training, broken down by model and by fold. """ if fit_settings is None: fit_settings = {} if not fit_settings.get("n_feat"): nf = len(data.df_featurized.columns) fit_settings["n_feat"] = nf if not fit_settings.get("num_neurons"): # Pass dummy network fit_settings["num_neurons"] = [[4], [4], [4], [4]] fold_data = [] results = defaultdict(list) for ind, (train, test) in enumerate(matbench_kfold_splits(data)): train_data, test_data = data.split((train, test)) if inner_feat_selection: path = "folds/train_moddata_f{}".format(ind + 1) if os.path.isfile(path): train_data = MODData.load(path) else: train_data.feature_selection( n=-1, use_precomputed_cross_nmi=use_precomputed_cross_nmi) os.makedirs("folds", exist_ok=True) train_data.save(path) fold_data.append((train_data, test_data)) args = (target, target_weights, fit_settings) model_kwargs = { "model_type": model_type, "hp_optimization": hp_optimization, "fast": fast, "classification": classification, "save_folds": save_folds, "presets": presets, "save_models": save_models, "nested": nested, "n_jobs": n_jobs, } model_kwargs.update(model_init_kwargs) fold_results = [] for fold in enumerate(fold_data): fold_results.append(train_fold(fold, *args, **model_kwargs)) for fold in fold_results: for key in fold: results[key].append(fold[key]) return results
def run_predict(data, final_model, settings, save_folds=False, dknn_only=False): """ Runs benchmark based on final_model without training everything again. It also computes the Knn distance and puts it in the results pickle. In fine, this should be integrated inside modnet benchmark. :param data: :param final_model: :param settings: :return: """ task = settings["task"] # rebuild the EnsembleMODNetModels from the final model n_best_archs = 5 # change this (from 1 to 5 max) to adapt number of inner best archs chosen bootstrap_size = 5 outer_fold_size = bootstrap_size * 5 * 5 inner_fold_size = bootstrap_size * 5 models = [] multi_target = bool(len(data.df_targets.columns) - 1) for i in range(5): # outer fold modnet_models = [] for j in range(5): # inner fold modnet_models+=( final_model.model[(i * outer_fold_size) + (j * inner_fold_size): (i * outer_fold_size) + (j * inner_fold_size) + (n_best_archs * bootstrap_size)]) model = EnsembleMODNetModel(modnet_models=modnet_models) models.append(model) if dknn_only: with open(f"results/{task}_results.pkl", "rb") as f: results = pickle.load(f) results["dknns"] = [] else: results = defaultdict(list) for ind, (train, test) in enumerate(matbench_kfold_splits(data, classification=settings.get("classification", False))): train_data, test_data = data.split((train, test)) path = "folds/train_moddata_f{}".format(ind + 1) train_data = MODData.load(path) assert len(set(train_data.df_targets.index).intersection(set(test_data.df_targets.index))) == 0 model = models[ind] # compute dkNN # TODO: test this quickly before submitting max_feat_model = np.argmax([m.n_feat for m in model.model]) n_feat = model.model[max_feat_model].n_feat feature_names = model.model[max_feat_model].optimal_descriptors dknn = get_dknn(train_data, test_data, feature_names) results["dknns"].append(dknn) if dknn_only: continue predict_kwargs = {} if settings.get("classification"): predict_kwargs["return_prob"] = True if model.can_return_uncertainty: predict_kwargs["return_unc"] = True pred_results = model.predict(test_data, **predict_kwargs) if isinstance(pred_results, tuple): predictions, stds = pred_results else: predictions = pred_results stds = None targets = test_data.df_targets if settings.get("classification"): from sklearn.metrics import roc_auc_score from sklearn.preprocessing import OneHotEncoder y_true = OneHotEncoder().fit_transform(targets.values).toarray() score = roc_auc_score(y_true, predictions.values) pred_bool = model.predict(test_data, return_prob=False) print(f"ROC-AUC: {score}") errors = targets - pred_bool elif multi_target: errors = targets - predictions score = np.mean(np.abs(errors.values), axis=0) else: errors = targets - predictions score = np.mean(np.abs(errors.values)) if save_folds: opt_feat = train_data.optimal_features[:n_feat] df_train = train_data.df_featurized df_train = df_train[opt_feat] df_train.to_csv("folds/train_f{}.csv".format(ind + 1)) df_test = test_data.df_featurized df_test = df_test[opt_feat] errors.columns = [x + "_error" for x in errors.columns] df_test = df_test.join(errors) df_test.to_csv("folds/test_f{}.csv".format(ind + 1)) results["predictions"].append(predictions) if stds is not None: results["stds"].append(stds) results["targets"].append(targets) results["errors"].append(errors) results["scores"].append(score) results['model'].append(model) return results