def test_train_small_bootstrap_single_target_classif(small_moddata, tf_session): """Tests the single target training.""" from modnet.models import EnsembleMODNetModel data = small_moddata # set 'optimal' features manually data.optimal_features = [ col for col in data.df_featurized.columns if col.startswith("ElementProperty") ] def is_metal(egap): if egap == 0: return 1 else: return 0 data.df_targets["is_metal"] = data.df_targets["egap"].apply(is_metal) model = EnsembleMODNetModel( [[["is_metal"]]], weights={"is_metal": 1}, num_neurons=[[16], [8], [8], [4]], num_classes={"is_metal": 2}, n_feat=10, n_models=3, bootstrap=True, ) model.fit(data, epochs=5) model.predict(data) model.predict(data, return_unc=True)
def test_train_small_bootstrap_multi_target(small_moddata, tf_session): """Tests the multi-target training.""" from modnet.models import EnsembleMODNetModel data = small_moddata # set 'optimal' features manually data.optimal_features = [ col for col in data.df_featurized.columns if col.startswith("ElementProperty") ] model = EnsembleMODNetModel( [[["eform", "egap"]]], weights={ "eform": 1, "egap": 1 }, num_neurons=[[16], [8], [8], [4]], n_feat=10, n_models=3, bootstrap=True, ) model.fit(data, epochs=5) model.predict(data, return_unc=True)
def run_predict(data, final_model, settings, save_folds=False, dknn_only=False): """ Runs benchmark based on final_model without training everything again. It also computes the Knn distance and puts it in the results pickle. In fine, this should be integrated inside modnet benchmark. :param data: :param final_model: :param settings: :return: """ task = settings["task"] # rebuild the EnsembleMODNetModels from the final model n_best_archs = 5 # change this (from 1 to 5 max) to adapt number of inner best archs chosen bootstrap_size = 5 outer_fold_size = bootstrap_size * 5 * 5 inner_fold_size = bootstrap_size * 5 models = [] multi_target = bool(len(data.df_targets.columns) - 1) for i in range(5): # outer fold modnet_models = [] for j in range(5): # inner fold modnet_models+=( final_model.model[(i * outer_fold_size) + (j * inner_fold_size): (i * outer_fold_size) + (j * inner_fold_size) + (n_best_archs * bootstrap_size)]) model = EnsembleMODNetModel(modnet_models=modnet_models) models.append(model) if dknn_only: with open(f"results/{task}_results.pkl", "rb") as f: results = pickle.load(f) results["dknns"] = [] else: results = defaultdict(list) for ind, (train, test) in enumerate(matbench_kfold_splits(data, classification=settings.get("classification", False))): train_data, test_data = data.split((train, test)) path = "folds/train_moddata_f{}".format(ind + 1) train_data = MODData.load(path) assert len(set(train_data.df_targets.index).intersection(set(test_data.df_targets.index))) == 0 model = models[ind] # compute dkNN # TODO: test this quickly before submitting max_feat_model = np.argmax([m.n_feat for m in model.model]) n_feat = model.model[max_feat_model].n_feat feature_names = model.model[max_feat_model].optimal_descriptors dknn = get_dknn(train_data, test_data, feature_names) results["dknns"].append(dknn) if dknn_only: continue predict_kwargs = {} if settings.get("classification"): predict_kwargs["return_prob"] = True if model.can_return_uncertainty: predict_kwargs["return_unc"] = True pred_results = model.predict(test_data, **predict_kwargs) if isinstance(pred_results, tuple): predictions, stds = pred_results else: predictions = pred_results stds = None targets = test_data.df_targets if settings.get("classification"): from sklearn.metrics import roc_auc_score from sklearn.preprocessing import OneHotEncoder y_true = OneHotEncoder().fit_transform(targets.values).toarray() score = roc_auc_score(y_true, predictions.values) pred_bool = model.predict(test_data, return_prob=False) print(f"ROC-AUC: {score}") errors = targets - pred_bool elif multi_target: errors = targets - predictions score = np.mean(np.abs(errors.values), axis=0) else: errors = targets - predictions score = np.mean(np.abs(errors.values)) if save_folds: opt_feat = train_data.optimal_features[:n_feat] df_train = train_data.df_featurized df_train = df_train[opt_feat] df_train.to_csv("folds/train_f{}.csv".format(ind + 1)) df_test = test_data.df_featurized df_test = df_test[opt_feat] errors.columns = [x + "_error" for x in errors.columns] df_test = df_test.join(errors) df_test.to_csv("folds/test_f{}.csv".format(ind + 1)) results["predictions"].append(predictions) if stds is not None: results["stds"].append(stds) results["targets"].append(targets) results["errors"].append(errors) results["scores"].append(score) results['model'].append(model) return results
test_data = MODData( materials=materials.tolist(), featurizer=fast_oxid_featurizer, ) test_data.featurize(n_jobs=32) test_data.feature_selection(n=-1, use_precomputed_cross_nmi=True) # predict on test data predict_kwargs = {} if classification: predict_kwargs["return_prob"] = True if model.can_return_uncertainty: predict_kwargs["return_unc"] = True pred_results = model.predict(test_data, **predict_kwargs) if isinstance(pred_results, tuple): pred_df, stds = pred_results else: pred_df = pred_results stds = None if classification: predictions = pred_df.values[:, 0].astype(bool).flatten() else: predictions = pred_df.values.flatten() # record predictions task.record(fold, predictions)