예제 #1
0
def test_train_small_bootstrap_single_target_classif(small_moddata,
                                                     tf_session):
    """Tests the single target training."""
    from modnet.models import EnsembleMODNetModel

    data = small_moddata
    # set 'optimal' features manually
    data.optimal_features = [
        col for col in data.df_featurized.columns
        if col.startswith("ElementProperty")
    ]

    def is_metal(egap):
        if egap == 0:
            return 1
        else:
            return 0

    data.df_targets["is_metal"] = data.df_targets["egap"].apply(is_metal)
    model = EnsembleMODNetModel(
        [[["is_metal"]]],
        weights={"is_metal": 1},
        num_neurons=[[16], [8], [8], [4]],
        num_classes={"is_metal": 2},
        n_feat=10,
        n_models=3,
        bootstrap=True,
    )

    model.fit(data, epochs=5)
    model.predict(data)
    model.predict(data, return_unc=True)
예제 #2
0
def test_train_small_bootstrap_multi_target(small_moddata, tf_session):
    """Tests the multi-target training."""
    from modnet.models import EnsembleMODNetModel

    data = small_moddata
    # set 'optimal' features manually
    data.optimal_features = [
        col for col in data.df_featurized.columns
        if col.startswith("ElementProperty")
    ]

    model = EnsembleMODNetModel(
        [[["eform", "egap"]]],
        weights={
            "eform": 1,
            "egap": 1
        },
        num_neurons=[[16], [8], [8], [4]],
        n_feat=10,
        n_models=3,
        bootstrap=True,
    )

    model.fit(data, epochs=5)
    model.predict(data, return_unc=True)
예제 #3
0
def run_predict(data, final_model, settings, save_folds=False, dknn_only=False):
    """
    Runs benchmark based on final_model without training everything again.
    It also computes the Knn distance and puts it in the results pickle.
    In fine, this should be integrated inside modnet benchmark.
    :param data:
    :param final_model:
    :param settings:
    :return:
    """

    task = settings["task"]
    # rebuild the EnsembleMODNetModels from the final model

    n_best_archs = 5 # change this (from 1 to 5 max) to adapt number of inner best archs chosen

    bootstrap_size = 5
    outer_fold_size = bootstrap_size * 5 * 5
    inner_fold_size = bootstrap_size * 5
    models = []

    multi_target = bool(len(data.df_targets.columns) - 1)


    for i in range(5): # outer fold
        modnet_models = []
        for j in range(5): # inner fold
                modnet_models+=(
                    final_model.model[(i * outer_fold_size) + (j * inner_fold_size):
                                      (i * outer_fold_size) + (j * inner_fold_size) + (n_best_archs * bootstrap_size)])
        model = EnsembleMODNetModel(modnet_models=modnet_models)
        models.append(model)

    if dknn_only:
        with open(f"results/{task}_results.pkl", "rb") as f:
            results = pickle.load(f)
            results["dknns"] = []
    else:
        results = defaultdict(list)

    for ind, (train, test) in enumerate(matbench_kfold_splits(data, classification=settings.get("classification", False))):
        train_data, test_data = data.split((train, test))
        path = "folds/train_moddata_f{}".format(ind + 1)
        train_data = MODData.load(path)
        assert len(set(train_data.df_targets.index).intersection(set(test_data.df_targets.index))) == 0
        model = models[ind]

        # compute dkNN

        # TODO: test this quickly before submitting
        max_feat_model = np.argmax([m.n_feat for m in model.model])
        n_feat = model.model[max_feat_model].n_feat
        feature_names = model.model[max_feat_model].optimal_descriptors
        dknn = get_dknn(train_data, test_data, feature_names)
        results["dknns"].append(dknn)
        if dknn_only:
            continue

        predict_kwargs = {}
        if settings.get("classification"):
            predict_kwargs["return_prob"] = True
        if model.can_return_uncertainty:
            predict_kwargs["return_unc"] = True

        pred_results = model.predict(test_data, **predict_kwargs)
        if isinstance(pred_results, tuple):
            predictions, stds = pred_results
        else:
            predictions = pred_results
            stds = None

        targets = test_data.df_targets

        if settings.get("classification"):
            from sklearn.metrics import roc_auc_score
            from sklearn.preprocessing import OneHotEncoder

            y_true = OneHotEncoder().fit_transform(targets.values).toarray()
            score = roc_auc_score(y_true, predictions.values)
            pred_bool = model.predict(test_data, return_prob=False)
            print(f"ROC-AUC: {score}")
            errors = targets - pred_bool
        elif multi_target:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values), axis=0)
        else:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values))

        if save_folds:
            opt_feat = train_data.optimal_features[:n_feat]
            df_train = train_data.df_featurized
            df_train = df_train[opt_feat]
            df_train.to_csv("folds/train_f{}.csv".format(ind + 1))
            df_test = test_data.df_featurized
            df_test = df_test[opt_feat]
            errors.columns = [x + "_error" for x in errors.columns]
            df_test = df_test.join(errors)
            df_test.to_csv("folds/test_f{}.csv".format(ind + 1))

        results["predictions"].append(predictions)
        if stds is not None:
            results["stds"].append(stds)
        results["targets"].append(targets)
        results["errors"].append(errors)
        results["scores"].append(score)
        results['model'].append(model)

    return results
예제 #4
0
        test_data = MODData(
            materials=materials.tolist(),
            featurizer=fast_oxid_featurizer,
        )
        test_data.featurize(n_jobs=32)
        test_data.feature_selection(n=-1, use_precomputed_cross_nmi=True)

        # predict on test data
        predict_kwargs = {}
        if classification:
            predict_kwargs["return_prob"] = True
        if model.can_return_uncertainty:
            predict_kwargs["return_unc"] = True

        pred_results = model.predict(test_data, **predict_kwargs)

        if isinstance(pred_results, tuple):
            pred_df, stds = pred_results
        else:
            pred_df = pred_results
            stds = None

        if classification:
            predictions = pred_df.values[:, 0].astype(bool).flatten()
        else:
            predictions = pred_df.values.flatten()

        # record predictions
        task.record(fold, predictions)