예제 #1
0
def test_train_small_bootstrap_single_target_classif(small_moddata,
                                                     tf_session):
    """Tests the single target training."""
    from modnet.models import EnsembleMODNetModel

    data = small_moddata
    # set 'optimal' features manually
    data.optimal_features = [
        col for col in data.df_featurized.columns
        if col.startswith("ElementProperty")
    ]

    def is_metal(egap):
        if egap == 0:
            return 1
        else:
            return 0

    data.df_targets["is_metal"] = data.df_targets["egap"].apply(is_metal)
    model = EnsembleMODNetModel(
        [[["is_metal"]]],
        weights={"is_metal": 1},
        num_neurons=[[16], [8], [8], [4]],
        num_classes={"is_metal": 2},
        n_feat=10,
        n_models=3,
        bootstrap=True,
    )

    model.fit(data, epochs=5)
    model.predict(data)
    model.predict(data, return_unc=True)
예제 #2
0
def test_train_small_bootstrap_presets(small_moddata, tf_session):
    """Tests the `fit_preset()` method."""
    from modnet.model_presets import gen_presets
    from modnet.models import EnsembleMODNetModel

    modified_presets = gen_presets(100, 100)[:2]

    for ind, preset in enumerate(modified_presets):
        modified_presets[ind]["epochs"] = 2

    data = small_moddata
    # set 'optimal' features manually
    data.optimal_features = [
        col for col in data.df_featurized.columns
        if col.startswith("ElementProperty")
    ]

    model = EnsembleMODNetModel(
        [[["eform", "egap"]]],
        weights={
            "eform": 1,
            "egap": 1
        },
        num_neurons=[[4], [2], [2], [2]],
        n_feat=3,
        n_models=2,
        bootstrap=True,
    )

    # nested=0/False -> no inner loop, so only 1 model
    # nested=1/True -> inner loop, but default n_folds so 5
    for num_nested, nested_option in zip([2, 1], [2, 0]):
        results = model.fit_preset(
            data,
            presets=modified_presets,
            nested=nested_option,
            val_fraction=0.2,
            n_jobs=2,
        )
        models = results[0]
        assert len(models) == len(modified_presets)
        assert len(models[0]) == num_nested
예제 #3
0
def test_train_small_bootstrap_multi_target(small_moddata, tf_session):
    """Tests the multi-target training."""
    from modnet.models import EnsembleMODNetModel

    data = small_moddata
    # set 'optimal' features manually
    data.optimal_features = [
        col for col in data.df_featurized.columns
        if col.startswith("ElementProperty")
    ]

    model = EnsembleMODNetModel(
        [[["eform", "egap"]]],
        weights={
            "eform": 1,
            "egap": 1
        },
        num_neurons=[[16], [8], [8], [4]],
        n_feat=10,
        n_models=3,
        bootstrap=True,
    )

    model.fit(data, epochs=5)
    model.predict(data, return_unc=True)
예제 #4
0
def run_predict(data, final_model, settings, save_folds=False, dknn_only=False):
    """
    Runs benchmark based on final_model without training everything again.
    It also computes the Knn distance and puts it in the results pickle.
    In fine, this should be integrated inside modnet benchmark.
    :param data:
    :param final_model:
    :param settings:
    :return:
    """

    task = settings["task"]
    # rebuild the EnsembleMODNetModels from the final model

    n_best_archs = 5 # change this (from 1 to 5 max) to adapt number of inner best archs chosen

    bootstrap_size = 5
    outer_fold_size = bootstrap_size * 5 * 5
    inner_fold_size = bootstrap_size * 5
    models = []

    multi_target = bool(len(data.df_targets.columns) - 1)


    for i in range(5): # outer fold
        modnet_models = []
        for j in range(5): # inner fold
                modnet_models+=(
                    final_model.model[(i * outer_fold_size) + (j * inner_fold_size):
                                      (i * outer_fold_size) + (j * inner_fold_size) + (n_best_archs * bootstrap_size)])
        model = EnsembleMODNetModel(modnet_models=modnet_models)
        models.append(model)

    if dknn_only:
        with open(f"results/{task}_results.pkl", "rb") as f:
            results = pickle.load(f)
            results["dknns"] = []
    else:
        results = defaultdict(list)

    for ind, (train, test) in enumerate(matbench_kfold_splits(data, classification=settings.get("classification", False))):
        train_data, test_data = data.split((train, test))
        path = "folds/train_moddata_f{}".format(ind + 1)
        train_data = MODData.load(path)
        assert len(set(train_data.df_targets.index).intersection(set(test_data.df_targets.index))) == 0
        model = models[ind]

        # compute dkNN

        # TODO: test this quickly before submitting
        max_feat_model = np.argmax([m.n_feat for m in model.model])
        n_feat = model.model[max_feat_model].n_feat
        feature_names = model.model[max_feat_model].optimal_descriptors
        dknn = get_dknn(train_data, test_data, feature_names)
        results["dknns"].append(dknn)
        if dknn_only:
            continue

        predict_kwargs = {}
        if settings.get("classification"):
            predict_kwargs["return_prob"] = True
        if model.can_return_uncertainty:
            predict_kwargs["return_unc"] = True

        pred_results = model.predict(test_data, **predict_kwargs)
        if isinstance(pred_results, tuple):
            predictions, stds = pred_results
        else:
            predictions = pred_results
            stds = None

        targets = test_data.df_targets

        if settings.get("classification"):
            from sklearn.metrics import roc_auc_score
            from sklearn.preprocessing import OneHotEncoder

            y_true = OneHotEncoder().fit_transform(targets.values).toarray()
            score = roc_auc_score(y_true, predictions.values)
            pred_bool = model.predict(test_data, return_prob=False)
            print(f"ROC-AUC: {score}")
            errors = targets - pred_bool
        elif multi_target:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values), axis=0)
        else:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values))

        if save_folds:
            opt_feat = train_data.optimal_features[:n_feat]
            df_train = train_data.df_featurized
            df_train = df_train[opt_feat]
            df_train.to_csv("folds/train_f{}.csv".format(ind + 1))
            df_test = test_data.df_featurized
            df_test = df_test[opt_feat]
            errors.columns = [x + "_error" for x in errors.columns]
            df_test = df_test.join(errors)
            df_test.to_csv("folds/test_f{}.csv".format(ind + 1))

        results["predictions"].append(predictions)
        if stds is not None:
            results["stds"].append(stds)
        results["targets"].append(targets)
        results["errors"].append(errors)
        results["scores"].append(score)
        results['model'].append(model)

    return results
예제 #5
0
    if not os.path.isdir(task):
        raise RuntimeError(f"No folder found for {task!r}.")

    os.chdir(task)
    print(f"Running on {n_jobs} jobs")
    settings = load_settings(task)
    settings["task"] = task

    if args.get("predict"):
        if not os.path.isfile(f"final_model/{task}_model"):
            raise RuntimeError("No model found for prediction, please run the benchmark first.")
        else:
            print("Loading data and model...")
            data = load_or_featurize(task)
            final_model = EnsembleMODNetModel.load(f"final_model/{task}_model")
            print("Running predictions...")
            results = run_predict(data, final_model, settings)
            print("Saving results...")
            try:
                save_results(results, task)
            except Exception:
                print_exc()

    if args.get("plot"):
        #make graphs only
        if not os.path.isfile(f"results/{task}_results.pkl"):
            raise RuntimeError("No results file, please run the benchmark before plotting.")
        else:
            print("Loading previous results.")
            with open(f"results/{task}_results.pkl", "rb") as f:
예제 #6
0
            )

        fast_oxid_featurizer = DeBreuck2020Featurizer(fast_oxid=True)
        train_data = MODData(
            materials=materials.tolist(),
            targets=train_df[targets].values,
            target_names=targets,
            featurizer=fast_oxid_featurizer,
        )
        train_data.featurize(n_jobs=32)
        train_data.feature_selection(n=-1, use_precomputed_cross_nmi=True)

        # create model
        targets_hierarchy = [[[field for field in targets]]]
        weights = {field: 1 for field in targets}
        model = EnsembleMODNetModel(targets_hierarchy, weights)

        # fit model

        if USE_GA:
            # you can either use a GA for hyper-parameter optimization or...
            from modnet.hyper_opt import FitGenetic
            ga = FitGenetic(train_data)
            model = ga.run(
                size_pop=20,
                num_generations=10,
                n_jobs=16,
                early_stopping=True,
                refit=True,
            )
        else: