Пример #1
0
def test_small_moddata_featurization():
    """ This test creates a new MODData from the MP 2018.6 structures. """
    data_file = Path(__file__).parent.joinpath("data/MP_2018.6_small.zip")

    # Loading pickles can be dangerous, so lets at least check that the MD5 matches
    # what it was when created
    assert (get_sha512_of_file(data_file) ==
            "37bd4f8ce6f29c904a13e5670dd53af9a8779094727052ec85ccd6362b1b3765"
            "ac613426331811b3f626242896d87c3f6bc1884cc5545875b5ae66a712f9e218")

    old = MODData.load(data_file)
    structures = old.structures
    targets = old.targets

    names = old.names
    new = MODData(structures, targets, target_names=names)
    new.featurize(fast=False)

    new_cols = sorted(new.df_featurized.columns.tolist())
    old_cols = sorted(old.df_featurized.columns.tolist())

    for i in range(len(old_cols)):
        print(new_cols[i], old_cols[i])
        assert new_cols[i] == old_cols[i]

    np.testing.assert_array_equal(old_cols, new_cols)

    for col in new.df_featurized.columns:
        np.testing.assert_almost_equal(
            new.df_featurized[col].to_numpy(),
            old.df_featurized[col].to_numpy(),
        )
Пример #2
0
def load_or_featurize(task, n_jobs=1):
    data_files = glob.glob("./precomputed/*.gz")
    if len(data_files) == 0:
        data = featurize(task, n_jobs=n_jobs)
    else:
        precomputed_moddata = data_files[0]
        if len(data_files) > 1:
            print(
                f"Found multiple data files {data_files}, loading the first {data_files[0]}"
            )

        data = MODData.load(precomputed_moddata)

    return data
Пример #3
0
def _load_moddata(filename):
    """Loads the pickled MODData from the test directory and checks it's hash."""
    from modnet.preprocessing import MODData

    data_file = Path(__file__).parent.joinpath(f"data/{filename}")
    if filename not in _TEST_DATA_HASHES:
        raise RuntimeError(
            f"Cannot verify hash of {filename} as it was not provided, will not load pickle."
        )
    # Loading pickles can be dangerous, so lets at least check that the MD5 matches
    # what it was when created
    assert get_hash_of_file(data_file) == _TEST_DATA_HASHES[filename]

    return MODData.load(data_file)
Пример #4
0
def test_load_moddata_zip():
    """ This test checks that older MODData objects can still be loaded. """

    data_file = Path(__file__).parent.joinpath("data/MP_2018.6_subset.zip")

    # Loading pickles can be dangerous, so lets at least check that the MD5 matches
    # what it was when created
    assert (get_sha512_of_file(data_file) ==
            "d7d75e646dbde539645c8c0b065fd82cbe93f81d3500809655bd13d0acf2027c"
            "1786091a73f53985b08868c5be431a3c700f7f1776002df28ebf3a12a79ab1a1")

    data = MODData.load(data_file)
    assert len(data.structures) == 100
    assert len(data.mpids) == 100
    assert len(data.df_structure) == 100
    assert len(data.df_featurized) == 100
    assert len(data.df_targets) == 100
    assert len(data.df_targets) == 100
Пример #5
0
def matbench_benchmark(
    data: MODData,
    target: List[str],
    target_weights: Dict[str, float],
    fit_settings: Optional[Dict[str, Any]] = None,
    classification: bool = False,
    model_type: Type[MODNetModel] = MODNetModel,
    save_folds: bool = False,
    save_models: bool = False,
    hp_optimization: bool = True,
    inner_feat_selection: bool = True,
    use_precomputed_cross_nmi: bool = True,
    presets: Optional[List[dict]] = None,
    fast: bool = False,
    n_jobs: Optional[int] = None,
    nested: bool = False,
    **model_init_kwargs,
) -> dict:
    """Train and cross-validate a model against Matbench data splits, optionally
    performing hyperparameter optimisation.

    Arguments:
        data: The entire dataset as a `MODData`.
        target: The list of target names to train on.
        target_weights: The target weights to use for the `MODNetModel`.
        fit_settings: Any settings to pass to `model.fit(...)` directly
            (typically when not performing hyperparameter optimisation).
        classification: Whether all tasks are classification rather than regression.
        model_type: The type of the model to create and benchmark.
        save_folds: Whether to save dataframes with pre-processed fold
            data (e.g. feature selection).
        save_models: Whether to pickle all trained models according to
            their fold index and performance.
        hp_optimization: Whether to perform hyperparameter optimisation.
        inner_feat_selection: Whether to perform split-level feature
            selection or try to use pre-computed values.
        use_precomputed_cross_nmi: Whether to use the precmputed cross NMI
            from the Materials Project dataset, or recompute per fold.
        presets: Override the built-in hyperparameter grid with these presets.
        fast: Whether to perform debug training, i.e. reduced presets and epochs.
        n_jobs: Try to parallelize the inner fit_preset over this number of
            processes. Maxes out at number_of_presets*nested_folds
        nested: Whether to perform nested CV for hyperparameter optimisation.
        **model_init_kwargs: Additional arguments to pass to the model on creation.

    Returns:
        A dictionary containing all the results from the training, broken
            down by model and by fold.

    """

    if fit_settings is None:
        fit_settings = {}

    if not fit_settings.get("n_feat"):
        nf = len(data.df_featurized.columns)
        fit_settings["n_feat"] = nf
    if not fit_settings.get("num_neurons"):
        # Pass dummy network
        fit_settings["num_neurons"] = [[4], [4], [4], [4]]

    fold_data = []
    results = defaultdict(list)

    for ind, (train, test) in enumerate(matbench_kfold_splits(data)):
        train_data, test_data = data.split((train, test))
        if inner_feat_selection:
            path = "folds/train_moddata_f{}".format(ind + 1)
            if os.path.isfile(path):
                train_data = MODData.load(path)
            else:
                train_data.feature_selection(
                    n=-1, use_precomputed_cross_nmi=use_precomputed_cross_nmi)
            os.makedirs("folds", exist_ok=True)
            train_data.save(path)

        fold_data.append((train_data, test_data))

    args = (target, target_weights, fit_settings)

    model_kwargs = {
        "model_type": model_type,
        "hp_optimization": hp_optimization,
        "fast": fast,
        "classification": classification,
        "save_folds": save_folds,
        "presets": presets,
        "save_models": save_models,
        "nested": nested,
        "n_jobs": n_jobs,
    }

    model_kwargs.update(model_init_kwargs)

    fold_results = []
    for fold in enumerate(fold_data):
        fold_results.append(train_fold(fold, *args, **model_kwargs))

    for fold in fold_results:
        for key in fold:
            results[key].append(fold[key])

    return results
Пример #6
0
def run_predict(data, final_model, settings, save_folds=False, dknn_only=False):
    """
    Runs benchmark based on final_model without training everything again.
    It also computes the Knn distance and puts it in the results pickle.
    In fine, this should be integrated inside modnet benchmark.
    :param data:
    :param final_model:
    :param settings:
    :return:
    """

    task = settings["task"]
    # rebuild the EnsembleMODNetModels from the final model

    n_best_archs = 5 # change this (from 1 to 5 max) to adapt number of inner best archs chosen

    bootstrap_size = 5
    outer_fold_size = bootstrap_size * 5 * 5
    inner_fold_size = bootstrap_size * 5
    models = []

    multi_target = bool(len(data.df_targets.columns) - 1)


    for i in range(5): # outer fold
        modnet_models = []
        for j in range(5): # inner fold
                modnet_models+=(
                    final_model.model[(i * outer_fold_size) + (j * inner_fold_size):
                                      (i * outer_fold_size) + (j * inner_fold_size) + (n_best_archs * bootstrap_size)])
        model = EnsembleMODNetModel(modnet_models=modnet_models)
        models.append(model)

    if dknn_only:
        with open(f"results/{task}_results.pkl", "rb") as f:
            results = pickle.load(f)
            results["dknns"] = []
    else:
        results = defaultdict(list)

    for ind, (train, test) in enumerate(matbench_kfold_splits(data, classification=settings.get("classification", False))):
        train_data, test_data = data.split((train, test))
        path = "folds/train_moddata_f{}".format(ind + 1)
        train_data = MODData.load(path)
        assert len(set(train_data.df_targets.index).intersection(set(test_data.df_targets.index))) == 0
        model = models[ind]

        # compute dkNN

        # TODO: test this quickly before submitting
        max_feat_model = np.argmax([m.n_feat for m in model.model])
        n_feat = model.model[max_feat_model].n_feat
        feature_names = model.model[max_feat_model].optimal_descriptors
        dknn = get_dknn(train_data, test_data, feature_names)
        results["dknns"].append(dknn)
        if dknn_only:
            continue

        predict_kwargs = {}
        if settings.get("classification"):
            predict_kwargs["return_prob"] = True
        if model.can_return_uncertainty:
            predict_kwargs["return_unc"] = True

        pred_results = model.predict(test_data, **predict_kwargs)
        if isinstance(pred_results, tuple):
            predictions, stds = pred_results
        else:
            predictions = pred_results
            stds = None

        targets = test_data.df_targets

        if settings.get("classification"):
            from sklearn.metrics import roc_auc_score
            from sklearn.preprocessing import OneHotEncoder

            y_true = OneHotEncoder().fit_transform(targets.values).toarray()
            score = roc_auc_score(y_true, predictions.values)
            pred_bool = model.predict(test_data, return_prob=False)
            print(f"ROC-AUC: {score}")
            errors = targets - pred_bool
        elif multi_target:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values), axis=0)
        else:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values))

        if save_folds:
            opt_feat = train_data.optimal_features[:n_feat]
            df_train = train_data.df_featurized
            df_train = df_train[opt_feat]
            df_train.to_csv("folds/train_f{}.csv".format(ind + 1))
            df_test = test_data.df_featurized
            df_test = df_test[opt_feat]
            errors.columns = [x + "_error" for x in errors.columns]
            df_test = df_test.join(errors)
            df_test.to_csv("folds/test_f{}.csv".format(ind + 1))

        results["predictions"].append(predictions)
        if stds is not None:
            results["stds"].append(stds)
        results["targets"].append(targets)
        results["errors"].append(errors)
        results["scores"].append(score)
        results['model'].append(model)

    return results