Пример #1
0
    def test_checkerboard(self):
        grid_results_uncert = run_experiment("fit_grid",
                               recalculate_experiments=True, \
                               n_jobs = 4, \
                               experiment_detailed_name="test_fit_grid_checkerboard_uncertanity",
                               base_experiment="fit_active_learning",
                               seed=777,
                               grid_params = {"base_model_kwargs:alpha": list(np.logspace(-5,5,10))},
                               base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                       "loader_function": "get_splitted_uniform_data",
                                                       "preprocess_fncs": [],
                                                       "protein": "5ht7",
                                                       "fingerprint": "ExtFP",
                                                       "batch_size": 50, \
                                                       "base_model": "SGDClassifier",
                                                       "loader_args": {"n_folds": 2, "valid_size": 0.05}})

        grid_results_random = run_experiment("fit_grid",
                               n_folds=4,
                               experiment_detailed_name="test_fit_grid_checkerboard_random",
                               base_experiment="fit_active_learning", seed=777,
                               grid_params = {"base_model_kwargs:alpha": list(np.logspace(-5,5,10))},
                               base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                       "loader_function": "get_splitted_uniform_data",
                                                       "preprocess_fncs": [],
                                                       "protein": "5ht7",
                                                       "fingerprint": "ExtFP",
                                                       "batch_size": 50, \
                                                       "base_model": "SGDClassifier",
                                                       "loader_args": {"n_folds": 2, "valid_size": 0.05}})

        random_exp = get_best(grid_results_random.experiments, "mean_mcc_valid")

        uncert_exp = get_best(grid_results_uncert.experiments, "mean_mcc_valid")
Пример #2
0
def experiment_configurator(config, interaction_configuration=False):
    models = get_variants(config.predictors.is_gbm, config.one_hot)
    # models = dict(catboost=['vanilla'],  lgbm=['vanilla'])
    start = time()
    make_dirs([config.results_dir])
    for model_name, model_variants in models.items():
        print(f'Working on experiment : {model_name}')
        exp_dir = config.results_dir / model_name
        make_dirs([exp_dir])
        for variant in model_variants:
            config._set_attributes(model_name=model_name, variant=variant)
            seed(config.seed)
            if interaction_configuration:
                configurations = get_interaction_configuration(
                    config, model_name, variant, exp_dir)
            else:
                configurations = get_configurations(config, model_name,
                                                    variant, exp_dir)
            experiments_counter = 0
            for configuration in configurations:
                experiments_counter += 1
                seed(experiments_counter)
                if config.exp_results_path.exists():
                    continue
                run_experiment(configuration)

    end = time()
    print(f"run took {end - start} seconds")
Пример #3
0
    def test_checkerboard(self):
        grid_results_uncert = run_experiment("fit_grid",
                               recalculate_experiments=True, \
                               n_jobs = 4, \
                               experiment_detailed_name="test_fit_grid_checkerboard_uncertanity",
                               base_experiment="fit_active_learning",
                               seed=777,
                               grid_params = {"base_model_kwargs:alpha": list(np.logspace(-5,5,10))},
                               base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                       "loader_function": "get_splitted_uniform_data",
                                                       "preprocess_fncs": [],
                                                       "protein": "5ht7",
                                                       "fingerprint": "ExtFP",
                                                       "batch_size": 50, \
                                                       "base_model": "SGDClassifier",
                                                       "loader_args": {"n_folds": 2, "valid_size": 0.05}})

        grid_results_random = run_experiment("fit_grid",
                               n_folds=4,
                               experiment_detailed_name="test_fit_grid_checkerboard_random",
                               base_experiment="fit_active_learning", seed=777,
                               grid_params = {"base_model_kwargs:alpha": list(np.logspace(-5,5,10))},
                               base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                       "loader_function": "get_splitted_uniform_data",
                                                       "preprocess_fncs": [],
                                                       "protein": "5ht7",
                                                       "fingerprint": "ExtFP",
                                                       "batch_size": 50, \
                                                       "base_model": "SGDClassifier",
                                                       "loader_args": {"n_folds": 2, "valid_size": 0.05}})

        random_exp = get_best(grid_results_random.experiments,
                              "mean_mcc_valid")

        uncert_exp = get_best(grid_results_uncert.experiments,
                              "mean_mcc_valid")
Пример #4
0
    def test_clusterwise(self):
        compound = "5ht6"
        fingerprint = "ExtFP"
        seed = 777

        # all_combinations = [p for p in list(product(proteins, fingerprints))]
        preprocess_fncs = [["to_binary", {"all_below": True}]]
        loader = [
            "get_splitted_data_clusterwise", {
                "seed": seed,
                "valid_size": 0.15,
                "n_folds": 4
            }
        ]

        folds, _, _ = get_data([[compound, fingerprint]], loader,
                               preprocess_fncs).values()[0]

        plt.figure(figsize=(20, 20))
        X_2 = folds[0]["X_valid"]
        X = folds[0]["X_train"]
        Y = folds[0]["Y_train"]["data"]

        # Note: this test might fail if you change get_data preprocess to_binary. Just change it appropr. then
        assert X["data"].shape[1] == 2012

        # Check interestigness index
        d1 = calculate_jaccard_kernel(X["data"][X["cluster_A"]],
                                      X["data"][X["cluster_A"]])[1].mean()
        d2 = calculate_jaccard_kernel(X["data"][X["cluster_B"]],
                                      X["data"][X["cluster_B"]])[1].mean()
        d3 = calculate_jaccard_kernel(X["data"][X["cluster_A"]],
                                      X["data"][X["cluster_B"]])[1].mean()
        assert d3 / (0.5 * (d1 + d2)) >= 1.1

        ids = X["cluster_A"] + X["cluster_B"]
        c = Y.copy()[ids]
        c[:] = 1
        c[0:len(X["cluster_A"])] = 2

        X_proj = RandomizedPCA(n_components=3,
                               iterated_power=10).fit_transform(
                                   X["data"].toarray())

        plt.figure(figsize=(30, 30))
        plt.scatter(X_proj[ids, 0], X_proj[ids, 1], c=c, s=250)

        plt.show()

        compound = "5ht6"
        fingerprint = "ExtFP"
        seed = 777

        # all_combinations = [p for p in list(product(proteins, fingerprints))]
        preprocess_fncs = [["to_binary", {"all_below": True}]]
        loader = [
            "get_splitted_data_clusterwise", {
                "seed": seed,
                "valid_size": 0.15,
                "n_folds": 4
            }
        ]


        twelm_uncertain_1 = run_experiment("fit_grid",
                                         n_jobs=4,
                                         experiment_detailed_name="test_fit_TWELM_uncertain_%s_%s" % (compound, fingerprint),
                                         base_experiment="fit_active_learning",
                                         seed=777,
                                         base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                                 "preprocess_fncs": preprocess_fncs,
                                                                 "batch_size": 20,
                                                                 "protein": compound,
                                                                 "fingerprint": fingerprint,
                                                                 "warm_start_percentage": 0.03,
                                                                 "base_model": "TWELM",
                                                                 "loader_function": loader[0],
                                                                 "loader_args": loader[1],
                                                                 "param_grid": {'h': [100], \
                                                                                'C': list(np.logspace(-3,4,7))}})

        assert "wac_score_cluster_B_valid" in twelm_uncertain_1.experiments[
            0].monitors[0].keys()

        # This is rather magic, but seems quite reasonable
        assert np.array([
            m["wac_score_cluster_B_valid"][-1]
            for m in twelm_uncertain_1.experiments[0].monitors
        ]).mean() > 0.7
Пример #5
0
    def test_reproducibility(self):
        compound = "5ht6"
        fingerprint = "ExtFP"
        seed = 777

        twelm_uncertain_1 = run_experiment("fit_grid",
                                         recalculate_experiments=False,
                                         n_jobs=8,
                                         experiment_detailed_name="test_fit_TWELM_uncertain_%s_%s" % (compound, fingerprint),
                                         base_experiment="fit_active_learning",
                                         seed=seed,
                                         grid_params={"batch_size": [50,100]},
                                         base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                                 "loader_function": "get_splitted_data",
                                                                 "protein": compound,
                                                                 "fingerprint": fingerprint,
                                                                 "preprocess_fncs": [["to_binary", {"all_below": True}]],
                                                                 "base_model": "TWELM",
                                                                 "loader_args": {"n_folds": 2, "valid_size": 0.05, "percent": 0.15},
                                                                 "param_grid": {'h': [100], \
                                                                                'C': list(np.logspace(-3,4,7))}})




        twelm_uncertain_2 = run_experiment("fit_grid",
                                         recalculate_experiments=False,
                                         n_jobs=8,
                                         grid_params={"batch_size": [50,100]},
                                         experiment_detailed_name="test_fit_TWELM_uncertain_%s_%s" % (compound, fingerprint),
                                         base_experiment="fit_active_learning",
                                         seed=seed,
                                         base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                                 "loader_function": "get_splitted_data",
                                                                 "protein": compound,
                                                                 "fingerprint": fingerprint,
                                                                 "preprocess_fncs": [["to_binary", {"all_below": True}]],
                                                                 "base_model": "TWELM",
                                                                 "loader_args": {"n_folds": 2, "valid_size": 0.05, "percent": 0.15},
                                                                 "param_grid": {'h': [100], \
                                                                                'C': list(np.logspace(-3,4,7))}})

        best_experiment = get_best(twelm_uncertain_1.experiments,
                                   "auc_wac_score_concept")

        best_experiment.config['id_folds'] = [0, 1]
        best_experiment_refit = run_experiment("fit_active_learning",
                                               recalculate_experiments=False,
                                               n_jobs=8,
                                               **best_experiment.config)

        main_logger.info(len(best_experiment.monitors))
        main_logger.info(len(best_experiment_refit.monitors))

        assert len(best_experiment.monitors) == 2

        vals_1 = []
        vals_2 = []
        for k in sorted(best_experiment_refit.results):
            if "time" not in k and "labeled" not in k:
                vals_1.append(best_experiment_refit.results[k])
                vals_2.append(best_experiment.results[k])
                main_logger.info(
                    str(vals_1[-1]) + " " + str(vals_2[-1]) + " " + k)
                if isinstance(vals_1[-1], list):
                    vals_1[-1] = sum(vals_1[-1])
                if isinstance(vals_2[-1], list):
                    vals_2[-1] = sum(vals_2[-1])

        assert np.array_equal(np.array(vals_1), np.array(vals_2))

        vals_1 = []
        vals_2 = []
        for k in twelm_uncertain_1.experiments[0].results:
            if "time" not in k:
                vals_1.append(twelm_uncertain_1.experiments[0].results[k])
                vals_2.append(twelm_uncertain_2.experiments[0].results[k])
                if isinstance(vals_1[-1], list):
                    vals_1[-1] = sum(vals_1[-1])
                if isinstance(vals_2[-1], list):
                    vals_2[-1] = sum(vals_2[-1])

        assert np.array_equal(np.array(vals_1), np.array(vals_2))
Пример #6
0
from experiments import experiment_runner, fit_active_learning, fit_grid
from experiments.utils import get_best
from models.strategy import cosine_distance_normalized

from kaggle_ninja import *
import kaggle_ninja

kaggle_ninja.turn_on_force_reload_all()

grid_result_passive = run_experiment("fit_grid",
                                    recalculate_experiments=True,
                                    n_jobs = 1,
                                    experiment_detailed_name="fit_svm_passive_tiles",
                                    base_experiment="fit_active_learning",
                                    seed=666,
                                    grid_params = {"base_model_kwargs:C": list(np.logspace(-5,5,10)),
                                                   "base_model_kwargs:kernel": ['linear']},
                                    base_experiment_kwargs={"strategy": "random_query",
                                                       "loader_function": "get_splitted_uniform_data",
                                                       "batch_size": 20,
                                                       "base_model": "SVC",
                                                       "loader_args": {"n_folds": 2}})

# grid_result_uncertainty = run_experiment("fit_grid",
#                                     recalculate_experiments=True,
#                                     n_jobs = 8,
#                                     experiment_detailed_name="fit_svm_uncertainty_tiles",
#                                     base_experiment="fit_active_learning",
#                                     seed=666,
#                                     grid_params = {"base_model_kwargs:C": list(np.logspace(-5,5,10)),
#                                                    "base_model_kwargs:kernel": ['linear']},
Пример #7
0
    def test_clusterwise(self):
        compound = "5ht6"
        fingerprint = "ExtFP"
        seed = 777

        # all_combinations = [p for p in list(product(proteins, fingerprints))]
        preprocess_fncs = [["to_binary", {"all_below": True}]]
        loader = ["get_splitted_data_clusterwise", {
                "seed": seed,
                "valid_size": 0.15,
                "n_folds": 4}]

        folds, _, _ = get_data([[compound, fingerprint]], loader, preprocess_fncs).values()[0]

        plt.figure(figsize=(20,20))
        X_2 = folds[0]["X_valid"]
        X = folds[0]["X_train"]
        Y = folds[0]["Y_train"]["data"]

        # Note: this test might fail if you change get_data preprocess to_binary. Just change it appropr. then
        assert X["data"].shape[1] == 2012

        # Check interestigness index
        d1 = calculate_jaccard_kernel(X["data"][X["cluster_A"]], X["data"][X["cluster_A"]])[1].mean()
        d2 = calculate_jaccard_kernel(X["data"][X["cluster_B"]], X["data"][X["cluster_B"]])[1].mean()
        d3 = calculate_jaccard_kernel(X["data"][X["cluster_A"]], X["data"][X["cluster_B"]])[1].mean()
        assert d3/(0.5*(d1+d2)) >= 1.1

        ids = X["cluster_A"] + X["cluster_B"]
        c = Y.copy()[ids]
        c[:] = 1
        c[0:len(X["cluster_A"])] = 2

        X_proj = RandomizedPCA(n_components=3, iterated_power=10).fit_transform(X["data"].toarray())

        plt.figure(figsize=(30,30))
        plt.scatter(X_proj[ids,0], X_proj[ids,1], c=c, s=250)

        plt.show()

        compound = "5ht6"
        fingerprint = "ExtFP"
        seed = 777

        # all_combinations = [p for p in list(product(proteins, fingerprints))]
        preprocess_fncs = [["to_binary", {"all_below": True}]]
        loader = ["get_splitted_data_clusterwise", {
                "seed": seed,
                "valid_size": 0.15,
                "n_folds": 4}]


        twelm_uncertain_1 = run_experiment("fit_grid",
                                         n_jobs=4,
                                         experiment_detailed_name="test_fit_TWELM_uncertain_%s_%s" % (compound, fingerprint),
                                         base_experiment="fit_active_learning",
                                         seed=777,
                                         base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                                 "preprocess_fncs": preprocess_fncs,
                                                                 "batch_size": 20,
                                                                 "protein": compound,
                                                                 "fingerprint": fingerprint,
                                                                 "warm_start_percentage": 0.03,
                                                                 "base_model": "TWELM",
                                                                 "loader_function": loader[0],
                                                                 "loader_args": loader[1],
                                                                 "param_grid": {'h': [100], \
                                                                                'C': list(np.logspace(-3,4,7))}})

        assert "wac_score_cluster_B_valid" in twelm_uncertain_1.experiments[0].monitors[0].keys()

        # This is rather magic, but seems quite reasonable
        assert np.array([m["wac_score_cluster_B_valid"][-1] for m in twelm_uncertain_1.experiments[0].monitors]).mean() > 0.7
Пример #8
0
    def test_reproducibility(self):
        compound = "5ht6"
        fingerprint = "ExtFP"
        seed = 777

        twelm_uncertain_1 = run_experiment("fit_grid",
                                         recalculate_experiments=False,
                                         n_jobs=8,
                                         experiment_detailed_name="test_fit_TWELM_uncertain_%s_%s" % (compound, fingerprint),
                                         base_experiment="fit_active_learning",
                                         seed=seed,
                                         grid_params={"batch_size": [50,100]},
                                         base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                                 "loader_function": "get_splitted_data",
                                                                 "protein": compound,
                                                                 "fingerprint": fingerprint,
                                                                 "preprocess_fncs": [["to_binary", {"all_below": True}]],
                                                                 "base_model": "TWELM",
                                                                 "loader_args": {"n_folds": 2, "valid_size": 0.05, "percent": 0.15},
                                                                 "param_grid": {'h': [100], \
                                                                                'C': list(np.logspace(-3,4,7))}})




        twelm_uncertain_2 = run_experiment("fit_grid",
                                         recalculate_experiments=False,
                                         n_jobs=8,
                                         grid_params={"batch_size": [50,100]},
                                         experiment_detailed_name="test_fit_TWELM_uncertain_%s_%s" % (compound, fingerprint),
                                         base_experiment="fit_active_learning",
                                         seed=seed,
                                         base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                                 "loader_function": "get_splitted_data",
                                                                 "protein": compound,
                                                                 "fingerprint": fingerprint,
                                                                 "preprocess_fncs": [["to_binary", {"all_below": True}]],
                                                                 "base_model": "TWELM",
                                                                 "loader_args": {"n_folds": 2, "valid_size": 0.05, "percent": 0.15},
                                                                 "param_grid": {'h': [100], \
                                                                                'C': list(np.logspace(-3,4,7))}})



        best_experiment = get_best(twelm_uncertain_1.experiments, "auc_wac_score_concept")

        best_experiment.config['id_folds'] = [0,1]
        best_experiment_refit = run_experiment("fit_active_learning",
                                         recalculate_experiments=False,
                                         n_jobs=8,
                                         **best_experiment.config)

        main_logger.info(len(best_experiment.monitors))
        main_logger.info(len(best_experiment_refit.monitors))

        assert len(best_experiment.monitors)==2

        vals_1 = []
        vals_2 = []
        for k in sorted(best_experiment_refit.results):
            if "time" not in k and "labeled" not in k:
                vals_1.append(best_experiment_refit.results[k])
                vals_2.append(best_experiment.results[k])
                main_logger.info(str(vals_1[-1]) + " "+str(vals_2[-1]) + " "+k)
                if isinstance(vals_1[-1], list):
                    vals_1[-1] = sum(vals_1[-1])
                if isinstance(vals_2[-1], list):
                    vals_2[-1] = sum(vals_2[-1])


        assert np.array_equal(np.array(vals_1), np.array(vals_2))

        vals_1 = []
        vals_2 = []
        for k in twelm_uncertain_1.experiments[0].results:
            if "time" not in k:
                vals_1.append(twelm_uncertain_1.experiments[0].results[k])
                vals_2.append(twelm_uncertain_2.experiments[0].results[k])
                if isinstance(vals_1[-1], list):
                    vals_1[-1] = sum(vals_1[-1])
                if isinstance(vals_2[-1], list):
                    vals_2[-1] = sum(vals_2[-1])



        assert np.array_equal(np.array(vals_1), np.array(vals_2))
Пример #9
0
from experiments.experiment_runner import run_experiment, run_experiment_grid
from experiments import experiment_runner, fit_active_learning, fit_grid
from sklearn.svm import SVC

from experiments.utils import plot_grid_experiment_results, get_best, plot_monitors

from kaggle_ninja import *
turn_on_force_reload_all()

grid_result_passive = run_experiment("fit_grid",
                                    recalculate_experiments=True,
                                    n_jobs = 4, 
                                    experiment_detailed_name="fit_svm_passive_tiles",
                                    base_experiment="fit_active_learning",
                                    seed=666,
                                    grid_params = {"base_model_kwargs:C": list(np.logspace(-5,5,10)),
                                                   "base_model_kwargs:kernel": ['linear']},
                                    base_experiment_kwargs={"strategy": "random_query",
                                                       "loader_function": "get_splitted_uniform_data",
                                                       "batch_size": 20, \
                                                       "base_model": "SVC",
                                                       "loader_args": {"n_folds": 2}})

passive_exp = get_best(grid_result_passive.experiments, "mean_mcc_valid")
Пример #10
0
def run(experiment_sub_name, base_batch_size, seed, _log, _config):
    val1 = run_experiment("random_query_exp", batch_size=base_batch_size, seed=seed)
    val2 = run_experiment("random_query_exp", batch_size=2*base_batch_size, seed=seed)
    return ExperimentResults(name=ex.name, misc={}, \
                             monitors={}, results={"acc": val1.results["acc"] + val2.results["acc"]}, dumps={}, config=_config)