def simple_cobras_tests(): # this class contains the logic to build different kind of test_cobras cases # if you want another testing scenario you can add it to this class tests = TestCollection() tests.add_10_times_10_fold_test( "<TEST NAME>", "COBRAS", cobras_algorithm_settings_to_string(0.10, 3, 7, 0.91, 0.91, False, False), Dataset.get_dataset_names(), "probability_noise_querier", # name of the querier probabilistic_noisy_querier_settings_to_string( 0, 200)) # noise probability 0 --> no noise # this runs the tests locally over the number of cores specified run_tests_local(tests, nb_of_cores=4) # after running several of the above you can compare different results as follows comparison_name = "NAME OF THE COMPARISON" test_names = [ "<TEST NAME>", "<OTHER TEST NAME>" ] # these should be the same string as the first argument of tests.add_10_times_10_fold_test line_names = [ "<simple name for <TEST NAME>>", "<OTHER SIMPLE NAME>" ] # these names are displayed in the legend of the plots instead of test_cobras names (test_cobras names should be unique and can thus become very large) # this will calculate all the aris and compare the tests this is not possible over SSH but this is not as much works as well calculate_aris_and_compare_for_tests(comparison_name, test_names, line_names, query_budget=200, nb_of_cores=4)
def generate_folds_for_dataset(): dataset_names = Dataset.get_dataset_names() + Dataset.interesting_2d_datasets() for dataset_name in dataset_names: dataset = Dataset(dataset_name) print("making folds for dataset ", dataset_name) os.makedirs(os.path.join(FOLD_PATH, dataset_name), exist_ok=True) for run_nb in range(10): # toon's code # skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle = True) # skf = KFold(n_splits=10, shuffle=True) labels = dataset.target for fold_nb, (train_indices, test_indices) in enumerate(skf.split(np.zeros(len(labels)), labels)): to_write = dict() to_write["train_indices"] = train_indices.tolist() to_write["test_indices"] = test_indices.tolist() if os.path.isfile(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb))): print("fold file already exists! not overwriting!") continue with open(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb)), mode = 'w') as fold_file: json.dump(to_write, fold_file)
def ncobras_noise_comparison_fixed_noise_changing_p_noise(): print("making tests") tests = TestCollection() tests.add_10_times_10_fold_test( "ncobras_0.10_noise_0.05_p_noise", "COBRAS", cobras_algorithm_settings_to_string(0.05, 3, 7, 0.96, 0.96, True, False), Dataset.get_dataset_names(), "probability_noise_querier", probabilistic_noisy_querier_settings_to_string(0.10, 250)) tests.add_10_times_10_fold_test( "ncobras_0.10_noise_0.10_p_noise", "COBRAS", cobras_algorithm_settings_to_string(0.10, 3, 7, 0.91, 0.91, True, False), Dataset.get_dataset_names(), "probability_noise_querier", probabilistic_noisy_querier_settings_to_string(0.10, 250)) tests.add_10_times_10_fold_test( "ncobras_0.10_noise_0.15_p_noise", "COBRAS", cobras_algorithm_settings_to_string(0.15, 3, 7, 0.91, 0.91, True, False), Dataset.get_dataset_names(), "probability_noise_querier", probabilistic_noisy_querier_settings_to_string(0.10, 250)) tests.add_10_times_10_fold_test( "ncobras_0.10_noise_0.20_p_noise", "COBRAS", cobras_algorithm_settings_to_string(0.20, 3, 7, 0.91, 0.91, True, False), Dataset.get_dataset_names(), "probability_noise_querier", probabilistic_noisy_querier_settings_to_string(0.10, 250)) run_tests_over_SSH_on_machines( tests, generate_computer_info(start_index=21, nb_of_machines=5)) comparison_name = "ncobras_parameter_sensitivity" test_names = [ "ncobras_0.10_noise_0.05_p_noise", "ncobras_0.10_noise_0.10_p_noise", "ncobras_0.10_noise_0.15_p_noise", "ncobras_0.10_noise_0.20_p_noise" ] line_names = None calculate_aris_and_compare_for_tests(comparison_name, test_names, line_names)
def ncobras_plus_varying_amounts_of_noise(): print("making tests") tests = TestCollection() query_budget = 200 for noise_percentage in [-1, 0.05, 0.10]: noise_text = str(noise_percentage) if noise_percentage != -1 else "no" threshold = 0.95 noise_percentage_to_use = noise_percentage if noise_percentage > 0 else 0.10 tests.add_10_times_10_fold_test( "NCOBRASplus_{}_noise_budget{}_pnoise{}_threshold{}".format( noise_text, query_budget, noise_percentage_to_use, threshold), "COBRAS", cobras_algorithm_settings_to_string(noise_percentage_to_use, 3, 10, threshold, threshold, True, False), Dataset.get_dataset_names(), "probability_noise_querier", probabilistic_noisy_querier_settings_to_string( noise_percentage, query_budget), nb_of_runs=10) run_tests_over_SSH_on_machines(tests, MACHINES_TO_USE)
def ncobras_plus_runtime_test(): tests = TestCollection() query_budget = 100 noise_precentage = 0.05 threshold = 0.95 tests.add_10_times_10_fold_test( "NCOBRASplus_{}_noise_budget{}_pnoise{}_threshold{}_runtimes".format( noise_precentage, query_budget, noise_precentage, threshold), "COBRAS", cobras_algorithm_settings_to_string(noise_precentage, min_approx_order=3, max_approx_order=10, keep_threshold=threshold, reuse_threshold=threshold, correct_noise=True, use_all_cycles=False), Dataset.get_dataset_names(), "probability_noise_querier", probabilistic_noisy_querier_settings_to_string(noise_precentage, query_budget), nb_of_runs=10) run_tests_over_SSH_on_machines(tests, MACHINES_TO_USE)
def cobras_VaryingAmountsOfNoise(): print("making tests") tests = TestCollection() query_budget = 200 for noise_percentage in [-1, 0.05, 0.10, 0.20]: noise_text = str(noise_percentage) if noise_percentage != -1 else "no" tests.add_10_times_10_fold_test( "cobras_{}_noise_budget{}".format(noise_text, query_budget), "COBRAS", cobras_algorithm_settings_to_string(0.10, 3, 7, 0.91, 0.91, False, False), Dataset.get_dataset_names(), "probability_noise_querier", probabilistic_noisy_querier_settings_to_string( noise_percentage, query_budget)) run_tests_over_SSH_on_machines(tests, MACHINES_TO_USE) comparison_name = "cobras_varying_amounts_of_noise" test_names = [ "cobras_{}_noise_budget200".format(i) for i in ["no", 0.05, 0.10, 0.20] ] line_names = None calculate_aris_and_compare_for_tests(comparison_name, test_names, line_names)
def calculate_all_W_matrices(nb_of_cores=8): tasks = [] for dataset_name in Dataset.get_dataset_names(): tasks.append(CalculateWMatrix(dataset_name)) run_tests_from_generator(tasks, nb_of_cores=nb_of_cores)