def check_influence_of_noise_on_MPCK_means(): random_name = "random_constraints_200" active_name = "active_constraints_200" active_noisy = "active_noisy_constraints_200" random_noisy = "random_noisy_constraints_200" datasets = list(Dataset.datasets()) tests = run_mpck_means_with_constraints("MPCKJava_random_w=100", random_name, datasets, 10, 100) tests.extend( run_mpck_means_with_constraints("MPCKJava_active_w=10000", active_name, datasets, 10, 10000)) tests.extend( run_mpck_means_with_constraints("MPCKJava_noisy_random_w=100", random_noisy, datasets, 10, 100)) tests.extend( run_mpck_means_with_constraints("MPCKJava_noisy_active_w=100", active_noisy, datasets, 10, 100)) execute_list_of_futures_with_dataset_requirements(tests, 100) random = collect_semi_supervised_results("MPCKJava_random") noisy_random = collect_semi_supervised_results("MPCKJava_noisy_random") # print("RANDOM") # print(random.sort_index()) # print("RANDOM W=100") # print(collect_semi_supervised_results("MPCKJava_random_w=100").sort_index()) # print("NOISY RANDOM") # print(noisy_random.sort_index()) print("ACTIVE") print(collect_semi_supervised_results("MPCKJava_active").sort_index()) print("ACTIVE w=10000") print( collect_semi_supervised_results( "MPCKJava_active_w=10000").sort_index())
def scatter_datasets(client): """ Scatter the preprocessed datasets over all the workers """ dataset_dict = {} for dataset in Dataset.datasets(preprocessed=True): dataset_dict[dataset.name] = dataset dataset_dict_futures = client.scatter(dataset_dict, broadcast=True) return dataset_dict_futures
def run_clustering_tasks_locally(clustering_tasks, nb_of_cores=3): dataset_dict = {} for dataset in Dataset.datasets(preprocessed=True): dataset_dict[dataset.name] = dataset tuple_list = [] for clustering_task in clustering_tasks: tuple_list.append( (clustering_task, dataset_dict[clustering_task.dataset_name])) if nb_of_cores > 1: with Pool(nb_of_cores, initializer=tqdm.set_lock, initargs=(RLock(), )) as pool: results = pool.imap(run_clustering_task, tuple_list) for _ in tqdm(results, total=len(tuple_list)): pass else: for clustering_task, dataset in tqdm(tuple_list): clustering_task.run(dataset)
def generate_active_constraints(datasets, nb_of_constraints, number_of_runs): generate_constraints_from_generator(datasets, generate_correct_min_max_active_constraint_set, f"active_constraints_{nb_of_constraints}", nb_of_constraints, number_of_runs) def generate_constraints_from_generator(datasets, generator, constraint_set_name, number_of_constraints, number_of_runs): path = Path(CONSTRAINTS_PATH) / constraint_set_name for dataset, run_index in itertools.product(datasets, range(number_of_runs)): file_path = path / dataset.name / f"constraints_{run_index}.json" if file_path.exists(): continue ml, cl = generator(dataset, number_of_constraints) file_path.parent.mkdir(parents=True, exist_ok=True) with file_path.open(mode='w') as output_file: json.dump((ml, cl, [], []), output_file) if __name__ == '__main__': datasets = list(Dataset.datasets(preprocessed=True)) nb_of_constraints = 200 nb_of_runs = 10 nb_of_noise = 20 generate_random_constraints(datasets, nb_of_constraints,nb_of_runs) print("generating active") generate_active_constraints(datasets, nb_of_constraints, nb_of_runs) generate_noisy_constraint_from_constraints(f"random_constraints_{nb_of_constraints}", f"random_noisy_constraints_{nb_of_constraints}", nb_of_noise) generate_noisy_constraint_from_constraints(f"active_constraints_{nb_of_constraints}", f"active_noisy_constraints_{nb_of_constraints}", nb_of_noise)