示例#1
0
def check_influence_of_noise_on_MPCK_means():
    random_name = "random_constraints_200"
    active_name = "active_constraints_200"
    active_noisy = "active_noisy_constraints_200"
    random_noisy = "random_noisy_constraints_200"
    datasets = list(Dataset.datasets())
    tests = run_mpck_means_with_constraints("MPCKJava_random_w=100",
                                            random_name, datasets, 10, 100)
    tests.extend(
        run_mpck_means_with_constraints("MPCKJava_active_w=10000", active_name,
                                        datasets, 10, 10000))
    tests.extend(
        run_mpck_means_with_constraints("MPCKJava_noisy_random_w=100",
                                        random_noisy, datasets, 10, 100))
    tests.extend(
        run_mpck_means_with_constraints("MPCKJava_noisy_active_w=100",
                                        active_noisy, datasets, 10, 100))
    execute_list_of_futures_with_dataset_requirements(tests, 100)
    random = collect_semi_supervised_results("MPCKJava_random")
    noisy_random = collect_semi_supervised_results("MPCKJava_noisy_random")
    # print("RANDOM")
    # print(random.sort_index())
    # print("RANDOM W=100")
    # print(collect_semi_supervised_results("MPCKJava_random_w=100").sort_index())
    # print("NOISY RANDOM")
    # print(noisy_random.sort_index())
    print("ACTIVE")
    print(collect_semi_supervised_results("MPCKJava_active").sort_index())
    print("ACTIVE w=10000")
    print(
        collect_semi_supervised_results(
            "MPCKJava_active_w=10000").sort_index())
def scatter_datasets(client):
    """
        Scatter the preprocessed datasets over all the workers

    """
    dataset_dict = {}
    for dataset in Dataset.datasets(preprocessed=True):
        dataset_dict[dataset.name] = dataset
    dataset_dict_futures = client.scatter(dataset_dict, broadcast=True)
    return dataset_dict_futures
示例#3
0
def run_clustering_tasks_locally(clustering_tasks, nb_of_cores=3):
    dataset_dict = {}
    for dataset in Dataset.datasets(preprocessed=True):
        dataset_dict[dataset.name] = dataset
    tuple_list = []
    for clustering_task in clustering_tasks:
        tuple_list.append(
            (clustering_task, dataset_dict[clustering_task.dataset_name]))
    if nb_of_cores > 1:
        with Pool(nb_of_cores, initializer=tqdm.set_lock,
                  initargs=(RLock(), )) as pool:
            results = pool.imap(run_clustering_task, tuple_list)
            for _ in tqdm(results, total=len(tuple_list)):
                pass
    else:
        for clustering_task, dataset in tqdm(tuple_list):
            clustering_task.run(dataset)
def generate_active_constraints(datasets, nb_of_constraints, number_of_runs):
    generate_constraints_from_generator(datasets, generate_correct_min_max_active_constraint_set,
                                        f"active_constraints_{nb_of_constraints}", nb_of_constraints, number_of_runs)


def generate_constraints_from_generator(datasets, generator, constraint_set_name, number_of_constraints,
                                        number_of_runs):
    path = Path(CONSTRAINTS_PATH) / constraint_set_name
    for dataset, run_index in itertools.product(datasets, range(number_of_runs)):
        file_path = path / dataset.name / f"constraints_{run_index}.json"
        if file_path.exists():
            continue
        ml, cl = generator(dataset, number_of_constraints)
        file_path.parent.mkdir(parents=True, exist_ok=True)
        with file_path.open(mode='w') as output_file:
            json.dump((ml, cl, [], []), output_file)

if __name__ == '__main__':
    datasets = list(Dataset.datasets(preprocessed=True))
    nb_of_constraints = 200
    nb_of_runs = 10
    nb_of_noise = 20
    generate_random_constraints(datasets, nb_of_constraints,nb_of_runs)
    print("generating active")
    generate_active_constraints(datasets, nb_of_constraints, nb_of_runs)
    generate_noisy_constraint_from_constraints(f"random_constraints_{nb_of_constraints}", f"random_noisy_constraints_{nb_of_constraints}", nb_of_noise)
    generate_noisy_constraint_from_constraints(f"active_constraints_{nb_of_constraints}",
                                               f"active_noisy_constraints_{nb_of_constraints}", nb_of_noise)