Пример #1
0
def save_memmap_data(output_dirpath, dataname, data_size, nb_features, Xy_gen):
    output_path_obs = project_dir / output_dirpath / (dataname + ".dat")
    output_path_labels = project_dir / output_dirpath / (dataname + ".lab")
    fp_obs = np.memmap(output_path_obs,
                       dtype='float32',
                       mode='w+',
                       shape=(data_size, nb_features))
    fp_labels = np.memmap(output_path_labels, mode='w+', shape=(data_size, ))

    logger.info(
        "{} Data will be created in file: {}; labels stored in file: {}".
        format(dataname, output_path_obs, output_path_labels))
    logger.info("About to create {}: Total {} examples.".format(
        dataname, data_size))

    curr_idx = 0
    for i, (batch_X, batch_y) in enumerate(Xy_gen):
        curr_batch_size = batch_X.shape[0]
        fp_obs[curr_idx:curr_idx + curr_batch_size] = batch_X
        if batch_y is not None:
            fp_labels[curr_idx:curr_idx + curr_batch_size] = batch_y
        curr_idx += curr_batch_size

    if batch_y is None:
        os.remove(str(output_path_labels))
Пример #2
0
def _download_single_dataset(output_dirpath, dataname):
    regex_million = re.compile(r"blobs_(\d+)_million")
    match = regex_million.match(dataname)
    if match:
        size_batch = 10000
        data_size = int(1e6) * int(match.group(1))
        nb_features = 2000
        nb_centers = 1000

        save_memmap_data(
            output_dirpath, dataname, data_size, nb_features,
            generator_blobs_data(data_size, size_batch, nb_features,
                                 nb_centers))

    else:
        if dataname in MAP_NAME_DATASET_DD.keys():
            MAP_NAME_DATASET_DD[dataname](output_dirpath)
            return

        elif MAP_NAME_CLASSES_PRESENCE_RAM[dataname]:
            (x_train, y_train), (x_test,
                                 y_test) = MAP_NAME_DATASET_RAM[dataname]()
            map_savez = {
                "x_train": x_train,
                "y_train": y_train,
                "x_test": x_test,
                "y_test": y_test
            }
        else:
            X = MAP_NAME_DATASET_RAM[dataname]()
            map_savez = {"x_train": X}

        output_path = project_dir / output_dirpath / dataname
        logger.info(f"Save {dataname} to {output_path}")
        np.savez(output_path, **map_savez)
Пример #3
0
def generator_blobs_data(data_size, size_batch, nb_features, nb_centers):
    total_nb_chunks = int(data_size // size_batch)
    init_centers = np.random.uniform(-10.0, 10.0, (nb_centers, nb_features))
    for i in range(total_nb_chunks):
        logger.info("Chunk {}/{}".format(i + 1, total_nb_chunks))
        X, y = make_blobs(size_batch,
                          n_features=nb_features,
                          centers=init_centers,
                          cluster_std=12.)
        yield X, y
Пример #4
0
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, gamma, landmarks):
    # verify sample size for evaluation
    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None

    # Make nystrom approximation
    # nys_obj = Nystroem(gamma=gamma, n_components=landmarks.shape[0])
    # nys_obj.fit(landmarks)
    # nystrom_embedding = nys_obj.transform(sample)
    landmarks_norm = get_squared_froebenius_norm_line_wise(landmarks)[:, np.newaxis]
    metric = prepare_nystrom(landmarks, landmarks_norm, gamma=gamma)
    nystrom_embedding = nystrom_transformation(sample, landmarks, metric, landmarks_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T

    # Create real kernel matrix
    real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm)
    # real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel_special)

    # evaluation reconstruction error
    reconstruction_error_nystrom = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm

    # start svm + nystrom classification
    if x_test is not None:
        logger.info("Start classification")

        x_train_nystrom_embedding = nystrom_transformation(x_train, landmarks, metric, landmarks_norm, None, gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test, landmarks, metric, landmarks_norm, None, gamma=gamma)

        linear_svc_clf = LinearSVC(class_weight="balanced")
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        predictions = linear_svc_clf.predict(x_test_nystrom_embedding)

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy_nystrom_svm = f1
        else:
            accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0]

    else:
        accuracy_nystrom_svm = None

    return reconstruction_error_nystrom, accuracy_nystrom_svm
Пример #5
0
def get_objective_value(X_data, op_centroids, indicator_vector):
    logger.info("Compute objective")
    if paraman["--minibatch"]:
        final_objective_value = compute_objective_by_batch(X_data, op_centroids, indicator_vector, paraman["--minibatch"])
    else:
        final_objective_value = compute_objective(X_data, op_centroids, indicator_vector)

    resprinter.add({
        "final_objective_value": final_objective_value,
    })
    return final_objective_value
Пример #6
0
def compute_objective_by_batch(X_data, op_centroids, indicator_vector,
                               batch_size):
    total_nb_of_minibatch = X_data.shape[0] // batch_size
    objective_value_so_far = 0
    for i_minibatch, example_batch_indexes in enumerate(
            DataGenerator(X_data, batch_size=batch_size, return_indexes=True)):
        logger.info("Minibatch number {}/{};".format(i_minibatch,
                                                     total_nb_of_minibatch))
        example_batch = X_data[example_batch_indexes]
        indicator_vector_batch = indicator_vector[example_batch_indexes]
        objective_value_so_far += compute_objective(example_batch,
                                                    op_centroids,
                                                    indicator_vector_batch)
    final_objective_value = objective_value_so_far
    return final_objective_value
Пример #7
0
def generator_data(data_load_func, size_batch=10000):
    X, y = data_load_func()
    data_size = X.shape[0]
    total_nb_chunks = int(data_size // size_batch)
    remaining = int(data_size % size_batch)
    for i in range(total_nb_chunks):
        logger.info("Chunk {}/{}".format(i + 1, total_nb_chunks))
        if y is None:
            yield X[i * size_batch:(i + 1) * size_batch], None
        else:
            yield X[i * size_batch:(i + 1) *
                    size_batch], y[i * size_batch:(i + 1) * size_batch]
    if remaining > 0:
        if y is None:
            yield X[(i + 1) * size_batch:], None
        else:
            yield X[(i + 1) * size_batch:], y[(i + 1) * size_batch:]
def make_1nn_evaluation(x_train, y_train, x_test, y_test, U_centroids,
                        indicator_vector):
    def scikit_evaluation(str_type):
        clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type)
        clf.fit(x_train, y_train)

        start_inference_time = time.time()
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0]
        stop_inference_time = time.time()

        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_{}_inference_time".format(str_type): inference_time,
            "1nn_{}_accuracy".format(str_type): accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

    def kmean_tree_evaluation():

        lst_clf_by_cluster = [
            KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(
                x_train[indicator_vector == i], y_train[indicator_vector == i])
            for i in range(U_centroids.shape[0])
        ]

        start_inference_time = time.time()
        distances = get_distances(x_test, U_centroids)
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1,
                                                                        -1))[0]

        stop_inference_time = time.time()
        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

    logger.info("1 nearest neighbor with k-means search")
    kmean_tree_time = kmean_tree_evaluation()
    if paraman["kmeans"]:
        lst_knn_types = ["brute", "ball_tree", "kd_tree"]
        for knn_type in lst_knn_types:
            signal.signal(signal.SIGALRM, timeout_signal_handler)
            signal.alarm(int(kmean_tree_time * 10))
            try:
                logger.info(
                    "1 nearest neighbor with {} search".format(knn_type))
                scikit_evaluation(knn_type)
            except TimeoutError as te:
                logger.warning(
                    "Timeout during execution of 1-nn with {} version: {}".
                    format(knn_type, te))
            signal.alarm(0)
Пример #9
0
def hierarchical_palm4msa(arr_X_target: np.array,
                          lst_S_init: list,
                          lst_dct_projection_function: list,
                          nb_iter: int,
                          f_lambda_init: float = 1,
                          residual_on_right: bool = True,
                          update_right_to_left=True,
                          track_objective_palm=False,
                          return_objective_function=False,
                          delta_objective_error_threshold_palm=1e-6):
    """


    :param arr_X_target:
    :param lst_S_init: The factors are given right to left. In all case.
    :param nb_keep_values:
    :param f_lambda_init:
    :param nb_iter:
    :param update_right_to_left: Way in which the factors are updated in the inner palm4msa algorithm. If update_right_to_left is True,
    the factors are updated right to left (e.g; the last factor in the list first). Otherwise the contrary.
    :param residual_on_right: During the split step, the residual can be computed as a right or left factor. If residual_on_right is True,
    the residuals are computed as right factors. We can also see this option as the update way for the hierarchical strategy:
    when the residual is computed on the right, it correspond to compute the last factor first (left to right according to the paper: the factor with the
    bigger number first)
    :return:
    """
    if not update_right_to_left:
        raise NotImplementedError  # todo voir pourquoi ça plante... mismatch dimension

    arr_residual = arr_X_target

    op_S_factors = SparseFactors(deepcopy(lst_S_init))
    nb_factors = op_S_factors.n_factors

    # check if lst_dct_param_projection_operator contains a list of dict with param for step split and finetune
    assert len(
        lst_dct_projection_function
    ) == nb_factors - 1, "Number of factor {} and number of constraints {} are different".format(
        len(lst_dct_projection_function), nb_factors - 1)
    assert all(
        len({"split", "finetune"}.difference(dct.keys())) == 0
        for dct in lst_dct_projection_function)

    f_lambda = f_lambda_init

    if return_objective_function:
        objective_function = np.empty((nb_factors, 3))
    else:
        objective_function = None

    lst_objectives = []

    # main loop
    for k in range(nb_factors - 1):
        lst_objective_split_fine_fac_k = []

        nb_factors_so_far = k + 1

        logger.info("Working on factor: {}".format(k))
        logger.info("Step split")

        ########################## Step split ##########################################################

        if return_objective_function:
            # compute objective before split step
            objective_function[k, 0] = compute_objective_function(
                arr_X_target, f_lambda, op_S_factors)

        # calcule decomposition en 2 du résidu précédent
        if k == 0:
            f_lambda_init_split = f_lambda_init
        else:
            f_lambda_init_split = 1.

        func_split_step_palm4msa = lambda lst_S_init: palm4msa(
            arr_X_target=arr_residual,
            lst_S_init=lst_S_init,  # eye for factor and zeros for residual
            nb_factors=2,
            lst_projection_functions=lst_dct_projection_function[k]["split"],
            # define constraints: ||0 = d pour T1; relaxed constraint on ||0 for T2
            f_lambda_init=f_lambda_init_split,
            nb_iter=nb_iter,
            update_right_to_left=update_right_to_left,
            track_objective=track_objective_palm,
            delta_objective_error_threshold=
            delta_objective_error_threshold_palm)

        if residual_on_right:
            op_S_factors_init = SparseFactors(lst_S_init[nb_factors_so_far:])
            residual_init = op_S_factors_init.compute_product(
            )  # todo I think this product can be prepared before and save computation
            lst_S_init_split_step = [lst_S_init[k], residual_init]
            f_lambda_prime, S_out, unscaled_residual_reconstruction, objective_palm_split, _ = \
                func_split_step_palm4msa(lst_S_init=lst_S_init_split_step)
            new_factor = S_out.get_factor(0)
            new_residual = S_out.get_factor(1)
            op_S_factors.set_factor(k, new_factor)

        else:
            op_S_factors_init = SparseFactors(lst_S_init[:-nb_factors_so_far])
            residual_init = op_S_factors_init.compute_product(
            )  # todo I think this product can be prepared before and save computation
            lst_S_init_split_step = [
                residual_init, lst_S_init[-nb_factors_so_far]
            ]
            f_lambda_prime, S_out, unscaled_residual_reconstruction, objective_palm_split, _ = \
                func_split_step_palm4msa(lst_S_init=lst_S_init_split_step)
            new_residual = S_out.get_factor(0)
            new_factor = S_out.get_factor(1)
            op_S_factors.set_factor(nb_factors - nb_factors_so_far, new_factor)

        if k == 0:
            f_lambda = f_lambda_prime
        else:
            f_lambda *= f_lambda_prime

        lst_objective_split_fine_fac_k.append(objective_palm_split)

        # get the k first elements [:k+1] and the next one (k+1)th as arr_residual (depend on the residual_on_right option)
        logger.info("Step finetuning")

        ########################## Step finetuning ##########################################################

        if return_objective_function:
            objective_function[k, 1] = compute_objective_function(
                arr_X_target, f_lambda, op_S_factors)

        func_fine_tune_step_palm4msa = lambda lst_S_init: palm4msa(
            arr_X_target=arr_X_target,
            lst_S_init=lst_S_init,
            nb_factors=nb_factors_so_far + 1,
            lst_projection_functions=lst_dct_projection_function[k]["finetune"
                                                                    ],
            f_lambda_init=f_lambda,
            nb_iter=nb_iter,
            update_right_to_left=update_right_to_left,
            track_objective=track_objective_palm,
            delta_objective_error_threshold=
            delta_objective_error_threshold_palm)

        if residual_on_right:
            lst_S_in = op_S_factors.get_list_of_factors()[:nb_factors_so_far]
            f_lambda, lst_S_out, _, objective_palm_fine, _ = \
                func_fine_tune_step_palm4msa(
                    lst_S_init=lst_S_in + [new_residual])
            for i in range(nb_factors_so_far):
                op_S_factors.set_factor(i, lst_S_out.get_factor(i))
            # TODO remove .toarray()?
            arr_residual = lst_S_out.get_factor(nb_factors_so_far).toarray()
        else:
            lst_S_in = op_S_factors.get_list_of_factors()[-nb_factors_so_far:]
            f_lambda, lst_S_out, _, objective_palm_fine, _ = \
                func_fine_tune_step_palm4msa(
                    lst_S_init=[new_residual] + lst_S_in)
            for i in range(nb_factors_so_far):
                op_S_factors.set_factor(-nb_factors_so_far + i,
                                        lst_S_out.get_factor(i + 1))
            # TODO remove .toarray()?
            arr_residual = lst_S_out.get_factor(0).toarray()

        lst_objective_split_fine_fac_k.append(objective_palm_fine)
        lst_objectives.append(tuple(lst_objective_split_fine_fac_k))

        if return_objective_function:
            objective_function[k, 2] = compute_objective_function(
                arr_X_target, f_lambda, op_S_factors)

    # last factor is residual of last palm4LED
    if residual_on_right:
        op_S_factors.set_factor(-1, arr_residual)
    else:
        op_S_factors.set_factor(0, arr_residual)

    if return_objective_function:
        objective_function[nb_factors - 1, :] = np.array(
            [compute_objective_function(arr_X_target, f_lambda, op_S_factors)
             ] * 3)

    arr_X_curr = f_lambda * op_S_factors.compute_product()

    return f_lambda, op_S_factors, arr_X_curr, lst_objectives, objective_function
    nystrom_inference_time = nystrom_inference_time_stop - nystrom_inference_time_start

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value -
                                             real_kernel)

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm
    }

    resprinter.add(nystrom_results)


if __name__ == "__main__":
    logger.info("Command line: " + " ".join(sys.argv))
    arguments = docopt.docopt(__doc__)
    paraman = ParameterManager(arguments)
    initialized_results = dict((v, None) for v in lst_results_header)
    resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"])
    resprinter.add(initialized_results)
    resprinter.add(paraman)
    objprinter = ObjectiveFunctionPrinter(
        output_file=paraman["--output-file_objprinter"])
    has_failed = False
    if paraman["--verbose"]:
        daiquiri.setup(level=logging.DEBUG)
    else:
        daiquiri.setup(level=logging.INFO)

    try:
Пример #11
0
lst_factors[-1] = np.random.rand(min(X.shape), X.shape[1])
lst_factors[0] = np.eye(X.shape[0], min(X.shape))
_lambda = 1.

lst_proj_op_by_fac_step = []
factor = 10
nb_keep_values = factor*d
for k in range(nb_factors - 1):
    nb_values_residual = max(nb_keep_values, int(d / 2 ** (k + 1)) * d)
    dct_step_lst_nb_keep_values = {
        "split": [get_lambda_proxsplincol(nb_keep_values), get_lambda_proxsplincol(nb_values_residual)] if residual_on_right else [get_lambda_proxsplincol(nb_values_residual), get_lambda_proxsplincol(nb_keep_values)],
        "finetune": [get_lambda_proxsplincol(nb_keep_values)] * (k+1) + [get_lambda_proxsplincol(nb_values_residual)] if residual_on_right else [get_lambda_proxsplincol(nb_values_residual)] + [get_lambda_proxsplincol(nb_keep_values)] * (k+1)
    }
    lst_proj_op_by_fac_step.append(dct_step_lst_nb_keep_values)

logger.info("Sparsity parameter by factor: {}".format(pformat(lst_proj_op_by_fac_step)))
#final_lambda, final_factors, final_X = PALM4LED(H, lst_factors, [nb_keep_values for _ in range(nb_factors)], _lambda, nb_iter)
final_lambda, final_factors, final_X, nb_iter_by_factor, _ = hierarchical_palm4msa(
    arr_X_target=X,
    lst_S_init=lst_factors,
    lst_dct_projection_function=lst_proj_op_by_fac_step,
    f_lambda_init=_lambda,
    nb_iter=nb_iter,
    update_right_to_left=True,
    residual_on_right=residual_on_right,
    graphical_display=False)

logger.info("Number of iteration for each factor: {}; Total: {}".format(nb_iter_by_factor, sum(nb_iter_by_factor)))

visual_evaluation_palm4msa(X, lst_factors, final_factors, final_X)
Пример #12
0
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """
    def prepare_nystrom(landmarks, landmarks_norm):
        basis_kernel_W = special_rbf_kernel(landmarks, landmarks, gamma,
                                            landmarks_norm, landmarks_norm)
        U, S, V = np.linalg.svd(basis_kernel_W)
        S = np.maximum(S, 1e-12)
        normalization_ = np.dot(U / np.sqrt(S), V)

        return normalization_

    def nystrom_transformation(x_input, landmarks, p_metric, landmarks_norm,
                               x_input_norm):
        nystrom_embedding = special_rbf_kernel(landmarks, x_input, gamma,
                                               landmarks_norm,
                                               x_input_norm).T @ p_metric
        return nystrom_embedding

    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning(
            "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage(
        "Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)
    # centroids_norm = None

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None
    log_memory_usage(
        "Memory after sample selection in make_nystrom_evaluation")

    ########################
    # Nystrom on centroids #
    ########################
    logger.info("Build Nystrom on centroids")
    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.process_time()
    metric = prepare_nystrom(U_centroids, centroids_norm)
    nystrom_build_stop_time = time.process_time()
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    # STOP
    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.process_time()
    nystrom_embedding = nystrom_transformation(sample, U_centroids, metric,
                                               centroids_norm, samples_norm)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.process_time()
    log_memory_usage(
        "Memory after kernel matrix approximation in make_nystrom_evaluation")
    ## STOP
    nystrom_inference_time = (nystrom_inference_time_stop -
                              nystrom_inference_time_start) / n_sample

    ################################################################

    ######################
    # Nystrom on uniform #
    ######################
    logger.info("Build Nystrom on uniform sampling")

    indexes_uniform_samples = np.random.permutation(
        x_train.shape[0])[:U_centroids.shape[0]]
    uniform_sample = x_train[indexes_uniform_samples]
    uniform_sample_norm = None
    log_memory_usage(
        "Memory after uniform sample selection in make_nystrom_evaluation")

    metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm)
    log_memory_usage(
        "Memory after SVD computation in uniform part of make_nystrom_evaluation"
    )

    nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample,
                                                       metric_uniform,
                                                       uniform_sample_norm,
                                                       samples_norm)
    nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T

    #################################################################

    ###############
    # Real Kernel #
    ###############
    logger.info("Compute real kernel matrix")

    real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm,
                                     samples_norm)
    real_kernel_norm = np.linalg.norm(real_kernel)
    log_memory_usage(
        "Memory after real kernel computation in make_nystrom_evaluation")

    ################################################################

    ####################
    # Error evaluation #
    ####################

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value -
                                             real_kernel) / real_kernel_norm
    sampled_froebenius_norm_uniform = np.linalg.norm(
        nystrom_approx_kernel_value_uniform - real_kernel) / real_kernel_norm

    # svm evaluation
    if x_test is not None:
        logger.info("Start classification")

        time_classification_start = time.process_time()
        x_train_nystrom_embedding = nystrom_transformation(
            x_train, U_centroids, metric, centroids_norm, None)
        x_test_nystrom_embedding = nystrom_transformation(
            x_test, U_centroids, metric, centroids_norm, None)

        linear_svc_clf = LinearSVC()
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        accuracy_nystrom_svm = linear_svc_clf.score(x_test_nystrom_embedding,
                                                    y_test)
        time_classification_stop = time.process_time()

        delta_time_classification = time_classification_stop - time_classification_start
    else:
        accuracy_nystrom_svm = None
        delta_time_classification = None

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm,
        "nystrom_sampled_error_reconstruction_uniform":
        sampled_froebenius_norm_uniform,
        "nystrom_svm_accuracy": accuracy_nystrom_svm,
        "nystrom_svm_time": delta_time_classification
    }

    resprinter.add(nystrom_results)
Пример #13
0
            K_nb_cluster=U_init.shape[0],
            nb_iter=paraman["--nb-iteration"],
            initialization=U_init,
            batch_size=paraman["--minibatch"])
    else:
        objective_values_k, final_centroids, indicator_vector_final = kmeans(
            X_data=X,
            K_nb_cluster=U_init.shape[0],
            nb_iter=paraman["--nb-iteration"],
            initialization=U_init)

    return final_centroids, indicator_vector_final


if __name__ == "__main__":
    logger.info("Command line: " + " ".join(sys.argv))
    log_memory_usage("Memory at startup")
    arguments = docopt.docopt(__doc__)
    paraman = ParameterManagerEfficientNystrom(arguments)
    initialized_results = dict((v, None) for v in lst_results_header)
    resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"])
    resprinter.add(initialized_results)
    resprinter.add(paraman)
    has_failed = False
    if paraman["-v"] >= 2:
        daiquiri.setup(level=logging.DEBUG)
    elif paraman["-v"] >= 1:
        daiquiri.setup(level=logging.INFO)
    else:
        daiquiri.setup(level=logging.WARNING)
Пример #14
0
def kmeans(X_data,
           K_nb_cluster,
           nb_iter,
           initialization,
           delta_objective_error_threshold=1e-6,
           proj_l1=False,
           _lambda=None,
           epsilon=None):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value.
    :return:
    """

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    # Initialize our centroids by picking random data points
    U_centroids_hat = copy.deepcopy(initialization)
    U_centroids = U_centroids_hat

    objective_function = np.empty((nb_iter, ))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error = np.inf
    while (i_iter == 0) or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Kmeans {}".format(i_iter))

        indicator_vector, distances = assign_points_to_clusters(
            X_data, U_centroids, X_norms=X_data_norms)

        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        # Update centroid location using the new indicator vector
        counts, cluster_names_sorted = update_clusters_with_integrity_check(
            X_data, X_data_norms, U_centroids_hat, K_nb_cluster, counts,
            indicator_vector, distances, cluster_names, cluster_names_sorted)

        U_centroids = U_centroids_hat

        if proj_l1:
            if _lambda is None or epsilon is None:
                raise ValueError(
                    "epsilon and lambda must be set if proj_l1 is True")
            for i_centroid, centroid in enumerate(U_centroids):
                U_centroids[i_centroid, :] = proj_onto_l1_ball(
                    _lambda=_lambda, epsilon_tol=epsilon, vec=centroid)

        objective_function[i_iter, ] = compute_objective(
            X_data, U_centroids, indicator_vector)

        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter] -
                                           objective_function[i_iter - 1]
                                           ) / objective_function[i_iter - 1]

        i_iter += 1

    return objective_function[:i_iter], U_centroids, indicator_vector
def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization,
                     batch_size):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param batch_size: The size of each batch.
    :return:
    """

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    # Initialize our centroids by picking random data points
    U_centroids_hat = copy.deepcopy(initialization)
    U_centroids = U_centroids_hat
    full_indicator_vector = np.zeros(X_data.shape[0], dtype=int)
    full_count_vector = np.zeros(K_nb_cluster, dtype=int)
    objective_function = np.empty((nb_iter, ))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf
    while True:
        for i_iter, example_batch_indexes in enumerate(
                DataGenerator(X_data,
                              batch_size=batch_size,
                              return_indexes=True)):
            if not (delta_objective_error > delta_objective_error_threshold):
                logger.info(
                    "not (delta_objective_error {}-{}={} > delta_objective_error_threshold {})"
                    .format(objective_function[i_iter],
                            objective_function[i_iter - 1],
                            delta_objective_error,
                            delta_objective_error_threshold))
                break

            example_batch = X_data[example_batch_indexes]

            logger.info("Iteration Kmeans {}".format(i_iter))

            indicator_vector, distances = assign_points_to_clusters(
                example_batch,
                U_centroids,
                X_norms=X_data_norms[example_batch_indexes])
            full_indicator_vector[example_batch_indexes] = indicator_vector

            cluster_names, counts = np.unique(indicator_vector,
                                              return_counts=True)
            # cluster_names_sorted = np.argsort(cluster_names)
            #
            count_vector = np.zeros(K_nb_cluster, dtype=int)
            count_vector[cluster_names] = counts

            full_count_vector += count_vector
            # previous_full_count_vector = full_count_vector - count_vector

            # Update centroid location using the newly
            # assigned data point classes
            # This way of updating the centroids (centroid index wise) is better than the one proposed in the paper "Web-Scale K-Means Clustering"
            # as the number of update with always be <= batch_size
            for c in range(K_nb_cluster):
                if full_count_vector[c] != 0 and count_vector[c] != 0:
                    U_centroids_hat[c] += (1 / full_count_vector[c]) * np.sum(
                        example_batch[indicator_vector == c] -
                        U_centroids_hat[c],
                        axis=0)
                    # this is exactly equivalent to an update of the mean:
                    # U_centroids_hat[c] = (previous_full_count_vector[c] / full_count_vector[c]) * U_centroids_hat[c] + (1 / full_count_vector[c]) * np.sum(example_batch[indicator_vector == c], axis=0)

            # for i_ex, ex in enumerate(example_batch):
            #     c = indicator_vector[i_ex]
            #     full_count_vector[c] += 1
            #     eta = 1./full_count_vector[c]
            #     U_centroids_hat[c] = (1-eta) * U_centroids_hat[c] + eta * ex

            # counts, cluster_names_sorted = assess_clusters_integrity(X_data,
            #                                                          X_data_norms,
            #                                                          U_centroids_hat,
            #                                                          K_nb_cluster,
            #                                                          counts,
            #                                                          indicator_vector,
            #                                                          distances,
            #                                                          cluster_names,
            #                                                          cluster_names_sorted)

            # check if all clusters still have points
            # for c in range(K_nb_cluster):
            #     biggest_cluster_index = np.argmax(counts)  # type: int
            #     biggest_cluster = cluster_names[biggest_cluster_index]
            #     biggest_cluster_data = X_data[indicator_vector == biggest_cluster]
            #
            #     cluster_data = X_data[indicator_vector == c]
            #     if len(cluster_data) == 0:
            #         logger.warning("cluster has lost data, add new cluster. cluster idx: {}".format(c))
            #         U_centroids_hat[c] = biggest_cluster_data[np.random.randint(len(biggest_cluster_data))].reshape(1, -1)
            #         counts = list(counts)
            #         counts[biggest_cluster_index] -= 1
            #         counts.append(1)
            #         counts = np.array(counts)
            #         cluster_names_sorted = list(cluster_names_sorted)
            #         cluster_names_sorted.append(c)
            #         cluster_names_sorted = np.array(cluster_names_sorted)
            #     else:
            #         U_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0)

            U_centroids = U_centroids_hat

            objective_function[i_iter, ] = compute_objective(
                X_data, U_centroids, full_indicator_vector)

            if i_iter >= 1:
                delta_objective_error = np.abs(
                    objective_function[i_iter] - objective_function[i_iter - 1]
                ) / objective_function[
                    i_iter -
                    1]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

            i_iter += 1
        else:
            continue
        break

    return objective_function[:i_iter], U_centroids, indicator_vector
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """

    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage("Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)[:, np.newaxis]
    # centroids_norm = None

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None
    log_memory_usage("Memory after sample selection in make_nystrom_evaluation")

    ########################
    # Nystrom on centroids #
    ########################
    logger.info("Build Nystrom on centroids")
    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.process_time()
    metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma)
    nystrom_build_stop_time = time.process_time()
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    # STOP
    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.process_time()
    nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.process_time()
    log_memory_usage("Memory after kernel matrix approximation in make_nystrom_evaluation")
    ## STOP
    nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample

    ################################################################

    ######################
    # Nystrom on uniform #
    ######################
    logger.info("Build Nystrom on uniform sampling")

    indexes_uniform_samples = np.random.permutation(x_train.shape[0])[:U_centroids.shape[0]]
    uniform_sample = x_train[indexes_uniform_samples]
    uniform_sample_norm = get_squared_froebenius_norm_line_wise(uniform_sample)[:, np.newaxis]
    log_memory_usage("Memory after uniform sample selection in make_nystrom_evaluation")

    metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm, gamma=gamma)
    log_memory_usage("Memory after SVD computation in uniform part of make_nystrom_evaluation")

    nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T

    #################################################################

    ###############
    # Real Kernel #
    ###############
    logger.info("Compute real kernel matrix")

    real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm)
    # real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel_special)
    log_memory_usage("Memory after real kernel computation in make_nystrom_evaluation")

    #################################
    # Sklearn based Nystrom uniform #
    #################################

    # sklearn_nystrom = Nystroem(gamma=gamma, n_components=uniform_sample.shape[0])
    # sklearn_nystrom = sklearn_nystrom.fit(uniform_sample)
    # sklearn_transfo = sklearn_nystrom.transform(sample)
    # kernel_sklearn_nys = sklearn_transfo  @ sklearn_transfo.T

    ################################################################

    ####################
    # Error evaluation #
    ####################

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm
    sampled_froebenius_norm_uniform = np.linalg.norm(nystrom_approx_kernel_value_uniform - real_kernel_special) / real_kernel_norm

    # svm evaluation
    if x_test is not None:
        logger.info("Start classification")

        time_classification_start = time.process_time()
        x_train_nystrom_embedding = nystrom_transformation(x_train, U_centroids, metric, centroids_norm, None, gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test, U_centroids, metric, centroids_norm, None, gamma=gamma)

        linear_svc_clf = LinearSVC(class_weight="balanced")
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        predictions = linear_svc_clf.predict(x_test_nystrom_embedding)
        time_classification_stop = time.process_time()

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy_nystrom_svm = f1
        else:
            accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0]

        delta_time_classification = time_classification_stop - time_classification_start
    else:
        accuracy_nystrom_svm = None
        delta_time_classification = None

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm,
        "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform,
        "nystrom_svm_accuracy": accuracy_nystrom_svm,
        "nystrom_svm_time": delta_time_classification
    }

    resprinter.add(nystrom_results)
Пример #17
0
def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           graphical_display=False):
    """
    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param graphical_display: Tell the algorithm to display the results.
    :return:
    """

    assert K_nb_cluster == initialization.shape[0]

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]

    X_centroids_hat = copy.deepcopy(initialization)
    min_K_d = min(X_centroids_hat.shape)

    lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)]

    eye_norm = np.sqrt(K_nb_cluster)
    lst_factors[0] = np.eye(K_nb_cluster) / eye_norm
    lst_factors[1] = np.eye(K_nb_cluster, min_K_d)
    lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1]))

    if graphical_display:
        lst_factors_init = copy.deepcopy(lst_factors)

    _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
        arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
        lst_S_init=lst_factors,
        lst_dct_projection_function=lst_proj_op_by_fac_step,
        f_lambda_init=init_lambda * eye_norm,
        nb_iter=nb_iter_palm,
        update_right_to_left=True,
        residual_on_right=residual_on_right,
        graphical_display=False)

    _lambda = _lambda_tmp / eye_norm

    if graphical_display:
        if hierarchical_inside:
            plt.figure()
            plt.yscale("log")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                        objective_palm[:, 0],
                        marker="x",
                        label="before split")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                        objective_palm[:, 1],
                        marker="x",
                        label="between")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                        objective_palm[:, 2],
                        marker="x",
                        label="after finetune")
            plt.plot(np.arange(len(objective_palm) * 3),
                     objective_palm.flatten(),
                     color="k")
            plt.legend()
            plt.show()

        visual_evaluation_palm4msa(
            np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init,
            lst_factors, _lambda * multi_dot(lst_factors))

    objective_function = np.empty((nb_iter, 2))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf
    while (i_iter <= 1) or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Qmeans {}".format(i_iter))

        U_centroids = _lambda * multi_dot(lst_factors[1:])

        if i_iter > 0:
            objective_function[i_iter,
                               0] = compute_objective(X_data, U_centroids,
                                                      indicator_vector)

        # Assign all points to the nearest centroid
        # first get distance from all points to all centroids
        distances = get_distances(X_data,
                                  U_centroids,
                                  precomputed_data_points_norm=X_data_norms)
        # then, Determine class membership of each point
        # by picking the closest centroid
        indicator_vector = np.argmin(distances, axis=1)

        objective_function[i_iter,
                           1] = compute_objective(X_data, U_centroids,
                                                  indicator_vector)

        # Update centroid location using the newly
        # assigned data point classes
        for c in range(K_nb_cluster):
            X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0)

        # get the number of observation in each cluster
        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        if len(counts) < K_nb_cluster:
            raise ValueError(
                "Some clusters have no point. Aborting iteration {}".format(
                    i_iter))

        diag_counts_sqrt = np.diag(np.sqrt(
            counts[cluster_names_sorted]))  # todo use sparse matrix object
        diag_counts_sqrt_norm = np.linalg.norm(
            diag_counts_sqrt
        )  # todo analytic sqrt(n) instead of cumputing it with norm
        diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm
        # set it as first factor
        lst_factors[0] = diag_counts_sqrt_normalized

        if graphical_display:
            lst_factors_init = copy.deepcopy(lst_factors)

        if hierarchical_inside:
            _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                # f_lambda_init=_lambda,
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                graphical_display=False)

            loss_palm_before = objective_palm[0, 0]
            loss_palm_after = objective_palm[-1, -1]

        else:
            _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1]
                ["finetune"],
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                graphical_display=False)

            loss_palm_before = objective_palm[0, -1]
            loss_palm_after = objective_palm[-1, -1]

        logger.debug("Loss palm before: {}".format(loss_palm_before))
        logger.debug("Loss palm after: {}".format(loss_palm_after))

        if graphical_display:
            if hierarchical_inside:
                plt.figure()
                plt.yscale("log")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                            objective_palm[:, 0],
                            marker="x",
                            label="before split")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                            objective_palm[:, 1],
                            marker="x",
                            label="between")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                            objective_palm[:, 2],
                            marker="x",
                            label="after finetune")
                plt.plot(np.arange(len(objective_palm) * 3),
                         objective_palm.flatten(),
                         color="k")
                plt.legend()
                plt.show()

            visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat,
                                       lst_factors_init, lst_factors,
                                       _lambda_tmp * multi_dot(lst_factors))

        _lambda = _lambda_tmp / diag_counts_sqrt_norm

        logger.debug("Returned loss (with diag) palm: {}".format(
            objective_palm[-1, 0]))

        if i_iter >= 2:
            delta_objective_error = np.abs(
                objective_function[i_iter, 0] -
                objective_function[i_iter - 1, 0]
            ) / objective_function[
                i_iter - 1,
                0]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    U_centroids = _lambda * multi_dot(lst_factors[1:])
    distances = get_distances(X_data,
                              U_centroids,
                              precomputed_data_points_norm=X_data_norms)
    indicator_vector = np.argmin(distances, axis=1)

    return objective_function[:i_iter], U_centroids, indicator_vector
Пример #18
0
    nb_factors = 5
    sparsity_factor = 2
    nb_iter_palm = 300

    residual_on_right = False

    # lst_constraints, lst_constraints_vals = build_constraint_sets(U_centroids_hat.shape[0], U_centroids_hat.shape[1], nb_factors, sparsity_factor=sparsity_factor)
    K = U_centroids_hat.shape[0]
    d = U_centroids_hat.shape[1]
    lst_constraints, lst_constraints_vals = build_constraint_set_smart(
        K,
        d,
        nb_factors,
        sparsity_factor=sparsity_factor,
        residual_on_right=residual_on_right)
    logger.info("constraints: {}".format(pformat(lst_constraints_vals)))

    hierarchical_palm_init = {
        "init_lambda": 1.,
        "nb_iter": nb_iter_palm,
        "lst_constraint_sets": lst_constraints,
        "residual_on_right": residual_on_right
    }

    # try:
    objective_values_q_hier, centroids_finaux_q_hier, indicator_hier = qmeans(
        X,
        nb_clusters,
        nb_iter_kmeans,
        nb_factors,
        hierarchical_palm_init,
Пример #19
0
    logger.info("Compute objective")
    if paraman["--minibatch"]:
        final_objective_value = compute_objective_by_batch(
            X_data, op_centroids, indicator_vector, paraman["--minibatch"])
    else:
        final_objective_value = compute_objective(X_data, op_centroids,
                                                  indicator_vector)

    resprinter.add({
        "final_objective_value": final_objective_value,
    })
    return final_objective_value


if __name__ == "__main__":
    logger.info("Command line: " + " ".join(sys.argv))
    log_memory_usage("Memory at startup")
    arguments = docopt.docopt(__doc__)
    paraman = ParameterManager(arguments)
    initialized_results = dict((v, None) for v in lst_results_header)
    resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"])
    resprinter.add(initialized_results)
    resprinter.add(paraman)
    objprinter = ObjectiveFunctionPrinter(
        output_file=paraman["--output-file_objprinter"])
    has_failed = False
    if paraman["-v"] >= 2:
        daiquiri.setup(level=logging.DEBUG)
    elif paraman["-v"] >= 1:
        daiquiri.setup(level=logging.INFO)
    else:
            i_iter += 1
        else:
            continue
        break

    return objective_function[:i_iter], U_centroids, indicator_vector


if __name__ == "__main__":
    n_samples = 1000
    n_features = 2
    n_centers = 500

    batch_size = 100
    nb_clust = 10
    nb_iter = 20

    X, _ = datasets.make_blobs(n_samples=n_samples,
                               n_features=n_features,
                               centers=n_centers)

    centroids_init = X[np.random.permutation(X.shape[0])[:nb_clust]]

    actual_nb_iter = (n_samples // batch_size) * nb_iter

    logger.info("Nb iteration: {}".format(actual_nb_iter))
    obj, _, _ = kmeans_minibatch(X, nb_clust, actual_nb_iter, centroids_init,
                                 batch_size)

    plt.plot(obj)
    plt.show()
Пример #21
0
def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           delta_objective_error_threshold=1e-6,
           hierarchical_init=False):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param delta_objective_error_threshold:
    :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not.
    :return:
    """
    assert K_nb_cluster == initialization.shape[0], "The number of cluster {} is not equal to the number of centroids in the initialization {}.".format(K_nb_cluster, initialization.shape[0])

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    nb_examples = X_data.shape[0]

    logger.info("Initializing Qmeans")

    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]
    delta_objective_error_threshold_inner_palm = params_palm4msa["delta_objective_error_threshold"]
    track_objective_palm = params_palm4msa["track_objective"]

    X_centroids_hat = copy.deepcopy(initialization)

    lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1], nb_factors)

    eye_norm = np.sqrt(K_nb_cluster)

    if hierarchical_inside or hierarchical_init:
        _lambda_tmp, op_factors, U_centroids, objective_palm, array_objective_hierarchical= \
            hierarchical_palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                track_objective_palm=track_objective_palm,
                delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm,
                return_objective_function=track_objective_palm)
    else:
        _lambda_tmp, op_factors, U_centroids, objective_palm, nb_iter_palm = \
            palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1][
                    "finetune"],
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                track_objective=track_objective_palm,
                delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

    lst_factors = None  # safe assignment for debug

    _lambda = _lambda_tmp / eye_norm

    objective_function = np.ones(nb_iter) * -1
    lst_all_objective_functions_palm = []
    lst_all_objective_functions_palm.append(objective_palm)

    i_iter = 0
    delta_objective_error = np.inf
    while ((i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Qmeans {}".format(i_iter))

        lst_factors_ = op_factors.get_list_of_factors()
        op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:])

        ###########################
        # Cluster assignment step #
        ###########################

        indicator_vector, distances = assign_points_to_clusters(X_data, op_centroids, X_norms=X_data_norms)



        #######################
        # Cluster update step #
        #######################

        # get the number of observation in each cluster
        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        # Update centroid location using the newly (it happens in the assess_cluster_integrity function)
        # assigned data point classes
        # and check if all clusters still have points
        # and change the object X_centroids_hat in place if some cluster have lost points (biggest cluster)
        counts, cluster_names_sorted = update_clusters_with_integrity_check(X_data,
                                                                            X_data_norms,
                                                                            X_centroids_hat, # in place changes
                                                                            K_nb_cluster,
                                                                            counts,
                                                                            indicator_vector,
                                                                            distances,
                                                                            cluster_names,
                                                                            cluster_names_sorted)

        #################
        # PALM4MSA step #
        #################

        # create the diagonal of the sqrt of those counts
        diag_counts_sqrt_normalized = csr_matrix(
            (np.sqrt(counts[cluster_names_sorted] / nb_examples),
             (np.arange(K_nb_cluster), np.arange(K_nb_cluster))))
        diag_counts_sqrt = np.sqrt(counts[cluster_names_sorted])

        # set it as first factor
        op_factors.set_factor(0, diag_counts_sqrt_normalized)


        if hierarchical_inside:
            _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \
                hierarchical_palm4msa(
                    arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                    lst_S_init=op_factors.get_list_of_factors(),
                    lst_dct_projection_function=lst_proj_op_by_fac_step,
                    f_lambda_init=_lambda * np.sqrt(nb_examples),
                    nb_iter=nb_iter_palm,
                    update_right_to_left=True,
                    residual_on_right=residual_on_right,
                    return_objective_function=track_objective_palm,
                    track_objective_palm=track_objective_palm,
                    delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm)

        else:
            _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \
                palm4msa(arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                         lst_S_init=op_factors.get_list_of_factors(),
                         nb_factors=op_factors.n_factors,
                         lst_projection_functions=lst_proj_op_by_fac_step[-1][
                             "finetune"],
                         f_lambda_init=_lambda * np.sqrt(nb_examples),
                         nb_iter=nb_iter_palm,
                         update_right_to_left=True,
                         track_objective=track_objective_palm,
                         delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

        lst_all_objective_functions_palm.append(objective_palm)

        _lambda = _lambda_tmp / np.sqrt(nb_examples)

        objective_function[i_iter] = compute_objective(X_data, op_centroids, indicator_vector)
        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter-1]) / objective_function[i_iter-1]

        # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee

        i_iter += 1

    lst_factors_ = op_factors.get_list_of_factors()
    op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:])

    return objective_function[:i_iter], op_centroids, indicator_vector, lst_all_objective_functions_palm
Пример #22
0
def hierarchical_palm4msa(arr_X_target: np.array,
                          lst_S_init: list,
                          lst_dct_projection_function: list,
                          nb_iter: int,
                          f_lambda_init: float = 1,
                          residual_on_right: bool = True,
                          update_right_to_left=True,
                          graphical_display=False):
    """
    lst S init contains factors in decreasing indexes (e.g: the order along which they are multiplied in the product).
    example: S5 S4 S3 S2 S1

    lst S [-j] = Sj


    :param arr_X_target: The target to approximate.
    :param lst_S_init: The initial list of sparse factors. The factors are given right to left. In all case.
    :param nb_factors: The number of factors.
    :param lst_projection_functions: The projection function for each of the sparse factor.
    :param f_lambda_init: The initial scaling factor.
    :param nb_iter: The number of iteration before stopping.
    :param update_right_to_left: Way in which the factors are updated in the inner palm4msa algorithm. If update_right_to_left is True,
    the factors are updated right to left (e.g; the last factor in the list first). Otherwise the contrary.
    :param residual_on_right: During the split step, the residual can be computed as a right or left factor. If residual_on_right is True,
    the residuals are computed as right factors. We can also see this option as the update way for the hierarchical strategy:
    when the residual is computed on the right, it correspond to compute the last factor first (left to right according to the paper: the factor with the
    bigger number first)
    :param graphical_display: Make a graphical representation of results.
    :return:
    """
    if not update_right_to_left:
        raise NotImplementedError  # todo voir pourquoi ça plante... mismatch dimension

    arr_residual = arr_X_target

    lst_S = deepcopy(lst_S_init)
    nb_factors = len(lst_S)

    # check if lst_dct_param_projection_operator contains a list of dict with param for step split and finetune
    assert len(lst_dct_projection_function) == nb_factors - 1
    assert all(
        len({"split", "finetune"}.difference(dct.keys())) == 0
        for dct in lst_dct_projection_function)

    lst_nb_iter_by_factor = []

    f_lambda = f_lambda_init  # todo enlever?

    objective_function = np.empty((nb_factors, 3))

    # main loop
    for k in range(nb_factors - 1):
        nb_factors_so_far = k + 1

        logger.info("Working on factor: {}".format(k))
        logger.info("Step split")

        objective_function[k, 0] = compute_objective_function(
            arr_X_target, f_lambda, lst_S)

        # calcule decomposition en 2 du résidu précédent
        if k == 0:
            f_lambda_init_split = f_lambda_init
        else:
            f_lambda_init_split = 1.

        func_split_step_palm4msa = lambda lst_S_init: palm4msa(
            arr_X_target=arr_residual,
            lst_S_init=lst_S_init,  # eye for factor and zeros for residual
            nb_factors=2,
            lst_projection_functions=lst_dct_projection_function[k]["split"],
            # define constraints: ||0 = d pour T1; relaxed constraint on ||0 for T2
            f_lambda_init=f_lambda_init_split,
            nb_iter=nb_iter,
            update_right_to_left=update_right_to_left,
            graphical_display=graphical_display)

        if residual_on_right:
            residual_init = get_side_prod(lst_S_init[nb_factors_so_far:])
            S_init = lst_S_init[k]
            lst_S_init_split_step = [S_init, residual_init]

        else:
            residual_init = get_side_prod(lst_S_init[:-nb_factors_so_far])
            S_init = lst_S_init[-nb_factors_so_far]
            lst_S_init_split_step = [residual_init, S_init]

        if residual_on_right:
            f_lambda_prime, (
                new_factor, new_residual
            ), unscaled_residual_reconstruction, _, nb_iter_this_factor = func_split_step_palm4msa(
                lst_S_init=lst_S_init_split_step)
        else:
            f_lambda_prime, (
                new_residual, new_factor
            ), unscaled_residual_reconstruction, _, nb_iter_this_factor = func_split_step_palm4msa(
                lst_S_init=lst_S_init_split_step)

        if k == 0:
            f_lambda = f_lambda_prime
            # f_lambda = f_lambda
        else:
            f_lambda *= f_lambda_prime

        if residual_on_right:
            lst_S[k] = new_factor
        else:
            lst_S[nb_factors - nb_factors_so_far] = new_factor

        if graphical_display:
            plt.figure()
            plt.subplot(221)
            plt.title('Input residual Iteration {}, etape split'.format(k))
            plt.imshow(arr_residual)
            plt.colorbar()

            plt.subplot(222)
            if residual_on_right:
                plt.imshow(f_lambda_prime * (new_factor @ new_residual))
                plt.title('lambda * new_factor @ new_residual')
            else:
                plt.imshow(f_lambda_prime * (new_residual @ new_factor))
                plt.title('lambda * new_residual @ new_factor')
            plt.colorbar()

            plt.subplot(223)
            plt.imshow(f_lambda_prime * new_factor)
            plt.colorbar()
            plt.title('lambda*new_factor')

            plt.subplot(224)
            plt.imshow(new_residual)
            plt.colorbar()
            plt.title('new_residual')

            plt.show()

        # get the k first elements [:k+1] and the next one (k+1)th as arr_residual (depend on the residual_on_right option)
        logger.info("Step finetuning")

        objective_function[k, 1] = compute_objective_function(
            arr_X_target, f_lambda, lst_S)

        func_fine_tune_step_palm4msa = lambda lst_S_init: palm4msa(
            arr_X_target=arr_X_target,
            lst_S_init=lst_S_init,
            nb_factors=nb_factors_so_far + 1,
            lst_projection_functions=lst_dct_projection_function[k]["finetune"
                                                                    ],
            f_lambda_init=f_lambda,
            nb_iter=nb_iter,
            update_right_to_left=update_right_to_left,
            graphical_display=graphical_display)

        if residual_on_right:
            f_lambda, (
                *lst_S[:nb_factors_so_far], arr_residual
            ), _, _, nb_iter_this_factor_bis = func_fine_tune_step_palm4msa(
                lst_S_init=lst_S[:nb_factors_so_far] + [new_residual])
        else:
            f_lambda, (
                arr_residual, *lst_S[-nb_factors_so_far:]
            ), _, _, nb_iter_this_factor_bis = func_fine_tune_step_palm4msa(
                lst_S_init=[new_residual] + lst_S[-nb_factors_so_far:])

        lst_nb_iter_by_factor.append(nb_iter_this_factor +
                                     nb_iter_this_factor_bis)

        objective_function[k, 2] = compute_objective_function(
            arr_X_target, f_lambda, lst_S)

        if graphical_display:
            plt.figure()
            plt.subplot(221)
            plt.title('Residual Iteration {}, step fine tune '.format(k))
            plt.imshow(arr_residual)
            plt.colorbar()

            plt.subplot(222)
            plt.imshow(
                f_lambda *
                get_side_prod(lst_S[:nb_factors_so_far] + [arr_residual]))
            plt.colorbar()
            plt.title('reconstructed')

            plt.subplot(223)
            plt.imshow(lst_S[k])
            plt.colorbar()
            plt.title('current factor')

            plt.subplot(224)
            plt.imshow(arr_residual)
            plt.colorbar()
            plt.title('residual (right factor)')

            plt.show()

    # last factor is residual of last palm4LED
    if residual_on_right:
        lst_S[-1] = arr_residual
    else:
        lst_S[0] = arr_residual

    objective_function[nb_factors - 1, :] = np.array(
        [compute_objective_function(arr_X_target, f_lambda, lst_S)] * 3)

    if len(lst_S) == 1:
        arr_X_curr = f_lambda * lst_S[0]
    else:
        arr_X_curr = f_lambda * multi_dot(lst_S)

    return f_lambda, lst_S, arr_X_curr, lst_nb_iter_by_factor, objective_function
Пример #23
0
                     graphical_display=False,
                     track_objective=False)

    log_memory_usage(
        "Memory after palm on top of kmeans in process_palm_on_top_of_kmeans")

    _lambda = _lambda_tmp / eye_norm
    lst_factors_ = op_factors.get_list_of_factors()
    op_centroids = SparseFactors([lst_factors_[1] * _lambda] +
                                 lst_factors_[2:])

    return op_centroids


if __name__ == "__main__":
    logger.info("Command line: " + " ".join(sys.argv))
    log_memory_usage("Memory at startup")
    arguments = docopt.docopt(__doc__)
    paraman = ParameterManager(arguments)
    initialized_results = dict((v, None) for v in lst_results_header)
    resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"])
    resprinter.add(initialized_results)
    resprinter.add(paraman)
    objprinter = ObjectiveFunctionPrinter(
        output_file=paraman["--output-file_objprinter"])
    has_failed = False
    if paraman["--verbose"]:
        daiquiri.setup(level=logging.DEBUG)
    else:
        daiquiri.setup(level=logging.INFO)
Пример #24
0
                               n_features=n_features,
                               centers=n_centers)

    U_centroids_hat = X[np.random.permutation(X.shape[0])[:nb_clusters]]
    # kmeans++ initialization is not feasible because complexity is O(ndk)...
    residual_on_right = True

    sparsity_factor = 2
    nb_iter_palm = 30
    delta_objective_error_threshold_in_palm = 1e-6
    track_objective_in_palm = True

    lst_constraints, lst_constraints_vals = build_constraint_set_smart(
        U_centroids_hat.shape[0], U_centroids_hat.shape[1], nb_factors,
        sparsity_factor=sparsity_factor, residual_on_right=residual_on_right)
    logger.info("constraints: {}".format(pformat(lst_constraints_vals)))


    hierarchical_palm_init = {
        "init_lambda": 1.,
        "nb_iter": nb_iter_palm,
        "lst_constraint_sets": lst_constraints,
        "residual_on_right": residual_on_right,
        "delta_objective_error_threshold": delta_objective_error_threshold_in_palm,
        "track_objective": track_objective_in_palm
    }

    logger.info('Running QuicK-means with H-Palm')
    objective_function_with_hier_palm, op_centroids_hier, indicator_hier, lst_objective_function_hier_palm = \
        qmeans(X,
               nb_clusters,
Пример #25
0
def kmeans_minibatch(X_data,
                     K_nb_cluster,
                     nb_iter,
                     initialization,
                     batch_size,
                     delta_objective_error_threshold=1e-6,
                     proj_l1=False,
                     _lambda=None,
                     epsilon=None):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param batch_size: The size of each batch.
    :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value.
    :return:
    """

    logger.debug("Compute squared froebenius norm of data")
    X_data_norms = get_squared_froebenius_norm_line_wise_batch_by_batch(
        X_data, batch_size)

    # Initialize our centroids by picking random data points

    U_centroids = copy.deepcopy(initialization)
    objective_function = np.empty((nb_iter, ))

    total_nb_of_minibatch = X_data.shape[0] // batch_size

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error = np.inf
    while i_iter < nb_iter and (delta_objective_error >
                                delta_objective_error_threshold):
        logger.info("Iteration number {}/{}".format(i_iter, nb_iter))
        # Prepare next epoch
        full_count_vector = np.zeros(K_nb_cluster, dtype=int)
        full_indicator_vector = np.zeros(X_data.shape[0], dtype=int)
        U_centroids_before = np.copy(U_centroids)

        U_centroids = np.zeros_like(U_centroids_before)
        for i_minibatch, example_batch_indexes in enumerate(
                DataGenerator(X_data,
                              batch_size=batch_size,
                              return_indexes=True)):
            logger.info(
                "Minibatch number {}/{}; Iteration number {}/{}".format(
                    i_minibatch, total_nb_of_minibatch, i_iter, nb_iter))
            example_batch = X_data[example_batch_indexes]
            example_batch_norms = X_data_norms[example_batch_indexes]

            indicator_vector, distances = assign_points_to_clusters(
                example_batch, U_centroids_before, X_norms=example_batch_norms)
            full_indicator_vector[example_batch_indexes] = indicator_vector

            cluster_names, counts = np.unique(indicator_vector,
                                              return_counts=True)
            count_vector = np.zeros(K_nb_cluster)
            count_vector[cluster_names] = counts

            full_count_vector = update_clusters(example_batch, U_centroids,
                                                K_nb_cluster,
                                                full_count_vector,
                                                count_vector, indicator_vector)

            # Update centroid location using the newly
            # assigned data point classes

        if proj_l1:
            if _lambda is None or epsilon is None:
                raise ValueError(
                    "epsilon and lambda must be set if proj_l1 is True")
            for i_centroid, centroid in enumerate(U_centroids):
                U_centroids[i_centroid, :] = proj_onto_l1_ball(
                    _lambda=_lambda, epsilon_tol=epsilon, vec=centroid)

        objective_function[i_iter, ] = compute_objective_by_batch(
            X_data, U_centroids, full_indicator_vector, batch_size)

        if i_iter >= 1:
            delta_objective_error = np.abs(
                objective_function[i_iter] - objective_function[i_iter - 1]
            ) / objective_function[
                i_iter -
                1]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    return objective_function[:i_iter], U_centroids, full_indicator_vector
lst_factors = [np.eye(d) for _ in range(nb_factors)]
lst_factors[-1] = np.zeros((d, d))
_lambda = 1.  # init the scaling factor at 1

# Create the projection operators for each factor
lst_proj_op_by_fac_step, lst_proj_op_by_fac_step_desc = build_constraint_set_smart(
    left_dim=d,
    right_dim=d,
    nb_factors=nb_factors,
    sparsity_factor=sparsity_factor,
    residual_on_right=True,
    fast_unstable_proj=False,
    constant_first=False)

logger.info(
    "Description of projection operators for each iteration of hierarchical_palm: \n{}"
    .format(pprint.pformat(lst_proj_op_by_fac_step_desc)))
print(np.__version__)

# Call the algorithm
final_lambda, final_factors, final_X, _, _ = hierarchical_palm4msa(
    arr_X_target=H,
    lst_S_init=lst_factors,
    lst_dct_projection_function=lst_proj_op_by_fac_step,
    f_lambda_init=_lambda,
    nb_iter=nb_iter,
    update_right_to_left=True,
    residual_on_right=True)

# Vizualization utility
visual_evaluation_palm4msa(H, lst_factors, final_factors, final_X)
Пример #27
0
nb_clusters = 10
nb_iter_kmeans = 10
nb_factors = 5
U_centroids_hat = X[np.random.permutation(X.shape[0])[:nb_clusters]]
# kmeans++ initialization is not feasible because complexity is O(ndk)...

# Initialize palm4msa
sparsity_factor = 2
nb_iter_palm = 30
delta_objective_error_threshold_in_palm = 1e-6
# Create constraints for palm4msa
lst_constraints, lst_constraints_vals = build_constraint_set_smart(
    U_centroids_hat.shape[0], U_centroids_hat.shape[1], nb_factors,
    sparsity_factor=sparsity_factor, residual_on_right=True)

logger.info("Description of constraints: \n{}".format(pformat(lst_constraints_vals)))

hierarchical_palm_init = {
    "init_lambda": 1.,
    "nb_iter": nb_iter_palm,
    "lst_constraint_sets": lst_constraints,
    "residual_on_right": True,
    "delta_objective_error_threshold": delta_objective_error_threshold_in_palm,
    "track_objective": False
}

logger.info('Running QuicK-means with H-Palm')

# QKmeans with hierarchical palm4msa
objective_function_with_hier_palm, op_centroids_hier, indicator_hier, lst_objective_function_hier_palm = \
    qmeans(X,
Пример #28
0
def make_1nn_evaluation(x_train, y_train, x_test, y_test, U_centroids,
                        indicator_vector):
    """
    Do the 1-nearest neighbor classification using `x_train`, `y_train` as support and `x_test`, `y_test` as
    evaluation set.

    The scikilearn classifiers (brute, kdtree and balltree) are called only in the case where it is the kmeans version
    of the program that is called (for simplicity purposes: not do it many times).

    Time is recorded.
    Classification accuracy is recorded.

    :param x_train: Train data set as ndarray.
    :param y_train: Train labels as categories in ndarray.
    :param x_test: Test data as ndarray.
    :param y_test: Test labels as categories.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param indicator_vector: The indicator vector for this matrix of centroids and this train data.

    :return:
    """
    def scikit_evaluation(str_type):
        """
        Do the scikit learn version of nearest neighbor (used for comparison)

        :param str_type:
        :return:
        """
        clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type)
        clf.fit(x_train, y_train)
        log_memory_usage(
            "Memory after definition of neighbors classifiers in scikit_evaluation of make_1nn_evaluation"
        )

        start_inference_time = time.time()
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0]
        stop_inference_time = time.time()
        log_memory_usage(
            "Memory after label assignation in scikit_evaluation of make_1nn_evaluation"
        )

        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_{}_inference_time".format(str_type): inference_time,
            "1nn_{}_accuracy".format(str_type): accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

    def kmean_tree_evaluation():
        """
        Do the K-means partitioning version of nearest neighbor?=.

        :return:
        """
        # for each cluster, there is a sub nearest neighbor classifier for points in that cluster.
        lst_clf_by_cluster = [
            KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(
                x_train[indicator_vector == i], y_train[indicator_vector == i])
            for i in range(U_centroids.shape[0])
        ]
        log_memory_usage(
            "Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation"
        )
        # precomputed_centroid_norms = get_squared_froebenius_norm(landmarks)
        precomputed_centroid_norms = None
        start_inference_time = time.time()
        distances = get_distances(
            x_test,
            U_centroids,
            precomputed_centroids_norm=precomputed_centroid_norms)
        stop_get_distances_time = time.time()
        get_distance_time = stop_get_distances_time - start_inference_time
        log_memory_usage(
            "Memory after distances computation with clusters in kmean_tree_evaluation of make_1nn_evaluation"
        )
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            # get the cluster to which belongs this data point and call the associated nearest neighbor classifier
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1,
                                                                        -1))[0]
        stop_inference_time = time.time()
        log_memory_usage(
            "Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation"
        )
        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_get_distance_time": get_distance_time / x_test.shape[0],
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time

    logger.info("1 nearest neighbor with k-means search")
    kmean_tree_time = kmean_tree_evaluation()
    #
    if paraman["kmeans"]:
        lst_knn_types = ["brute", "ball_tree", "kd_tree"]
        for knn_type in lst_knn_types:
            # the classification must not take more than 10 times the time taken for the K means 1 nn classification or
            # it will stop.
            signal.signal(signal.SIGALRM, timeout_signal_handler)
            signal.alarm(int(kmean_tree_time * 10))  # start alarm
            try:
                logger.info(
                    "1 nearest neighbor with {} search".format(knn_type))
                scikit_evaluation(knn_type)
            except TimeoutError as te:
                logger.warning(
                    "Timeout during execution of 1-nn with {} version: {}".
                    format(knn_type, te))
            signal.alarm(0)  # stop alarm for next evaluation
Пример #29
0
def qkmeans_minibatch(X_data: np.ndarray,
                      K_nb_cluster: int,
                      nb_iter: int,
                      nb_factors: int,
                      params_palm4msa: dict,
                      initialization: np.ndarray,
                      batch_size: int,
                      hierarchical_inside=False,
                      delta_objective_error_threshold=1e-6,
                      hierarchical_init=False):
    """
    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param delta_objective_error_threshold:
    :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not.
    :param batch_size:  The size of each batch.
    
    :return:
    """

    assert K_nb_cluster == initialization.shape[0]

    logger.debug("Compute squared froebenius norm of data")
    X_data_norms = get_squared_froebenius_norm_line_wise_batch_by_batch(
        X_data, batch_size)

    nb_examples = X_data.shape[0]
    total_nb_of_minibatch = X_data.shape[0] // batch_size

    X_centroids_hat = copy.deepcopy(initialization)

    # ################################ INIT PALM4MSA ###############################
    logger.info("Initializing QKmeans with PALM algorithm")

    lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1],
                                   nb_factors)
    eye_norm = np.sqrt(K_nb_cluster)

    ##########################
    # GET PARAMS OF PALM4MSA #
    ##########################
    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]
    delta_objective_error_threshold_inner_palm = params_palm4msa[
        "delta_objective_error_threshold"]
    track_objective_palm = params_palm4msa["track_objective"]

    ####################
    # INIT RUN OF PALM #
    ####################

    if hierarchical_inside or hierarchical_init:
        _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical= \
            hierarchical_palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                track_objective_palm=track_objective_palm,
                delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm,
                return_objective_function=track_objective_palm)
    else:
        _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \
            palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1][
                    "finetune"],
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                track_objective=track_objective_palm,
                delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

    # ################################################################

    lst_factors = None  # safe assignment for debug

    _lambda = _lambda_tmp / eye_norm

    objective_function = np.ones(nb_iter) * -1
    lst_all_objective_functions_palm = []
    lst_all_objective_functions_palm.append(objective_palm)

    i_iter = 0
    delta_objective_error = np.inf
    while ((i_iter < nb_iter)
           and (delta_objective_error > delta_objective_error_threshold)):
        logger.info("Iteration number {}/{}".format(i_iter, nb_iter))

        # Re-init palm factors for iteration
        lst_factors_ = op_factors.get_list_of_factors()
        op_centroids = SparseFactors([lst_factors_[1] * _lambda] +
                                     lst_factors_[2:])

        # Prepare next epoch
        full_count_vector = np.zeros(K_nb_cluster, dtype=int)
        full_indicator_vector = np.zeros(X_data.shape[0], dtype=int)

        X_centroids_hat = np.zeros_like(X_centroids_hat)

        for i_minibatch, example_batch_indexes in enumerate(
                DataGenerator(X_data,
                              batch_size=batch_size,
                              return_indexes=True)):
            logger.info(
                "Minibatch number {}/{}; Iteration number {}/{}".format(
                    i_minibatch, total_nb_of_minibatch, i_iter, nb_iter))
            example_batch = X_data[example_batch_indexes]
            example_batch_norms = X_data_norms[example_batch_indexes]

            ##########################
            # Update centroid oracle #
            ##########################

            indicator_vector, distances = assign_points_to_clusters(
                example_batch, op_centroids, X_norms=example_batch_norms)
            full_indicator_vector[example_batch_indexes] = indicator_vector

            cluster_names, counts = np.unique(indicator_vector,
                                              return_counts=True)
            count_vector = np.zeros(K_nb_cluster)
            count_vector[cluster_names] = counts

            full_count_vector = update_clusters(example_batch, X_centroids_hat,
                                                K_nb_cluster,
                                                full_count_vector,
                                                count_vector, indicator_vector)

        objective_function[i_iter] = compute_objective_by_batch(
            X_data, op_centroids, full_indicator_vector, batch_size)

        # inplace modification of X_centrois_hat and full_count_vector and full_indicator_vector
        check_cluster_integrity(X_data, X_centroids_hat, K_nb_cluster,
                                full_count_vector, full_indicator_vector)

        #########################
        # Do palm for iteration #
        #########################

        # create the diagonal of the sqrt of those counts
        diag_counts_sqrt_normalized = csr_matrix(
            (np.sqrt(full_count_vector / nb_examples),
             (np.arange(K_nb_cluster), np.arange(K_nb_cluster))))
        diag_counts_sqrt = np.sqrt(full_count_vector)

        # set it as first factor
        op_factors.set_factor(0, diag_counts_sqrt_normalized)

        if hierarchical_inside:
            _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \
                hierarchical_palm4msa(
                    arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                    lst_S_init=op_factors.get_list_of_factors(),
                    lst_dct_projection_function=lst_proj_op_by_fac_step,
                    f_lambda_init=_lambda * np.sqrt(nb_examples),
                    nb_iter=nb_iter_palm,
                    update_right_to_left=True,
                    residual_on_right=residual_on_right,
                    return_objective_function=track_objective_palm,
                    track_objective_palm=track_objective_palm,
                    delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm)

        else:
            _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \
                palm4msa(arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                         lst_S_init=op_factors.get_list_of_factors(),
                         nb_factors=op_factors.n_factors,
                         lst_projection_functions=lst_proj_op_by_fac_step[-1][
                             "finetune"],
                         f_lambda_init=_lambda * np.sqrt(nb_examples),
                         nb_iter=nb_iter_palm,
                         update_right_to_left=True,
                         track_objective=track_objective_palm,
                         delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

        _lambda = _lambda_tmp / np.sqrt(nb_examples)

        ############################

        lst_all_objective_functions_palm.append(objective_palm)

        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter] -
                                           objective_function[i_iter - 1]
                                           ) / objective_function[i_iter - 1]

        # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee

        i_iter += 1

    op_centroids = SparseFactors([lst_factors_[1] * _lambda] +
                                 lst_factors_[2:])

    return objective_function[:
                              i_iter], op_centroids, full_indicator_vector, lst_all_objective_functions_palm