示例#1
0
def update_clusters(X_data, X_centroids_hat, K_nb_cluster, counts_before,
                    new_counts, indicator_vector):
    """
    Update centroids and return new counts of each centroid.
    All changes are made in place.

    :param X_data:
    :param X_data_norms:
    :param X_centroids_hat:
    :param K_nb_cluster:
    :param new_counts:
    :param indicator_vector:
    :param distances:
    :param cluster_names:
    :param cluster_names_sorted:
    :return:
    """
    total_count_vector = counts_before + new_counts
    for c in range(K_nb_cluster):
        if total_count_vector[c] != 0:
            X_centroids_hat[c] = (
                (counts_before[c] / total_count_vector[c]) *
                X_centroids_hat[c]) + (
                    (1. / total_count_vector[c]) *
                    np.sum(X_data[indicator_vector == c, :], 0))
        else:
            logger.debug("Cluster {} has zero point, continue".format(c))

    return total_count_vector
示例#2
0
def load_kddcup04bio_no_classif():
    data_url = "http://cs.joensuu.fi/sipu/datasets/KDDCUP04Bio.txt"

    with tempfile.TemporaryDirectory() as d_tmp:
        logger.debug(
            f"Downloading file from url {data_url} to temporary directory {d_tmp}"
        )
        matfile_path = download_data(data_url, d_tmp)
        data = pandas.read_csv(matfile_path, delim_whitespace=True)

    return data.values
示例#3
0
def get_squared_froebenius_norm_line_wise_batch_by_batch(
        data_arr_memmap, batch_size):
    data_norms = np.zeros(data_arr_memmap.shape[0])
    logger.debug(
        "Start computing norm of datat array of shape {}, batch by batch".
        format(data_arr_memmap.shape))
    for i_batch, batch in enumerate(
            DataGenerator(data_arr_memmap,
                          batch_size=batch_size,
                          return_indexes=False)):
        logger.debug("Compute norm of batch {}/{}".format(
            i_batch, data_arr_memmap.shape[0] // batch_size))
        data_norms[i_batch * batch_size:(i_batch + 1) *
                   batch_size] = np.linalg.norm(batch, axis=1)**2
    return data_norms
示例#4
0
def load_census1990():
    """
    Meek, Thiesson, and Heckerman (2001), "The Learning Curve Method Applied to Clustering", to appear in The Journal of Machine Learning Research.

    Number of clusters: 25, 50, 100
    :return:
    """
    data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/census1990-mld/USCensus1990.data.txt"

    with tempfile.TemporaryDirectory() as d_tmp:
        logger.debug(
            f"Downloading file from url {data_url} to temporary directory {d_tmp}"
        )
        matfile_path = download_data(data_url, d_tmp)
        data = pandas.read_csv(matfile_path)

    return data.values[:, 1:], None  # remove the `caseId` attribute
示例#5
0
def load_plants():
    """
    USDA, NRCS. 2008. The PLANTS Database ([Web Link], 31 December 2008). National Plant Data Center, Baton Rouge, LA 70874-4490 USA.

    :return:
    """
    data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/plants/plants.data"

    with tempfile.TemporaryDirectory() as d_tmp:
        logger.debug(
            f"Downloading file from url {data_url} to temporary directory {d_tmp}"
        )
        file_path = download_data(data_url, d_tmp)

        with open(file_path, 'r', encoding="ISO-8859-15") as f:
            plants = f.readlines()

    # get all the features in a set
    set_plants_attributes = set()
    lst_plants = []
    for plant_line in plants:
        plant_line_no_name = [v.strip() for v in plant_line.split(',')[1:]]
        lst_plants.append(plant_line_no_name)
        set_plants_attributes.update(plant_line_no_name)

    # give a code to each feature in a 1-hot fashion
    arr_plants_attributes = np.array([v for v in set_plants_attributes])
    onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
    onehot_encoder.fit(arr_plants_attributes.reshape(-1, 1))

    # transform each plant with their code
    for i, plant_line_no_name in enumerate(lst_plants):
        plant_line_oh = np.sum(onehot_encoder.transform(
            np.array(plant_line_no_name).reshape(-1, 1)),
                               axis=0)
        lst_plants[i] = plant_line_oh

    arr_lst_plants = np.array(lst_plants)

    return arr_lst_plants
示例#6
0
def palm4msa(arr_X_target: np.array,
             lst_S_init: list,
             nb_factors: int,
             lst_projection_functions: list,
             f_lambda_init: float,
             nb_iter: int,
             update_right_to_left=True,
             graphical_display=False):
    """
    lst S init contains factors in decreasing indexes (e.g: the order along which they are multiplied in the product).
        example: S5 S4 S3 S2 S1

    lst S [-j] = Sj

    :param arr_X_target: The target to approximate.
    :param lst_S_init: The initial list of sparse factors.
    :param nb_factors: The number of factors.
    :param lst_projection_functions: The projection function for each of the sparse factor.
    :param f_lambda_init: The initial scaling factor.
    :param nb_iter: The number of iteration before stopping.
    :param update_right_to_left: Tells the algorithm to update factors from right to left (S1 first)
    :param graphical_display: Make a graphical representation of results.
    :return:
    """
    def update_S(S_old, _left_side, _right_side, _c, _lambda,
                 projection_function):
        """
        Return the new factor value.

        - Compute gradient
        - Do gradient step
        - Project data on _nb_keep_values highest entries
        - Normalize data
        """
        # compute gradient of the distance metric (with 1/_c gradient step size)
        grad_step = 1. / _c * _lambda \
                    * _left_side.T \
                    @ ((_lambda * _left_side @ S_old @ _right_side)
                       - arr_X_target) \
                    @ _right_side.T

        # 1 step for minimizing + flatten necessary for the upcoming projection
        S_tmp = S_old - grad_step

        # normalize because all factors must have norm 1
        S_proj = projection_function(S_tmp)
        S_proj = S_proj / norm(S_proj, ord="fro")
        return S_proj

    def update_scaling_factor(X, X_est):
        return np.sum(X * X_est) / np.sum(X_est**2)

    logger.debug('Norme de arr_X_target: {}'.format(
        np.linalg.norm(arr_X_target, ord='fro')))

    assert len(lst_S_init) > 0
    assert get_side_prod(lst_S_init).shape == arr_X_target.shape
    assert len(lst_S_init) == nb_factors
    # initialization
    f_lambda = f_lambda_init
    lst_S = deepcopy(
        lst_S_init)  # todo may not be necessary; check this ugliness

    objective_function = np.empty((nb_iter, nb_factors + 1))

    if update_right_to_left:
        # range arguments: start, stop, step
        factor_number_generator = range(-1, -(nb_factors + 1), -1)
    else:
        factor_number_generator = range(0, nb_factors, 1)
    # main loop
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf

    while i_iter == 0 or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        for j in factor_number_generator:
            if lst_projection_functions[j].__name__ == "constant_proj":
                continue

            left_side = get_side_prod(
                lst_S[:j], (arr_X_target.shape[0], arr_X_target.shape[0]))  # L
            index_value_for_right_factors_selection = (nb_factors + j + 1) % (
                nb_factors + 1)  # trust me, I am a scientist.
            right_side = get_side_prod(
                lst_S[index_value_for_right_factors_selection:],
                (arr_X_target.shape[1], arr_X_target.shape[1]))  # R

            # compute minimum c value (according to paper)
            min_c_value = (f_lambda * norm(right_side, ord=2) *
                           norm(left_side, ord=2))**2
            # add epsilon because it is exclusive minimum
            c = min_c_value * 1.001
            logger.debug("Lipsitchz constant value: {}; c value: {}".format(
                min_c_value, c))
            # compute new factor value
            lst_S[j] = update_S(lst_S[j], left_side, right_side, c, f_lambda,
                                lst_projection_functions[j])

            objective_function[i_iter, j - 1] = \
                compute_objective_function(arr_X_target,
                                           _f_lambda=f_lambda,
                                           _lst_S=lst_S)

        # re-compute the full factorisation
        if len(lst_S) == 1:
            arr_X_curr = lst_S[0]
        else:
            arr_X_curr = multi_dot(lst_S)
        # update lambda
        f_lambda = update_scaling_factor(arr_X_target, arr_X_curr)
        logger.debug("Lambda value: {}".format(f_lambda))

        objective_function[i_iter, -1] = \
            compute_objective_function(arr_X_target, _f_lambda=f_lambda,
                                       _lst_S=lst_S)

        logger.debug("Iteration {}; Objective value: {}".format(
            i_iter, objective_function[i_iter, -1]))

        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter, -1] -
                                           objective_function[i_iter - 1, -1]
                                           ) / objective_function[i_iter - 1,
                                                                  -1]
            # TODO vérifier que l'erreur absolue est plus petite que le
            # threshold plusieurs fois d'affilée

        i_iter += 1

    objective_function = objective_function[:i_iter, :]

    if graphical_display:
        plt.figure()
        plt.title("n factors {}".format(nb_factors))
        for j in range(nb_factors + 1):
            plt.semilogy(objective_function[:, j], label=str(j))
        plt.legend()
        plt.show()

        plt.figure()
        plt.semilogy(objective_function.flat)
        plt.legend()
        plt.show()

    # todo maybe change arrX_curr by lambda * arrX_curr
    return f_lambda, lst_S, arr_X_curr, objective_function, i_iter
示例#7
0
def palm4msa_fast4(arr_X_target: np.array,
                   lst_S_init: list,
                   nb_factors: int,
                   lst_projection_functions: list,
                   f_lambda_init: float,
                   nb_iter: int,
                   update_right_to_left=True,
                   track_objective=False,
                   delta_objective_error_threshold=1e-6):
    """
    lst S init contains factors in decreasing indexes (e.g: the order along which they are multiplied in the product).
        example: S5 S4 S3 S2 S1

    lst S [-j] = Sj

    :param arr_X_target: The target to approximate.
    :param lst_S_init: The initial list of sparse factors.
    :param nb_factors: The number of factors.
    :param lst_projection_functions: The projection function for each of the sparse factor.
    :param f_lambda_init: The initial scaling factor.
    :param nb_iter: The number of iteration before stopping.
    :param update_right_to_left: Tells the algorithm to update factors from right to left (S1 first)
    :param graphical_display: Make a graphical representation of results.
    :param track_objective: If true, the objective function is computed for each factor and not only at the end of each iteration.
    :param delta_objective_error_threshold: The normalized difference threshold between error at two successive iterations threshold below which the computation is stopped.

    :return: the sparse factorization but careful: the final X isn't multiplyed by lambda
    """
    logger.debug('Norme de arr_X_target: {}'.format(
        np.linalg.norm(arr_X_target, ord='fro')))
    # initialization
    f_lambda = f_lambda_init
    S_factors_op = SparseFactors(lst_S_init)

    assert np.all(S_factors_op.shape == arr_X_target.shape)
    assert S_factors_op.n_factors > 0
    assert S_factors_op.n_factors == nb_factors

    if track_objective:
        objective_function = np.ones(
            (nb_iter,
             nb_factors + 1)) * -1  # (nb_factors + 1) because of the lambda
    else:
        objective_function = np.ones((nb_iter, 1)) * -1

    if update_right_to_left:
        # range arguments: start, stop, step
        factor_number_generator = range(-1, -(nb_factors + 1), -1)
    else:
        factor_number_generator = range(0, nb_factors, 1)
    # main loop
    i_iter = 0
    delta_objective_error = np.inf

    init_vectors_norm_comp_L = [None] * nb_factors
    init_vectors_norm_comp_R = [None] * nb_factors

    while ((i_iter < nb_iter)
           and (delta_objective_error > delta_objective_error_threshold)):

        for machine_idx_fac, j in enumerate(factor_number_generator):
            if lst_projection_functions[j].__name__ == "constant_proj":
                if track_objective:
                    objective_function[
                        i_iter, machine_idx_fac] = compute_objective_function(
                            arr_X_target,
                            _f_lambda=f_lambda,
                            _lst_S=S_factors_op)
                    logger.debug(
                        "Iteration {}; Factor idx {}; Objective value {}".
                        format(i_iter, j, objective_function[i_iter,
                                                             machine_idx_fac]))
                continue

            L = S_factors_op.get_L(j)
            R = S_factors_op.get_R(-j - 1)
            # R = S_factors_op.get_R(nb_factors - j - 1)
            # print(nb_factors, L.n_factors+R.n_factors+1, L.n_factors,
            #       R.n_factors, j, -j-1)

            # compute minimum c value (according to paper)
            L_norm, init_vectors_norm_comp_L[j] = L.compute_spectral_norm(init_vector_eigs_v0=init_vectors_norm_comp_L[j]) \
                if L.n_factors > 0 else (1, init_vectors_norm_comp_L[j])
            R_norm, init_vectors_norm_comp_R[j] = R.compute_spectral_norm(init_vector_eigs_v0=init_vectors_norm_comp_R[j]) \
                if R.n_factors > 0 else (1, init_vectors_norm_comp_R[j])
            min_c_value = (f_lambda * L_norm * R_norm)**2  # lipsitchz constant
            # add epsilon because it is exclusive minimum
            c = min_c_value * 1.001
            logger.debug("Lipsitchz constant value: {}; c value: {}".format(
                min_c_value, c))
            # compute new factor value
            # todo check if it is not redundant to recompute the S_factors_op
            res = f_lambda * S_factors_op.compute_product() - arr_X_target
            # res_RH = R.dot(res.T).T if R.n_factors > 0 else res
            res_RH = S_factors_op.apply_RH(n_factors=-j - 1, X=res)
            # res_RH = S_factors_op.apply_RH(n_factors=nb_factors-j-1, X=res)
            LH_res_RH = S_factors_op.apply_LH(n_factors=j, X=res_RH)
            grad_step = 1. / c * f_lambda * LH_res_RH

            Sj = S_factors_op.get_factor(j)

            # normalize because all factors must have norm 1
            S_proj = lst_projection_functions[j](Sj - grad_step)
            S_proj = csr_matrix(S_proj)
            S_proj /= np.sqrt(S_proj.power(2).sum())

            S_factors_op.set_factor(j, S_proj)

            if track_objective:
                objective_function[
                    i_iter, machine_idx_fac] = compute_objective_function(
                        arr_X_target, _f_lambda=f_lambda, _lst_S=S_factors_op)
                logger.debug(
                    "Iteration {}; Factor idx {}; Objective value {}".format(
                        i_iter, j, objective_function[i_iter,
                                                      machine_idx_fac]))

        # re-compute the full factorisation
        # todo check if it is not redundant to recompute the S_factors_op
        arr_X_curr = S_factors_op.compute_product()

        # update lambda
        f_lambda = update_scaling_factor(X=arr_X_target, X_est=arr_X_curr)

        logger.debug("Lambda value: {}".format(f_lambda))

        objective_function[i_iter, -1] = \
            compute_objective_function(arr_X_target, _f_lambda=f_lambda,
                                       _lst_S=S_factors_op)

        logger.debug("Iteration {}; Objective value: {}".format(
            i_iter, objective_function[i_iter, -1]))

        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter, -1] -
                                           objective_function[i_iter - 1, -1]
                                           ) / objective_function[i_iter - 1,
                                                                  -1]
            logger.debug("Delta objective: {}".format(delta_objective_error))

        # TODO vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    return f_lambda, S_factors_op, arr_X_curr, objective_function, i_iter
示例#8
0
def load_caltech(final_size):
    data_url = "http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar"

    lst_images = []
    lst_classes_idx = []

    with tempfile.TemporaryDirectory() as d_tmp:
        logger.debug(
            f"Downloading file from url {data_url} to temporary directory {d_tmp}"
        )
        tarfile_path = Path(download_data(data_url, d_tmp))

        dir_path = Path(d_tmp)

        tf = tarfile.open(tarfile_path)
        tf.extractall(dir_path / "caltech256")
        tf.close()
        for root, dirs, files in os.walk(dir_path / "caltech256"):
            print(root)
            label_class = root.split("/")[-1]
            splitted_label_class = label_class.split(".")
            if splitted_label_class[-1] == "clutter":
                continue
            if len(splitted_label_class) > 1:
                label_idx = int(splitted_label_class[0])
            else:
                continue

            for file in files:
                path_img_file = Path(root) / file
                try:
                    img = plt.imread(path_img_file)
                except:
                    continue
                aspect_ratio = max(final_size / img.shape[0],
                                   final_size / img.shape[1])
                new_img = cv2.resize(img,
                                     dsize=(0, 0),
                                     fx=aspect_ratio,
                                     fy=aspect_ratio)
                new_img = crop_center(new_img, (final_size, final_size, 3))

                if new_img.shape == (final_size, final_size):
                    new_img = cv2.cvtColor(new_img, cv2.COLOR_GRAY2RGB)

                lst_images.append(new_img.flatten())
                lst_classes_idx.append(label_idx)

        X = np.vstack(lst_images)
        y = np.array(lst_classes_idx)

        print(X.shape)
        print(y.shape)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42,
                                                            stratify=y)

    return (X_train, y_train), (X_test, y_test)
示例#9
0
def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           graphical_display=False):
    """
    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param graphical_display: Tell the algorithm to display the results.
    :return:
    """

    assert K_nb_cluster == initialization.shape[0]

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]

    X_centroids_hat = copy.deepcopy(initialization)
    min_K_d = min(X_centroids_hat.shape)

    lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)]

    eye_norm = np.sqrt(K_nb_cluster)
    lst_factors[0] = np.eye(K_nb_cluster) / eye_norm
    lst_factors[1] = np.eye(K_nb_cluster, min_K_d)
    lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1]))

    if graphical_display:
        lst_factors_init = copy.deepcopy(lst_factors)

    _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
        arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
        lst_S_init=lst_factors,
        lst_dct_projection_function=lst_proj_op_by_fac_step,
        f_lambda_init=init_lambda * eye_norm,
        nb_iter=nb_iter_palm,
        update_right_to_left=True,
        residual_on_right=residual_on_right,
        graphical_display=False)

    _lambda = _lambda_tmp / eye_norm

    if graphical_display:
        if hierarchical_inside:
            plt.figure()
            plt.yscale("log")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                        objective_palm[:, 0],
                        marker="x",
                        label="before split")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                        objective_palm[:, 1],
                        marker="x",
                        label="between")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                        objective_palm[:, 2],
                        marker="x",
                        label="after finetune")
            plt.plot(np.arange(len(objective_palm) * 3),
                     objective_palm.flatten(),
                     color="k")
            plt.legend()
            plt.show()

        visual_evaluation_palm4msa(
            np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init,
            lst_factors, _lambda * multi_dot(lst_factors))

    objective_function = np.empty((nb_iter, 2))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf
    while (i_iter <= 1) or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Qmeans {}".format(i_iter))

        U_centroids = _lambda * multi_dot(lst_factors[1:])

        if i_iter > 0:
            objective_function[i_iter,
                               0] = compute_objective(X_data, U_centroids,
                                                      indicator_vector)

        # Assign all points to the nearest centroid
        # first get distance from all points to all centroids
        distances = get_distances(X_data,
                                  U_centroids,
                                  precomputed_data_points_norm=X_data_norms)
        # then, Determine class membership of each point
        # by picking the closest centroid
        indicator_vector = np.argmin(distances, axis=1)

        objective_function[i_iter,
                           1] = compute_objective(X_data, U_centroids,
                                                  indicator_vector)

        # Update centroid location using the newly
        # assigned data point classes
        for c in range(K_nb_cluster):
            X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0)

        # get the number of observation in each cluster
        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        if len(counts) < K_nb_cluster:
            raise ValueError(
                "Some clusters have no point. Aborting iteration {}".format(
                    i_iter))

        diag_counts_sqrt = np.diag(np.sqrt(
            counts[cluster_names_sorted]))  # todo use sparse matrix object
        diag_counts_sqrt_norm = np.linalg.norm(
            diag_counts_sqrt
        )  # todo analytic sqrt(n) instead of cumputing it with norm
        diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm
        # set it as first factor
        lst_factors[0] = diag_counts_sqrt_normalized

        if graphical_display:
            lst_factors_init = copy.deepcopy(lst_factors)

        if hierarchical_inside:
            _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                # f_lambda_init=_lambda,
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                graphical_display=False)

            loss_palm_before = objective_palm[0, 0]
            loss_palm_after = objective_palm[-1, -1]

        else:
            _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1]
                ["finetune"],
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                graphical_display=False)

            loss_palm_before = objective_palm[0, -1]
            loss_palm_after = objective_palm[-1, -1]

        logger.debug("Loss palm before: {}".format(loss_palm_before))
        logger.debug("Loss palm after: {}".format(loss_palm_after))

        if graphical_display:
            if hierarchical_inside:
                plt.figure()
                plt.yscale("log")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                            objective_palm[:, 0],
                            marker="x",
                            label="before split")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                            objective_palm[:, 1],
                            marker="x",
                            label="between")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                            objective_palm[:, 2],
                            marker="x",
                            label="after finetune")
                plt.plot(np.arange(len(objective_palm) * 3),
                         objective_palm.flatten(),
                         color="k")
                plt.legend()
                plt.show()

            visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat,
                                       lst_factors_init, lst_factors,
                                       _lambda_tmp * multi_dot(lst_factors))

        _lambda = _lambda_tmp / diag_counts_sqrt_norm

        logger.debug("Returned loss (with diag) palm: {}".format(
            objective_palm[-1, 0]))

        if i_iter >= 2:
            delta_objective_error = np.abs(
                objective_function[i_iter, 0] -
                objective_function[i_iter - 1, 0]
            ) / objective_function[
                i_iter - 1,
                0]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    U_centroids = _lambda * multi_dot(lst_factors[1:])
    distances = get_distances(X_data,
                              U_centroids,
                              precomputed_data_points_norm=X_data_norms)
    indicator_vector = np.argmin(distances, axis=1)

    return objective_function[:i_iter], U_centroids, indicator_vector
            "split": [
                get_lambda_proxsplincol(nb_keep_values),
                get_lambda_proxsplincol(nb_values_residual)
            ],
            "finetune":
            [constant_proj] + [get_lambda_proxsplincol(nb_keep_values)] * (k) +
            [get_lambda_proxsplincol(nb_values_residual)]
        }
    lst_proj_op_by_fac_step.append(dct_step_lst_nb_keep_values)

#final_lambda, final_factors, final_X = PALM4LED(H, lst_factors, [nb_keep_values for _ in range(nb_factors)], _lambda, nb_iter)
final_lambda, final_factors, final_X, nb_iter_by_factor, _ = hierarchical_palm4msa(
    arr_X_target=H,
    lst_S_init=lst_factors,
    lst_dct_projection_function=lst_proj_op_by_fac_step,
    f_lambda_init=_lambda,
    nb_iter=nb_iter,
    update_right_to_left=True,
    residual_on_right=True,
    graphical_display=True)

logger.debug("Number of iteration for each factor: {}; Total: {}".format(
    nb_iter_by_factor, sum(nb_iter_by_factor)))

visual_evaluation_palm4msa(H, lst_factors, final_factors, final_X)

vec = np.random.rand(d)
h_vec = H @ vec
r_vec = final_X @ vec
logger.debug("Distance matrice to random vector (true vs fake):{}".format(
    norm(h_vec - r_vec)))
示例#11
0
def kmeans_minibatch(X_data,
                     K_nb_cluster,
                     nb_iter,
                     initialization,
                     batch_size,
                     delta_objective_error_threshold=1e-6,
                     proj_l1=False,
                     _lambda=None,
                     epsilon=None):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param batch_size: The size of each batch.
    :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value.
    :return:
    """

    logger.debug("Compute squared froebenius norm of data")
    X_data_norms = get_squared_froebenius_norm_line_wise_batch_by_batch(
        X_data, batch_size)

    # Initialize our centroids by picking random data points

    U_centroids = copy.deepcopy(initialization)
    objective_function = np.empty((nb_iter, ))

    total_nb_of_minibatch = X_data.shape[0] // batch_size

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error = np.inf
    while i_iter < nb_iter and (delta_objective_error >
                                delta_objective_error_threshold):
        logger.info("Iteration number {}/{}".format(i_iter, nb_iter))
        # Prepare next epoch
        full_count_vector = np.zeros(K_nb_cluster, dtype=int)
        full_indicator_vector = np.zeros(X_data.shape[0], dtype=int)
        U_centroids_before = np.copy(U_centroids)

        U_centroids = np.zeros_like(U_centroids_before)
        for i_minibatch, example_batch_indexes in enumerate(
                DataGenerator(X_data,
                              batch_size=batch_size,
                              return_indexes=True)):
            logger.info(
                "Minibatch number {}/{}; Iteration number {}/{}".format(
                    i_minibatch, total_nb_of_minibatch, i_iter, nb_iter))
            example_batch = X_data[example_batch_indexes]
            example_batch_norms = X_data_norms[example_batch_indexes]

            indicator_vector, distances = assign_points_to_clusters(
                example_batch, U_centroids_before, X_norms=example_batch_norms)
            full_indicator_vector[example_batch_indexes] = indicator_vector

            cluster_names, counts = np.unique(indicator_vector,
                                              return_counts=True)
            count_vector = np.zeros(K_nb_cluster)
            count_vector[cluster_names] = counts

            full_count_vector = update_clusters(example_batch, U_centroids,
                                                K_nb_cluster,
                                                full_count_vector,
                                                count_vector, indicator_vector)

            # Update centroid location using the newly
            # assigned data point classes

        if proj_l1:
            if _lambda is None or epsilon is None:
                raise ValueError(
                    "epsilon and lambda must be set if proj_l1 is True")
            for i_centroid, centroid in enumerate(U_centroids):
                U_centroids[i_centroid, :] = proj_onto_l1_ball(
                    _lambda=_lambda, epsilon_tol=epsilon, vec=centroid)

        objective_function[i_iter, ] = compute_objective_by_batch(
            X_data, U_centroids, full_indicator_vector, batch_size)

        if i_iter >= 1:
            delta_objective_error = np.abs(
                objective_function[i_iter] - objective_function[i_iter - 1]
            ) / objective_function[
                i_iter -
                1]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    return objective_function[:i_iter], U_centroids, full_indicator_vector
示例#12
0
            ) / objective_function[
                i_iter -
                1]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    return objective_function[:i_iter], U_centroids, full_indicator_vector


if __name__ == "__main__":
    batch_size = 10000
    nb_clust = 1000
    nb_iter = 30

    X = np.memmap(
        "/home/luc/PycharmProjects/qalm_qmeans/data/external/blobs_1_billion.dat",
        mode="r",
        dtype="float32",
        shape=(int(1e6), 2000))

    logger.debug("Initializing clusters")
    centroids_init = X[np.random.permutation(X.shape[0])[:nb_clust]]

    start = time.time()
    logger.debug("Nb iteration: {}".format(nb_iter))
    obj, _, _ = kmeans_minibatch(X, nb_clust, nb_iter, centroids_init,
                                 batch_size)
    stop = time.time()
    plt.plot(obj)
    plt.show()
    print("It took {} s".format(stop - start))
示例#13
0
def qkmeans_minibatch(X_data: np.ndarray,
                      K_nb_cluster: int,
                      nb_iter: int,
                      nb_factors: int,
                      params_palm4msa: dict,
                      initialization: np.ndarray,
                      batch_size: int,
                      hierarchical_inside=False,
                      delta_objective_error_threshold=1e-6,
                      hierarchical_init=False):
    """
    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param delta_objective_error_threshold:
    :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not.
    :param batch_size:  The size of each batch.
    
    :return:
    """

    assert K_nb_cluster == initialization.shape[0]

    logger.debug("Compute squared froebenius norm of data")
    X_data_norms = get_squared_froebenius_norm_line_wise_batch_by_batch(
        X_data, batch_size)

    nb_examples = X_data.shape[0]
    total_nb_of_minibatch = X_data.shape[0] // batch_size

    X_centroids_hat = copy.deepcopy(initialization)

    # ################################ INIT PALM4MSA ###############################
    logger.info("Initializing QKmeans with PALM algorithm")

    lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1],
                                   nb_factors)
    eye_norm = np.sqrt(K_nb_cluster)

    ##########################
    # GET PARAMS OF PALM4MSA #
    ##########################
    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]
    delta_objective_error_threshold_inner_palm = params_palm4msa[
        "delta_objective_error_threshold"]
    track_objective_palm = params_palm4msa["track_objective"]

    ####################
    # INIT RUN OF PALM #
    ####################

    if hierarchical_inside or hierarchical_init:
        _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical= \
            hierarchical_palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                track_objective_palm=track_objective_palm,
                delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm,
                return_objective_function=track_objective_palm)
    else:
        _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \
            palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1][
                    "finetune"],
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                track_objective=track_objective_palm,
                delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

    # ################################################################

    lst_factors = None  # safe assignment for debug

    _lambda = _lambda_tmp / eye_norm

    objective_function = np.ones(nb_iter) * -1
    lst_all_objective_functions_palm = []
    lst_all_objective_functions_palm.append(objective_palm)

    i_iter = 0
    delta_objective_error = np.inf
    while ((i_iter < nb_iter)
           and (delta_objective_error > delta_objective_error_threshold)):
        logger.info("Iteration number {}/{}".format(i_iter, nb_iter))

        # Re-init palm factors for iteration
        lst_factors_ = op_factors.get_list_of_factors()
        op_centroids = SparseFactors([lst_factors_[1] * _lambda] +
                                     lst_factors_[2:])

        # Prepare next epoch
        full_count_vector = np.zeros(K_nb_cluster, dtype=int)
        full_indicator_vector = np.zeros(X_data.shape[0], dtype=int)

        X_centroids_hat = np.zeros_like(X_centroids_hat)

        for i_minibatch, example_batch_indexes in enumerate(
                DataGenerator(X_data,
                              batch_size=batch_size,
                              return_indexes=True)):
            logger.info(
                "Minibatch number {}/{}; Iteration number {}/{}".format(
                    i_minibatch, total_nb_of_minibatch, i_iter, nb_iter))
            example_batch = X_data[example_batch_indexes]
            example_batch_norms = X_data_norms[example_batch_indexes]

            ##########################
            # Update centroid oracle #
            ##########################

            indicator_vector, distances = assign_points_to_clusters(
                example_batch, op_centroids, X_norms=example_batch_norms)
            full_indicator_vector[example_batch_indexes] = indicator_vector

            cluster_names, counts = np.unique(indicator_vector,
                                              return_counts=True)
            count_vector = np.zeros(K_nb_cluster)
            count_vector[cluster_names] = counts

            full_count_vector = update_clusters(example_batch, X_centroids_hat,
                                                K_nb_cluster,
                                                full_count_vector,
                                                count_vector, indicator_vector)

        objective_function[i_iter] = compute_objective_by_batch(
            X_data, op_centroids, full_indicator_vector, batch_size)

        # inplace modification of X_centrois_hat and full_count_vector and full_indicator_vector
        check_cluster_integrity(X_data, X_centroids_hat, K_nb_cluster,
                                full_count_vector, full_indicator_vector)

        #########################
        # Do palm for iteration #
        #########################

        # create the diagonal of the sqrt of those counts
        diag_counts_sqrt_normalized = csr_matrix(
            (np.sqrt(full_count_vector / nb_examples),
             (np.arange(K_nb_cluster), np.arange(K_nb_cluster))))
        diag_counts_sqrt = np.sqrt(full_count_vector)

        # set it as first factor
        op_factors.set_factor(0, diag_counts_sqrt_normalized)

        if hierarchical_inside:
            _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \
                hierarchical_palm4msa(
                    arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                    lst_S_init=op_factors.get_list_of_factors(),
                    lst_dct_projection_function=lst_proj_op_by_fac_step,
                    f_lambda_init=_lambda * np.sqrt(nb_examples),
                    nb_iter=nb_iter_palm,
                    update_right_to_left=True,
                    residual_on_right=residual_on_right,
                    return_objective_function=track_objective_palm,
                    track_objective_palm=track_objective_palm,
                    delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm)

        else:
            _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \
                palm4msa(arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                         lst_S_init=op_factors.get_list_of_factors(),
                         nb_factors=op_factors.n_factors,
                         lst_projection_functions=lst_proj_op_by_fac_step[-1][
                             "finetune"],
                         f_lambda_init=_lambda * np.sqrt(nb_examples),
                         nb_iter=nb_iter_palm,
                         update_right_to_left=True,
                         track_objective=track_objective_palm,
                         delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

        _lambda = _lambda_tmp / np.sqrt(nb_examples)

        ############################

        lst_all_objective_functions_palm.append(objective_palm)

        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter] -
                                           objective_function[i_iter - 1]
                                           ) / objective_function[i_iter - 1]

        # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee

        i_iter += 1

    op_centroids = SparseFactors([lst_factors_[1] * _lambda] +
                                 lst_factors_[2:])

    return objective_function[:
                              i_iter], op_centroids, full_indicator_vector, lst_all_objective_functions_palm