def test_dkmeans_zero_arrays():
    k = 8
    X = np.random.random((4, 4))

    cluster_labels = [0] * len(X)
    local_means = local.compute_mean(X, cluster_labels, k)

    assert np.array(local_means).shape == (k, 4)
def intermediate_kmeans():
    """Calculate k-Means locally."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Extract hyperparameters from ENV variables
    k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS)

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    X = io_helper.fetch_dataframe(variables=indep_vars)

    # Return variables info, but remove actual data points
    results = {'indep_vars': []}
    for var in indep_vars:
        if var['type']['name'] in ('integer', 'real'):
            new_var = {k: v for k, v in var.items() if k != 'series'}
            mean, std = _get_moments(var)
            new_var['mean'] = mean
            new_var['std'] = std
        else:
            new_var = var

        results['indep_vars'].append(new_var)

    # Drop NaN values
    X = utils.remove_nulls(X, errors='ignore')
    if len(X) == 0:
        logging.warning("All data are NULL, returning empty centroids.")
        results['centroids'] = []
        io_helper.save_results(json.dumps(results), shapes.Shapes.JSON)
        return

    # Generate results
    logging.info("Generating results...")

    # featurization
    featurizer = _create_featurizer(indep_vars)
    X = featurizer.transform(X)

    m, n = X.shape
    num_iter = 0
    not_converged = True

    # Run k-Means locally
    # Have each site compute k initial clusters locally
    local_centroids = local.initialize_own_centroids(X, k)

    # Local Optimization Loop
    while not_converged:
        # Each local site computes its cluster
        cluster_labels = local.compute_clustering(X, local_centroids)
        if OPTIMIZATION == 'lloyd':
            # Computes its local mean if doing lloyd, and updates centroids
            local_means = local.compute_mean(X, cluster_labels, k)
            local_centroids, previous_centroids = local.mean_step(
                local_means, local_centroids)
        elif OPTIMIZATION == 'gradient':
            # Computes the local gradient if doing GD, and takes a GD step
            local_grad = local.compute_gradient(X, cluster_labels,
                                                local_centroids, LR)
            local_centroids, previous_centroids = local.gradient_step(
                local_grad, local_centroids)

        # Check local stopping conditions
        not_converged, local_delta = local.check_stopping(
            local_centroids, previous_centroids, EPSILON)

        num_iter += 1
        logging.info("Single-Shot {} ; iter : {} delta : {}".format(
            OPTIMIZATION, num_iter, local_delta))

    results['centroids'] = [lc.tolist() for lc in local_centroids]

    logging.info("Results:\n{}".format(results))
    io_helper.save_results(json.dumps(results), shapes.Shapes.JSON)
    logging.info("DONE")
Пример #3
0
def main(X,
         k,
         optimization='lloyd',
         s=2,
         epsilon=0.00001,
         shuffle=True,
         lr=0.001,
         verbose=True):
    m, n = get_data_dims(X)
    nodes, inds = split_over_nodes(X, s, shuffle=shuffle)
    X = [X[i] for i in inds]  # Reshuffle x to match the random
    tracked_delta = []
    num_iter = 0
    not_converged = True

    # Have each site compute k initial clusters locally
    local_centroids = [
        cent for node in nodes
        for cent in local.initialize_own_centroids(node, k)
    ]
    # and select k random clusters from the s*k pool
    np.random.shuffle(local_centroids)
    remote_centroids = local_centroids[:k]

    # Remote Optimization Loop
    while not_converged:
        cluster_labels = [None for j in range(s)]  # the clusterings
        local_optimizer = [None for j in range(s)]  # the optimization entity

        # Local computation loop
        for i, node in enumerate(nodes):
            # Each site compute local clusters
            cluster_labels[i] = \
                        local.compute_clustering(node, remote_centroids)
            if optimization == 'lloyd':
                # Lloyd has sites compute means locally
                local_optimizer[i] = local.compute_mean(
                    node, cluster_labels[i], k)
            elif optimization == 'gradient':
                # Gradient descent has sites compute gradients locally
                local_optimizer[i] = \
                    local.compute_gradient(node, cluster_labels[i],
                                               remote_centroids, lr)
        # End of Local Computations

        # Both objects can be aggregated by taking a sum
        remote_optimizer = remote.aggregate_sum(local_optimizer)
        if optimization == 'lloyd':
            # and for the mean, we need to further divide the number of sites
            remote_optimizer = [r / s for r in remote_optimizer]

            # Then, update centroids as corresponding to the local mean
            [remote_centroids, previous] = \
                local.mean_step(remote_optimizer,
                                    remote_centroids)
        elif optimization == 'gradient':
            # Then, update centroids according to one step of gradient descent
            [remote_centroids, previous] = \
                local.gradient_step(remote_optimizer, remote_centroids)

        # Check the stopping condition "locally" at the aggregator
        # - returns false if converged
        remote_check, delta = local.check_stopping(remote_centroids, previous,
                                                   epsilon)
        if verbose:
            print("Multi-Shot %s ; iter : %d delta : %f" %
                  (optimization, num_iter, delta))
        not_converged = remote_check
        tracked_delta.append(delta)
        num_iter += 1

    # Compute the final clustering "locally" at the aggregator
    cluster_labels = [
        clusters for node in nodes
        for clusters in local.compute_clustering(node, remote_centroids)
    ]
    return {
        'centroids': remote_centroids,
        'cluster_labels': cluster_labels,
        'X': X,
        'delta': tracked_delta,
        'num_iter': i,
        'name': 'multishot_%s' % optimization
    }
Пример #4
0
def main(X, k, optimization='lloyd', s=2, epsilon=0.00001, shuffle=True,
         lr=0.01, verbose=True):
    """
        Local Variables - X: a list of N many m x n matrices storing data
                          k: number of clusters (int)
                          local_centroids : a s x k 2-d list of
                          m x n matrices storing cluster centroids
    """
    m, n = get_data_dims(X)
    nodes, inds = split_over_nodes(X, s, shuffle=shuffle)
    X = [X[i] for i in inds]  # Reshuffle x to match the random
    tracked_delta = []
    num_iter = 0
    not_converged = True

    # Have each site compute k initial clusters locally
    local_centroids = [local.initialize_own_centroids(node, k)
                       for node in nodes]

    # Local Optimization Loop
    while not_converged:
        cluster_labels = [None for j in range(s)]  # the clusterings
        local_delta = [None for j in range(s)]  # Track all local delta
        local_stop = [False for j in range(s)]  # And all local stopping conds
        for i, node in enumerate(nodes):
            # Each local site computes its cluster
            cluster_labels[i] = \
                         local.compute_clustering(node, local_centroids[i])
            if optimization == 'lloyd':
                # Computes its local mean if doing lloyd, and updates centroids
                local_means = local.compute_mean(node,
                                                     cluster_labels[i], k)
                [local_centroids[i], previous_centroids] = \
                    local.mean_step(local_means,
                                        local_centroids[i])
            elif optimization == 'gradient':
                # Computes the local gradient if doing GD, and takes a GD step
                local_grad = local.compute_gradient(node,
                                                        cluster_labels[i],
                                                        local_centroids[i],
                                                        lr)
                [local_centroids[i], previous_centroids] = \
                    local.gradient_step(local_grad, local_centroids[i])
            # Check local stopping conditions
            local_stop[i], local_delta[i] = \
                local.check_stopping(local_centroids[i],
                                         previous_centroids, epsilon)
        num_iter += 1
        tracked_delta.append(local_delta)
        if verbose:
            print("Single-Shot %s ; iter : %d delta : %f"
                  % (optimization, num_iter, max(local_delta)))

        # if any of the sites are still iterating, keep the global loop running
        # TODO: we can save computations by locally waiting if local
        #       conditions are met
        not_converged = any(local_stop)

    # Aggregate clusters remotely
    remote_centroids = remote.aggregate_clusters(local_centroids)
    # And compute the final global clustering
    cluster_labels = [clusters for node in nodes for clusters in
                      local.compute_clustering(node, remote_centroids)]
    return {'centroids': remote_centroids, 'cluster_labels': cluster_labels,
            'X': X, 'delta': tracked_delta, 'iter': num_iter,
            'name': 'singleshot_%s' % optimization}