Exemplo n.º 1
0
def dkm_local_compute_clustering(args,
                                 config_file=CONFIG_FILE,
                                 **kwargs):
    """
        # Description:
            Assign data instances to clusters.

        # PREVIOUS PHASE:
            remote_init_centroids (on first run only)
            remote_cehck_convergence

        # INPUT:

            |   name                |   type    |   default     |
            |   ---                 |   ---     |   ---         |
            |   config_file         |   str     |   config.cfg  |
            |   remote_centroids    |   list    |   None        |
            |   computation_phase   |   list    |   None        |

        # OUTPUT:
            - centroids: list of numpy arrays

        # NEXT PHASE:
            remote_init_centroids
    """
    state, inputs, cache = ut.resolve_args(args)
    config_file = ut.resolve_input('config_file', cache)
    remote_centroids = ut.resolve_input('remote_centroids', inputs)
    computation_phase = ut.resolve_input('computation_phase', inputs)
    ut.log('LOCAL: computing clustering', state)
    if remote_centroids is None:
        raise ValueError(
            "LOCAL: at local_compute_clustering - remote_centroids not passed correctly"
        )
    if computation_phase is None:
        raise ValueError(
            "LOCAL: at local_compute_clustering - computation_phase not passed correctly"
        )
    config = configparser.ConfigParser()
    config.read(config_file)
    ut.log('Config file is %s, with keys %s' % (config_file, str(dict(config))), state)

    data = np.load(config['LOCAL']['data_file'])

    cluster_labels = local.compute_clustering(data, remote_centroids)

    new_comp_phase = "dkm_local_compute_clustering"
    if computation_phase == "dkm_remote_optimization_step":
        new_comp_phase = "dkm_local_compute_clustering_2"

    computation_output = ut.default_computation_output(args)
    cache['cluster_labels'] = cluster_labels
    cache['remote_centroids'] = remote_centroids
    computation_output['output'] = dict(
        computation_phase=new_comp_phase,
        remote_centroids=remote_centroids,
        cluster_labels=cluster_labels
    )
    computation_output['cache'] = cache
    return computation_output
Exemplo n.º 2
0
def dkm_local_compute_optimizer(args,
                                config_file=CONFIG_FILE,
                                **kwargs):
    """
        # Description:
            Compute local optimizers with local data.

        # PREVIOUS PHASE:
            local_compute_clustering

        # INPUT:

            |   name                |   type    |   default     |
            |   ---                 |   ---     |   ---         |
            |   config_file         |   str     |   config.cfg  |
            |   remote_centroids    |   list    |   None        |
            |   cluster_labels      |   list    |   None        |

        # OUTPUT:
            - centroids: list of numpy arrays

        # NEXT PHASE:
            remote_init_centroids
    """
    state, inputs, cache = ut.resolve_args(args)
    config_file = ut.resolve_input('config_file', cache)
    remote_centroids = ut.resolve_input('remote_centroids', inputs, cache)
    cluster_labels = ut.resolve_input('cluster_labels', inputs, cache)
    if remote_centroids is None:
        raise ValueError(
            "LOCAL: at local_compute_clustering - remote_centroids not passed correctly"
        )
    if cluster_labels is None:
        raise ValueError(
            "LOCAL: at local_compute_clustering - cluster_labels not passed correctly"
        )
    ut.log('LOCAL: computing optimizers', state)
    config = configparser.ConfigParser()
    config.read(config_file)
    data = np.load(config['LOCAL']['data_file'])
    k = int(config['LOCAL']['k'])
    learning_rate = config['LOCAL']['learning_rate']
    optimization = config['LOCAL']['optimization']
    if optimization == 'lloyd':
        local_optimizer = local.compute_mean(data, cluster_labels, k)
    elif optimization == 'gradient':
        # Gradient descent has sites compute gradients locally
        local_optimizer = \
            local.compute_gradient(data, cluster_labels[i],
                                   remote_centroids, learning_rate)
    outdir = state['outputDirectory']
    np.save(os.path.join(outdir, 'local_optimizer.npy'), local_optimizer)
    np.save(os.path.join(outdir, 'local_cluster_labels.npy'), cluster_labels)
    computation_output = dict(output=dict(
        local_optimizer=local_optimizer,
        computation_phase="dkm_local_compute_optimizer"),
        state=state
    )
    return computation_output
Exemplo n.º 3
0
def dkm_remote_optimization_step(args, config_file=CONFIG_FILE):
    """
        # Description:
            Use optimizer to take the next step.

        # PREVIOUS PHASE:
            remote_aggregate_optimizer

        # INPUT:

            |   name               |   type    |   default     |
            |   ---                |   ---     |   ---         |
            |   config_file        |   str     |   config.cfg  |
            |   remote_centroids   |   list    |   None        |
            |   remote_optimizer   |   list    |   None        |

        # OUTPUT:
            - previous centroids: list of numpy arrays
            - remote centroids: list of numpy arrays

        # NEXT PHASE:
            remote_check_convergence
    """
    state, inputs, cache = ut.resolve_args(args)
    config_file = ut.resolve_input('config_file', cache)
    remote_centroids = ut.resolve_input('remote_centroids', inputs, cache)
    remote_optimizer = ut.resolve_input('remote_optimizer', inputs, cache)
    if type(remote_centroids[0]) is not np.ndarray:
        remote_centroids = [np.array(c) for c in remote_centroids]
    ut.log('REMOTE: Optimization step', args['state'])
    config = configparser.ConfigParser()
    config.read(config_file)
    optimization = config['REMOTE']['optimization']
    if optimization == 'lloyd':
        # Then, update centroids as corresponding to the local mean
        previous_centroids = remote_centroids[:]
        remote_centroids = remote_optimizer[:]
        ut.log("Previous centroids look like %s" % type(previous_centroids[0]),
               state)
        ut.log("Remote centroids look like %s" % type(remote_centroids[0]),
               state)
    elif optimization == 'gradient':
        # Then, update centroids according to one step of gradient descent
        [remote_centroids,
         previous_centroids] = local.gradient_step(remote_optimizer,
                                                   remote_centroids)
    cache['previous_centroids'] = previous_centroids
    cache['remote_centroids'] = remote_centroids
    computation_output = dict(output=dict(
        computation_phase="dkm_remote_optimization_step",
        remote_centroids=remote_centroids),
                              state=state,
                              cache=cache)
    return computation_output
Exemplo n.º 4
0
def dkm_remote_aggregate_output(args):
    """
        # Description:
            Check convergence.

        # PREVIOUS PHASE:
            remote_check_convergence

        # INPUT:

            |   name | type | default |
            | --- | --- | --- |
            |   config_file | str | config.cfg |
            |   remote_centroids | list | None |
            |   previous_centroids | list | None |

        # OUTPUT:
            -remote_centroids

    """
    state, inputs, cache = ut.resolve_args(args)
    remote_centroids = ut.resolve_input('remote_centroids', inputs, cache)
    ut.log('REMOTE: Aggregating input', state)
    computation_output = dict(output=dict(
        computation_phase="dkm_remote_aggregate_output",
        remote_centroids=remote_centroids,
    ),
                              state=state,
                              cache=cache)
    return computation_output
Exemplo n.º 5
0
def dkm_remote_check_convergence(args, config_file=CONFIG_FILE):
    """
         # Description:
             Check convergence.
         # PREVIOUS PHASE:
             remote_aggregate_optimizer
         # INPUT:
             |   name               |   type    |   default     |
             |   ---                |   ---     |   ---         |
             |   config_file        |   str     |   config.cfg  |
             |   remote_centroids   |   list    |   None        |
             |   previous_centroids |   list    |   None        |
         # OUTPUT:
             - boolean encoded in name of phase
             - delta
             - remote_centroids
         # NEXT PHASE:
             remote_check_convergence
     """
    state, inputs, cache = ut.resolve_args(args)
    ut.log('REMOTE: Check convergence', state)
    config_file = ut.resolve_input('config_file', cache)
    remote_centroids = ut.resolve_input('remote_centroids', inputs, cache)
    previous_centroids = ut.resolve_input('previous_centroids', inputs, cache)
    if type(remote_centroids) is not np.ndarray:
        remote_centroids = [np.array(c) for c in remote_centroids]
    if type(previous_centroids) is not np.ndarray:
        previous_centroids = [np.array(c) for c in previous_centroids]

    config = configparser.ConfigParser()
    config.read(config_file)
    epsilon = float(config['REMOTE']['epsilon'])
    remote_check, delta = local.check_stopping(remote_centroids,
                                               previous_centroids, epsilon)
    ut.log(
        'REMOTE: Convergence Delta is %f, Converged is %s' %
        (delta, remote_check), state)
    new_phase = "dkm_remote_converged_true" if remote_check else "dkm_remote_converged_false"
    computation_output = dict(output=dict(
        computation_phase=new_phase,
        delta=delta,
        remote_centroids=remote_centroids,
    ),
                              state=state,
                              cache=cache)
    return computation_output
Exemplo n.º 6
0
def dkm_remote_init_centroids(args, config_file=CONFIG_FILE, **kwargs):
    """
        # Description:
            Initialize K centroids from locally selected centroids.

        # PREVIOUS PHASE:
            local_init_centroids

        # INPUT:

            |   name             |   type    |   default     |
            |   ---              |   ---     |   ---         |
            |   config_file      |   str     |   config.cfg  |

        # OUTPUT:
            - centroids: list of numpy arrays

        # NEXT PHASE:
            local_compute_optimizer
    """
    state, inputs, cache = ut.resolve_args(args)
    ut.log('REMOTE: Initializing centroids', state)
    config_file = ut.resolve_input('config_file', cache)
    config = configparser.ConfigParser()
    config.read(config_file)
    ut.log('Config file %s, looks like %s' % (config_file, str(dict(config))),
           state)
    k = int(config['REMOTE']['k'])
    # Have each site compute k initial clusters locally
    local_centroids = []
    if 'remote_centroids' in inputs.keys():
        remote_centroids = inputs['remote_centroids']
    elif 'remote_centroids' in cache.keys():
        remote_centroids = cache['remote_centroids']
    else:
        for site in inputs:
            ut.log(
                'Local site %s sent inputs with keys %s' %
                (site, str(inputs[site].keys())), state)
            local_centroids += inputs[site]['local_centroids']
        # and select k random clusters from the s*k pool
        np.random.shuffle(local_centroids)
        remote_centroids = local_centroids[:k]
    cache['config_file'] = config_file
    cache['remote_centroids'] = remote_centroids
    computation_output = dict(
        output=dict(
            work_dir='.',
            config_file=config_file,
            # local_centroids=remote_centroids,
            computation_phase="dkm_remote_init_centroids",
            remote_centroids=remote_centroids),
        state=state,
        cache=cache)
    return computation_output
Exemplo n.º 7
0
def dkm_local_init_env(args,
                       config_file=CONFIG_FILE,
                       k=DEFAULT_k,
                       optimization=DEFAULT_optimization,
                       shuffle=DEFAULT_shuffle,
                       learning_rate=DEFAULT_learning_rate,
                       **kwargs):
    """
        # Description:
            Initialize the local environment, creating the config file.

        # PREVIOUS PHASE:
            remote_init_env

        # INPUT:

            |   name            |   type    |   default     |
            |   ---             |   ---     |   ---         |
            |   config_file     |   str     |   config.cfg  |
            |   k               |   int     |   5           |
            |   optimization    |   str     |   lloyd       |
            |   shuffle         |   bool    |   False       |
            |   data_file       |   str     |   data.txt    |
            |   learning_rate   |   float   |   0.001       |

        # OUTPUT:
            - config file written to disk

        # NEXT PHASE:
            local_init_centroids
    """
    state, inputs, cache = ut.resolve_args(args)
    data_file = ut.resolve_input('all_windows', cache)
    ut.log('LOCAL: Initializing remote environment', state)
    config_path = os.path.join(state['outputDirectory'], config_file)
    cache['config_file'] = config_path
    config = configparser.ConfigParser()
    config['LOCAL'] = dict(k=k,
                           optimization=optimization,
                           shuffle=shuffle,
                           data_file=data_file,
                           learning_rate=learning_rate)
    with open(config_path, 'w') as file:
        config.write(file)
    # output
    computation_output = dict(
        output=dict(
            config_file=config_path,
            computation_phase="dkm_local_init_env"),
        state=state,
        cache=cache
    )
    return computation_output
Exemplo n.º 8
0
def dkm_remote_aggregate_optimizer(args, config_file=CONFIG_FILE):
    """
        # Description:
            Aggregate optimizers sent from local nodes.

        # PREVIOUS PHASE:
            local_compute_optimizer

        # INPUT:

            |   name             |   type    |   default     |
            |   ---              |   ---     |   ---         |
            |   config_file      |   str     |   config.cfg  |

        # OUTPUT:
            - remote_optimizer: list of K numpy arrays

        # NEXT PHASE:
            remote_optimization_step
    """
    state, inputs, cache = ut.resolve_args(args)
    config_file = ut.resolve_input('config_file', cache)
    config = configparser.ConfigParser()
    config.read(config_file)

    optimization = config['REMOTE']['optimization']
    ut.log('REMOTE: Aggregate optimizer', state)
    local_optimizers = [inputs[site]['local_optimizer'] for site in inputs]
    s = len(local_optimizers)
    remote_optimizer = remote.aggregate_sum(local_optimizers)
    if not all([type(r) is np.ndarray for r in remote_optimizer]):
        try:
            remote_opt2 = [np.array(c) for c in remote_optimizer]
            remote_optimizer = remote_opt2[:]
        except Exception as e:
            raise (Exception("Hit valueerror. Remote optimizer types are %s" %
                             ([len(r) for r in remote_optimizer])))
    if optimization == 'lloyd':
        # for the mean, we need to further divide the number of sites
        try:
            remote_optimizer = [r / s for r in remote_optimizer]
        except Exception as e:
            raise (Exception("Hit valueerror. Remote optimizer types are %s" %
                             ([len(r) for r in remote_optimizer])))
    cache['remote_optimizer'] = remote_optimizer
    computation_output = dict(output=dict(
        remote_optimizer=remote_optimizer,
        computation_phase="dkm_remote_aggregate_optimizer"),
                              state=state,
                              cache=cache)
    return computation_output
Exemplo n.º 9
0
def dkm_local_init_centroids(args,
                             config_file=CONFIG_FILE,
                             **kwargs):
    """
        # Description:
            Initialize K centroids from own data.

        # PREVIOUS PHASE:
            local_init_env

        # INPUT:

            |   name             |   type    |   default     |
            |   ---              |   ---     |   ---         |
            |   config_file      |   str     |   config.cfg  |

        # OUTPUT:
            - centroids: list of numpy arrays

        # NEXT PHASE:
            remote_init_centroids
    """
    state, inputs, cache = ut.resolve_args(args)
    config_file = ut.resolve_input('config_file', cache)
    ut.log('LOCAL: Initializing centroids', state)
    config = configparser.ConfigParser()
    config.read(config_file)
    data = np.load(config['LOCAL']['data_file'])
    centroids = local.initialize_own_centroids(data, int(config['LOCAL']['k']))
    np.save(os.path.join(state['outputDirectory'], 'initial_centroids'), 'centroids')
    ut.log('Local centroids looks like %s' % (str(type(centroids))), state)
    # output
    cache['local_centroids'] = centroids
    computation_output = dict(output=dict(
        config_file=config_file,
        local_centroids=centroids,
        computation_phase="dkm_local_init_env"),
        state=state,
        cache=cache
    )
    return computation_output