示例#1
0
def main():
    nClasses = 2
    nFeatures = 20

    # read training data from file with 20 features per observation and 1 class label
    # and use only a chunk per process
    trainfile = "./data/batch/binary_cls_train.csv"
    train_data = np.split(read_csv(trainfile, range(nFeatures)), d4p.num_procs())[d4p.my_procid()]
    train_labels = np.split(read_csv(trainfile, range(nFeatures, nFeatures + 1)), d4p.num_procs())[d4p.my_procid()]

    # set parameters and train
    train_alg = d4p.logistic_regression_training(nClasses=nClasses, interceptFlag=True, distributed=True)
    train_result = train_alg.compute(train_data, train_labels)

    # Now let's do some prediction
    # It operates on the same data on each process
    # read testing data from file with 20 features per observation
    testfile = "./data/batch/binary_cls_test.csv"
    predict_data = read_csv(testfile, range(nFeatures))
    predict_labels = read_csv(testfile, range(nFeatures, nFeatures + 1))
    
    # set parameters and compute predictions
    predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses)
    predict_result = predict_alg.compute(predict_data, train_result.model)
    
    # the prediction result provides prediction
    assert predict_result.prediction.shape == (predict_data.shape[0], train_labels.shape[1])
    
    return (train_result, predict_result, predict_labels)
示例#2
0
def main():
    infile = "./data/batch/covcormoments_dense.csv"

    # We know the number of lines in the file and use this to separate data between processes
    skiprows, nrows = get_chunk_params(lines_count=200,
                                       chunks_count=d4p.num_procs(),
                                       chunk_number=d4p.my_procid())

    # Each process reads its chunk of the file
    data = read_csv(infile, sr=skiprows, nr=nrows)

    # Create algorithm with distributed mode
    alg = d4p.low_order_moments(method='defaultDense', distributed=True)

    # Perform computation
    res = alg.compute(data)

    # result provides minimum, maximum, sum, sumSquares, sumSquaresCentered,
    # mean, secondOrderRawMoment, variance, standardDeviation, variation
    assert (all(
        getattr(res, name).shape == (1, data.shape[1]) for name in [
            'minimum', 'maximum', 'sum', 'sumSquares', 'sumSquaresCentered',
            'mean', 'secondOrderRawMoment', 'variance', 'standardDeviation',
            'variation'
        ]))

    return res
示例#3
0
def main(method='plusPlusDense'):
    infile = "./data/distributed/kmeans_dense.csv"
    nClusters = 10
    maxIter = 25

    # configure a kmeans-init
    init_algo = d4p.kmeans_init(nClusters, method=method, distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # now slice the data, it would have been better to read only what we need, of course...
    rpp = int(data.shape[0]/d4p.num_procs())
    data = data[rpp*d4p.my_procid():rpp*d4p.my_procid()+rpp,:]

    # compute initial centroids
    init_result = init_algo.compute(data)
    # The results provides the initial centroids
    assert init_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
    result = algo.compute(data, init_result.centroids)
    
    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)

    # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    assert result.nIterations <= maxIter
    # we need an extra call to kmeans to get the assignments (not directly supported through parameter assignFlag yet in SPMD mode)
    algo = d4p.kmeans(nClusters, 0, assignFlag=True) # maxIt=0; not distributed, we compute on local data only!
    assignments = algo.compute(data, result.centroids).assignments

    return (assignments, result)
示例#4
0
        def test_dbscan_spmd(self):
            epsilon = 0.04
            minObservations = 45
            data = np_read_csv(
                os.path.join(".", 'data', 'batch', 'dbscan_dense.csv'))

            batch_algo = d4p.dbscan(minObservations=minObservations,
                                    epsilon=epsilon,
                                    resultsToCompute='computeCoreIndices')
            batch_result = batch_algo.compute(data)

            rpp = int(data.shape[0] / d4p.num_procs())
            node_stride = rpp * d4p.my_procid()
            node_range = range(node_stride, node_stride + rpp)
            node_data = data[node_range, :]

            spmd_algo = d4p.dbscan(minObservations=minObservations,
                                   epsilon=epsilon,
                                   distributed=True)
            spmd_result = spmd_algo.compute(node_data)

            # clusters can get different indexes in batch and spmd algos,
            # to compare them we should take care about it
            cluster_index_dict = {}
            for i in node_range:
                # border points assignments can be different
                # with different amount of nodes but cores are the same
                if i in batch_result.coreIndices:
                    right = spmd_result.assignments[i - node_stride][0]
                    if not batch_result.assignments[i][0] in cluster_index_dict:
                        cluster_index_dict[batch_result.assignments[i]
                                           [0]] = right
                    left = cluster_index_dict[batch_result.assignments[i][0]]
                    self.assertTrue(left == right)
示例#5
0
        def test_kmeans_spmd(self):
            nClusters = 10
            maxIter = 25

            data = np.loadtxt("./data/distributed/kmeans_dense.csv",
                              delimiter=',')

            rpp = int(data.shape[0] / d4p.num_procs())
            spmd_data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() +
                             rpp, :]

            for init_method in [
                    'plusPlusDense', 'parallelPlusDense', 'deterministicDense'
            ]:
                batch_init_res = d4p.kmeans_init(
                    nClusters=nClusters, method=init_method).compute(data)
                spmd_init_res = d4p.kmeans_init(
                    nClusters=nClusters, method=init_method,
                    distributed=True).compute(spmd_data)

                if init_method in ['parallelPlusDense']:
                    print("Warning: It is well known "
                          "that results of parallelPlusDense init "
                          "does not match with batch algorithm")
                else:
                    reason = "Initial centroids with " + init_method
                    reason += " does not match with batch algorithm"
                    self.assertTrue(
                        np.allclose(batch_init_res.centroids,
                                    spmd_init_res.centroids), reason)

                batch_res = d4p.kmeans(nClusters=nClusters,
                                       maxIterations=maxIter).compute(
                                           data, batch_init_res.centroids)
                spmd_res = d4p.kmeans(nClusters=nClusters,
                                      maxIterations=maxIter,
                                      distributed=True).compute(
                                          spmd_data, spmd_init_res.centroids)

                if init_method in ['parallelPlusDense']:
                    print("Warning: It is well known "
                          "that results of parallelPlusDense init "
                          "does not match with batch algorithm")
                else:
                    reason = "Final centroids with " + init_method
                    reason += " does not match with batch algorithm"
                    self.assertTrue(
                        np.allclose(batch_res.centroids, spmd_res.centroids),
                        reason)
示例#6
0
 def test_dbscan_spmd(self):
     import dbscan_spmd as ex
     result = self.call(ex)
     test_data = np_read_csv(
         os.path.join(unittest_data_path, "dbscan_batch.csv"))
     rpp = int(test_data.shape[0] / d4p.num_procs())
     test_data = test_data[rpp * d4p.my_procid():rpp * d4p.my_procid() +
                           rpp, :]
     # clusters can get different indexes in batch and spmd algos, to compare them we should take care about it
     cluster_index_dict = {}
     for i in range(test_data.shape[0]):
         if not test_data[i][0] in cluster_index_dict:
             cluster_index_dict[test_data[i]
                                [0]] = result.assignments[i][0]
         self.assertTrue(cluster_index_dict[test_data[i][0]] ==
                         result.assignments[i][0])
def main(method='defaultDense'):
    infile = "./data/batch/dbscan_dense.csv"
    epsilon = 0.04
    minObservations = 45

    # Load the data
    data = np.loadtxt(infile, delimiter=',')
    rpp = int(data.shape[0] / d4p.num_procs())
    data = data[rpp * d4p.my_procid(): rpp * d4p.my_procid() + rpp, :]

    # configure dbscan main object
    algo = d4p.dbscan(minObservations=minObservations, epsilon=epsilon, distributed=True)
    # and compute
    result = algo.compute(data)

    return result
示例#8
0
def main():
    infile = "./data/batch/covcormoments_dense.csv"

    # We know the number of lines in the file and use this to separate data between processes
    skiprows, nrows = get_chunk_params(lines_count=200,
                                       chunks_count=d4p.num_procs(),
                                       chunk_number=d4p.my_procid())

    # Each process reads its chunk of the file
    data = read_csv(infile, sr=skiprows, nr=nrows)

    # Create algorithm with distributed mode
    alg = d4p.covariance(method="defaultDense", distributed=True)

    # Perform computation
    res = alg.compute(data)

    # covariance result objects provide correlation, covariance and mean
    assert res.covariance.shape == (data.shape[1], data.shape[1])
    assert res.mean.shape == (1, data.shape[1])
    assert res.correlation.shape == (data.shape[1], data.shape[1])

    return res
示例#9
0
    # Initialize SPMD mode
    d4p.daalinit()

    infile = "./data/distributed/kmeans_dense.csv"
    nClusters = 10
    maxIter = 25

    # configure a kmeans-init
    init_algo = d4p.kmeans_init(nClusters,
                                method="plusPlusDense",
                                distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # now slice the data, it would have been better to read only what we need, of course...
    rpp = int(data.shape[0] / d4p.num_procs())
    data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :]

    # compute initial centroids
    init_result = init_algo.compute(data)
    # The results provides the initial centroids
    assert init_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
    result = algo.compute(data, init_result.centroids)

    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)
示例#10
0
max_iter = 300
acc_tres = 1e-4

img = Image.open(
    './Yushan.jpg'
)  #https://commons.wikimedia.org/wiki/File:%E7%8E%89%E5%B1%B1%E4%B8%BB%E5%B3%B0_02.jpg
img.load()

china = np.array(img, dtype=np.float64) / 255

# Load Image and transform to a 2D numpy array.
w, h, d = original_shape = tuple(china.shape)
assert d == 3
image_array = np.reshape(china, (w * h, d))
o_colors = 344038  #Yushan
n_slices = int(image_array.shape[0] / d4p.num_procs())

print("Number of MPI tasks: ", d4p.num_procs())

image_array = image_array[n_slices *
                          d4p.my_procid():n_slices * d4p.my_procid() +
                          n_slices, :]

print("Fitting model on the data")
t0 = time()

# compute initial centroids
init_result = init_algo.compute(image_array)
assert init_result.centroids.shape[0] == n_colors
# configure kmeans main object
algo = d4p.kmeans(n_colors, max_iter, distributed=True)
示例#11
0
    # Initialize SPV mode
    d4p.daalinit()

    infile = "./data/distributed/kmeans_dense.csv"
    nClusters = 10
    maxIter = 25

    # configure a kmeans-init
    initrain_algo = d4p.kmeans_init(nClusters,
                                    method="plusPlusDense",
                                    distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # We need partioned input data, let's slice the data
    rpp = int(data.shape[0] / d4p.num_procs())
    data = [data[rpp * x:rpp * x + rpp, :] for x in range(d4p.num_procs())]
    # Note, providing a list of files instead also distributes the file read!

    # compute initial centroids
    initrain_result = initrain_algo.compute(data)
    # The results provides the initial centroids
    assert initrain_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
    result = algo.compute(data, initrain_result.centroids)

    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)