sc = SparkContext.getOrCreate(conf=conf)

    sc.addFile("/nfs/paper-big-data-engines/utils.py")
    sc.addFile("/nfs/paper-big-data-engines/histogram/Histogram.py")
    from utils import benchmark, crawl_dir, read_img
    from Histogram import (
        calculate_histogram,
        combine_histogram,
        flatten,
        save_histogram,
    )

    print("Connected")

    # Read images
    paths = crawl_dir(os.path.abspath(args.bb_dir))
    paths = sc.parallelize(paths, len(paths))
    img_rdd = paths.map(lambda p: read_img(p, start=start, args=args))

    img_rdd = img_rdd.map(
        lambda x: flatten(x[1], start=start, args=args, filename=x[0]))

    partial_histogram = img_rdd.map(lambda x: calculate_histogram(
        x[1], args=args, start=start, filename=x[0]))

    histogram = partial_histogram.fold(
        np.array([0] * (2**16 - 1)),
        lambda x, y: combine_histogram(x, y, args=args, start=start),
    )

    save_histogram(histogram, args=args, start=start)
Exemplo n.º 2
0
                        action="store_true",
                        help="benchmark results")

    args = parser.parse_args()

    # Cluster scheduler
    cluster = args.scheduler
    client = Client(cluster)

    print(client)
    client.upload_file(
        "nfs/SOEN-499-Project/utils.py")  # Allow workers to use module
    client.upload_file("nfs/SOEN-499-Project/kmeans/Kmeans.py")

    # Read images
    paths = crawl_dir(os.path.abspath("test/data"))

    img = [read_img(path, start=start, args=args) for path in paths]

    voxels = da.concatenate([x[1] for x in img]).reshape(-1)

    centroids = [0.0, 125.8, 251.6, 377.4]  # Initial centroids
    voxel_pair = None

    bincount = da.bincount(voxels)
    bincount = bincount[bincount != 0]
    unique = da.unique(voxels)

    unique, counts = dask.compute(unique, bincount)

    for i in range(0, args.iterations):  # Disregard convergence.