示例#1
0
    def _check_array(self, X):
        t0 = tic()

        if isinstance(X, pd.DataFrame):
            X = X.values

        elif isinstance(X, dd.DataFrame):
            raise TypeError("Cannot fit on dask.dataframe due to unknown "
                            "partition lengths.")

        if X.dtype == 'int32':
            X = X.astype('float32')
        elif X.dtype == 'int64':
            X = X.astype('float64')

        X = check_array(X,
                        accept_dask_dataframe=False,
                        accept_unknown_chunks=False,
                        accept_sparse=False)

        if isinstance(X, np.ndarray):
            X = da.from_array(X,
                              chunks=(max(1,
                                          len(X) // cpu_count()), X.shape[-1]))

        bad = (da.isnull(X).any(), da.isinf(X).any())
        if any(*compute(bad)):
            msg = ("Input contains NaN, infinity or a value too large for "
                   "dtype('float64').")
            raise ValueError(msg)
        t1 = tic()
        logger.info("Finished check_array in %0.2f s", t1 - t0)
        return X
示例#2
0
def main(args=None):
    args = parse_args(args)
    steps = range(args.start, args.stop, args.step)
    if args.scheduler_address:
        client = Client(args.scheduler_address)
        info = client.scheduler_info()
        logger.info("Distributed mode: %s", client.scheduler)
        logger.info("Dashboard: %s:%s", info["address"],
                    info["services"]["bokeh"])
    else:
        logger.warning("Local mode")

    logger.info("Fitting for %s", list(steps))

    logger.info("Reading data")
    X = read().pipe(transform).pipe(as_array)
    X, = persist(X)

    timings = []

    for n_clusters in range(args.start, args.stop, args.step):
        logger.info("Starting %02d", n_clusters)
        t0 = tic()
        with _timer(n_clusters, _logger=logger):
            km = do(X, n_clusters, factor=args.factor)
        t1 = tic()
        logger.info("Cluster Centers [%s]:\n%s", n_clusters,
                    km.cluster_centers_)
        inertia = km.inertia_.compute()
        logger.info("Inertia [%s]: %s", km.cluster_centers_, inertia)
        timings.append((n_clusters, args.factor, t1 - t0, inertia))

    pd.DataFrame(timings, columns=["n_clusters", "factor", "time",
                                   "inertia"]).to_csv("timings.csv")
示例#3
0
def main(args=None):
    args = parse_args()

    ctx = directory = tempfile.TemporaryDirectory()

    with ctx:
        original = os.path.join(str(directory), args.original)
        split = os.path.join(str(directory), args.split)
        final = os.path.join(str(directory), args.final)

        shape = (args.n_slices, ) + args.shape
        chunks = (1, ) + args.shape
        a = da.random.random(shape, chunks=chunks)
        a.to_zarr(original, overwrite=True)

        with Client():
            print("rechunking")
            t0 = tic()

            with performance_report():
                rechunk.rechunk(original, split, final, args.split_chunks)
                t1 = tic()

        took = t1 - t0
        gbs = a.nbytes / 1e9 / took
        print(
            f"Rechunked {dask.utils.format_bytes(a.nbytes)} in {t1 - t0:.2f}s ({gbs:0.2f} GB/s)"
        )
示例#4
0
def main(args=None):
    args = parse_args(args)
    steps = range(args.start, args.stop, args.step)
    if args.scheduler_address:
        client = Client(args.scheduler_address)
        info = client.scheduler_info()
        logger.info("Distributed mode: %s", client.scheduler)
        logger.info("Dashboard: %s:%s", info['address'],
                    info['services']['bokeh'])
    else:
        logger.warning("Local mode")

    logger.info("Fitting for %s", list(steps))

    logger.info("Reading data")
    X = read().pipe(transform).pipe(as_array)
    X, = persist(X)

    timings = []

    for n_clusters in range(args.start, args.stop, args.step):
        logger.info("Starting %02d", n_clusters)
        t0 = tic()
        km = do(X, n_clusters, factor=args.factor)
        t1 = tic()
        logger.info("Finished %02d, [%.2f]", n_clusters, t1 - t0)
        logger.info("Cluster Centers [%s]:\n%s", n_clusters,
                    km.cluster_centers_)
        inertia = km.inertia_.compute()
        logger.info("Inertia [%s]: %s", km.cluster_centers_, inertia)
        timings.append((n_clusters, args.factor, t1 - t0, inertia))

    pd.DataFrame(timings, columns=['n_clusters', 'factor', 'time',
                                   'inertia']).to_csv('timings.csv')
示例#5
0
 def wrapper(*args, **kwargs):
     # TODO: grab config.
     # TODO: structlog or something similar
     t0 = tic()
     result = func(*args, **kwargs)
     t1 = tic()
     timings[func.__name__].append(t1 - t0)
     return result
示例#6
0
def _partial_fit(model, x, y, kwargs=None):
    kwargs = kwargs or dict()
    start = tic()
    logger.info("Starting partial-fit %s", dask.base.tokenize(model, x, y))
    model.partial_fit(x, y, **kwargs)
    stop = tic()
    logger.info("Finished partial-fit %s [%0.2f]",
                dask.base.tokenize(model, x, y), stop - start)
    return model
示例#7
0
def init_scalable(X,
                  n_clusters,
                  random_state=None,
                  max_iter=None,
                  oversampling_factor=2):
    """K-Means initialization using k-means||

    This is algorithm 2 in Scalable K-Means++ (2012).
    """
    if isinstance(random_state, Integral) or random_state is None:
        random_state = da.random.RandomState(random_state)

    logger.info("Initializing with k-means||")
    init_start = tic()
    # Step 1: Initialize Centers
    idx = 0
    centers = da.compute(X[idx, np.newaxis])[0]
    c_idx = {idx}

    # Step 2: Initialize cost
    cost = evaluate_cost(X, centers)
    # TODO: natural log10? log2?
    n_iter = int(np.round(np.log(cost)))
    if max_iter is not None:
        n_iter = min(max_iter, n_iter)

    # Steps 3 - 6: update candidate Centers
    for i in range(n_iter):
        t0 = tic()
        new_idxs = _sample_points(X, centers, oversampling_factor,
                                  random_state)
        new_idxs = set(*compute(new_idxs))
        c_idx |= new_idxs
        t1 = tic()
        logger.info("init iteration %2d/%2d %.2f s, %2d centers", i + 1,
                    n_iter, t1 - t0, len(c_idx))
        # Sort before slicing, for better performance / memory
        # usage with the scheduler.
        # See https://github.com/dask/dask-ml/issues/39
        centers = X[sorted(c_idx)].compute()

    # XXX: scikit-learn doesn't have weighted k-means.
    # The paper weights each center by the number of points closest to it.
    # https://stackoverflow.com/a/37198799/1889400 claims you can scale the
    # features before clustering, but that doesn't seem right.
    # I think that replicating the *points*, proportional to the number of
    # original points closest to the candidate centers, would be a better way
    # to do that.

    # Step 7, 8 without weights
    km = sk_k_means.KMeans(n_clusters)
    km.fit(centers)
    logger.info("Finished initialization. %.2f s, %2d centers",
                tic() - init_start, n_clusters)
    return km.cluster_centers_
示例#8
0
def init_random(X, n_clusters, random_state):
    """K-means initialization using randomly chosen points"""
    logger.info("Initializing randomly")
    t0 = tic()

    idx = sorted(random_state.randint(0, len(X), size=n_clusters))
    centers = X[idx].compute()

    logger.info("Finished initialization. %.2f s, %2d centers",
                tic() - t0, n_clusters)
    return centers
示例#9
0
def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means||',
                         verbose=False, x_squared_norms=None,
                         random_state=None, tol=1e-4,
                         precompute_distances=True,
                         oversampling_factor=2,
                         init_max_iter=None):
    centers = k_init(X, n_clusters, init=init,
                     oversampling_factor=oversampling_factor,
                     random_state=random_state, max_iter=init_max_iter)
    dt = X.dtype
    X = X.astype(np.float32)
    P = X.shape[1]
    for i in range(max_iter):
        t0 = tic()
        centers = centers.astype('f4')
        labels, distances = pairwise_distances_argmin_min(
            X, centers, metric='euclidean', metric_kwargs={"squared": True}
        )

        labels = labels.astype(np.int32)
        distances = distances.astype(np.float32)

        r = da.atop(_centers_dense, 'ij',
                    X, 'ij',
                    labels, 'i',
                    n_clusters, None,
                    distances, 'i',
                    adjust_chunks={"i": n_clusters, "j": P},
                    dtype='f8')
        new_centers = da.from_delayed(
            sum(r.to_delayed().flatten()),
            (n_clusters, P),
            X.dtype
        )
        counts = da.bincount(labels, minlength=n_clusters)
        new_centers = new_centers / counts[:, None]
        new_centers, = compute(new_centers)

        # Convergence check
        shift = squared_norm(centers - new_centers)
        t1 = tic()
        logger.info("Lloyd loop %2d. Shift: %0.4f [%.2f s]", i, shift, t1 - t0)
        if shift < tol:
            break
        centers = new_centers

    if shift > 1e-7:
        labels, distances = pairwise_distances_argmin_min(X, centers)
    inertia = distances.astype(dt).sum()
    centers = centers.astype(dt)
    labels = labels.astype(np.int64)

    return labels, inertia, centers, i + 1
示例#10
0
def init_pp(X, n_clusters, random_state):
    """K-means initialization using k-means++

    This uses scikit-learn's implementation.
    """
    x_squared_norms = row_norms(X, squared=True).compute()
    logger.info("Initializing with k-means++")
    t0 = tic()
    centers = sk_k_means._k_init(X, n_clusters, random_state=random_state,
                                 x_squared_norms=x_squared_norms)
    logger.info("Finished initialization. %.2f s, %2d centers",
                tic() - t0, n_clusters)

    return centers
示例#11
0
def fit(data, use_scikit_learn=False):
    logger.info("Starting to cluster")
    # Cluster
    n_clusters = 8
    oversampling_factor = 2
    if use_scikit_learn:
        km = sk.KMeans(n_clusters=n_clusters, random_state=0)
    else:
        km = KMeans(n_clusters=n_clusters,
                    oversampling_factor=oversampling_factor,
                    random_state=0)
    t0 = tic()
    logger.info("Starting n_clusters=%2d, oversampling_factor=%2d",
                n_clusters, oversampling_factor)
    km.fit(data)
    t1 = tic()
    logger.info("Finished in %.2f", t1 - t0)
示例#12
0
def _timer(name, _logger=None, level="info"):
    """
    Output execution time of a function to the given logger level

    Parameters
    ----------
    name : str
        How to name the timer (will be in the logs)
    logger : logging.logger
        The optional logger where to write
    level : str
        On which level to log the performance measurement
    """
    start = tic()
    _logger = _logger or logger
    _logger.info("Starting %s", name)
    yield
    stop = tic()
    delta = datetime.timedelta(seconds=stop - start)
    _logger_level = getattr(_logger, level)
    _logger_level("Finished %s in %s", name, delta)  # nicer formatting for time.
示例#13
0
    def fit(self, X, y=None, **kwargs):
        """Fit the underlying estimator.

        Parameters
        ----------
        X, y : array-like
        **kwargs
            Additional fit-kwargs for the underlying estimator.

        Returns
        -------
        self : object
        """
        start = tic()
        logger.info("Starting fit")
        result = self.estimator.fit(X, y, **kwargs)
        stop = tic()
        logger.info("Finished fit, %0.2f", stop - start)

        # Copy over learned attributes
        copy_learned_attributes(result, self)
        copy_learned_attributes(result, self.estimator)
        return self
示例#14
0
This example shows how dask-ml's ``SpectralClustering`` scales with the
number of samples, compared to scikit-learn's implementation. The dask
version uses an approximation to the affinity matrix, which avoids an
avoids an expensive computation at the cost of some approximation error.
"""
from sklearn.datasets import make_circles
from sklearn.utils import shuffle
import pandas as pd

from timeit import default_timer as tic
import sklearn.cluster as scluster
import dask_ml.cluster as dcluster
import seaborn as sns

Ns = [2500, 5000, 7500]
X, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5)
X, y = shuffle(X, y)

timings = []
for n in Ns:
    X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5)
    t1 = tic()
    dcluster.SpectralClustering(n_clusters=2, n_components=100).fit(X)
    timings.append(('nystrom', n, tic() - t1))
    t1 = tic()
    scluster.SpectralClustering(n_clusters=2).fit(X)
    timings.append(('exact', n, tic() - t1))

df = pd.DataFrame(timings, columns=['method', 'n_samples', 'time'])
sns.factorplot(x='n_samples', y='time', hue='method', data=df, aspect=1.5)
示例#15
0
    def evolve(self, generations=1, model_epochs=10, elites=1, verbose=2):
        log = "Generation,Fitness,TrainingTime\n"
        for generation in range(generations):
            epoch_start = tic()
            for i in range(self.pop_size):
                if (self.population[i].fitness == -1):
                    if (verbose >= 1):
                        print("Training model {}...".format(i + 1))
                        print(self.population[i])
                    training_start = tic()
                    self._get_fitness(i, epochs=model_epochs, verbose=verbose)
                    log += '{},{:.4f},{:.4f}\n'.format(
                        generation, self.population[i].fitness,
                        tic() - training_start)
                else:
                    if (verbose >= 1):
                        print("Model {} already trained".format(i + 1))
                        log += '{},{:.4f},{:.4f}\n'.format(
                            generation, self.population[i].fitness,
                            tic() - training_start)
            self.population = sorted(self.population,
                                     key=lambda x: x.fitness,
                                     reverse=True)
            if (verbose >= 1):
                print("Best fitness for Generation {}: {:.4f}".format(
                    generation + 1, self.population[0].fitness))
            probs = np.array([gene.fitness for gene in self.population])
            total = probs.sum()
            probs = probs / total
            new_pop = self.population[:elites]  # Keep the most fit individuals
            for i in range(elites, self.pop_size):
                a, b = np.random.choice(self.pop_size,
                                        size=2,
                                        replace=True,
                                        p=probs)
                child = self.population[a].cross(self.population[b])
                child.mutate()
                new_pop.append(child)
            self.population = new_pop
            if (verbose >= 1):
                print("Generation {} duration: {:.4f}".format(
                    i + 1,
                    tic() - epoch_start))
        if (verbose >= 1):
            print("Training final generation...")
        for i in range(self.pop_size):
            if (self.population[i].fitness == -1):
                if (verbose >= 1):
                    print("Training model {}...".format(i + 1))
                    print(self.population[i])
                self._get_fitness(i, epochs=model_epochs, verbose=verbose)
                log += '{},{:.4f},{:.4f}\n'.format(generations,
                                                   self.population[i].fitness,
                                                   tic() - training_start)
            else:
                if (verbose >= 1):
                    print("Model {} already trained".format(i + 1))
                    log += '{},{:.4f},{:.4f}\n'.format(
                        generations, self.population[i].fitness,
                        tic() - training_start)

        self.population = sorted(self.population,
                                 key=lambda x: x.fitness,
                                 reverse=True)
        output_log = open("output_log.csv", 'w')
        output_log.write(log)
        output_log.close()
        if (verbose >= 1):
            print("Final best fitness: {:.4f}".format(
                self.population[0].fitness))
示例#16
0
def _kmeans_single_lloyd(
    X,
    n_clusters,
    max_iter=300,
    init="k-means||",
    verbose=False,
    x_squared_norms=None,
    random_state=None,
    tol=1e-4,
    precompute_distances=True,
    oversampling_factor=2,
    init_max_iter=None,
):
    centers = k_init(
        X,
        n_clusters,
        init=init,
        oversampling_factor=oversampling_factor,
        random_state=random_state,
        max_iter=init_max_iter,
    )
    dt = X.dtype
    P = X.shape[1]
    for i in range(max_iter):
        t0 = tic()
        labels, distances = pairwise_distances_argmin_min(
            X, centers, metric="euclidean", metric_kwargs={"squared": True})

        labels = labels.astype(np.int32)
        # distances is always float64, but we need it to match X.dtype
        # for centers_dense, but remain float64 for inertia
        r = da.atop(
            _centers_dense,
            "ij",
            X,
            "ij",
            labels,
            "i",
            n_clusters,
            None,
            distances.astype(X.dtype),
            "i",
            adjust_chunks={
                "i": n_clusters,
                "j": P
            },
            dtype=X.dtype,
        )
        new_centers = da.from_delayed(sum(r.to_delayed().flatten()),
                                      (n_clusters, P), X.dtype)
        counts = da.bincount(labels, minlength=n_clusters)
        # Require at least one per bucket, to avoid division by 0.
        counts = da.maximum(counts, 1)
        new_centers = new_centers / counts[:, None]
        new_centers, = compute(new_centers)

        # Convergence check
        shift = squared_norm(centers - new_centers)
        t1 = tic()
        logger.info("Lloyd loop %2d. Shift: %0.4f [%.2f s]", i, shift, t1 - t0)
        if shift < tol:
            break
        centers = new_centers

    if shift > 1e-7:
        labels, distances = pairwise_distances_argmin_min(X, centers)
        labels = labels.astype(np.int32)

    inertia = distances.sum()
    centers = centers.astype(dt)

    return labels, inertia, centers, i + 1
    b_ix = pd.Index([1, 2, 3, 4, 5], name='b')

    concat = xr.concat([uav.Band1, uav.Band2, uav.Band3, uav.Band4, uav.Band5],
                       b_ix)
    # Mask nodata areas
    #concat = concat.where(concat.sum(dim='b') > 0)

    predicted = xarray_classify.classify_dataset(concat, clf_RF)

    # Just look at a subset area in this case (slice)
    #predicted = xarray_classify.classify_dataset(concat.isel(x=slice(3000,3500),y=slice(3000,3500)), clf_RF)

    # Calculate albedo
    #uav = uav.isel(x=slice(3000,3500),y=slice(3000,3500))
    #albedo = 0.726*(uav['Band2']-0.18) - 0.322*(uav['Band2']-0.18)**2 - 0.015*(uav['Band4']-0.2) + 0.581*(uav['Band4']-0.2)
    t1 = tic()
    albedo = 0.726 * uav['Band2'] - 0.322 * uav['Band2']**2 - 0.015 * uav[
        'Band4'] + 0.581 * uav['Band4']
    print('xarray albedo (seconds): ', tic() - t1)

    # Save outputs

    if not setup:
        # Define projection
        srs = osr.SpatialReference()
        srs.ImportFromProj4('+init=epsg:32622')
        crs = xr.DataArray(0, encoding={'dtype': np.dtype('int8')})
        crs.attrs['projected_crs_name'] = srs.GetAttrValue('projcs')
        crs.attrs['grid_mapping_name'] = 'universal_transverse_mercator'
        crs.attrs['scale_factor_at_central_origin'] = srs.GetProjParm(
            'scale_factor')
示例#18
0
version uses an approximation to the affinity matrix, which avoids an
expensive computation at the cost of some approximation error.
"""
from sklearn.datasets import make_circles
from sklearn.utils import shuffle
import pandas as pd

from timeit import default_timer as tic
import sklearn.cluster
import dask_ml.cluster
import seaborn as sns

Ns = [2500, 5000, 7500, 10000]
X, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5)
X, y = shuffle(X, y)

timings = []
for n in Ns:
    X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5)
    t1 = tic()
    sklearn.cluster.SpectralClustering(n_clusters=2).fit(X)
    timings.append(('Scikit-Learn (exact)', n, tic() - t1))
    t1 = tic()
    dask_ml.cluster.SpectralClustering(n_clusters=2, n_components=100).fit(X)
    timings.append(('dask-ml (approximate)', n, tic() - t1))


df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])
sns.factorplot(x='Number of Samples', y='Fit Time', hue='method',
               data=df, aspect=1.5)
Comparison of scaling.
"""
from dask_ml.datasets import make_classification
import pandas as pd

from timeit import default_timer as tic
import sklearn.linear_model
import dask_ml.linear_model
import seaborn as sns

Ns = [2500, 5000, 7500, 10000]

timings = []

for n in Ns:
    X, y = make_classification(n_samples=n, random_state=n, chunks=n // 20)
    t1 = tic()
    sklearn.linear_model.LogisticRegression().fit(X, y)
    timings.append(('Scikit-Learn', n, tic() - t1))
    t1 = tic()
    dask_ml.linear_model.LogisticRegression().fit(X, y)
    timings.append(('dask-ml', n, tic() - t1))

df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])
sns.factorplot(x='Number of Samples',
               y='Fit Time',
               hue='method',
               data=df,
               aspect=1.5)
示例#20
0
def init_scalable(X,
                  n_clusters,
                  random_state=None,
                  max_iter=None,
                  oversampling_factor=2):
    """K-Means initialization using k-means||

    This is algorithm 2 in Scalable K-Means++ (2012).
    """

    logger.info("Initializing with k-means||")
    init_start = tic()
    # Step 1: Initialize Centers
    idx = 0
    centers = da.compute(X[idx, np.newaxis])[0]
    c_idx = {idx}

    # Step 2: Initialize cost
    cost, = compute(evaluate_cost(X, centers))

    if cost == 0:
        n_iter = 0
    else:
        n_iter = int(np.round(np.log(cost)))

    if max_iter is not None:
        n_iter = min(max_iter, n_iter)

    # Steps 3 - 6: update candidate Centers
    for i in range(n_iter):
        t0 = tic()
        new_idxs = _sample_points(X, centers, oversampling_factor,
                                  random_state)
        new_idxs = set(*compute(new_idxs))
        c_idx |= new_idxs
        t1 = tic()
        logger.info("init iteration %2d/%2d %.2f s, %2d centers", i + 1,
                    n_iter, t1 - t0, len(c_idx))
        # Sort before slicing, for better performance / memory
        # usage with the scheduler.
        # See https://github.com/dask/dask-ml/issues/39
        centers = X[sorted(c_idx)].compute()

    # XXX: scikit-learn doesn't have weighted k-means.
    # The paper weights each center by the number of points closest to it.
    # https://stackoverflow.com/a/37198799/1889400 claims you can scale the
    # features before clustering, but that doesn't seem right.
    # I think that replicating the *points*, proportional to the number of
    # original points closest to the candidate centers, would be a better way
    # to do that.

    if len(centers) < n_clusters:
        logger.warning("Found fewer than %d clusters in init.", n_clusters)
        # supplement with random
        need = n_clusters - len(centers)
        locs = sorted(
            random_state.choice(np.arange(0, len(X)),
                                size=need,
                                replace=False,
                                chunks=len(X)))
        extra = X[locs].compute()
        return np.vstack([centers, extra])
    else:
        # Step 7, 8 without weights
        # dask RandomState objects aren't valid for scikit-learn
        rng2 = random_state.randint(0, 2**32 - 1, chunks=()).compute().item()
        km = sk_k_means.KMeans(n_clusters, random_state=rng2)
        km.fit(centers)
        logger.info("Finished initialization. %.2f s, %2d centers",
                    tic() - init_start, n_clusters)
    return km.cluster_centers_