Пример #1
0
def compute_stepsize_dask(beta,
                          step,
                          Xbeta,
                          Xstep,
                          y,
                          curr_val,
                          family=Logistic,
                          stepSize=1.0,
                          armijoMult=0.1,
                          backtrackMult=0.1):

    loglike = family.loglike
    beta, step, Xbeta, Xstep, y, curr_val = persist(beta, step, Xbeta, Xstep,
                                                    y, curr_val)
    obeta, oXbeta = beta, Xbeta
    (step, ) = compute(step)
    steplen = (step**2).sum()
    lf = curr_val
    func = 0
    for ii in range(100):
        beta = obeta - stepSize * step
        if ii and (beta == obeta).all():
            stepSize = 0
            break

        Xbeta = oXbeta - stepSize * Xstep
        func = loglike(Xbeta, y)
        Xbeta, func = persist(Xbeta, func)

        df = lf - compute(func)[0]
        if df >= armijoMult * stepSize * steplen:
            break
        stepSize *= backtrackMult

    return stepSize, beta, Xbeta, func
Пример #2
0
 def scatter_data_to_workers(self):
     self.scatteredDataFutures = None
     if self.client is not None:
         if not self.cpuFlag:
             print('scattering data to GPU workers...', end='')
             self.scatteredDataFutures = self.client.scatter([
                 self.dataset.trainData, self.dataset.trainLabels,
                 self.dataset.testData, self.dataset.testLabels
             ],
                                                             broadcast=True)
         else:
             print('scattering data to CPU workers...', end='')
             self.scatteredDataFutures = self.client.scatter(
                 [
                     self.dataset.cpuDataset['trainData'],
                     self.dataset.cpuDataset['trainLabels'],
                     self.dataset.cpuDataset['testData'],
                     self.dataset.cpuDataset['testLabels']
                 ],
                 broadcast=False
             )  # there is no need to broadcast between CPU workers [ ? ]
         print('done scatter')
         print('  + persisting scattered data...', end='')
         persist(self.scatteredDataFutures)
         print('done persist')
     else:
         assert ('error: missing a dask client/cluster!')
Пример #3
0
def compute_stepsize_dask(beta,
                          step,
                          Xbeta,
                          Xstep,
                          y,
                          curr_val,
                          family=Logistic,
                          stepSize=1.0,
                          armijoMult=0.1,
                          backtrackMult=0.1):
    """Compute the optimal stepsize

    Parameters
    ----------
    beta : array-like
    step : float
    XBeta : array-like
    Xstep : float
    y : array-like
    curr_val : float
    famlily : Family, optional
    stepSize : float, optional
    armijoMult : float, optional
    backtrackMult : float, optional

    Returns
    -------
    stepSize : float
    beta : array-like
    xBeta : array-like
    func : callable
    """

    loglike = family.loglike
    beta, step, Xbeta, Xstep, y, curr_val = persist(beta, step, Xbeta, Xstep,
                                                    y, curr_val)
    obeta, oXbeta = beta, Xbeta
    (step, ) = compute(step)
    steplen = (step**2).sum()
    lf = curr_val
    func = 0
    for ii in range(100):
        beta = obeta - stepSize * step
        if ii and (beta == obeta).all():
            stepSize = 0
            break

        Xbeta = oXbeta - stepSize * Xstep
        func = loglike(Xbeta, y)
        Xbeta, func = persist(Xbeta, func)

        df = lf - compute(func)[0]
        if df >= armijoMult * stepSize * steplen:
            break
        stepSize *= backtrackMult

    return stepSize, beta, Xbeta, func
Пример #4
0
async def test_persist(c, s, a, b):
    x = delayed(inc)(1)
    (x2, ) = persist(x)

    await wait(x2)
    assert x2.key in a.data or x2.key in b.data

    y = delayed(inc)(10)
    y2, one = persist(y, 1)

    await wait(y2)
    assert y2.key in a.data or y2.key in b.data
Пример #5
0
def test_persist(c, s, a, b):
    x = delayed(inc)(1)
    x2, = persist(x)

    yield wait(x2)
    assert x2.key in a.data or x2.key in b.data

    y = delayed(inc)(10)
    y2, one = persist(y, 1)

    yield wait(y2)
    assert y2.key in a.data or y2.key in b.data
Пример #6
0
def test_persist(c, s, a, b):
    x = delayed(inc)(1)
    x2, = persist(x)

    yield _wait(x2)
    assert x2.key in a.data or x2.key in b.data

    y = delayed(inc)(10)
    y2, one = persist(y, 1)

    yield _wait(y2)
    assert y2.key in a.data or y2.key in b.data
Пример #7
0
 def __init__(self,
              xyz,
              topology,
              time=None,
              delayed_objects=None,
              **kwargs):
     dask.persist(**kwargs)
     self._unitcell_vectors = None
     super(Trajectory, self).__init__(xyz=xyz,
                                      topology=topology,
                                      time=time,
                                      **kwargs)
Пример #8
0
def test_repeated_persists_same_priority(c, s, w):
    xs = [delayed(slowinc)(i, delay=0.05, dask_key_name='x-%d' % i) for i in range(10)]
    ys = [delayed(slowinc)(x, delay=0.05, dask_key_name='y-%d' % i) for i, x in enumerate(xs)]
    zs = [delayed(slowdec)(x, delay=0.05, dask_key_name='z-%d' % i) for i, x in enumerate(xs)]

    ys = dask.persist(*ys)
    zs = dask.persist(*zs)

    while sum(t.state == 'memory' for t in s.tasks.values()) < 5:  # TODO: reduce this number
        yield gen.sleep(0.01)

    assert any(s.tasks[y.key].state == 'memory' for y in ys)
    assert any(s.tasks[z.key].state == 'memory' for z in zs)
Пример #9
0
    def apply_func(self, func, varname, *args, **kwargs):
        """
        Apply the function to each block of data (doesn't use xarray)

        See here:
                http://dask.pydata.org/en/latest/delayed-best-practices.html
        """
        @delayed
        def load_single_nc(ncfile, varname):
            with Dataset(ncfile) as nc:
                # Load the data
                X = nc.variables[varname][:]
                X[np.isnan(X)] = 0.
            return X

        @delayed
        def lazy_func(func, X, *args, **kwargs):
            return func(X, *args, **kwargs)

        def f(func, ncfiles, varname, *args, **kwargs):
            output = []
            for ncfile in ncfiles:
                X = load_single_nc(ncfile, varname)
                output.append(lazy_func(func, X, *args, **kwargs))

            return output

        stack = dask.persist(f(func, self.filenames, varname, *args, **kwargs))
        return np.concatenate([ii.compute() for ii in stack[0]], axis=-1)[...,self.ghost]
Пример #10
0
def main(args=None):
    args = parse_args(args)
    steps = range(args.start, args.stop, args.step)
    if args.scheduler_address:
        client = Client(args.scheduler_address)
        info = client.scheduler_info()
        logger.info("Distributed mode: %s", client.scheduler)
        logger.info("Dashboard: %s:%s", info['address'],
                    info['services']['bokeh'])
    else:
        logger.warning("Local mode")

    logger.info("Fitting for %s", list(steps))

    logger.info("Reading data")
    X = read().pipe(transform).pipe(as_array)
    X, = persist(X)

    timings = []

    for n_clusters in range(args.start, args.stop, args.step):
        logger.info("Starting %02d", n_clusters)
        t0 = tic()
        km = do(X, n_clusters, factor=args.factor)
        t1 = tic()
        logger.info("Finished %02d, [%.2f]", n_clusters, t1 - t0)
        logger.info("Cluster Centers [%s]:\n%s", n_clusters,
                    km.cluster_centers_)
        inertia = km.inertia_.compute()
        logger.info("Inertia [%s]: %s", km.cluster_centers_, inertia)
        timings.append((n_clusters, args.factor, t1 - t0, inertia))

    pd.DataFrame(timings, columns=['n_clusters', 'factor', 'time',
                                   'inertia']).to_csv('timings.csv')
Пример #11
0
    def cleaning(self):
        cols = [
            'Year', 'Month', 'DayOfWeek', 'Distance', 'DepDelay', 'CRSDepTime',
            'UniqueCarrier', 'Origin', 'Dest'
        ]

        # Create the dataframe
        df = dd.read_csv(sorted(
            glob(os.path.join('data', 'nycflights', '*.csv'))),
                         usecols=cols,
                         storage_options={'anon': True})

        df = df.sample(frac=0.2)  # we blow out ram otherwise

        label = (df.DepDelay.fillna(16) > 15)

        df['CRSDepTime'] = df['CRSDepTime'].clip(upper=2399)
        del df['DepDelay']

        df, label = persist(df, label)
        df2 = dd.get_dummies(df.categorize()).persist()
        X_train, X_test = df2.random_split([0.9, 0.1], random_state=1234)
        y_train, y_test = label.random_split([0.9, 0.1], random_state=1234)

        return X_train, X_test, y_train, y_test
Пример #12
0
    def fit(self, X, y=None):
        self._reset()
        to_persist = OrderedDict()
        feature_range = self.feature_range

        if feature_range[0] >= feature_range[1]:
            raise ValueError("Minimum of desired feature "
                             "range must be smaller than maximum.")

        _X = slice_columns(X, self.columns)
        data_min = _X.min(0)
        data_max = _X.max(0)
        data_range = data_max - data_min
        scale = ((feature_range[1] - feature_range[0]) /
                 handle_zeros_in_scale(data_range))

        to_persist["data_min_"] = data_min
        to_persist["data_max_"] = data_max
        to_persist["data_range_"] = data_range
        to_persist["scale_"] = scale
        to_persist["min_"] = feature_range[0] - data_min * scale
        to_persist["n_samples_seen_"] = np.nan

        values = persist(*to_persist.values())
        for k, v in zip(to_persist, values):
            setattr(self, k, v)
        return self
Пример #13
0
    def _to_ds24(self, X, y=None, *, batch_size, shuffle, drop_remainder):
        def to_spec(name, dtype, idx):
            if dtype is not None:
                spec = tf.TensorSpec(shape=(None, len(idx)), dtype=dtype)
            else:  # var len
                v = X[name].head(1).tolist()[0]
                spec = tf.TensorSpec(shape=(None, len(v)), dtype='int32')
            return spec

        meta = self._get_meta(X)
        sig = {k: to_spec(k, dtype, idx) for k, (dtype, idx) in meta.items()}

        if y is not None:
            if isinstance(y, dd.Series):
                y = y.to_dask_array(lengths=True)
            if self.task == consts.TASK_MULTICLASS:
                y = self._to_categorical(y, num_classes=self.num_classes)
                sig = sig, tf.TensorSpec(shape=(None, self.num_classes),
                                         dtype=y.dtype)
            else:
                sig = sig, tf.TensorSpec(shape=(None, ), dtype=y.dtype)

        X = X.to_dask_array(lengths=True)
        X, y = dask.persist(X, y)
        gen = partial(self._generate,
                      meta,
                      X,
                      y,
                      batch_size=batch_size,
                      shuffle=shuffle,
                      drop_remainder=drop_remainder)
        ds = tf.data.Dataset.from_generator(gen, output_signature=sig)

        return ds
Пример #14
0
def main(args=None):
    args = parse_args(args)

    client = Client(args.scheduler_address)  # noqa
    if args.scheduler_address.startswith("ucx://"):
        setup()
        client.run_on_scheduler(setup)
        client.run(setup)

    n_keys = args.n_keys
    n_rows_l = args.left_rows
    n_rows_r = args.left_rows

    gleft, gright = make_data(n_keys, n_rows_l, n_rows_r)

    t0 = clock()
    gleft, gright = dask.persist(gleft, gright)
    wait([gleft, gright])

    print('left  :', gleft)
    print('right :', gright)
    t1 = clock()

    print("Persist :", t1 - t0)
    out = gleft.merge(gright, on=['id'])
    t2 = clock()
    result = out.compute()
    t3 = clock()

    print("Schedule:", t2 - t1)
    print("Compute :", t3 - t2)
    print("Total   :", t3 - t0)
    print(type(result))
    print(result.head())
Пример #15
0
def cg_project(A, x, y, tol=1e-8, **options):
    r""" Project (x, y) onto graph G = {(y, x) | y = Ax} via CG

    In particular, form outputs as:

        :math:`x_{out} = (1 + A^TA)^{-1}(A^Ty + x)`
        :math:`y_{out} = Ax_{out}`
    """
    fmt = 'array {} compatible'
    assert A.shape[0] == y.shape[0] and A.shape[1] == x.shape[0], fmt.format(
        'dims')
    assert A.chunks[0] == y.chunks[0] and A.chunks[1] == x.chunks[
        0], fmt.format('chunks')

    token = options.pop(
        'name', 'cg-project-' + dask.base.tokenize(A, x, y, tol, **options))
    nm_b, nm_x, nm_y = map(lambda nm: nm + '-' + token, ('b', 'x', 'y'))

    # b = A'y + x
    b = atoms2.gemv(1, A, y, 1, x, transpose=True, name=nm_b)
    A_hat = linop.DLORegularizedGram(A, transpose=False)
    x_out, res, iters = cg.cg_graph(A_hat, b, tol=tol, name=nm_x, **options)
    y_out = atoms2.dot(A, x_out, name=nm_y)
    x_out, y_out = dask.persist(x_out, y_out)
    return x_out, y_out, res, iters
Пример #16
0
    def _fit_parallel(
        self,
        convert_to_inference: bool = False,
        sampler_args: dict = None,
    ) -> Union[List[CmdStanMCMC], List[az.InferenceData]]:
        """Fit model by parallelizing across features.

        :param convert_to_inference: Whether to create individual
            InferenceData objects for individual feature fits, defaults to
            False
        :type convert_to_inference: bool

        :param sampler_args: Additional parameters to pass to CmdStanPy
            sampler (optional)
        :type sampler_args: dict
        """
        if sampler_args is None:
            sampler_args = dict()

        _fits = []
        for v, i, d in self.table.iter(axis="observation"):
            _fit = dask.delayed(self._fit_single)(
                v,
                sampler_args,
                convert_to_inference,
            )
            _fits.append(_fit)

        fit_futures = dask.persist(*_fits)
        all_fits = dask.compute(fit_futures)[0]
        # Set data back to full table
        self.dat["y"] = self.table.matrix_data.todense().T.astype(int)
        self.fit = all_fits
Пример #17
0
def missing_impact(df: dd.DataFrame, bins: int) -> Intermediate:
    """
    Calculate the data for visualizing the plot_missing(df).
    This contains the missing spectrum, missing bar chart and missing heatmap.
    """
    cols = df.columns.values
    (nulldf, ) = dask.persist(df.isnull())
    nullity = nulldf.to_dask_array(lengths=True)

    null_perc = nullity.sum(axis=0) / nullity.shape[0]

    tasks = (
        missing_spectrum(nullity, cols, bins=bins),
        null_perc,
        missing_bars(null_perc, cols),
        missing_heatmap(nulldf, null_perc, cols),
        missing_dendrogram(nullity, cols),
    )

    spectrum, null_perc, bars, heatmap, dendrogram = dd.compute(*tasks)

    return Intermediate(
        data_total_missing={
            col: null_perc[idx]
            for idx, col in enumerate(cols)
        },
        data_spectrum=spectrum,
        data_bars=bars,
        data_heatmap=heatmap,
        data_dendrogram=dendrogram,
        visual_type="missing_impact",
    )
Пример #18
0
def prepare_data():
    # Choose columns to use
    cols = [
        'Year', 'Month', 'DayOfWeek', 'Distance', 'DepDelay', 'CRSDepTime',
        'UniqueCarrier', 'Origin', 'Dest'
    ]

    df = dd.read_csv(os.path.join('data', 'nycflights', '*.csv'),
                     usecols=cols,
                     storage_options={'anon': True})
    is_delayed = (df.DepDelay.fillna(16) > 15)

    # Remove delay information from training dataframe
    del df['DepDelay']

    # Trim all the values in data
    df['CRSDepTime'] = df['CRSDepTime'].clip(upper=2399)

    # df: data from which we will learn if flights are delayed
    # is_delayed: whether or not those flights were delayed
    df, is_delayed = dask.persist(df, is_delayed)

    # Convert categorical data into numerical
    df_numerical = dd.get_dummies(df.categorize()).persist()

    print("- Done")

    return df_numerical, is_delayed
Пример #19
0
    def run(self):
        self._validate_setup()
        write_locks = {}
        for times in self._times:
            filename = self._get_output_filename(times)
            self.setup_netcdf_output(filename, times)
            write_locks[filename] = combine_locks(
                [NETCDFC_LOCK, get_write_lock(filename)])
        self.logger.info('Starting {} chunks...'.format(len(self.slices)))

        delayed_objs = [
            wrap_run_slice(self.params, write_locks, dslice)
            for dslice in self.slices
        ]
        persisted = dask.persist(delayed_objs,
                                 num_workers=self.params['num_workers'])
        self.progress_bar(persisted)
        dask.compute(persisted)
        self.logger.info('Cleaning up...')
        try:
            self._client.cluster.close()
            self._client.close()
            if self.params['verbose'] == logging.DEBUG:
                print()
                print('closed dask cluster/client')
        except Exception:
            pass
Пример #20
0
def main(args=None):
    args = parse_args(args)
    steps = range(args.start, args.stop, args.step)
    if args.scheduler_address:
        client = Client(args.scheduler_address)
        info = client.scheduler_info()
        logger.info("Distributed mode: %s", client.scheduler)
        logger.info("Dashboard: %s:%s", info["address"],
                    info["services"]["bokeh"])
    else:
        logger.warning("Local mode")

    logger.info("Fitting for %s", list(steps))

    logger.info("Reading data")
    X = read().pipe(transform).pipe(as_array)
    X, = persist(X)

    timings = []

    for n_clusters in range(args.start, args.stop, args.step):
        logger.info("Starting %02d", n_clusters)
        t0 = tic()
        with _timer(n_clusters, _logger=logger):
            km = do(X, n_clusters, factor=args.factor)
        t1 = tic()
        logger.info("Cluster Centers [%s]:\n%s", n_clusters,
                    km.cluster_centers_)
        inertia = km.inertia_.compute()
        logger.info("Inertia [%s]: %s", km.cluster_centers_, inertia)
        timings.append((n_clusters, args.factor, t1 - t0, inertia))

    pd.DataFrame(timings, columns=["n_clusters", "factor", "time",
                                   "inertia"]).to_csv("timings.csv")
Пример #21
0
def _evaluate_datasets(pipelines, datasets, hyperparameters, metrics,
                       distributed, test_split, detrend):
    delayed = []
    for dataset, signals in datasets.items():
        LOGGER.info("Starting dataset {} with {} signals..".format(
            dataset, len(signals)))

        # dataset configuration
        hyperparameters_ = _get_parameter(hyperparameters, dataset)
        parameters = _get_parameter(BENCHMARK_PARAMS, dataset)
        if parameters is not None:
            detrend, test_split = parameters.values()

        result = _evaluate_pipelines(pipelines, dataset, signals,
                                     hyperparameters_, metrics, distributed,
                                     test_split, detrend)

        delayed.extend(result)

    if distributed:
        persisted = dask.persist(*delayed)
        results = dask.compute(*persisted)

    else:
        results = delayed

    df = pd.DataFrame.from_records(results)
    return df
Пример #22
0
def from_dask(df: "dask.DataFrame") -> Dataset[ArrowRow]:
    """Create a dataset from a Dask DataFrame.

    Args:
        df: A Dask DataFrame.

    Returns:
        Dataset holding Arrow records read from the DataFrame.
    """
    import dask
    from ray.util.dask import ray_dask_get

    partitions = df.to_delayed()
    persisted_partitions = dask.persist(*partitions, scheduler=ray_dask_get)

    import pandas

    def to_ref(df):
        if isinstance(df, pandas.DataFrame):
            return ray.put(df)
        elif isinstance(df, ray.ObjectRef):
            return df
        else:
            raise ValueError(
                "Expected a Ray object ref or a Pandas DataFrame, "
                f"got {type(df)}")

    return from_pandas_refs([
        to_ref(next(iter(part.dask.values()))) for part in persisted_partitions
    ])
Пример #23
0
    def _transform(self, ds, do_fit=False, method_name=None):
        for i, block in enumerate(self.graph):
            if block.dataset_map is not None:
                try:
                    ds = block.dataset_map(ds)
                except Exception as e:
                    raise RuntimeError(
                        f"Could not map ds {ds}\n with {block.dataset_map}"
                    ) from e
                continue

            if do_fit:
                args = _get_dask_args_from_ds(ds, block.fit_input)
                args = [d for d, dims in args]
                estimator = block.estimator
                if is_estimator_stateless(estimator):
                    block.estimator_ = estimator
                elif block.model_path is not None and os.path.isfile(block.model_path):
                    _load_estimator.__name__ = f"load_{block.estimator_name}"
                    block.estimator_ = dask.delayed(_load_estimator)(block)
                elif block.input_dask_array:
                    ds = ds.persist()
                    args = _get_dask_args_from_ds(ds, block.fit_input)
                    args = [d for d, dims in args]
                    block.estimator_ = _fit(*args, block=block)
                else:
                    _fit.__name__ = f"{block.estimator_name}.fit"
                    block.estimator_ = dask.delayed(_fit)(
                        *args,
                        block=block,
                    )

            mn = "transform"
            if i == len(self.graph) - 1:
                if do_fit:
                    break
                mn = method_name

            if block.features_dir is None:
                args = _get_dask_args_from_ds(ds, block.transform_input)
                dims, data = _blockwise_with_block(
                    args, block, mn, input_has_keys=False
                )
            else:
                dims, data = _transform_or_load(block, ds, block.transform_input, mn)

            # replace data inside dataset
            ds = ds.copy(deep=False)
            del ds["data"]
            persisted = False
            if not np.all(np.isfinite(data.shape)):
                block.estimator_, data = dask.persist(block.estimator_, data)
                data = data.compute_chunk_sizes()
                persisted = True
            ds["data"] = (dims, data)
            if persisted:
                ds = ds.persist()

        return ds
Пример #24
0
def gradient_descent(X, y, max_steps=100, tol=1e-14, family=Logistic):
    '''Michael Grant's implementation of Gradient Descent.'''

    loglike, gradient = family.loglike, family.gradient
    n, p = X.shape
    firstBacktrackMult = 0.1
    nextBacktrackMult = 0.5
    armijoMult = 0.1
    stepGrowth = 1.25
    stepSize = 1.0
    recalcRate = 10
    backtrackMult = firstBacktrackMult
    beta = np.zeros(p)

    for k in range(max_steps):
        # how necessary is this recalculation?
        if k % recalcRate == 0:
            Xbeta = X.dot(beta)
            func = loglike(Xbeta, y)

        grad = gradient(Xbeta, X, y)
        Xgradient = X.dot(grad)

        # backtracking line search
        lf = func
        stepSize, _, _, func = compute_stepsize_dask(
            beta,
            grad,
            Xbeta,
            Xgradient,
            y,
            func,
            family=family,
            backtrackMult=backtrackMult,
            armijoMult=armijoMult,
            stepSize=stepSize)

        beta, stepSize, Xbeta, lf, func, grad, Xgradient = persist(
            beta, stepSize, Xbeta, lf, func, grad, Xgradient)

        stepSize, lf, func, grad = compute(stepSize, lf, func, grad)

        beta = beta - stepSize * grad  # tiny bit of repeat work here to avoid communication
        Xbeta = Xbeta - stepSize * Xgradient

        if stepSize == 0:
            print('No more progress')
            break

        df = lf - func
        df /= max(func, lf)

        if df < tol:
            print('Converged')
            break
        stepSize *= stepGrowth
        backtrackMult = nextBacktrackMult

    return beta
Пример #25
0
def test_persist_nested(c):
    a = delayed(1) + 5
    b = a + 1
    c = a + 2
    result = persist({"a": a, "b": [1, 2, b]}, (c, 2), 4, [5])
    assert isinstance(result[0]["a"], Delayed)
    assert isinstance(result[0]["b"][2], Delayed)
    assert isinstance(result[1][0], Delayed)

    sol = ({"a": 6, "b": [1, 2, 7]}, (8, 2), 4, [5])
    assert compute(*result) == sol

    res = persist([a, b], c, 4, [5], traverse=False)
    assert res[0][0] is a
    assert res[0][1] is b
    assert res[1].compute() == 8
    assert res[2:] == (4, [5])
Пример #26
0
def test_expand_persist(c, s, a, b):
    low = delayed(inc)(1, dask_key_name='low')
    many = [delayed(slowinc)(i, delay=0.1) for i in range(4)]
    high = delayed(inc)(2, dask_key_name='high')

    low, high, x, y, z, w = persist(low, high, *many, priority={low: -1, high: 1})
    yield wait(high)
    assert s.tasks[low.key].state == 'processing'
Пример #27
0
def benchmark(tuners, challenges, iterations, detailed_output=False):
    """Score ``tuners`` against a list of ``challenges`` for the given amount of iterations.

    This function scores a collection of ``tuners`` against a collection of ``challenges``
    performing tuning iterations in order to obtain a better score. At the end, the best score
    for each tuner / challenge is being returned. This data is returned as a ``pandas.DataFrame``.

    Args:
        tuners (dict):
            Python dictionary with the ``name`` of the function as ``key`` and the callable
            function that returns the best score for a given ``scorer``.
            This function must have three arguments:

                * scorer (function):
                    A function that performs scoring over params.
                * tunable (btb.tuning.Tunable):
                    A ``Tunable`` instance used to instantiate a tuner.
                * iterations (int):
                    Number of tuning iterations to perform.

        challenges (list):
            A list of ``chalenges``. This challenges must inherit from
            ``btb.challenges.challenge.Challenge``.
        iterations (int):
            Amount of tuning iterations to perform for each tuner and each challenge.
        detailed_output (bool):
            If ``True`` a dataframe with the elapsed time, score and iterations will be returned.

    Returns:
        pandas.DataFrame:
            A ``pandas.DataFrame`` with the obtained scores for the given challenges is being
            returned.
    """
    delayed = []

    for challenge in challenges:
        result = _evaluate_tuners_on_challenge(tuners, challenge, iterations)
        delayed.extend(result)

    persisted = dask.persist(*delayed)

    try:
        progress(persisted)
    except ValueError:
        # Using local client. No progress bar needed.
        pass

    results = dask.compute(*persisted)

    df = pd.DataFrame.from_records(results)
    if detailed_output:
        return df

    df = df.pivot(index='challenge', columns='tuner', values='score')
    del df.columns.name
    del df.index.name

    return df
Пример #28
0
def test_dataset_dask(ms):
    datasets = read_datasets(ms, [], [], [])
    assert len(datasets) == 1
    ds = datasets[0]

    # All dask arrays
    for k, v in ds.data_vars.items():
        assert isinstance(v.data, da.Array)

        # Test variable compute
        v2 = dask.compute(v)[0]
        assert isinstance(v2, xr.DataArray if have_xarray else Variable)
        assert isinstance(v2.data, np.ndarray)

        # Test variable persists
        v3 = dask.persist(v)[0]
        assert isinstance(v3, xr.DataArray if have_xarray else Variable)

        # Now have numpy array in the graph
        assert len(v3.data.__dask_keys__()) == 1
        data = next(iter(v3.__dask_graph__().values()))
        assert isinstance(data, np.ndarray)
        assert_array_equal(v2.data, v3.data)

    # Test compute
    nds = dask.compute(ds)[0]

    for k, v in nds.data_vars.items():
        assert isinstance(v.data, np.ndarray)
        cdata = getattr(ds, k).data
        assert_array_equal(cdata, v.data)

    # Test persist
    nds = dask.persist(ds)[0]

    for k, v in nds.data_vars.items():
        assert isinstance(v.data, da.Array)

        # Now have numpy array iin the graph
        assert len(v.data.__dask_keys__()) == 1
        data = next(iter(v.data.__dask_graph__().values()))
        assert isinstance(data, np.ndarray)

        cdata = getattr(ds, k).data
        assert_array_equal(cdata, v.data)
Пример #29
0
def cg_initialize(A, b, x_init=None):
    if x_init is None:
        x = 0 * b
    else:
        x = 1 * x_init
    r = A.dot(x) - b
    p = 1 * r
    x, r, p = dask.persist(x, r, p)
    return x, r, p
Пример #30
0
 def test_future(self):
     """compute_with_trace() can handle Futures."""
     client = Client(processes=False)
     self.addCleanup(client.shutdown)
     [bag] = dask.persist(from_sequence([1, 2, 3]))
     bag = bag.map(lambda x: x * 5)
     result = dask.compute(bag)
     self.assertEqual(result, ([5, 10, 15],))
     self.assertEqual(result, compute_with_trace(bag))
Пример #31
0
def test_admm_with_large_lamduh(N, p, nchunks):
    X = da.random.random((N, p), chunks=(N // nchunks, p))
    beta = np.random.random(p)
    y = make_y(X, beta=np.array(beta), chunks=(N // nchunks, ))

    X, y = persist(X, y)
    z = admm(X, y, reg=L1, lamduh=1e4, rho=20, max_iter=500)

    assert np.allclose(z, np.zeros(p), atol=1e-4)
Пример #32
0
def get_array_moments(
        array: da.core.Array,
        mean: bool = True,
        std: bool = True,
        std_method: str = 'binom',
        axis: int = 0
) -> Tuple[Optional[da.core.Array], Optional[da.core.Array]]:
    """ Computes specified array_moments

    Parameters
    ----------
    array : array_like, shape (N, P)
        Array that moments will be computed from
    mean : bool
        Flag whether to compute mean of "array" along "axis"
    std : bool
        Flag whether to compute std of "array" along "axis"
    std_method : str
        Method used to compute standard deviation.

        Possible methods are:
            'norm' ===> Normal Distribution Standard Deviation. See np.std
            'binom' ====> Binomial Standard Deviation
                            sqrt(2*p*(1-p)), where p = "mean"/2
    axis : int
        Axis to compute mean and std along.

    Returns
    -------
    array_mean : da.core.array, optional
        If "mean" is false, returns None
        Otherwise returns the array mean
    array_std: da.core.array, optional
        If "std" is false, returns None
        Otherwise returns the array std
    """
    array_mean = None
    array_std = None

    if mean:
        array_mean = da.nanmean(array, axis=axis)

    if std:
        if std_method == 'binom':
            u = array_mean if mean else da.nanmean(array, axis=axis)
            u /= 2
            array_std = da.sqrt(2 * u * (1 - u))
        elif std_method == 'norm':
            array_std = da.nanstd(array, axis=axis)
        else:
            raise NotImplementedError(
                f'std_method, {std_method}, is not implemented ')

    array_mean, array_std = persist(array_mean, array_std)

    return array_mean, array_std
Пример #33
0
def test_persist_nested(loop):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop):
            a = delayed(1) + 5
            b = a + 1
            c = a + 2
            result = persist({'a': a, 'b': [1, 2, b]}, (c, 2), 4, [5])
            assert isinstance(result[0]['a'], Delayed)
            assert isinstance(result[0]['b'][2], Delayed)
            assert isinstance(result[1][0], Delayed)

            sol = ({'a': 6, 'b': [1, 2, 7]}, (8, 2), 4, [5])
            assert compute(*result) == sol

            res = persist([a, b], c, 4, [5], traverse=False)
            assert res[0][0] is a
            assert res[0][1] is b
            assert res[1].compute() == 8
            assert res[2:] == (4, [5])
Пример #34
0
async def test_annotate_persist(c, s, a, b):
    with dask.annotate(priority=-1):
        low = delayed(inc)(1, dask_key_name="low")
    with dask.annotate(priority=1):
        high = delayed(inc)(2, dask_key_name="high")
    many = [delayed(slowinc)(i, delay=0.1) for i in range(4)]

    low, high, x, y, z, w = persist(low, high, *many, optimize_graph=False)
    await wait(high)
    assert s.tasks[low.key].state == "processing"
Пример #35
0
def test_dont_recompute_if_persisted_4(c, s, a, b):
    x = delayed(inc)(1, dask_key_name='x')
    y = delayed(inc)(x, dask_key_name='y')
    z = delayed(inc)(x, dask_key_name='z')

    yy = y.persist()
    yield wait(yy)

    old = s.story('x')

    while s.tasks['x'].state == 'memory':
        yield gen.sleep(0.01)

    yyy, zzz = dask.persist(y, z)
    yield wait([yyy, zzz])

    new = s.story('x')
    assert len(new) > len(old)
Пример #36
0
def test_custom_collection():
    dsk = {'a': 1, 'b': 2}
    dsk2 = {'c': (add, 'a', 'b'),
            'd': (add, 'c', 1)}
    dsk2.update(dsk)
    dsk3 = {'e': (add, 'a', 4),
            'f': (inc, 'e')}
    dsk3.update(dsk)

    x = Tuple(dsk, ['a', 'b'])
    y = Tuple(dsk2, ['c', 'd'])
    z = Tuple(dsk3, ['e', 'f'])

    # __slots__ defined on base mixin class propogates
    with pytest.raises(AttributeError):
        x.foo = 1

    # is_dask_collection
    assert is_dask_collection(x)

    # tokenize
    assert tokenize(x) == tokenize(x)
    assert tokenize(x) != tokenize(y)

    # compute
    assert x.compute() == (1, 2)
    assert dask.compute(x, [y, z]) == ((1, 2), [(3, 4), (5, 6)])
    t = x + y + z
    assert t.compute() == (1, 2, 3, 4, 5, 6)

    # persist
    t2 = t.persist()
    assert isinstance(t2, Tuple)
    assert t2._dask == dict(zip('abcdef', range(1, 7)))
    assert t2.compute() == (1, 2, 3, 4, 5, 6)
    x2, y2, z2 = dask.persist(x, y, z)
    t3 = x2 + y2 + z2
    assert t2._dask == t3._dask