示例#1
0
 def __invert__(self) -> FutureWrapper:
     client = get_client()
     new_fut = client.submit(op.not_, self)
     return FutureWrapper.from_future(new_fut)
示例#2
0
 def __xor__(self, other) -> FutureWrapper:
     client = get_client()
     new_fut = client.submit(op.xor, self, other)
     return FutureWrapper.from_future(new_fut)
示例#3
0
def cross_validation(model,
                     horizon,
                     period=None,
                     initial=None,
                     parallel=None,
                     cutoffs=None,
                     disable_tqdm=False):
    """Cross-Validation for time series.

    Computes forecasts from historical cutoff points, which user can input.
    If not provided, begins from (end - horizon) and works backwards, making
    cutoffs with a spacing of period until initial is reached.

    When period is equal to the time interval of the data, this is the
    technique described in https://robjhyndman.com/hyndsight/tscv/ .

    Parameters
    ----------
    model: Prophet class object. Fitted Prophet model.
    horizon: string with pd.Timedelta compatible style, e.g., '5 days',
        '3 hours', '10 seconds'.
    period: string with pd.Timedelta compatible style. Simulated forecast will
        be done at every this period. If not provided, 0.5 * horizon is used.
    initial: string with pd.Timedelta compatible style. The first training
        period will include at least this much data. If not provided,
        3 * horizon is used.
    cutoffs: list of pd.Timestamp specifying cutoffs to be used during
        cross validation. If not provided, they are generated as described
        above.
    parallel : {None, 'processes', 'threads', 'dask', object}
    disable_tqdm: if True it disables the progress bar that would otherwise show up when parallel=None

        How to parallelize the forecast computation. By default no parallelism
        is used.

        * None : No parallelism.
        * 'processes' : Parallelize with concurrent.futures.ProcessPoolExectuor.
        * 'threads' : Parallelize with concurrent.futures.ThreadPoolExecutor.
            Note that some operations currently hold Python's Global Interpreter
            Lock, so parallelizing with threads may be slower than training
            sequentially.
        * 'dask': Parallelize with Dask.
           This requires that a dask.distributed Client be created.
        * object : Any instance with a `.map` method. This method will
          be called with :func:`single_cutoff_forecast` and a sequence of
          iterables where each element is the tuple of arguments to pass to
          :func:`single_cutoff_forecast`

          .. code-block::

             class MyBackend:
                 def map(self, func, *iterables):
                     results = [
                        func(*args)
                        for args in zip(*iterables)
                     ]
                     return results

    Returns
    -------
    A pd.DataFrame with the forecast, actual value and cutoff.
    """

    df = model.history.copy().reset_index(drop=True)
    horizon = pd.Timedelta(horizon)

    predict_columns = ['ds', 'yhat']
    if model.uncertainty_samples:
        predict_columns.extend(['yhat_lower', 'yhat_upper'])

    # Identify largest seasonality period
    period_max = 0.
    for s in model.seasonalities.values():
        period_max = max(period_max, s['period'])
    seasonality_dt = pd.Timedelta(str(period_max) + ' days')

    if cutoffs is None:
        # Set period
        period = 0.5 * horizon if period is None else pd.Timedelta(period)

        # Set initial
        initial = (max(3 * horizon, seasonality_dt)
                   if initial is None else pd.Timedelta(initial))

        # Compute Cutoffs
        cutoffs = generate_cutoffs(df, horizon, initial, period)
    else:
        # add validation of the cutoff to make sure that the min cutoff is strictly greater than the min date in the history
        if min(cutoffs) <= df['ds'].min():
            raise ValueError(
                "Minimum cutoff value is not strictly greater than min date in history"
            )
        # max value of cutoffs is <= (end date minus horizon)
        end_date_minus_horizon = df['ds'].max() - horizon
        if max(cutoffs) > end_date_minus_horizon:
            raise ValueError(
                "Maximum cutoff value is greater than end date minus horizon, no value for cross-validation remaining"
            )
        initial = cutoffs[0] - df['ds'].min()

    # Check if the initial window
    # (that is, the amount of time between the start of the history and the first cutoff)
    # is less than the maximum seasonality period
    if initial < seasonality_dt:
        msg = 'Seasonality has period of {} days '.format(period_max)
        msg += 'which is larger than initial window. '
        msg += 'Consider increasing initial.'
        logger.warning(msg)

    if parallel:
        valid = {"threads", "processes", "dask"}

        if parallel == "threads":
            pool = concurrent.futures.ThreadPoolExecutor()
        elif parallel == "processes":
            pool = concurrent.futures.ProcessPoolExecutor()
        elif parallel == "dask":
            try:
                from dask.distributed import get_client
            except ImportError as e:
                raise ImportError("parallel='dask' requies the optional "
                                  "dependency dask.") from e
            pool = get_client()
            # delay df and model to avoid large objects in task graph.
            df, model = pool.scatter([df, model])
        elif hasattr(parallel, "map"):
            pool = parallel
        else:
            msg = ("'parallel' should be one of {} for an instance with a "
                   "'map' method".format(', '.join(valid)))
            raise ValueError(msg)

        iterables = ((df, model, cutoff, horizon, predict_columns)
                     for cutoff in cutoffs)
        iterables = zip(*iterables)

        logger.info("Applying in parallel with %s", pool)
        predicts = pool.map(single_cutoff_forecast, *iterables)
        if parallel == "dask":
            # convert Futures to DataFrames
            predicts = pool.gather(predicts)

    else:
        predicts = [
            single_cutoff_forecast(df, model, cutoff, horizon, predict_columns)
            for cutoff in (tqdm(cutoffs) if not disable_tqdm else cutoffs)
        ]

    # Combine all predicted pd.DataFrame into one pd.DataFrame
    return pd.concat(predicts, axis=0).reset_index(drop=True)
示例#4
0
 def __getitem__(self, item) -> FutureWrapper:
     client = get_client()
     new_fut = client.submit(op.getitem, self, item)
     return FutureWrapper.from_future(new_fut)
示例#5
0
 def session_run_at_end():
     client = get_client(address)
     print("Closed Dask client={}\n".format(client))
     client.shutdown()
     client.close()
     del client
示例#6
0
def get_chunking(filelist,
                 chunksize,
                 treename="Events",
                 workers=12,
                 skip_bad_files=False,
                 xrootd=False,
                 client=None,
                 use_dask=False):
    """
    Return 2-tuple of
    - chunks: triplets of (filename,entrystart,entrystop) calculated with input `chunksize` and `filelist`
    - total_nevents: total event count over `filelist`
    """
    import uproot3
    from tqdm.auto import tqdm
    import concurrent.futures

    if xrootd:
        temp = []
        for fname in filelist:
            if fname.startswith("/hadoop/cms"):
                temp.append(
                    fname.replace("/hadoop/cms",
                                  "root://redirector.t2.ucsd.edu/"))
            else:
                temp.append(
                    fname.replace("/store/",
                                  "root://xrootd.t2.ucsd.edu:2040//store/"))
        filelist = temp

    chunksize = int(chunksize)
    chunks = []
    nevents = 0

    if use_dask:
        if not client:
            from dask.distributed import get_client
            client = get_client()

        def numentries(fname):
            import uproot3
            try:
                return (fname, uproot3.numentries(fname, treename))
            except:
                return (fname, -1)

        futures = client.map(numentries, filelist)
        info = []
        for future, result in tqdm(as_completed(futures, with_results=True),
                                   total=len(futures)):
            info.append(result)
        for fn, nentries in info:
            if nentries < 0:
                if skip_bad_files:
                    print("Skipping bad file: {}".format(fn))
                    continue
                else:
                    raise RuntimeError("Bad file: {}".format(fn))
            nevents += nentries
            for index in range(nentries // chunksize + 1):
                chunks.append((fn, chunksize * index,
                               min(chunksize * (index + 1), nentries)))
    else:
        if skip_bad_files:
            # slightly slower (serial loop), but can skip bad files
            for fname in tqdm(filelist):
                try:
                    items = uproot3.numentries(fname, treename,
                                               total=False).items()
                except (IndexError, ValueError) as e:
                    print("Skipping bad file", fname)
                    continue
                for fn, nentries in items:
                    nevents += nentries
                    for index in range(nentries // chunksize + 1):
                        chunks.append((fn, chunksize * index,
                                       min(chunksize * (index + 1), nentries)))
        else:
            executor = None if len(
                filelist) < 5 else concurrent.futures.ThreadPoolExecutor(
                    min(workers, len(filelist)))
            for fn, nentries in uproot3.numentries(filelist,
                                                   treename,
                                                   total=False,
                                                   executor=executor).items():
                nevents += nentries
                for index in range(nentries // chunksize + 1):
                    chunks.append((fn, chunksize * index,
                                   min(chunksize * (index + 1), nentries)))

    return chunks, nevents
示例#7
0
 def __setstate__(self, state):
     super(Dask, self).__setstate__(state)
     self.client = get_client()
示例#8
0
def set_window(shear_zbins={},
               f_sky=0.3,
               nside=256,
               mask_start_pix=0,
               window_cl_fact=None,
               unit_win=False,
               scheduler_info=None,
               mask=None,
               delta_W=True):
    from skylens.skylens_main import Skylens
    w_lmax = 3 * nside
    l0 = np.arange(w_lmax, dtype='int')
    corr = ('galaxy', 'galaxy')

    kappa0 = Skylens(galaxy_zbins=shear_zbins,
                     do_cov=False,
                     bin_cl=False,
                     l_bins=None,
                     l=l0,
                     use_window=False,
                     corrs=[corr],
                     f_sky=f_sky,
                     scheduler_info=scheduler_info)
    cl0G = kappa0.cl_tomo()

    npix0 = hp.nside2npix(nside)

    npix = np.int(npix0 * f_sky)
    if mask is None:
        mask = np.zeros(npix0, dtype='bool')
        #     mask[int(npix):]=0
        mask[mask_start_pix:mask_start_pix + int(npix)] = 1

    cl_map0 = hp.ma(np.ones(npix0))
    cl_map0[~mask] = hp.UNSEEN

    if scheduler_info is None:
        client = get_client()
    else:
        client = get_client(address=scheduler_info['address'])

    for i in np.arange(shear_zbins['n_bins']):
        cl_i = client.compute(cl0G['cl'][corr][(i, i)]).result()
        if np.any(np.isnan(cl_i)):
            print('survey utils, set_window:', cl_i)
            crash
        if unit_win:
            cl_map = hp.ma(np.ones(12 * nside * nside))


#             cl_i=1
        else:
            cl_i += shear_zbins['SN']['galaxy'][:, i, i]
            if window_cl_fact is not None:
                cl_i *= window_cl_fact
            cl_map = hp.ma(1 + hp.synfast(cl_i, nside=nside))
        cl_map[cl_map <= 0] = 1.e-4
        cl_map[~mask] = hp.UNSEEN
        cl_t = hp.anafast(cl_map)
        #         cl_map/=cl_map[mask].mean()
        #         if not unit_win:
        #             cl_map/=np.sqrt(cl_t[0]) #this is important for shear map normalization in correlation functions.
        cl_map[~mask] = hp.UNSEEN
        cl_map_noise = np.sqrt(cl_map)
        cl_map_noise[~mask] = hp.UNSEEN
        # cl_map.mask=mask
        shear_zbins[i]['window_cl0'] = cl_i
        shear_zbins[i]['window'] = cl_map
        if delta_W:
            shear_zbins[i]['window_N'] = np.sqrt(shear_zbins[i]['window'])
            shear_zbins[i]['window_N'][~mask] = hp.UNSEEN
        else:
            print('not using delta_W window')
            shear_zbins[i]['window_N'] = np.sqrt(1. / shear_zbins[i]['window'])
            shear_zbins[i]['window_N'][~mask] = hp.UNSEEN
            shear_zbins[i]['window'][:] = 1
            shear_zbins[i]['window'][~mask] = hp.UNSEEN

    del cl0G, kappa0
    return shear_zbins
示例#9
0
def check_sq_variance(dataset_path,
                      dataset_id,
                      variable,
                      pngpath,
                      pbar,
                      debug=False):
    issues = []
    client = get_client()
    num_segments = 10
    pbar.total = num_segments + 2
    if pbar.n != 0:
        pbar.n = 0
        pbar.last_print_n = 0
        pbar.update()

    with xr.open_mfdataset(dataset_path + '/*.nc') as ds:

        segments = list(
            range(0, ds['time'].size, ds['time'].size // num_segments))
        vmax = np.zeros(ds['time'].size)
        futures = []

        if 'time' not in ds.coords:
            return [], dataset_id

        dims = list()
        possible_dims = ['depth', 'lat', 'lon', 'plev', 'tau', 'lev', 'sector']
        for i in possible_dims:
            if i in ds.dims:
                if i == 'sector':
                    dims.append('basin')
                else:
                    dims.append(i)

        dims = tuple(dims)
        if 'lat' not in dims and 'lon' not in dims:
            ds['means'] = ds[variable]
            # maxrollingstd = client.compute( ds['means'].std ).result()
            # maxrollingvar = client.compute( ds['means'].mean ).result()
        else:
            ds['means'] = client.compute(ds[variable].mean(dim=dims)).result()
            # maxrollingstd = ds['means'].std().compute()
            # maxrollingvar = ds['means'].mean().compute()
            # maxrollingstd = client.compute( ds['means'].std ).result()
            # maxrollingvar = client.compute( ds['means'].mean ).result()
        # import ipdb; ipdb.set_trace()
        ds['means'] = ds['means'][~np.isnan(ds['means'])]
        pbar.update(1)

        maxrollingvar = client.compute(ds['means'].rolling(
            {
                'time': 120
            }, min_periods=1).mean())
        maxrollingstd = client.compute(ds['means'].rolling(
            {
                'time': 120
            }, min_periods=1).std())
        for idx, seg in enumerate(segments):
            if idx == num_segments - 1:
                seg_end = ds['time'].size
            else:
                seg_end = segments[idx + 1]

            temp_ds = ds['time'][seg:seg_end]
            chunk = ds.sel(time=temp_ds)
            futures.append(
                client.submit(run_chunk, chunk, dataset_id, maxrollingvar,
                              maxrollingstd, (seg, seg_end), idx))

        for f in as_completed(futures):
            pbar.update(1)
            vx, issues, seg, threshold = f.result()
            vmax[seg[0]:seg[1]] = vx
        plot_minmaxmean(pngpath, ds, vmax, dataset_id, debug=debug)

    return issues
示例#10
0
def _cluster_mode():
    try:
        get_client()
        return True
    except ValueError:
        return False
示例#11
0
def retry_with_timeout(func, retry_freq=10, n_tries=1, use_dask=True):
    """Execute ``func`` ``n_tries`` times, each time only allowing ``retry_freq``
    seconds for the function to complete. There are two main cases where this could be
    useful:

    1. You have a function that you know should execute quickly, but you may get
       occasional errors when running it simultaneously on a large number of workers. An
       example of this is massively parallelized I/O operations of netcdfs on GCS.
    2. You have a function that may or may not take a long time, but you want to skip it
       if it takes too long.

    There are two possible ways that this timeout function is implemented, each with
    pros and cons:

    1. Using python's native ``threading`` module. If you are executing ``func`` outside
       of a ``dask`` worker, you likely will want this approach. It may be slightly
       faster and has the benefit of starting the timeout clock when the function starts
       executing (rather than when the function is *submitted* to a dask scheduler).
       **Note**: This approach will also work if calling ``func`` *from* a dask worker,
       but only if the cluster was set up such that ``threads_per_worker=1``. Otherwise,
       this may cause issues if used from a dask worker.
    2. Using ``dask``. If you would like a dask worker to execute this function, you
       likely will want this approach. It can be executed from a dask worker regardless
       of the number of threads per worker (see above), but has the downside that the
       timeout clock begins once ``func`` is submitted, rather than when it begins
       executing.

    Parameters
    ----------
    func : callable
        The function you would like to execute with a timeout backoff.
    retry_freq : float
        The number of seconds to wait between successive retries of ``func``.
    n_tries : int
        The number of retries to attempt before raising an error if none were successful
    use_dask : bool
        If true, will try to use the ``dask``-based implementation (see description
        above). If no ``Client`` instance is present, will fall back to
        ``use_dask=False``.

    Returns
    -------
    The return value of ``func``

    Raises
    ------
    dask.distributed.TimeoutError :
        If the function does not execute successfully in the specified ``retry_freq``,
        after trying ``n_tries`` times.
    ValueError :
        If ``use_dask=True``, and a ``Client`` instance is present, but this fucntion is
        executed from the client (rather than as a task submitted to a worker), you will
        get ``ValueError("No workers found")``.

    Examples
    --------
    .. code-block:: python

        >>> import time
        >>> @retry_with_timeout(retry_freq=.5, n_tries=1)
        ... def wait_func(timeout):
        ...     time.sleep(timeout)
        >>> wait_func(.1)
        >>> wait_func(1)
        Traceback (most recent call last):
            ...
        asyncio.exceptions.TimeoutError: Func did not complete successfully in allowed time/number of retries.
    """

    # if use_dask specified, check if there is an active client, otherwise set to false
    if use_dask:
        try:
            dd.get_client()
        except ValueError:
            use_dask = False

    @functools.wraps(func)
    def inner(*args, **kwargs):
        if use_dask:
            # dask version
            with dd.worker_client() as client:
                for try_n in range(n_tries):
                    fut = client.submit(func, *args, **kwargs)
                    try:
                        return fut.result(timeout=retry_freq)
                    except dd.TimeoutError:
                        ...
        else:
            # non-dask version
            def this_func(q):
                args = q.get_nowait()
                kwargs = q.get_nowait()
                out = func(*args, **kwargs)
                q.put(out)

            for try_n in range(n_tries):
                q = queue.Queue()
                p = threading.Thread(target=this_func, args=(q, ))
                q.put_nowait(args)
                q.put_nowait(kwargs)
                p.start()
                p.join(timeout=retry_freq)
                if p.is_alive():
                    del p, q
                    continue
                elif q.qsize() == 0:
                    raise RuntimeError(
                        "Queue is not empty. Something malfunctined in ``func``"
                    )
                return q.get()
        raise dd.TimeoutError(
            "Func did not complete successfully in allowed time/number of retries."
        )

    return inner
示例#12
0
 def __init__(self):
     try:
         self._client = get_client()
     except ValueError:
         assert False, ("Should connect to Dask scheduler before"
                        " initializing this object.")
示例#13
0
def get_distributed_client():
    try:
        return get_client()
    except ValueError:
        return None