Exemplo n.º 1
0
def test_optimize():
    x = dask.delayed(inc)(1)
    y = dask.delayed(inc)(x)
    z = x + y

    x2, y2, z2, constant = optimize(x, y, z, 1)
    assert constant == 1

    # Same graphs for each
    dsk = dict(x2.dask)
    assert dict(y2.dask) == dsk
    assert dict(z2.dask) == dsk

    # Computationally equivalent
    assert dask.compute(x2, y2, z2) == dask.compute(x, y, z)

    # Applying optimizations before compute and during compute gives
    # same results. Shows optimizations are occurring.
    sols = dask.compute(x, y, z, optimizations=[inc_to_dec])
    x3, y3, z3 = optimize(x, y, z, optimizations=[inc_to_dec])
    assert dask.compute(x3, y3, z3) == sols

    # Optimize respects global optimizations as well
    with dask.config.set(optimizations=[inc_to_dec]):
        x4, y4, z4 = optimize(x, y, z)
    for a, b in zip([x3, y3, z3], [x4, y4, z4]):
        assert dict(a.dask) == dict(b.dask)
Exemplo n.º 2
0
def test_read_bytes_delimited(s3, blocksize):
    _, values = read_bytes(test_bucket_name+'/test/accounts*',
                           blocksize=blocksize, delimiter=b'\n', s3=s3)
    _, values2 = read_bytes(test_bucket_name+'/test/accounts*',
                            blocksize=blocksize, delimiter=b'foo', s3=s3)
    assert ([a.key for a in concat(values)] !=
            [b.key for b in concat(values2)])

    results = compute(*concat(values))
    res = [r for r in results if r]
    assert all(r.endswith(b'\n') for r in res)
    ourlines = b''.join(res).split(b'\n')
    testlines = b"".join(files[k] for k in sorted(files)).split(b'\n')
    assert ourlines == testlines

    # delimiter not at the end
    d = b'}'
    _, values = read_bytes(test_bucket_name+'/test/accounts*',
                           blocksize=blocksize, delimiter=d, s3=s3)
    results = compute(*concat(values))
    res = [r for r in results if r]
    # All should end in } except EOF
    assert sum(r.endswith(b'}') for r in res) == len(res) - 2
    ours = b"".join(res)
    test = b"".join(files[v] for v in sorted(files))
    assert ours == test
Exemplo n.º 3
0
def test_read_bytes_delimited():
    with filetexts(files, mode='b'):
        for bs in [5, 15, 45, 1500]:
            _, values = read_bytes('.test.accounts*',
                                    blocksize=bs, delimiter=b'\n')
            _, values2 = read_bytes('.test.accounts*',
                                    blocksize=bs, delimiter=b'foo')
            assert ([a.key for a in concat(values)] !=
                    [b.key for b in concat(values2)])

            results = compute(*concat(values))
            res = [r for r in results if r]
            assert all(r.endswith(b'\n') for r in res)
            ourlines = b''.join(res).split(b'\n')
            testlines = b"".join(files[k] for k in sorted(files)).split(b'\n')
            assert ourlines == testlines

            # delimiter not at the end
            d = b'}'
            _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d)
            results = compute(*concat(values))
            res = [r for r in results if r]
            # All should end in } except EOF
            assert sum(r.endswith(b'}') for r in res) == len(res) - 2
            ours = b"".join(res)
            test = b"".join(files[v] for v in sorted(files))
            assert ours == test
Exemplo n.º 4
0
def to_json(df, url_path, orient='records', lines=None, storage_options=None,
            compute=True, encoding='utf-8', errors='strict',
            compression=None, **kwargs):
    """Write dataframe into JSON text files

    This utilises ``pandas.DataFrame.to_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    produces the kind of JSON output that is most common in big-data
    applications, and which can be chunked when reading (see ``read_json()``).

    Parameters
    ----------
    df: dask.DataFrame
        Data to save
    url_path: str, list of str
        Location to write to. If a string, and there are more than one
        partitions in df, should include a glob character to expand into a
        set of file names, or provide a ``name_function=`` parameter.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    compute: bool
        If true, immediately executes. If False, returns a set of delayed
        objects, which can be computed at a later time.
    encoding, errors:
        Text conversion, ``see str.encode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    """
    if lines is None:
        lines = orient == 'records'
    if orient != 'records' and lines:
        raise ValueError('Line-delimited JSON is only available with'
                         'orient="records".')
    kwargs['orient'] = orient
    kwargs['lines'] = lines and orient == 'records'
    outfiles = open_files(
        url_path, 'wt', encoding=encoding,
        errors=errors,
        name_function=kwargs.pop('name_function', None),
        num=df.npartitions,
        compression=compression,
        **(storage_options or {})
    )
    parts = [dask.delayed(write_json_partition)(d, outfile, kwargs)
             for outfile, d in zip(outfiles, df.to_delayed())]
    if compute:
        dask.compute(parts)
        return [f.path for f in outfiles]
    else:
        return parts
Exemplo n.º 5
0
def test_write_bytes(s3):
    paths = ['s3://' + test_bucket_name + '/more/' + f for f in files]
    values = [delayed(v) for v in files.values()]
    out = core.write_bytes(values, paths)
    compute(*out)
    sample, values = read_bytes('s3://' + test_bucket_name + '/more/test/accounts.*')
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)
Exemplo n.º 6
0
def test_inner_compute():
    x = da.ones(10, chunks=(5,)) + 1 + 2 + 3
    a = x.sum()
    y = x * 2 * 3 * 4
    b = y.sum()
    z = x * 2 * 3

    dask.compute(x, a, y, b, z)
Exemplo n.º 7
0
def test_enforce_columns():
    blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'],
              [b'AA,bb\n1,1.0\n2.2.0', b'10,20\n30,40']]
    head = pd.read_csv(BytesIO(blocks[0][0]), header=0)
    with pytest.raises(ValueError):
        dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {},
                                  collection=False, enforce=True)
        compute(*dfs)
Exemplo n.º 8
0
def test_enforce_columns(reader, blocks):
    # Replace second header with different column name
    blocks = [blocks[0], [blocks[1][0].replace(b'a', b'A'), blocks[1][1]]]
    head = reader(BytesIO(blocks[0][0]), header=0)
    header = blocks[0][0].split(b'\n')[0] + b'\n'
    with pytest.raises(ValueError):
        dfs = text_blocks_to_pandas(reader, blocks, header, head, {},
                                    collection=False, enforce=True)
        dask.compute(*dfs, scheduler='sync')
Exemplo n.º 9
0
 def _compute_tasks(self, tasks, processes):
     """
     Compute all dask tasks
     """
     if processes is None:
         out = da.compute(*tasks, scheduler="single-threaded")
     else:
         out = da.compute(*tasks, num_workers=processes)
     return out
Exemplo n.º 10
0
Arquivo: csv.py Projeto: limx0/dask
def to_csv(df, filename, name_function=None, compression=None, compute=True,
           get=None, **kwargs):
    values = [_to_csv_chunk(d, **kwargs) for d in df.to_delayed()]
    values = write_bytes(values, filename, name_function, compression,
                         encoding=None)

    if compute:
        from dask import compute
        compute(*values, get=get)
    else:
        return values
def main(input_file, dtypes, output_path):
    """Create Plots From data in input"""

    data = pd.read_csv(input_file)
    new_file_name = f"{input_file}.parq"
    data.to_parquet(new_file_name)

    data_types = json.load(open(dtypes, "r"))
    plots = create_plots(new_file_name, data_types, output_path)
    with ProgressBar():
        dask.compute(*plots, scheduler="processes", n_workers=22)
Exemplo n.º 12
0
def test_to_textfiles(ext, myopen):
    b = db.from_sequence(['abc', '123', 'xyz'], npartitions=2)
    with tmpdir() as dir:
        c = b.to_textfiles(os.path.join(dir, '*.' + ext), compute=False)
        dask.compute(*c, get=dask.get)
        assert os.path.exists(os.path.join(dir, '1.' + ext))

        f = myopen(os.path.join(dir, '1.' + ext), 'rb')
        text = f.read()
        if hasattr(text, 'decode'):
            text = text.decode()
        assert 'xyz' in text
        f.close()
Exemplo n.º 13
0
def test_read_text(fmt, bs, encoding):
    compress = compression.compress[fmt]
    files2 = dict((k, compress(v.encode(encoding))) for k, v in files.items())
    with filetexts(files2, mode='b'):
        b = read_text('.test.accounts.*.json', compression=fmt, blocksize=bs,
                encoding=encoding)
        L, = compute(b)
        assert ''.join(L) == expected

        blocks = read_text('.test.accounts.*.json', compression=fmt, blocksize=bs,
                encoding=encoding, collection=False)
        L = compute(*blocks)
        assert ''.join(line for block in L for line in block) == expected
Exemplo n.º 14
0
def test_simple_write(tmpdir):
    tmpdir = str(tmpdir)
    make_bytes = lambda: b'000'
    some_bytes = delayed(make_bytes)()
    data = [some_bytes, some_bytes]
    out = write_bytes(data, tmpdir)
    assert len(out) == 2
    compute(*out)
    files = os.listdir(tmpdir)
    assert len(files) == 2
    assert '0.part' in files
    d = open(os.path.join(tmpdir, files[0]), 'rb').read()
    assert d == b'000'
Exemplo n.º 15
0
def test_compressed_write(tmpdir):
    tmpdir = str(tmpdir)
    make_bytes = lambda: b'000'
    some_bytes = delayed(make_bytes)()
    data = [some_bytes, some_bytes]
    out = write_bytes(data, os.path.join(tmpdir, 'bytes-*.gz'),
                      compression='gzip')
    compute(*out)
    files = os.listdir(tmpdir)
    assert len(files) == 2
    assert 'bytes-0.gz' in files
    import gzip
    d = gzip.GzipFile(os.path.join(tmpdir, files[0])).read()
    assert d == b'000'
Exemplo n.º 16
0
def test_registered_read_bytes():
    from dask.bytes.core import read_bytes
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*')

        results = compute(*concat(values))
        assert set(results) == set(files.values())
Exemplo n.º 17
0
def test_compression_binary(fmt):
    from dask.bytes.core import open_files
    files2 = valmap(compression.compress[fmt], files)
    with filetexts(files2, mode='b'):
        myfiles = open_files('.test.accounts.*', compression=fmt)
        data = compute(*[file.read() for file in myfiles])
        assert list(data) == [files[k] for k in sorted(files)]
Exemplo n.º 18
0
def test_registered_open_files():
    from dask.bytes.core import open_files
    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*')
        assert len(myfiles) == len(files)
        data = compute(*[file.read() for file in myfiles])
        assert list(data) == [files[k] for k in sorted(files)]
Exemplo n.º 19
0
def test_nout():
    func = delayed(lambda x: (x, -x), nout=2, pure=True)
    x = func(1)
    assert len(x) == 2
    a, b = x
    assert compute(a, b) == (1, -1)
    assert a._length is None
    assert b._length is None
    pytest.raises(TypeError, lambda: len(a))
    pytest.raises(TypeError, lambda: list(a))

    pytest.raises(ValueError, lambda: delayed(add, nout=-1))
    pytest.raises(ValueError, lambda: delayed(add, nout=True))

    func = delayed(add, nout=None)
    a = func(1)
    assert a._length is None
    pytest.raises(TypeError, lambda: list(a))
    pytest.raises(TypeError, lambda: len(a))

    func = delayed(lambda x: (x,), nout=1, pure=True)
    x = func(1)
    assert len(x) == 1
    a, = x
    assert a.compute() == 1
    assert a._length is None
    pytest.raises(TypeError, lambda: len(a))

    func = delayed(lambda x: tuple(), nout=0, pure=True)
    x = func(1)
    assert len(x) == 0
    assert x.compute() == tuple()
Exemplo n.º 20
0
def test_custom_delayed():
    x = Tuple({'a': 1, 'b': 2, 'c': (add, 'a', 'b')}, ['a', 'b', 'c'])
    x2 = delayed(add, pure=True)(x, (4, 5, 6))
    n = delayed(len, pure=True)(x)
    assert delayed(len, pure=True)(x).key == n.key
    assert x2.compute() == (1, 2, 3, 4, 5, 6)
    assert compute(n, x2, x) == (3, (1, 2, 3, 4, 5, 6), (1, 2, 3))
Exemplo n.º 21
0
def test_enforce_dtypes(reader, blocks):
    head = reader(BytesIO(blocks[0][0]), header=0)
    header = blocks[0][0].split(b'\n')[0] + b'\n'
    dfs = text_blocks_to_pandas(reader, blocks, header, head, {},
                                collection=False)
    dfs = dask.compute(*dfs, scheduler='sync')
    assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
Exemplo n.º 22
0
def test_registered_open_text_files(s3):
    from dask.bytes.core import open_text_files
    myfiles = open_text_files('s3://' + test_bucket_name + '/test/accounts.*.json',
                              s3=s3)
    assert len(myfiles) == len(files)
    data = compute(*[file.read() for file in myfiles])
    assert list(data) == [files[k].decode() for k in sorted(files)]
Exemplo n.º 23
0
Arquivo: dsk.py Projeto: chmp/flowly
    def compute(self, **kwargs):
        items = list(self.items())
        keys = [key for key, _ in items]
        values = [value for _, value in items]
        values = dask.compute(*values, **kwargs)

        return dask_dict(zip(keys, values))
Exemplo n.º 24
0
def test_open_files_write(tmpdir):
    tmpdir = str(tmpdir)
    f = open_file_write([os.path.join(tmpdir, 'test1'),
                         os.path.join(tmpdir, 'test2')])
    assert len(f) == 2
    files = compute(*f)
    assert files[0].mode == 'wb'
Exemplo n.º 25
0
def test_registered_open_files(s3):
    from dask.bytes.core import open_files

    myfiles = open_files("s3://%s/test/accounts.*.json" % test_bucket_name, s3=s3)
    assert len(myfiles) == len(files)
    data = compute(*[file.read() for file in myfiles])
    assert list(data) == [files[k] for k in sorted(files)]
Exemplo n.º 26
0
def test_registered(s3):
    from dask.bytes.core import read_bytes

    sample, values = read_bytes("s3://%s/test/accounts.*.json" % test_bucket_name, s3=s3)

    results = compute(*concat(values))
    assert set(results) == set(files.values())
Exemplo n.º 27
0
def l1_lpsolver_parallel(obs_phase, freqs, sigma_max = np.pi, fout=0.5, solve_cs=True, problem_name="l1_tec_solver",num_threads = None):
    '''Solve the tec and cs for multiple datasets.
    `obs_phase` : `numpy.ndarray`
        the measured phase with shape (num_freqs, num_datasets)
    `freqs` : `numpy.ndarray`
        the frequencies at the datapoints (num_freqs,)
    `sigma_max` : (optional) `float`
        the maximum allowed deviation for outlier detection. default np.pi
    `fout` : (optional) `float`
        The maximum fraction of allowed outliers out of total number of datapoints. default 0.5
    `solve_cs` : (optional) bool
        Whether to solve cs (True)
    `num_threads` : (optional) `int`
        number of parallel threads to run. default None is num_cpu
    `problem_name` : (optional) `str`
        name of problem "l1_tec_solver"
    '''
    from dask import delayed, compute
    from dask.threaded import get
    from functools import partial
    dsk = {}
    assert len(obs_phase.shape) == 2, "obs_phase not dim 2 {}".format(obs_phase.shape)
    N = obs_phase.shape[1]
    values = [delayed(partial(l1_lpsolver, sigma_max=sigma_max, fout=fout,solve_cs=solve_cs, problem_name="{}{:03d}".format(problem_name,i)), pure=True)( obs_phase[:,i], freqs) for i in range(N)]
    #client = Client()
    results = compute(*values, get=get, num_workers=num_threads)
    return results
Exemplo n.º 28
0
def test_to_textfiles_encoding():
    b = db.from_sequence([u'汽车', u'苹果', u'天气'], npartitions=2)
    for ext, myopen in [('gz', GzipFile), ('bz2', BZ2File), ('', open)]:
        if ext == 'bz2' and PY2:
            continue
        with tmpdir() as dir:
            c = b.to_textfiles(os.path.join(dir, '*.' + ext), encoding='gb18030', compute=False)
            dask.compute(*c)
            assert os.path.exists(os.path.join(dir, '1.' + ext))

            f = myopen(os.path.join(dir, '1.' + ext), 'rb')
            text = f.read()
            if hasattr(text, 'decode'):
                text = text.decode('gb18030')
            assert u'天气' in text
            f.close()
Exemplo n.º 29
0
def compute_with_trace(*args):
    """Do Dask compute(), but with added Eliot tracing.

    Dask is a graph of tasks, but Eliot logs trees.  So we need to emulate a
    graph using a tree.  We do this by making Eliot action for each task, but
    having it list the tasks it depends on.

    We use the following algorithm:

        1. Create a top-level action.

        2. For each entry in the dask graph, create a child with
           serialize_task_id.  Do this in likely order of execution, so that
           if B depends on A the task level of B is higher than the task Ievel
           of A.

        3. Replace each function with a wrapper that uses the corresponding
           task ID (with Action.continue_task), and while it's at it also
           records which other things this function depends on.

    Known issues:

        1. Retries will confuse Eliot.  Probably need different
           distributed-tree mechanism within Eliot to solve that.
    """
    # 1. Create top-level Eliot Action:
    with start_action(action_type="dask:compute"):
        # In order to reduce logging verbosity, add logging to the already
        # optimized graph:
        optimized = optimize(*args, optimizations=[_add_logging])
        return compute(*optimized, optimize_graph=False)
Exemplo n.º 30
0
def test_read_bytes_blocksize_types(blocksize):
    with filetexts(files, mode='b'):
        sample, vals = read_bytes('.test.account*', blocksize=blocksize)
        results = compute(*concat(vals))
        ourlines = b"".join(results).split(b'\n')
        testlines = b"".join(files.values()).split(b'\n')
        assert set(ourlines) == set(testlines)
Exemplo n.º 31
0
def run_TPI(p, client=None):
    '''
    Solve for transition path equilibrium of OG-USA.

    Args:
        p (OG-USA Specifications object): model parameters
        client (Dask client object): client

    Returns:
        output (dictionary): dictionary with transition path solution
            results

    '''
    # unpack tuples of parameters
    initial_values, ss_vars, theta, baseline_values = get_initial_SS_values(p)
    (B0, b_sinit, b_splus1init, factor, initial_b, initial_n,
     D0) = initial_values
    (TRbaseline, Gbaseline) = baseline_values

    print('Government spending breakpoints are tG1: ', p.tG1, '; and tG2:',
          p.tG2)

    # Initialize guesses at time paths
    # Make array of initial guesses for labor supply and savings
    guesses_b = utils.get_initial_path(initial_b, ss_vars['bssmat_splus1'], p,
                                       'ratio')
    guesses_n = utils.get_initial_path(initial_n, ss_vars['nssmat'], p,
                                       'ratio')
    b_mat = guesses_b
    n_mat = guesses_n
    ind = np.arange(p.S)

    # Get path for aggregate savings and labor supply`
    L_init = np.ones((p.T + p.S, )) * ss_vars['Lss']
    B_init = np.ones((p.T + p.S, )) * ss_vars['Bss']
    L_init[:p.T] = aggr.get_L(n_mat[:p.T], p, 'TPI')
    B_init[1:p.T] = aggr.get_B(b_mat[:p.T], p, 'TPI', False)[:p.T - 1]
    B_init[0] = B0

    if not p.small_open:
        if p.budget_balance:
            K_init = B_init
        else:
            K_init = B_init * ss_vars['Kss'] / ss_vars['Bss']
    else:
        K_init = firm.get_B(L_init, p.firm_r, p, 'TPI')

    K = K_init
    K_d = K_init * ss_vars['K_d_ss'] / ss_vars['Kss']
    K_f = K_init * ss_vars['K_f_ss'] / ss_vars['Kss']

    L = L_init
    B = B_init
    Y = np.zeros_like(K)
    Y[:p.T] = firm.get_Y(K[:p.T], L[:p.T], p, 'TPI')
    Y[p.T:] = ss_vars['Yss']
    r = np.zeros_like(Y)
    if not p.small_open:
        r[:p.T] = firm.get_r(Y[:p.T], K[:p.T], p, 'TPI')
        r[p.T:] = ss_vars['rss']
    else:
        r = p.firm_r
    # compute w
    w = np.zeros_like(r)
    w[:p.T] = firm.get_w_from_r(r[:p.T], p, 'TPI')
    w[p.T:] = ss_vars['wss']
    r_gov = fiscal.get_r_gov(r, p)
    if p.budget_balance:
        r_hh = r
    else:
        r_hh = aggr.get_r_hh(r, r_gov, K, ss_vars['Dss'])
    if p.small_open:
        r_hh = p.hh_r

    BQ0 = aggr.get_BQ(r[0], initial_b, None, p, 'SS', True)
    if not p.use_zeta:
        BQ = np.zeros((p.T + p.S, p.J))
        for j in range(p.J):
            BQ[:, j] = (list(np.linspace(BQ0[j], ss_vars['BQss'][j], p.T)) +
                        [ss_vars['BQss'][j]] * p.S)
        BQ = np.array(BQ)
    else:
        BQ = (list(np.linspace(BQ0, ss_vars['BQss'], p.T)) +
              [ss_vars['BQss']] * p.S)
        BQ = np.array(BQ)
    if p.budget_balance:
        if np.abs(ss_vars['TR_ss']) < 1e-13:
            TR_ss2 = 0.0  # sometimes SS is very small but not zero,
            # even if taxes are zero, this get's rid of the approximation
            # error, which affects the perc changes below
        else:
            TR_ss2 = ss_vars['TR_ss']
        TR = np.ones(p.T + p.S) * TR_ss2
        total_revenue = TR
        G = np.zeros(p.T + p.S)
    elif not p.baseline_spending:
        TR = p.alpha_T * Y
        G = np.ones(p.T + p.S) * ss_vars['Gss']
    elif p.baseline_spending:
        TR = TRbaseline
        TR_new = p.TR  # Need to set TR_new for later reference
        G = Gbaseline
        G_0 = Gbaseline[0]

    # Initialize some starting values
    if p.budget_balance:
        D = np.zeros(p.T + p.S)
    else:
        D = np.ones(p.T + p.S) * ss_vars['Dss']
    if ss_vars['Dss'] == 0:
        D_d = np.zeros(p.T + p.S)
        D_f = np.zeros(p.T + p.S)
    else:
        D_d = D * ss_vars['D_d_ss'] / ss_vars['Dss']
        D_f = D * ss_vars['D_f_ss'] / ss_vars['Dss']
    total_revenue = np.ones(p.T + p.S) * ss_vars['total_revenue_ss']

    TPIiter = 0
    TPIdist = 10
    euler_errors = np.zeros((p.T, 2 * p.S, p.J))
    TPIdist_vec = np.zeros(p.maxiter)

    # TPI loop
    while (TPIiter < p.maxiter) and (TPIdist >= p.mindist_TPI):
        r_gov[:p.T] = fiscal.get_r_gov(r[:p.T], p)
        if p.budget_balance:
            r_hh[:p.T] = r[:p.T]
        else:
            K[:p.T] = firm.get_K_from_Y(Y[:p.T], r[:p.T], p, 'TPI')
            r_hh[:p.T] = aggr.get_r_hh(r[:p.T], r_gov[:p.T], K[:p.T], D[:p.T])
        if p.small_open:
            r_hh[:p.T] = p.hh_r[:p.T]

        outer_loop_vars = (r, w, r_hh, BQ, TR, theta)

        euler_errors = np.zeros((p.T, 2 * p.S, p.J))
        lazy_values = []
        for j in range(p.J):
            guesses = (guesses_b[:, :, j], guesses_n[:, :, j])
            lazy_values.append(
                delayed(inner_loop)(guesses, outer_loop_vars, initial_values,
                                    j, ind, p))
        results = compute(*lazy_values,
                          scheduler=dask.multiprocessing.get,
                          num_workers=p.num_workers)
        for j, result in enumerate(results):
            euler_errors[:, :, j], b_mat[:, :, j], n_mat[:, :, j] = result

        bmat_s = np.zeros((p.T, p.S, p.J))
        bmat_s[0, 1:, :] = initial_b[:-1, :]
        bmat_s[1:, 1:, :] = b_mat[:p.T - 1, :-1, :]
        bmat_splus1 = np.zeros((p.T, p.S, p.J))
        bmat_splus1[:, :, :] = b_mat[:p.T, :, :]

        etr_params_4D = np.tile(
            p.etr_params.reshape(p.T, p.S, 1, p.etr_params.shape[2]),
            (1, 1, p.J, 1))
        bqmat = household.get_bq(BQ, None, p, 'TPI')
        trmat = household.get_tr(TR, None, p, 'TPI')
        tax_mat = tax.total_taxes(r_hh[:p.T], w[:p.T], bmat_s,
                                  n_mat[:p.T, :, :], bqmat[:p.T, :, :], factor,
                                  trmat[:p.T, :, :], theta, 0, None, False,
                                  'TPI', p.e, etr_params_4D, p)
        r_hh_path = utils.to_timepath_shape(r_hh)
        wpath = utils.to_timepath_shape(w)
        c_mat = household.get_cons(r_hh_path[:p.T, :, :], wpath[:p.T, :, :],
                                   bmat_s, bmat_splus1, n_mat[:p.T, :, :],
                                   bqmat[:p.T, :, :], tax_mat, p.e,
                                   p.tau_c[:p.T, :, :], p)
        y_before_tax_mat = (r_hh_path[:p.T, :, :] * bmat_s[:p.T, :, :] +
                            wpath[:p.T, :, :] * p.e * n_mat[:p.T, :, :])

        if not p.baseline_spending and not p.budget_balance:
            Y[:p.T] = TR[:p.T] / p.alpha_T[:p.T]  # maybe unecessary

            (total_rev, T_Ipath, T_Ppath, T_BQpath,
             T_Wpath, T_Cpath, business_revenue) = aggr.revenue(
                 r_hh[:p.T], w[:p.T], bmat_s, n_mat[:p.T, :, :],
                 bqmat[:p.T, :, :], c_mat[:p.T, :, :], Y[:p.T], L[:p.T],
                 K[:p.T], factor, theta, etr_params_4D, p, 'TPI')
            total_revenue[:p.T] = total_rev
            # set intial debt value
            if p.baseline:
                D0 = p.initial_debt_ratio * Y[0]
            if not p.baseline_spending:
                G_0 = p.alpha_G[0] * Y[0]
            dg_fixed_values = (Y, total_revenue, TR, D0, G_0)
            Dnew, G[:p.T] = fiscal.D_G_path(r_gov, dg_fixed_values, Gbaseline,
                                            p)
            # Fix initial amount of foreign debt holding
            D_f[0] = p.initial_foreign_debt_ratio * Dnew[0]
            for t in range(1, p.T):
                D_f[t + 1] = (D_f[t] / (np.exp(p.g_y) * (1 + p.g_n[t + 1])) +
                              p.zeta_D[t] * (Dnew[t + 1] -
                                             (Dnew[t] / (np.exp(p.g_y) *
                                                         (1 + p.g_n[t + 1])))))
            D_d[:p.T] = Dnew[:p.T] - D_f[:p.T]
        else:  # if budget balance
            Dnew = np.zeros(p.T + 1)
            G[:p.T] = np.zeros(p.T)
            D_f[:p.T] = np.zeros(p.T)
            D_d[:p.T] = np.zeros(p.T)

        L[:p.T] = aggr.get_L(n_mat[:p.T], p, 'TPI')
        B[1:p.T] = aggr.get_B(bmat_splus1[:p.T], p, 'TPI', False)[:p.T - 1]
        K_demand_open = firm.get_K(L[:p.T], p.firm_r[:p.T], p, 'TPI')
        K_d[:p.T] = B[:p.T] - D_d[:p.T]
        if np.any(K_d < 0):
            print('K_d has negative elements. Setting them ' +
                  'positive to prevent NAN.')
            K_d[:p.T] = np.fmax(K_d[:p.T], 0.05 * B[:p.T])
        K_f[:p.T] = p.zeta_K[:p.T] * (K_demand_open - B[:p.T] + D_d[:p.T])
        K = K_f + K_d
        if np.any(B) < 0:
            print('B has negative elements. B[0:9]:', B[0:9])
            print('B[T-2:T]:', B[p.T - 2, p.T])
        if p.small_open:
            K[:p.T] = K_demand_open
        Ynew = firm.get_Y(K[:p.T], L[:p.T], p, 'TPI')
        rnew = r.copy()
        if not p.small_open:
            rnew[:p.T] = firm.get_r(Ynew[:p.T], K[:p.T], p, 'TPI')
        else:
            rnew[:p.T] = r[:p.T].copy()
        r_gov_new = fiscal.get_r_gov(rnew, p)
        if p.budget_balance:
            r_hh_new = rnew[:p.T]
        else:
            r_hh_new = aggr.get_r_hh(rnew[:p.T], r_gov_new[:p.T], K[:p.T],
                                     Dnew[:p.T])
        if p.small_open:
            r_hh_new = p.hh_r[:p.T]
        # compute w
        wnew = firm.get_w_from_r(rnew[:p.T], p, 'TPI')

        b_mat_shift = np.append(np.reshape(initial_b, (1, p.S, p.J)),
                                b_mat[:p.T - 1, :, :],
                                axis=0)
        BQnew = aggr.get_BQ(r_hh_new[:p.T], b_mat_shift, None, p, 'TPI', False)
        bqmat_new = household.get_bq(BQnew, None, p, 'TPI')
        (total_rev, T_Ipath, T_Ppath, T_BQpath,
         T_Wpath, T_Cpath, business_revenue) = aggr.revenue(
             r_hh_new[:p.T], wnew[:p.T], bmat_s, n_mat[:p.T, :, :],
             bqmat_new[:p.T, :, :], c_mat[:p.T, :, :], Ynew[:p.T], L[:p.T],
             K[:p.T], factor, theta, etr_params_4D, p, 'TPI')
        total_revenue[:p.T] = total_rev

        if p.budget_balance:
            TR_new = total_revenue
        elif not p.baseline_spending:
            TR_new = p.alpha_T[:p.T] * Ynew[:p.T]
        # If baseline_spending==True, no need to update TR, it's fixed

        # update vars for next iteration
        w[:p.T] = wnew[:p.T]
        r[:p.T] = utils.convex_combo(rnew[:p.T], r[:p.T], p.nu)
        BQ[:p.T] = utils.convex_combo(BQnew[:p.T], BQ[:p.T], p.nu)
        D[:p.T] = Dnew[:p.T]
        Y[:p.T] = utils.convex_combo(Ynew[:p.T], Y[:p.T], p.nu)
        if not p.baseline_spending:
            TR[:p.T] = utils.convex_combo(TR_new[:p.T], TR[:p.T], p.nu)
        guesses_b = utils.convex_combo(b_mat, guesses_b, p.nu)
        guesses_n = utils.convex_combo(n_mat, guesses_n, p.nu)
        print('r diff: ', (rnew[:p.T] - r[:p.T]).max(),
              (rnew[:p.T] - r[:p.T]).min())
        print('BQ diff: ', (BQnew[:p.T] - BQ[:p.T]).max(),
              (BQnew[:p.T] - BQ[:p.T]).min())
        print('TR diff: ', (TR_new[:p.T] - TR[:p.T]).max(),
              (TR_new[:p.T] - TR[:p.T]).min())
        print('Y diff: ', (Ynew[:p.T] - Y[:p.T]).max(),
              (Ynew[:p.T] - Y[:p.T]).min())
        if not p.baseline_spending:
            if TR.all() != 0:
                TPIdist = np.array(
                    list(utils.pct_diff_func(rnew[:p.T], r[:p.T])) + list(
                        utils.pct_diff_func(BQnew[:p.T], BQ[:p.T]).flatten()) +
                    list(utils.pct_diff_func(TR_new[:p.T], TR[:p.T]))).max()
            else:
                TPIdist = np.array(
                    list(utils.pct_diff_func(rnew[:p.T], r[:p.T])) + list(
                        utils.pct_diff_func(BQnew[:p.T], BQ[:p.T]).flatten()) +
                    list(np.abs(TR[:p.T]))).max()
        else:
            TPIdist = np.array(
                list(utils.pct_diff_func(rnew[:p.T], r[:p.T])) +
                list(utils.pct_diff_func(BQnew[:p.T], BQ[:p.T]).flatten()) +
                list(utils.pct_diff_func(Ynew[:p.T], Y[:p.T]))).max()

        TPIdist_vec[TPIiter] = TPIdist
        # After T=10, if cycling occurs, drop the value of nu
        # wait til after T=10 or so, because sometimes there is a jump up
        # in the first couple iterations
        # if TPIiter > 10:
        #     if TPIdist_vec[TPIiter] - TPIdist_vec[TPIiter - 1] > 0:
        #         nu /= 2
        #         print 'New Value of nu:', nu
        TPIiter += 1
        print('Iteration:', TPIiter)
        print('\tDistance:', TPIdist)

    # Compute effective and marginal tax rates for all agents
    mtrx_params_4D = np.tile(
        p.mtrx_params.reshape(p.T, p.S, 1, p.mtrx_params.shape[2]),
        (1, 1, p.J, 1))
    mtry_params_4D = np.tile(
        p.mtry_params.reshape(p.T, p.S, 1, p.mtry_params.shape[2]),
        (1, 1, p.J, 1))

    e_3D = np.tile(p.e.reshape(1, p.S, p.J), (p.T, 1, 1))
    mtry_path = tax.MTR_income(r_hh_path[:p.T], wpath[:p.T],
                               bmat_s[:p.T, :, :], n_mat[:p.T, :, :], factor,
                               True, e_3D, etr_params_4D, mtry_params_4D, p)
    mtrx_path = tax.MTR_income(r_hh_path[:p.T], wpath[:p.T],
                               bmat_s[:p.T, :, :], n_mat[:p.T, :, :], factor,
                               False, e_3D, etr_params_4D, mtrx_params_4D, p)
    etr_path = tax.ETR_income(r_hh_path[:p.T], wpath[:p.T], bmat_s[:p.T, :, :],
                              n_mat[:p.T, :, :], factor, e_3D, etr_params_4D,
                              p)

    C = aggr.get_C(c_mat, p, 'TPI')
    # Note that implicity in this computation is that immigrants'
    # wealth is all in the form of private capital
    I_d = aggr.get_I(bmat_splus1[:p.T], K_d[1:p.T + 1], K_d[:p.T], p, 'TPI')
    I = aggr.get_I(bmat_splus1[:p.T], K[1:p.T + 1], K[:p.T], p, 'TPI')
    # solve resource constraint
    # net foreign borrowing
    new_borrowing_f = (D_f[1:p.T + 1] * np.exp(p.g_y) *
                       (1 + p.g_n[1:p.T + 1]) - D_f[:p.T])
    debt_service_f = D_f * r_hh
    RC_error = aggr.resource_constraint(Y[:p.T - 1], C[:p.T - 1], G[:p.T - 1],
                                        I_d[:p.T - 1], K_f[:p.T - 1],
                                        new_borrowing_f[:p.T - 1],
                                        debt_service_f[:p.T - 1],
                                        r_hh[:p.T - 1], p)

    # Compute total investment (not just domestic)
    I_total = ((1 + p.g_n[:p.T]) * np.exp(p.g_y) * K[1:p.T + 1] -
               (1.0 - p.delta) * K[:p.T])

    # Compute income tax revenues
    tax_rev = aggr.get_L(T_Ipath, p, 'TPI')
    payroll_tax_revenue = p.frac_tax_payroll[:p.T] * tax_rev[:p.T]
    iit_revenue = tax_rev[:p.T] - payroll_tax_revenue

    # Compute resource constraint error
    rce_max = np.amax(np.abs(RC_error))
    print('Max absolute value resource constraint error:', rce_max)

    print('Checking time path for violations of constraints.')
    for t in range(p.T):
        household.constraint_checker_TPI(b_mat[t], n_mat[t], c_mat[t], t,
                                         p.ltilde)

    eul_savings = euler_errors[:, :p.S, :].max(1).max(1)
    eul_laborleisure = euler_errors[:, p.S:, :].max(1).max(1)

    print('Max Euler error, savings: ', eul_savings)
    print('Max Euler error labor supply: ', eul_laborleisure)
    '''
    ------------------------------------------------------------------------
    Save variables/values so they can be used in other modules
    ------------------------------------------------------------------------
    '''

    output = {
        'Y': Y[:p.T],
        'B': B,
        'K': K,
        'K_f': K_f,
        'K_d': K_d,
        'L': L,
        'C': C,
        'I': I,
        'I_total': I_total,
        'I_d': I_d,
        'BQ': BQ,
        'total_revenue': total_revenue,
        'business_revenue': business_revenue,
        'IITpayroll_revenue': T_Ipath,
        'iit_revenue': iit_revenue,
        'payroll_tax_revenue': payroll_tax_revenue,
        'TR': TR,
        'T_P': T_Ppath,
        'T_BQ': T_BQpath,
        'T_W': T_Wpath,
        'T_C': T_Cpath,
        'G': G,
        'D': D,
        'D_f': D_f,
        'D_d': D_d,
        'r': r,
        'r_gov': r_gov,
        'r_hh': r_hh,
        'w': w,
        'bmat_splus1': bmat_splus1,
        'bmat_s': bmat_s[:p.T, :, :],
        'n_mat': n_mat[:p.T, :, :],
        'c_path': c_mat,
        'bq_path': bqmat,
        'tr_path': trmat,
        'y_before_tax_mat': y_before_tax_mat,
        'tax_path': tax_mat,
        'eul_savings': eul_savings,
        'eul_laborleisure': eul_laborleisure,
        'resource_constraint_error': RC_error,
        'new_borrowing_f': new_borrowing_f,
        'debt_service_f': debt_service_f,
        'etr_path': etr_path,
        'mtrx_path': mtrx_path,
        'mtry_path': mtry_path
    }

    tpi_dir = os.path.join(p.output_base, "TPI")
    utils.mkdirs(tpi_dir)
    tpi_vars = os.path.join(tpi_dir, "TPI_vars.pkl")
    with open(tpi_vars, "wb") as f:
        pickle.dump(output, f)

    if np.any(G) < 0:
        print('Government spending is negative along transition path' +
              ' to satisfy budget')

    if (((TPIiter >= p.maxiter) or (np.absolute(TPIdist) > p.mindist_TPI))
            and ENFORCE_SOLUTION_CHECKS):
        raise RuntimeError('Transition path equlibrium not found' +
                           ' (TPIdist)')

    if ((np.any(np.absolute(RC_error) >= p.mindist_TPI * 10))
            and ENFORCE_SOLUTION_CHECKS):
        raise RuntimeError('Transition path equlibrium not found ' +
                           '(RC_error)')

    if ((np.any(np.absolute(eul_savings) >= p.mindist_TPI) or
         (np.any(np.absolute(eul_laborleisure) > p.mindist_TPI)))
            and ENFORCE_SOLUTION_CHECKS):
        raise RuntimeError('Transition path equlibrium not found ' +
                           '(eulers)')

    return output
Exemplo n.º 32
0
def open_mfdataset(
    paths,
    chunks=None,
    concat_dim="time",
    compat="no_conflicts",
    preprocess=None,
    engine=None,
    lock=None,
    data_vars="all",
    coords="different",
    combine="nested",
    autoclose=None,
    parallel=False,
    join="outer",
    attrs_file=None,
    **kwargs,
):
    """Helper function for opening multiple files as an xarray_ dataset.
	Adapted from upstream implementation_. See docs_ for usage.

	.. todo::

		To be removed when a backend entrypoint_ is implementated.

	.. _implementation: https://github.com/pydata/xarray/blob/484d1ce5ff8969b6ca6fa942b344379725f33b9c/xarray/backends/api.py#L726
	.. _docs: https://xarray.pydata.org/en/v0.15.1/generated/xarray.open_mfdataset.html
	.. _entrypoint: https://github.com/pydata/xarray/pull/3166

	"""
    if isinstance(paths, str):
        paths = sorted(glob(paths))
    else:
        paths = [str(p) if isinstance(p, Path) else p for p in paths]

    if not paths:
        raise OSError("no files to open")

    # If combine='by_coords' then this is unnecessary, but quick.
    # If combine='nested' then this creates a flat list which is easier to
    # iterate over, while saving the originally-supplied structure as "ids"
    if combine == "nested":
        if isinstance(concat_dim, (str, xr.DataArray)) or concat_dim is None:
            concat_dim = [concat_dim]

    open_kwargs = dict()

    if parallel:
        import dask

        # wrap the open_dataset, getattr, and preprocess with delayed
        open_ = dask.delayed(open_dataset)
        if preprocess is not None:
            preprocess = dask.delayed(preprocess)
    else:
        open_ = open_dataset

    datasets = [open_(p, **open_kwargs) for p in paths]
    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets]

    if parallel:
        # calling compute here will return the datasets
        # the underlying datasets will still be stored as dask arrays
        datasets, = dask.compute(datasets)

    # Combine all datasets, closing them in case of a ValueError
    try:
        if combine == "nested":
            # Combined nested list by successive concat and merge operations
            # along each dimension, using structure given by "ids"
            combined = xr.combine_nested(
                datasets,
                concat_dim=concat_dim,
                compat=compat,
                data_vars=data_vars,
                coords=coords,
                join=join,
            )
        elif combine == "by_coords":
            # Redo ordering from coordinates, ignoring how they were ordered
            # previously
            combined = xr.combine_by_coords(datasets,
                                            compat=compat,
                                            data_vars=data_vars,
                                            coords=coords,
                                            join=join)
        else:
            raise ValueError("{} is an invalid option for the keyword argument"
                             " ``combine``".format(combine))
    except ValueError:
        for ds in datasets:
            ds.close()
        raise

    # read global attributes from the attrs_file or from the first dataset
    if attrs_file is not None:
        if isinstance(attrs_file, Path):
            attrs_file = str(attrs_file)
        combined.attrs = datasets[paths.index(attrs_file)].attrs
    else:
        combined.attrs = datasets[0].attrs

    return combined
Exemplo n.º 33
0
def read(filename,
         band_names=None,
         time_names=None,
         bounds=None,
         chunks=256,
         num_workers=1,
         **kwargs):
    """
    Reads a window slice in-memory

    Args:
        filename (str or list): A file name or list of file names to open read.
        band_names (Optional[list]): A list of names to give the output band dimension.
        time_names (Optional[list]): A list of names to give the time dimension.
        bounds (Optional[1d array-like]): A bounding box to subset to, given as
            [minx, miny, maxx, maxy] or [left, bottom, right, top].
        chunks (Optional[tuple]): The data chunk size.
        num_workers (Optional[int]): The number of parallel ``dask`` workers.
        kwargs (Optional[dict]): Keyword arguments to pass to ``rasterio.write``.

    Returns:
        ``xarray.DataArray``
    """

    # Cannot pass 'chunks' to rasterio
    if 'chunks' in kwargs:
        del kwargs['chunks']

    if isinstance(filename, str):

        with rio.open(filename) as src:

            src_transform = src.gw.transform if hasattr(
                src, 'gw') else src.transform

            if bounds and ('window' not in kwargs):
                kwargs['window'] = from_bounds(*bounds,
                                               transform=src_transform)

            ycoords, xcoords, attrs = get_attrs(src, **kwargs)

        data = dask.compute(read_delayed(filename, chunks, **kwargs),
                            num_workers=num_workers)[0]

        if not band_names:
            band_names = np.arange(1, data.shape[0] + 1)

        if len(band_names) != data.shape[0]:
            logger.exception(
                '  The band names do not match the output dimensions.')
            raise ValueError

        data = xr.DataArray(data,
                            dims=('band', 'y', 'x'),
                            coords={
                                'band': band_names,
                                'y': ycoords[:data.shape[-2]],
                                'x': xcoords[:data.shape[-1]]
                            },
                            attrs=attrs)

    else:

        with rio.open(filename[0]) as src:

            src_transform = src.gw.transform if hasattr(
                src, 'gw') else src.transform

            if bounds and ('window' not in kwargs):
                kwargs['window'] = from_bounds(*bounds,
                                               transform=src_transform)

            ycoords, xcoords, attrs = get_attrs(src, **kwargs)

        data = da.concatenate(dask.compute(read_list(filename, chunks,
                                                     **kwargs),
                                           num_workers=num_workers),
                              axis=0)

        if not band_names:
            band_names = np.arange(1, data.shape[-3] + 1)

        if len(band_names) != data.shape[-3]:
            logger.exception(
                '  The band names do not match the output dimensions.')
            raise ValueError

        if not time_names:
            time_names = np.arange(1, len(filename) + 1)

        if len(time_names) != data.shape[-4]:
            logger.exception(
                '  The time names do not match the output dimensions.')
            raise ValueError

        data = xr.DataArray(data,
                            dims=('time', 'band', 'y', 'x'),
                            coords={
                                'time': time_names,
                                'band': band_names,
                                'y': ycoords[:data.shape[-2]],
                                'x': xcoords[:data.shape[-1]]
                            },
                            attrs=attrs)

    return data
Exemplo n.º 34
0
 def do_work():
     return dask.compute(
         *[self.load_image_label(),
           self.load_image_annotations()]), None
Exemplo n.º 35
0
    initial_states_compartment : pd.DataFrame with index of draws and colunms for S, E, I1, I2, R
    use_mechanistic_testing : bool
    test_rate : tests per person per day
    test_positive_rate : fraction of daily tests that test positive (if there are enough infections to do so)

    Results
    -------
    returns two dicts of pd.DataFrames with columns for counts for S, E, I1, I2, and R
    as well as new infections, and rows for each day of projection; first dict is for
    agent counts, and second dict is for compartment counts
    """
    from dask import delayed, compute

    assert 0 <= mixing_parameter <= 1, 'mixing_parameter must be in interval [0,1]'

    df_agent_count_dict, df_compartment_count_dict = {}, {}

    for draw in np.random.choice(range(1_000), replace=False, size=n_draws):

        df_tuple = delayed(run_one_hybrid_model)(
            draw, n_simulants, mixing_parameter, params, beta_agent,
            beta_compartment, start_time, end_time, initial_states_agent,
            initial_states_compartment, use_mechanistic_testing, test_rate,
            test_positive_rate)

        # append the counts to their dicts
        df_agent_count_dict[draw] = df_tuple[0]
        df_compartment_count_dict[draw] = df_tuple[1]

    return compute(df_agent_count_dict, df_compartment_count_dict)
Exemplo n.º 36
0
 def test_compute(self):
     """compute_with_trace() runs the same logic as compute()."""
     bag = from_sequence([1, 2, 3])
     bag = bag.map(lambda x: x * 7).map(lambda x: x * 4)
     bag = bag.fold(lambda x, y: x + y)
     self.assertEqual(dask.compute(bag), compute_with_trace(bag))
import sys
import time

if __name__ == '__main__':
    # spin up the dask client  (NEEDS to be within __main__)
    c = None
    if len(sys.argv) > 1 and int(sys.argv[1]) > 1:
        print(f"Starting dask Client with {sys.argv[1]} processors")
        c = Client(threads_per_worker=1, n_workers=int(sys.argv[1]))
        print("client started...")

    ds = yt.load_sample("snapshot_033")
    sp = ds.sphere(ds.domain_center, (2, 'code_length'))
    ptf = {'PartType0': ['Mass']}
    mock_sphere = gda.MockSphere(sp)
    delayed_reader = gda.delayed_gadget(ds,
                                        ptf,
                                        mock_selector=mock_sphere,
                                        subchunk_size=None)
    # delayed_reader.delayed_chunks[0].compute()
    # data = compute(*delayed_reader.delayed_chunks)
    data_subset = compute(*delayed_reader.masked_chunks)
    # delayed_reader.set_chunk_masks("snapshot_033")
    # masks = compute(*delayed_reader.masks)

    # print(f"\nCompute time (neglecting Client spinup and yt initialization): {select_time}s")
    #
    if c is not None:
        print("\nshutting down dask client")
        c.shutdown()
Exemplo n.º 38
0
    print('ds_in size: ', ds_in[in_vars].nbytes / 1e9)

    full_template = create_template(ds_in['ppt'], in_vars + model_vars)
    for v in aux_vars:
        full_template[v] = ds_in[v]
    full_template = full_template.chunk(chunks)

    out_mapper = get_out_mapper(os.environ["BLOB_ACCOUNT_KEY"])
    print('clearing existing store')
    out_mapper.clear()

    full_template.to_zarr(out_mapper, mode='w', compute=False)

    regions = get_regions(ds_in)
    reg_tasks = []
    for region in regions:
        reg_tasks.append(block_wrapper(region, os.environ['BLOB_ACCOUNT_KEY']))

    return finish_store(out_mapper, reg_tasks)


if __name__ == '__main__':
    from dask.distributed import Client

    with Client(threads_per_worker=1, memory_limit='4 G') as client:
        print(client)
        print(client.dashboard_link)

        task = main()
        dask.compute(task, retries=10)
Exemplo n.º 39
0

d = []

start = 0
incr = 1000
stop = len(files)
ranges = list(range(start, stop, incr))
for i in tqdm_notebook(ranges):
    print(f'Processing {i}')
    d = []
    for file in files[i:i + incr]:
        print(file)
        d.append(process_float(file))

    results = dask.compute(*d)

    t = xr.concat(results, dim='N_PROF', coords='minimal')
    t = t.chunk({'N_PROF': 10000, 'N_LEVELS': 3000})
    print(f'Finished concatenating dataset')

    numcodecs.blosc.use_threads = False
    synchronizer = zarr.ProcessSynchronizer('../../argozarr/argodask2.sync')
    #compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
    zarr_path = '../../argozarr/argo_dask2.zarr'
    #encoding = {vname: {'compressor': compressor} for vname in t.variables}
    d = t.to_zarr(zarr_path,
                  mode='a',
                  synchronizer=synchronizer,
                  compute=True,
                  append_dim='N_PROF')
Exemplo n.º 40
0
def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
                   compat='no_conflicts', preprocess=None, engine=None,
                   lock=None, data_vars='all', coords='different',
                   autoclose=False, parallel=False, **kwargs):
    """Open multiple files as a single dataset.

    Requires dask to be installed. See documentation for details on dask [1].
    Attributes from the first dataset file are used for the combined dataset.

    Parameters
    ----------
    paths : str or sequence
        Either a string glob in the form "path/to/my/files/*.nc" or an explicit
        list of files to open.  Paths can be given as strings or as pathlib
        Paths.
    chunks : int or dict, optional
        Dictionary with keys given by dimension names and values given by chunk
        sizes. In general, these should divide the dimensions of each dataset.
        If int, chunk each dimension by ``chunks``.
        By default, chunks will be chosen to load entire input files into
        memory at once. This has a major impact on performance: please see the
        full documentation for more details [2].
    concat_dim : None, str, DataArray or Index, optional
        Dimension to concatenate files along. This argument is passed on to
        :py:func:`xarray.auto_combine` along with the dataset objects. You only
        need to provide this argument if the dimension along which you want to
        concatenate is not a dimension in the original datasets, e.g., if you
        want to stack a collection of 2D arrays along a third dimension.
        By default, xarray attempts to infer this argument by examining
        component files. Set ``concat_dim=None`` explicitly to disable
        concatenation.
    compat : {'identical', 'equals', 'broadcast_equals',
              'no_conflicts'}, optional
        String indicating how to compare variables of the same name for
        potential conflicts when merging:

        - 'broadcast_equals': all values must be equal when variables are
          broadcast against each other to ensure common dimensions.
        - 'equals': all values and dimensions must be the same.
        - 'identical': all values, dimensions and attributes must be the
          same.
        - 'no_conflicts': only values which are not null in both datasets
          must be equal. The returned dataset then contains the combination
          of all non-null values.
    preprocess : callable, optional
        If provided, call this function on each dataset prior to concatenation.
    engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional
        Engine to use when reading files. If not provided, the default engine
        is chosen based on available dependencies, with a preference for
        'netcdf4'.
    autoclose : bool, optional
        If True, automatically close files to avoid OS Error of too many files
        being open.  However, this option doesn't work with streams, e.g.,
        BytesIO.
    lock : False, True or threading.Lock, optional
        This argument is passed on to :py:func:`dask.array.from_array`. By
        default, a per-variable lock is used when reading data from netCDF
        files with the netcdf4 and h5netcdf engines to avoid issues with
        concurrent access when using dask's multithreaded backend.
    data_vars : {'minimal', 'different', 'all' or list of str}, optional
        These data variables will be concatenated together:
          * 'minimal': Only data variables in which the dimension already
            appears are included.
          * 'different': Data variables which are not equal (ignoring
            attributes) across all datasets are also concatenated (as well as
            all for which dimension already appears). Beware: this option may
            load the data payload of data variables into memory if they are not
            already loaded.
          * 'all': All data variables will be concatenated.
          * list of str: The listed data variables will be concatenated, in
            addition to the 'minimal' data variables.
    coords : {'minimal', 'different', 'all' o list of str}, optional
        These coordinate variables will be concatenated together:
          * 'minimal': Only coordinates in which the dimension already appears
            are included.
          * 'different': Coordinates which are not equal (ignoring attributes)
            across all datasets are also concatenated (as well as all for which
            dimension already appears). Beware: this option may load the data
            payload of coordinate variables into memory if they are not already
            loaded.
          * 'all': All coordinate variables will be concatenated, except
            those corresponding to other dimensions.
          * list of str: The listed coordinate variables will be concatenated,
            in addition the 'minimal' coordinates.
    parallel : bool, optional
        If True, the open and preprocess steps of this function will be
        performed in parallel using ``dask.delayed``. Default is False.
    **kwargs : optional
        Additional arguments passed on to :py:func:`xarray.open_dataset`.

    Returns
    -------
    xarray.Dataset

    See Also
    --------
    auto_combine
    open_dataset

    References
    ----------
    .. [1] http://xarray.pydata.org/en/stable/dask.html
    .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance
    """
    if isinstance(paths, basestring):
        paths = sorted(glob(paths))
    else:
        paths = [str(p) if isinstance(p, path_type) else p for p in paths]

    if not paths:
        raise IOError('no files to open')

    if lock is None:
        lock = _default_lock(paths[0], engine)

    open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock,
                       autoclose=autoclose, **kwargs)

    if parallel:
        import dask
        # wrap the open_dataset, getattr, and preprocess with delayed
        open_ = dask.delayed(open_dataset)
        getattr_ = dask.delayed(getattr)
        if preprocess is not None:
            preprocess = dask.delayed(preprocess)
    else:
        open_ = open_dataset
        getattr_ = getattr

    datasets = [open_(p, **open_kwargs) for p in paths]
    file_objs = [getattr_(ds, '_file_obj') for ds in datasets]
    if preprocess is not None:
        datasets = [preprocess(ds) for ds in datasets]

    if parallel:
        # calling compute here will return the datasets/file_objs lists,
        # the underlying datasets will still be stored as dask arrays
        datasets, file_objs = dask.compute(datasets, file_objs)

    # close datasets in case of a ValueError
    try:
        if concat_dim is _CONCAT_DIM_DEFAULT:
            combined = auto_combine(datasets, compat=compat,
                                    data_vars=data_vars, coords=coords)
        else:
            combined = auto_combine(datasets, concat_dim=concat_dim,
                                    compat=compat,
                                    data_vars=data_vars, coords=coords)
    except ValueError:
        for ds in datasets:
            ds.close()
        raise

    combined._file_obj = _MultiFileCloser(file_objs)
    combined.attrs = datasets[0].attrs
    return combined
Exemplo n.º 41
0
def get_pandas_parallel():
    files = get_files(get_data_dir())
    dfs = dask.compute(
        read_parquet_delayed(file_name) for file_name in files)
    df = pd.concat(dfs[0])
    return df
Exemplo n.º 42
0
def init_scalable(X,
                  n_clusters,
                  random_state=None,
                  max_iter=None,
                  oversampling_factor=2):
    """K-Means initialization using k-means||

    This is algorithm 2 in Scalable K-Means++ (2012).
    """

    logger.info("Initializing with k-means||")
    # Step 1: Initialize Centers
    idx = 0
    centers = da.compute(X[idx, np.newaxis])[0]
    c_idx = {idx}

    # Step 2: Initialize cost
    (cost, ) = compute(evaluate_cost(X, centers))

    if cost == 0:
        n_iter = 0
    else:
        n_iter = int(np.round(np.log(cost)))

    if max_iter is not None:
        n_iter = min(max_iter, n_iter)

    # Steps 3 - 6: update candidate Centers
    for i in range(n_iter):
        with _timer(
                "init iteration %2d/%2d , %2d centers" %
            (i + 1, n_iter, len(c_idx)),
                _logger=logger,
        ):
            new_idxs = _sample_points(X, centers, oversampling_factor,
                                      random_state)
            new_idxs = set(*compute(new_idxs))
            c_idx |= new_idxs

        # Sort before slicing, for better performance / memory
        # usage with the scheduler.
        # See https://github.com/dask/dask-ml/issues/39
        centers = X[sorted(c_idx)].compute()

    # XXX: scikit-learn doesn't have weighted k-means.
    # The paper weights each center by the number of points closest to it.
    # https://stackoverflow.com/a/37198799/1889400 claims you can scale the
    # features before clustering, but that doesn't seem right.
    # I think that replicating the *points*, proportional to the number of
    # original points closest to the candidate centers, would be a better way
    # to do that.

    if len(centers) < n_clusters:
        logger.warning("Found fewer than %d clusters in init.", n_clusters)
        # supplement with random
        need = n_clusters - len(centers)
        locs = sorted(
            random_state.choice(np.arange(0, len(X)),
                                size=need,
                                replace=False,
                                chunks=len(X)))
        extra = X[locs].compute()
        return np.vstack([centers, extra])
    else:
        # Step 7, 8 without weights
        # dask RandomState objects aren't valid for scikit-learn
        rng2 = (random_state.randint(
            0, np.iinfo("i4").max - 1,
            chunks=()).compute(scheduler="single-threaded").item())
        km = sklearn.cluster.KMeans(n_clusters, random_state=rng2)
        km.fit(centers)

    return km.cluster_centers_
Exemplo n.º 43
0
def compute_overview(df: dd.DataFrame, cfg: Config, dtype: Optional[DTypeDef]) -> Intermediate:
    """
    Compute functions for plot(df)

    Parameters
    ----------
    df
        DataFrame from which visualizations are generated
    cfg
        Config instance
    dtype: str or DType or dict of str or dict of DType, default None
        Specify Data Types for designated column or all columns.
        E.g.  dtype = {"a": Continuous, "b": "Nominal"} or
        dtype = {"a": Continuous(), "b": "nominal"}
        or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
    """
    # pylint: disable=too-many-branches

    if cfg.bar.enable or cfg.insight.enable:
        # extract the first rows to check if a column contains a mutable type
        head: pd.DataFrame = df.head()  # head triggers a (small) data read

    data: List[Tuple[str, DType, Any]] = []
    for col in df.columns:
        col_dtype = detect_dtype(df[col], dtype)
        if is_dtype(col_dtype, Continuous()) and (cfg.hist.enable or cfg.insight.enable):
            data.append((col, Continuous(), _cont_calcs(df[col].dropna(), cfg)))
        elif is_dtype(col_dtype, Nominal()) and (cfg.bar.enable or cfg.insight.enable):
            # Since it will throw error if column is object while some cells are
            # numerical, we transform column to string first.
            df[col] = df[col].astype(str)
            data.append((col, Nominal(), _nom_calcs(df[col].dropna(), head[col], cfg)))
        elif is_dtype(col_dtype, DateTime()) and (cfg.line.enable or cfg.insight.enable):
            data.append((col, DateTime(), dask.delayed(_calc_line_dt)(df[[col]], cfg.line.unit)))

    ov_stats = calc_stats(df, cfg, dtype)  # overview statistics
    data, ov_stats = dask.compute(data, ov_stats)

    # extract the plotting data, and detect and format the insights
    plot_data: List[Tuple[str, DType, Any]] = []
    col_insights: Dict[str, List[str]] = {}
    all_ins = _format_ov_ins(ov_stats, cfg) if cfg.insight.enable else []

    for col, dtp, dat in data:
        if is_dtype(dtp, Continuous()):
            if cfg.insight.enable:
                col_ins, ins = _format_cont_ins(col, dat, ov_stats["nrows"], cfg)
            if cfg.hist.enable:
                plot_data.append((col, dtp, dat["hist"]))
        elif is_dtype(dtp, Nominal()):
            if cfg.insight.enable:
                col_ins, ins = _format_nom_ins(col, dat, ov_stats["nrows"], cfg)
            if cfg.bar.enable:
                plot_data.append((col, dtp, (dat["bar"].to_frame(), dat["nuniq"])))
        elif is_dtype(dtp, DateTime()):
            plot_data.append((col, dtp, dat))
            continue

        if cfg.insight.enable:
            if col_ins:
                col_insights[col] = col_ins
            all_ins += ins

    return Intermediate(
        data=plot_data,
        stats=ov_stats,
        column_insights=col_insights,
        overview_insights=_insight_pagination(all_ins),
        visual_type="distribution_grid",
    )
Exemplo n.º 44
0
 def write(self):
     writes = xds_to_table(self.datasets, self.table_name, columns="ALL")
     dask.compute(writes)
Exemplo n.º 45
0
def ms_create(ms_table_name, info, ant_pos, vis_array, baselines, timestamps, pol_feeds, sources):
    ''' Create a Measurement Set from some TART observations
    
    Parameters
    ----------
    
    ms_table_name : string
        The name of the MS top level directory. I think this only workds in 
        the local directory.
    
    info : JSON
        "info": {
            "info": {
                "L0_frequency": 1571328000.0,
                "bandwidth": 2500000.0,
                "baseband_frequency": 4092000.0,
                "location": {
                    "alt": 270.0,
                    "lat": -45.85177,
                    "lon": 170.5456
                },
                "name": "Signal Hill - Dunedin",
                "num_antenna": 24,
                "operating_frequency": 1575420000.0,
                "sampling_frequency": 16368000.0
            }
        },

    Returns
    -------
    None
      
    '''

    epoch_s = timestamp_to_ms_epoch(timestamps)
    LOGGER.info("Time {}".format(epoch_s))

    try:
        loc = info['location']
    except:
        loc = info
    # Sort out the coordinate frames using astropy
    # https://casa.nrao.edu/casadocs/casa-5.4.1/reference-material/coordinate-frames
    iers.conf.iers_auto_url = 'https://astroconda.org/aux/astropy_mirror/iers_a_1/finals2000A.all' 
    iers.conf.auto_max_age = None 

    location = EarthLocation.from_geodetic(lon=loc['lon']*u.deg,
                                           lat=loc['lat']*u.deg,
                                           height=loc['alt']*u.m,
                                           ellipsoid='WGS84')
    obstime = Time(timestamps)
    local_frame = AltAz(obstime=obstime, location=location)

    phase_altaz = SkyCoord(alt=90.0*u.deg, az=0.0*u.deg, obstime = obstime, frame = 'altaz', location = location)
    phase_j2000 = phase_altaz.transform_to('fk5')

    # Get the stokes enums for the polarization types
    corr_types = [[MS_STOKES_ENUMS[p_f] for p_f in pol_feeds]]

    LOGGER.info("Pol Feeds {}".format(pol_feeds))
    LOGGER.info("Correlation Types {}".format(corr_types))
    num_freq_channels = [1]

    ant_table = MSTable(ms_table_name, 'ANTENNA')
    feed_table = MSTable(ms_table_name, 'FEED')
    field_table = MSTable(ms_table_name, 'FIELD')
    pol_table = MSTable(ms_table_name, 'POLARIZATION')
    obs_table = MSTable(ms_table_name, 'OBSERVATION')
    # SOURCE is an optional MS sub-table
    src_table = MSTable(ms_table_name, 'SOURCE')
    
    ddid_table_name = "::".join((ms_table_name, "DATA_DESCRIPTION"))
    spw_table_name = "::".join((ms_table_name, "SPECTRAL_WINDOW"))

    ms_datasets = []
    ddid_datasets = []
    spw_datasets = []

    # Create ANTENNA dataset
    # Each column in the ANTENNA has a fixed shape so we
    # can represent all rows with one dataset
    num_ant = len(ant_pos)
    position = da.asarray(ant_pos)
    diameter = da.ones(num_ant) * 0.025
    offset = da.zeros((num_ant, 3))
    names = np.array(['ANTENNA-%d' % i for i in range(num_ant)], dtype=np.object)
    stations = np.array([info['name'] for i in range(num_ant)], dtype=np.object)

    dataset = Dataset({
        'POSITION': (("row", "xyz"), position),
        'OFFSET': (("row", "xyz"), offset),
        'DISH_DIAMETER': (("row",), diameter),
        'NAME': (("row",), da.from_array(names, chunks=num_ant)),
        'STATION': (("row",), da.from_array(stations, chunks=num_ant)),
    })
    ant_table.append(dataset)

    ###################  Create a FEED dataset. ###################################
    # There is one feed per antenna, so this should be quite similar to the ANTENNA
    num_pols = len(pol_feeds)
    pol_types = pol_feeds
    pol_responses = [POL_RESPONSES[ct] for ct in pol_feeds]

    LOGGER.info("Pol Types {}".format(pol_types))
    LOGGER.info("Pol Responses {}".format(pol_responses))

    antenna_ids = da.asarray(range(num_ant))
    feed_ids = da.zeros(num_ant)
    num_receptors = da.zeros(num_ant) + num_pols
    polarization_types = np.array([pol_types for i in range(num_ant)], dtype=np.object)
    receptor_angles = np.array([[0.0] for i in range(num_ant)])
    pol_response = np.array([pol_responses for i in range(num_ant)])

    beam_offset = np.array([[[0.0, 0.0]] for i in range(num_ant)])

    dataset = Dataset({
        'ANTENNA_ID': (("row",), antenna_ids),
        'FEED_ID': (("row",), feed_ids),
        'NUM_RECEPTORS': (("row",), num_receptors),
        'POLARIZATION_TYPE': (("row", "receptors",),
                              da.from_array(polarization_types, chunks=num_ant)),
        'RECEPTOR_ANGLE': (("row", "receptors",),
                           da.from_array(receptor_angles, chunks=num_ant)),
        'POL_RESPONSE': (("row", "receptors", "receptors-2"),
                         da.from_array(pol_response, chunks=num_ant)),
        'BEAM_OFFSET': (("row", "receptors", "radec"),
                        da.from_array(beam_offset, chunks=num_ant)),
    })
    feed_table.append(dataset)


    ####################### FIELD dataset #########################################
    
    direction = [[phase_j2000.ra.radian, phase_j2000.dec.radian]]
    field_direction = da.asarray(direction)[None, :]
    field_name = da.asarray(np.asarray(['up'], dtype=np.object), chunks=1)
    field_num_poly = da.zeros(1) # Zero order polynomial in time for phase center.

    dir_dims = ("row", 'field-poly', 'field-dir',)

    dataset = Dataset({
        'PHASE_DIR': (dir_dims, field_direction),
        'DELAY_DIR': (dir_dims, field_direction),
        'REFERENCE_DIR': (dir_dims, field_direction),
        'NUM_POLY': (("row", ), field_num_poly),
        'NAME': (("row", ), field_name),
    })
    field_table.append(dataset)

   ######################### OBSERVATION dataset #####################################

    dataset = Dataset({
        'TELESCOPE_NAME': (("row",), da.asarray(np.asarray(['TART'], dtype=np.object), chunks=1)),
        'OBSERVER': (("row",), da.asarray(np.asarray(['Tim'], dtype=np.object), chunks=1)),
        "TIME_RANGE": (("row","obs-exts"), da.asarray(np.array([[epoch_s, epoch_s+1]]), chunks=1)),
    })
    obs_table.append(dataset)

    ######################## SOURCE datasets ########################################
    for src in sources:
        name = src['name']
        # Convert to J2000 
        dir_altaz = SkyCoord(alt=src['el']*u.deg, az=src['az']*u.deg, obstime = obstime,
                             frame = 'altaz', location = location)
        dir_j2000 = dir_altaz.transform_to('fk5')
        direction = [dir_j2000.ra.radian, dir_j2000.dec.radian]
        #LOGGER.info("SOURCE: {}, timestamp: {}".format(name, timestamps))
        dask_num_lines = da.full((1,), 1, dtype=np.int32)
        dask_direction = da.asarray(direction)[None, :]
        dask_name = da.asarray(np.asarray([name], dtype=np.object), chunks=1)
        dask_time = da.asarray(np.array([epoch_s]))
        dataset = Dataset({
            "NUM_LINES": (("row",), dask_num_lines),
            "NAME": (("row",), dask_name),
            "TIME": (("row",), dask_time),
            "DIRECTION": (("row", "dir"), dask_direction),
            })
        src_table.append(dataset)

    # Create POLARISATION datasets.
    # Dataset per output row required because column shapes are variable

    for corr_type in corr_types:
        corr_prod = [[i, i] for i in range(len(corr_type))]

        corr_prod = np.array(corr_prod)
        LOGGER.info("Corr Prod {}".format(corr_prod))
        LOGGER.info("Corr Type {}".format(corr_type))

        dask_num_corr = da.full((1,), len(corr_type), dtype=np.int32)
        LOGGER.info("NUM_CORR {}".format(dask_num_corr))
        dask_corr_type = da.from_array(corr_type,
                                       chunks=len(corr_type))[None, :]
        dask_corr_product = da.asarray(corr_prod)[None, :]
        LOGGER.info("Dask Corr Prod {}".format(dask_corr_product.shape))
        LOGGER.info("Dask Corr Type {}".format(dask_corr_type.shape))
        dataset = Dataset({
            "NUM_CORR": (("row",), dask_num_corr),
            "CORR_TYPE": (("row", "corr"), dask_corr_type),
            "CORR_PRODUCT": (("row", "corr", "corrprod_idx"), dask_corr_product),
        })

        pol_table.append(dataset)

    # Create multiple SPECTRAL_WINDOW datasets
    # Dataset per output row required because column shapes are variable

    for num_chan in num_freq_channels:
        dask_num_chan = da.full((1,), num_chan, dtype=np.int32)
        dask_chan_freq = da.asarray([[info['operating_frequency']]])
        dask_chan_width = da.full((1, num_chan), 2.5e6/num_chan)

        dataset = Dataset({
            "NUM_CHAN": (("row",), dask_num_chan),
            "CHAN_FREQ": (("row", "chan"), dask_chan_freq),
            "CHAN_WIDTH": (("row", "chan"), dask_chan_width),
            "EFFECTIVE_BW": (("row", "chan"), dask_chan_width),
            "RESOLUTION": (("row", "chan"), dask_chan_width),
        })

        spw_datasets.append(dataset)

    # For each cartesian product of SPECTRAL_WINDOW and POLARIZATION
    # create a corresponding DATA_DESCRIPTION.
    # Each column has fixed shape so we handle all rows at once
    spw_ids, pol_ids = zip(*product(range(len(num_freq_channels)),
                                    range(len(corr_types))))
    dask_spw_ids = da.asarray(np.asarray(spw_ids, dtype=np.int32))
    dask_pol_ids = da.asarray(np.asarray(pol_ids, dtype=np.int32))
    ddid_datasets.append(Dataset({
        "SPECTRAL_WINDOW_ID": (("row",), dask_spw_ids),
        "POLARIZATION_ID": (("row",), dask_pol_ids),
    }))

    # Now create the associated MS dataset

    #vis_data, baselines = cal_vis.get_all_visibility()
    #vis_array = np.array(vis_data, dtype=np.complex64)
    chunks = {
        "row": (vis_array.shape[0],),
    }
    baselines = np.array(baselines)
    #LOGGER.info(f"baselines {baselines}")
    bl_pos = np.array(ant_pos)[baselines]
    uu_a, vv_a, ww_a = -(bl_pos[:, 1] - bl_pos[:, 0]).T #/constants.L1_WAVELENGTH
    # Use the - sign to get the same orientation as our tart projections.

    uvw_array = np.array([uu_a, vv_a, ww_a]).T

    for ddid, (spw_id, pol_id) in enumerate(zip(spw_ids, pol_ids)):
        # Infer row, chan and correlation shape
        #LOGGER.info("ddid:{} ({}, {})".format(ddid, spw_id, pol_id))
        row = sum(chunks['row'])
        chan = spw_datasets[spw_id].CHAN_FREQ.shape[1]
        corr = pol_table.datasets[pol_id].CORR_TYPE.shape[1]

        # Create some dask vis data
        dims = ("row", "chan", "corr")
        LOGGER.info("Data size %s %s %s" % (row, chan, corr))

        #np_data = vis_array.reshape((row, chan, corr))
        np_data = np.zeros((row, chan, corr), dtype=np.complex128)
        for i in range(corr):
            np_data[:, :, i] = vis_array.reshape((row, chan))
        #np_data = np.array([vis_array.reshape((row, chan, 1)) for i in range(corr)])
        np_uvw = uvw_array.reshape((row, 3))

        data_chunks = tuple((chunks['row'], chan, corr))
        dask_data = da.from_array(np_data, chunks=data_chunks)
        
        flag_categories = da.from_array(0.05*np.ones((row, chan, corr, 1)))
        flag_data = np.zeros((row, chan, corr), dtype=np.bool_)

        uvw_data = da.from_array(np_uvw)
        # Create dask ddid column
        dask_ddid = da.full(row, ddid, chunks=chunks['row'], dtype=np.int32)
        dataset = Dataset({
            'DATA': (dims, dask_data),
            'FLAG': (dims, da.from_array(flag_data)),
            'TIME': (("row", "corr"), da.from_array(epoch_s*np.ones((row, corr)))),
            'TIME_CENTROID': (("row", "corr"), da.from_array(epoch_s*np.ones((row, corr)))),
            'WEIGHT': (("row", "corr"), da.from_array(0.95*np.ones((row, corr)))),
            'WEIGHT_SPECTRUM': (dims, da.from_array(0.95*np.ones_like(np_data, dtype=np.float64))),
            'SIGMA_SPECTRUM': (dims, da.from_array(np.ones_like(np_data, dtype=np.float64)*0.05)),
            'SIGMA': (("row", "corr"), da.from_array(0.05*np.ones((row, corr)))),
            'UVW': (("row", "uvw",), uvw_data),
            'FLAG_CATEGORY': (('row', 'flagcat', 'chan', 'corr'), flag_categories), # {'dims': ('flagcat', 'chan', 'corr')}
            'ANTENNA1': (("row",), da.from_array(baselines[:, 0])),
            'ANTENNA2': (("row",), da.from_array(baselines[:, 1])),
            'FEED1': (("row",), da.from_array(baselines[:, 0])),
            'FEED2': (("row",), da.from_array(baselines[:, 1])),
            'DATA_DESC_ID': (("row",), dask_ddid)
        })
        ms_datasets.append(dataset)

    ms_writes = xds_to_table(ms_datasets, ms_table_name, columns="ALL")
    spw_writes = xds_to_table(spw_datasets, spw_table_name, columns="ALL")
    ddid_writes = xds_to_table(ddid_datasets, ddid_table_name, columns="ALL")

    dask.compute(ms_writes)

    ant_table.write()
    feed_table.write()
    field_table.write()
    pol_table.write()
    obs_table.write()
    src_table.write()

    dask.compute(spw_writes)
    dask.compute(ddid_writes)
Exemplo n.º 46
0
def calc_cog_sog(obj):
    """
    This function calculates the course and speed over ground of a moving
    platform using the lat/lon. Note,data are resampled to 1 minute in
    order to provide a better estimate of speed/course compared with 1 second.

    Function is set up to use dask for the calculations in order to improve
    efficiency. Data are then resampled to 1 second to match native format.
    This assumes that the input data are 1 second. See this `example
    <https://ARM-DOE.github.io/ACT/source/auto_examples/correct_ship_wind_data.html
    #sphx-glr-source-auto-examples-correct-ship-wind-data-py>`_.

    Parameters
    ----------
    obj : ACT Dataset
        ACT Dataset to calculate COG/SOG from.  Assumes lat/lon are variables and
        that it's 1-second data.

    Returns
    -------
    obj : ACT Dataset
        Returns object with course_over_ground and speed_over_ground variables.

    """
    # Convert data to 1 minute in order to get proper values
    new_obj = obj.resample(time='1min').nearest()

    # Get lat and lon data
    if 'lat' in new_obj:
        lat = new_obj['lat']
    elif 'latitude' in new_obj:
        lat = new_obj['latitude']
    else:
        return new_obj

    if 'lon' in new_obj:
        lon = new_obj['lon']
    elif 'longitude' in new_obj:
        lon = new_obj['longitude']
    else:
        return new_obj

    # Set pyproj Geod
    _GEOD = pyproj.Geod(ellps='WGS84')

    # Set up delayed tasks for dask
    task = []
    time = new_obj['time'].values
    for i in range(len(lat) - 1):
        task.append(
            dask.delayed(proc_scog)(_GEOD, lon[i + 1], lat[i + 1], lon[i],
                                    lat[i], time[i], time[i + 1]))

    # Compute and process results Adding 2 values
    # to the end to make up for the missing times
    results = dask.compute(*task)
    sog = [r[0] for r in results]
    sog.append(sog[-1])
    sog.append(sog[-1])
    cog = [r[1] for r in results]
    cog.append(cog[-1])
    cog.append(cog[-1])
    time = np.append(time, time[-1] + np.timedelta64(1, 'm'))

    atts = {'long_name': 'Speed over ground', 'units': 'm/s'}
    sog_da = xr.DataArray(sog,
                          coords={'time': time},
                          dims=['time'],
                          attrs=atts)
    sog_da = sog_da.resample(time='1s').nearest()

    atts = {'long_name': 'Course over ground', 'units': 'deg'}
    cog_da = xr.DataArray(cog,
                          coords={'time': time},
                          dims=['time'],
                          attrs=atts)
    cog_da = cog_da.resample(time='1s').nearest()

    obj['course_over_ground'] = cog_da
    obj['speed_over_ground'] = sog_da

    return obj
Exemplo n.º 47
0
def test_custom_collection():
    # Arbitrary hashables
    h1 = object()
    h2 = object()

    dsk = {("x", h1): 1, ("x", h2): 2}
    dsk2 = {
        ("y", h1): (add, ("x", h1), ("x", h2)),
        ("y", h2): (add, ("y", h1), 1)
    }
    dsk2.update(dsk)
    dsk3 = {"z": (add, ("y", h1), ("y", h2))}
    dsk3.update(dsk2)

    w = Tuple({}, [])  # A collection can have no keys at all
    x = Tuple(dsk, [("x", h1), ("x", h2)])
    y = Tuple(dsk2, [("y", h1), ("y", h2)])
    z = Tuple(dsk3, ["z"])
    # Collection with multiple names
    t = w + x + y + z

    # __slots__ defined on base mixin class propagates
    with pytest.raises(AttributeError):
        x.foo = 1

    # is_dask_collection
    assert is_dask_collection(w)
    assert is_dask_collection(x)
    assert is_dask_collection(y)
    assert is_dask_collection(z)
    assert is_dask_collection(t)

    # tokenize
    assert tokenize(w) == tokenize(w)
    assert tokenize(x) == tokenize(x)
    assert tokenize(y) == tokenize(y)
    assert tokenize(z) == tokenize(z)
    assert tokenize(t) == tokenize(t)
    # All tokens are unique
    assert len({tokenize(coll) for coll in (w, x, y, z, t)}) == 5

    # get_collection_names
    assert get_collection_names(w) == set()
    assert get_collection_names(x) == {"x"}
    assert get_collection_names(y) == {"y"}
    assert get_collection_names(z) == {"z"}
    assert get_collection_names(t) == {"x", "y", "z"}

    # compute
    assert w.compute() == ()
    assert x.compute() == (1, 2)
    assert y.compute() == (3, 4)
    assert z.compute() == (7, )
    assert dask.compute(w, [{
        "x": x
    }, y, z]) == ((), [{
        "x": (1, 2)
    }, (3, 4), (7, )])
    assert t.compute() == (1, 2, 3, 4, 7)

    # persist
    t2 = t.persist()
    assert isinstance(t2, Tuple)
    assert t2._keys == t._keys
    assert sorted(t2._dask.values()) == [1, 2, 3, 4, 7]
    assert t2.compute() == (1, 2, 3, 4, 7)

    w2, x2, y2, z2 = dask.persist(w, x, y, z)
    assert y2._keys == y._keys
    assert y2._dask == {("y", h1): 3, ("y", h2): 4}
    assert y2.compute() == (3, 4)

    t3 = x2 + y2 + z2
    assert t3.compute() == (1, 2, 3, 4, 7)

    # __dask_postpersist__ with name change
    rebuild, args = w.__dask_postpersist__()
    w3 = rebuild({}, *args, rename={"w": "w3"})
    assert w3.compute() == ()

    rebuild, args = x.__dask_postpersist__()
    x3 = rebuild({("x3", h1): 10, ("x3", h2): 20}, *args, rename={"x": "x3"})
    assert x3.compute() == (10, 20)

    rebuild, args = z.__dask_postpersist__()
    z3 = rebuild({"z3": 70}, *args, rename={"z": "z3"})
    assert z3.compute() == (70, )
Exemplo n.º 48
0
 def do_work():
     return dask.compute(
         *[self.load_images(),
           self.load_models(),
           self.load_labels()]), None
Exemplo n.º 49
0
def parLapply(CORE_NUM, iterable, func, *args, **kwargs):
    with dask.config.set(scheduler='processes', num_workers=CORE_NUM):
        f_par = functools.partial(func, *args, **kwargs)
        result = compute([delayed(f_par)(item) for item in iterable])[0]
        return result
Exemplo n.º 50
0
 def check_parts(df, sol):
     assert all((p.dtypes == sol.dtypes).all()
                for p in dask.compute(*df.to_delayed()))
    def generate_product(
        self,
        dc,
        path_prefix,
        aoi,
        output_projection,
        start_date,
        end_date,
        platform,
        res,
        aoi_crs,
        **kwargs,
    ):

        ## Create datacube query

        dask_chunks = dict(time=10, x=1000, y=1000)

        query = create_base_query(aoi, res, output_projection, aoi_crs,
                                  dask_chunks)

        all_measurements = ["green", "red", "blue", "nir", "swir1", "swir2"]
        product, measurement, water_product = create_product_measurement(
            platform, all_measurements)

        time = (start_date, end_date)

        ## Create dask graph

        ds = dc.load(
            time=time,
            platform=platform,
            product=product,
            measurements=measurement,
            **query,
        )

        if is_dataset_empty(ds):
            raise Exception(
                "DataCube Load returned an empty Dataset." +
                "Please check load parameters for Baseline Dataset!")

        water_scenes = dc.load(
            product=water_product,
            measurements=["water_classification"],
            time=time,
            **query,
        )

        # Set land to no_data
        water_dataset = water_scenes.where(water_scenes > 0)

        good_quality = mask_good_quality(ds, product)
        ds_clear = ds.where(good_quality)
        ds_clear_land = ds_clear.where(water_dataset.water_classification > 0)
        tsm_dataset = xr.map_blocks(tsm, ds_clear_land)

        mean_tsm = tsm_dataset.mean(dim=["time"])
        max_tsm = tsm_dataset.max(dim=["time"])
        min_tsm = tsm_dataset.min(dim=["time"])

        ## Compute

        mean_tsm, max_tsm, min_tsm = dask.compute(mean_tsm, max_tsm, min_tsm)

        ## Write files

        result = []

        file_name = path.join(path_prefix, "mean_tsm.tiff")
        import_export.export_xarray_to_geotiff(
            mean_tsm,
            file_name,
            crs=output_projection,
            x_coord="x",
            y_coord="y",
        )
        result.append(file_name)

        file_name = path.join(path_prefix, "min_tsm.tiff")
        import_export.export_xarray_to_geotiff(
            min_tsm,
            file_name,
            crs=output_projection,
            x_coord="x",
            y_coord="y",
        )
        result.append(file_name)

        file_name = path.join(path_prefix, "max_tsm.tiff")
        import_export.export_xarray_to_geotiff(
            max_tsm,
            file_name,
            crs=output_projection,
            x_coord="x",
            y_coord="y",
        )
        result.append(file_name)

        return result
Exemplo n.º 52
0
 def compute(self, **kwargs) -> Any:
     return dask.compute(self, **kwargs)
Exemplo n.º 53
0
def _kmeans_single_lloyd(
    X,
    n_clusters,
    max_iter=300,
    init="k-means||",
    verbose=False,
    x_squared_norms=None,
    random_state=None,
    tol=1e-4,
    precompute_distances=True,
    oversampling_factor=2,
    init_max_iter=None,
):
    centers = k_init(
        X,
        n_clusters,
        init=init,
        oversampling_factor=oversampling_factor,
        random_state=random_state,
        max_iter=init_max_iter,
    )
    dt = X.dtype
    P = X.shape[1]
    for i in range(max_iter):
        with _timer("Lloyd loop %2d." % i, _logger=logger):
            labels, distances = pairwise_distances_argmin_min(
                X,
                centers,
                metric="euclidean",
                metric_kwargs={"squared": True})

            labels = labels.astype(np.int32)
            # distances is always float64, but we need it to match X.dtype
            # for centers_dense, but remain float64 for inertia
            r = blockwise(
                _centers_dense,
                "ij",
                X,
                "ij",
                labels,
                "i",
                n_clusters,
                None,
                "i",
                adjust_chunks={
                    "i": n_clusters,
                    "j": P
                },
                dtype=X.dtype,
            )
            new_centers = da.from_delayed(sum(r.to_delayed().flatten()),
                                          (n_clusters, P), X.dtype)
            counts = da.bincount(labels, minlength=n_clusters)
            # Require at least one per bucket, to avoid division by 0.
            counts = da.maximum(counts, 1)
            new_centers = new_centers / counts[:, None]
            (new_centers, ) = compute(new_centers)

            # Convergence check
            shift = squared_norm(centers - new_centers)

            logger.info("Shift: %0.4f", shift)
            if shift < tol:
                break
            centers = new_centers

    if shift > 1e-7:
        labels, distances = pairwise_distances_argmin_min(X, centers)
        labels = labels.astype(np.int32)

    inertia = distances.sum()
    centers = centers.astype(dt)

    return labels, inertia, centers, i + 1
Exemplo n.º 54
0
def to_csv(
    df,
    filename,
    single_file=False,
    encoding="utf-8",
    mode="wt",
    name_function=None,
    compression=None,
    compute=True,
    scheduler=None,
    storage_options=None,
    header_first_partition_only=None,
    compute_kwargs=None,
    **kwargs,
):
    """
    Store Dask DataFrame to CSV files

    One filename per partition will be created. You can specify the
    filenames in a variety of ways.

    Use a globstring::

    >>> df.to_csv('/path/to/data/export-*.csv')  # doctest: +SKIP

    The * will be replaced by the increasing sequence 0, 1, 2, ...

    ::

        /path/to/data/export-0.csv
        /path/to/data/export-1.csv

    Use a globstring and a ``name_function=`` keyword argument.  The
    name_function function should expect an integer and produce a string.
    Strings produced by name_function must preserve the order of their
    respective partition indices.

    >>> from datetime import date, timedelta
    >>> def name(i):
    ...     return str(date(2015, 1, 1) + i * timedelta(days=1))

    >>> name(0)
    '2015-01-01'
    >>> name(15)
    '2015-01-16'

    >>> df.to_csv('/path/to/data/export-*.csv', name_function=name)  # doctest: +SKIP

    ::

        /path/to/data/export-2015-01-01.csv
        /path/to/data/export-2015-01-02.csv
        ...

    You can also provide an explicit list of paths::

    >>> paths = ['/path/to/data/alice.csv', '/path/to/data/bob.csv', ...]  # doctest: +SKIP
    >>> df.to_csv(paths) # doctest: +SKIP

    Parameters
    ----------
    df : dask.DataFrame
        Data to save
    filename : string
        Path glob indicating the naming scheme for the output files
    single_file : bool, default False
        Whether to save everything into a single CSV file. Under the
        single file mode, each partition is appended at the end of the
        specified CSV file. Note that not all filesystems support the
        append mode and thus the single file mode, especially on cloud
        storage systems such as S3 or GCS. A warning will be issued when
        writing to a file that is not backed by a local filesystem.
    encoding : string, optional
        A string representing the encoding to use in the output file,
        defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
    mode : str
        Python write mode, default 'w'
    name_function : callable, default None
        Function accepting an integer (partition index) and producing a
        string to replace the asterisk in the given filename globstring.
        Should preserve the lexicographic order of partitions. Not
        supported when `single_file` is `True`.
    compression : string, optional
        a string representing the compression to use in the output file,
        allowed values are 'gzip', 'bz2', 'xz',
        only used when the first argument is a filename
    compute : bool
        If true, immediately executes. If False, returns a set of delayed
        objects, which can be computed at a later time.
    storage_options : dict
        Parameters passed on to the backend filesystem class.
    header_first_partition_only : boolean, default None
        If set to `True`, only write the header row in the first output
        file. By default, headers are written to all partitions under
        the multiple file mode (`single_file` is `False`) and written
        only once under the single file mode (`single_file` is `True`).
        It must not be `False` under the single file mode.
    compute_kwargs : dict, optional
        Options to be passed in to the compute method
    kwargs : dict, optional
        Additional parameters to pass to `pd.DataFrame.to_csv()`

    Returns
    -------
    The names of the file written if they were computed right away
    If not, the delayed tasks associated to the writing of the files

    Raises
    ------
    ValueError
        If `header_first_partition_only` is set to `False` or
        `name_function` is specified when `single_file` is `True`.
    """
    if single_file and name_function is not None:
        raise ValueError(
            "name_function is not supported under the single file mode")
    if header_first_partition_only is None:
        header_first_partition_only = single_file
    elif not header_first_partition_only and single_file:
        raise ValueError(
            "header_first_partition_only cannot be False in the single file mode."
        )
    file_options = dict(
        compression=compression,
        encoding=encoding,
        newline="",
        **(storage_options or {}),
    )
    to_csv_chunk = delayed(_write_csv, pure=False)
    dfs = df.to_delayed()
    if single_file:
        first_file = open_file(filename, mode=mode, **file_options)
        if not isinstance(first_file.fs,
                          fsspec.implementations.local.LocalFileSystem):
            warn("Appending data to a network storage system may not work.")
        value = to_csv_chunk(dfs[0], first_file, **kwargs)
        append_mode = mode.replace("w", "") + "a"
        append_file = open_file(filename, mode=append_mode, **file_options)
        kwargs["header"] = False
        for d in dfs[1:]:
            value = to_csv_chunk(d, append_file, depend_on=value, **kwargs)
        values = [value]
        files = [first_file]
    else:
        files = open_files(
            filename,
            mode=mode,
            name_function=name_function,
            num=df.npartitions,
            **file_options,
        )
        values = [to_csv_chunk(dfs[0], files[0], **kwargs)]
        if header_first_partition_only:
            kwargs["header"] = False
        values.extend(
            [to_csv_chunk(d, f, **kwargs) for d, f in zip(dfs[1:], files[1:])])
    if compute:
        if compute_kwargs is None:
            compute_kwargs = dict()

        if scheduler is not None:
            warn(
                "The 'scheduler' keyword argument for `to_csv()` is deprecated and"
                "will be removed in a future version. "
                "Please use the `compute_kwargs` argument instead. "
                f"For example, df.to_csv(..., compute_kwargs={{scheduler: {scheduler}}})",
                FutureWarning,
            )

        if (scheduler is not None
                and compute_kwargs.get("scheduler") is not None
                and compute_kwargs.get("scheduler") != scheduler):
            raise ValueError(
                f"Differing values for 'scheduler' have been passed in.\n"
                f"scheduler argument: {scheduler}\n"
                f"via compute_kwargs: {compute_kwargs.get('scheduler')}")

        if scheduler is not None and compute_kwargs.get("scheduler") is None:
            compute_kwargs["scheduler"] = scheduler

        import dask

        dask.compute(*values, **compute_kwargs)
        return [f.path for f in files]
    else:
        return values
Exemplo n.º 55
0
def forecast(
    R,
    metadata,
    V,
    n_timesteps,
    n_ens_members=24,
    n_cascade_levels=6,
    win_size=256,
    overlap=0.1,
    war_thr=0.1,
    extrap_method="semilagrangian",
    decomp_method="fft",
    bandpass_filter_method="gaussian",
    noise_method="ssft",
    ar_order=2,
    vel_pert_method=None,
    probmatching_method="cdf",
    mask_method="incremental",
    callback=None,
    fft_method="numpy",
    return_output=True,
    seed=None,
    num_workers=1,
    extrap_kwargs=None,
    filter_kwargs=None,
    noise_kwargs=None,
    vel_pert_kwargs=None,
    mask_kwargs=None,
    measure_time=False,
):
    """
    Generate a nowcast ensemble by using the Short-space ensemble prediction
    system (SSEPS) method.
    This is an experimental version of STEPS which allows for localization
    by means of a window function.

    Parameters
    ----------
    R : array-like
        Array of shape (ar_order+1,m,n) containing the input precipitation fields
        ordered by timestamp from oldest to newest. The time steps between the inputs
        are assumed to be regular, and the inputs are required to have finite values.
    metadata : dict
        Metadata dictionary containing the accutime, xpixelsize, threshold and
        zerovalue attributes as described in the documentation of
        :py:mod:`pysteps.io.importers`. xpixelsize is assumed to be in meters.
    V : array-like
        Array of shape (2,m,n) containing the x- and y-components of the advection
        field. The velocities are assumed to represent one time step between the
        inputs. All values are required to be finite.
    win_size : int or two-element sequence of ints
        Size-length of the localization window.
    overlap : float [0,1[
        A float between 0 and 1 prescribing the level of overlap between
        successive windows. If set to 0, no overlap is used.
    war_thr : float
        Threshold for the minimum fraction of rain in a given window.
    n_timesteps : int
        Number of time steps to forecast.
    n_ens_members : int
        The number of ensemble members to generate.
    n_cascade_levels : int
        The number of cascade levels to use.

    extrap_method : {'semilagrangian'}
        Name of the extrapolation method to use. See the documentation of
        pysteps.extrapolation.interface.
    decomp_method : {'fft'}
        Name of the cascade decomposition method to use. See the documentation
        of pysteps.cascade.interface.
    bandpass_filter_method : {'gaussian', 'uniform'}
        Name of the bandpass filter method to use with the cascade
        decomposition.
    noise_method : {'parametric','nonparametric','ssft','nested',None}
        Name of the noise generator to use for perturbating the precipitation
        field. See the documentation of pysteps.noise.interface. If set to None,
        no noise is generated.
    ar_order: int
        The order of the autoregressive model to use. Must be >= 1.
    vel_pert_method: {'bps',None}
        Name of the noise generator to use for perturbing the advection field.
        See the documentation of pysteps.noise.interface. If set to None,
        the advection field is not perturbed.
    mask_method : {'incremental', None}
        The method to use for masking no precipitation areas in the forecast
        field. The masked pixels are set to the minimum value of the
        observations. 'incremental' = iteratively buffer the mask with a
        certain rate (currently it is 1 km/min), None=no masking.
    probmatching_method : {'cdf', None}
        Method for matching the statistics of the forecast field with those of
        the most recently observed one. 'cdf'=map the forecast CDF to the
        observed one, None=no matching applied. Using 'mean' requires
        that mask_method is not None.
    callback : function
        Optional function that is called after computation of each time step of
        the nowcast. The function takes one argument: a three-dimensional array
        of shape (n_ens_members,h,w), where h and w are the height and width
        of the input field R, respectively. This can be used, for instance,
        writing the outputs into files.
    return_output : bool
        Set to False to disable returning the outputs as numpy arrays. This can
        save memory if the intermediate results are written to output files
        using the callback function.
    seed : int
        Optional seed number for the random generators.
    num_workers : int
        The number of workers to use for parallel computation. Applicable if
        dask is enabled or pyFFTW is used for computing the FFT.
        When num_workers>1, it is advisable to disable OpenMP by setting the
        environment variable OMP_NUM_THREADS to 1.
        This avoids slowdown caused by too many simultaneous threads.
    fft_method : str
        A string defining the FFT method to use (see utils.fft.get_method).
        Defaults to 'numpy' for compatibility reasons. If pyFFTW is installed,
        the recommended method is 'pyfftw'.
    extrap_kwargs : dict
        Optional dictionary containing keyword arguments for the extrapolation
        method. See the documentation of pysteps.extrapolation.
    filter_kwargs : dict
        Optional dictionary containing keyword arguments for the filter method.
        See the documentation of pysteps.cascade.bandpass_filters.py.
    noise_kwargs : dict
        Optional dictionary containing keyword arguments for the initializer of
        the noise generator. See the documentation of
        pysteps.noise.fftgenerators.
    vel_pert_kwargs : dict
        Optional dictionary containing keyword arguments "p_pert_par" and
        "p_pert_perp" for the initializer of the velocity perturbator.
        See the documentation of pysteps.noise.motion.
    mask_kwargs : dict
        Optional dictionary containing mask keyword arguments 'mask_f' and
        'mask_rim', the factor defining the the mask increment and the rim size,
        respectively.
        The mask increment is defined as mask_f*timestep/kmperpixel.
    measure_time : bool
        If set to True, measure, print and return the computation time.

    Returns
    -------
    out : ndarray
        If return_output is True, a four-dimensional array of shape
        (n_ens_members,n_timesteps,m,n) containing a time series of forecast
        precipitation fields for each ensemble member. Otherwise, a None value
        is returned. The time series starts from t0+timestep, where timestep is
        taken from the input precipitation fields R.

    See also
    --------
    pysteps.extrapolation.interface, pysteps.cascade.interface,
    pysteps.noise.interface, pysteps.noise.utils.compute_noise_stddev_adjs

    Notes
    -----
    Please be aware that this represents a (very) experimental implementation.

    References
    ----------
    :cite:`Seed2003`, :cite:`BPS2006`, :cite:`SPN2013`, :cite:`NBSG2017`

    """
    _check_inputs(R, V, ar_order)

    if extrap_kwargs is None:
        extrap_kwargs = dict()

    if filter_kwargs is None:
        filter_kwargs = dict()

    if noise_kwargs is None:
        noise_kwargs = dict()

    if vel_pert_kwargs is None:
        vel_pert_kwargs = dict()

    if mask_kwargs is None:
        mask_kwargs = dict()

    if np.any(~np.isfinite(R)):
        raise ValueError("R contains non-finite values")

    if np.any(~np.isfinite(V)):
        raise ValueError("V contains non-finite values")

    if mask_method not in ["incremental", None]:
        raise ValueError(
            "unknown mask method %s: must be 'incremental' or None" %
            mask_method)

    if np.isscalar(win_size):
        win_size = (np.int(win_size), np.int(win_size))
    else:
        win_size = tuple([np.int(win_size[i]) for i in range(2)])

    timestep = metadata["accutime"]
    kmperpixel = metadata["xpixelsize"] / 1000

    print("Computing SSEPS nowcast:")
    print("------------------------")
    print("")

    print("Inputs:")
    print("-------")
    print("input dimensions: %dx%d" % (R.shape[1], R.shape[2]))
    print("km/pixel:         %g" % kmperpixel)
    print("time step:        %d minutes" % timestep)
    print("")

    print("Methods:")
    print("--------")
    print("extrapolation:          %s" % extrap_method)
    print("bandpass filter:        %s" % bandpass_filter_method)
    print("decomposition:          %s" % decomp_method)
    print("noise generator:        %s" % noise_method)
    print("velocity perturbator:   %s" % vel_pert_method)
    print("precip. mask method:    %s" % mask_method)
    print("probability matching:   %s" % probmatching_method)
    print("FFT method:             %s" % fft_method)
    print("")

    print("Parameters:")
    print("-----------")
    print("localization window:      %dx%d" % (win_size[0], win_size[1]))
    print("overlap:                  %.1f" % overlap)
    print("war thr:                  %.2f" % war_thr)
    print("number of time steps:     %d" % n_timesteps)
    print("ensemble size:            %d" % n_ens_members)
    print("number of cascade levels: %d" % n_cascade_levels)
    print("order of the AR(p) model: %d" % ar_order)
    print("dask imported:            %s" % ("yes" if dask_imported else "no"))
    print("num workers:              %d" % num_workers)

    if vel_pert_method == "bps":
        vp_par = vel_pert_kwargs.get("p_pert_par",
                                     noise.motion.get_default_params_bps_par())
        vp_perp = vel_pert_kwargs.get(
            "p_pert_perp", noise.motion.get_default_params_bps_perp())
        print("velocity perturbations, parallel:      %g,%g,%g" %
              (vp_par[0], vp_par[1], vp_par[2]))
        print("velocity perturbations, perpendicular: %g,%g,%g" %
              (vp_perp[0], vp_perp[1], vp_perp[2]))

    R_thr = metadata["threshold"]
    R_min = metadata["zerovalue"]

    num_ensemble_workers = n_ens_members if num_workers > n_ens_members else num_workers

    if measure_time:
        starttime_init = time.time()

    # get methods
    extrapolator_method = extrapolation.get_method(extrap_method)

    x_values, y_values = np.meshgrid(np.arange(R.shape[2]),
                                     np.arange(R.shape[1]))

    xy_coords = np.stack([x_values, y_values])

    decomp_method, __ = cascade.get_method(decomp_method)
    filter_method = cascade.get_method(bandpass_filter_method)
    if noise_method is not None:
        init_noise, generate_noise = noise.get_method(noise_method)

    # advect the previous precipitation fields to the same position with the
    # most recent one (i.e. transform them into the Lagrangian coordinates)
    R = R[-(ar_order + 1):, :, :].copy()
    extrap_kwargs = extrap_kwargs.copy()
    extrap_kwargs["xy_coords"] = xy_coords
    res = []
    f = lambda R, i: extrapolator_method(R[i, :, :], V, ar_order - i, "min", **
                                         extrap_kwargs)[-1]
    for i in range(ar_order):
        if not dask_imported:
            R[i, :, :] = f(R, i)
        else:
            res.append(dask.delayed(f)(R, i))

    if dask_imported:
        num_workers_ = len(res) if num_workers > len(res) else num_workers
        R = np.stack(
            list(dask.compute(*res, num_workers=num_workers_)) + [R[-1, :, :]])

    if mask_method == "incremental":
        # get mask parameters
        mask_rim = mask_kwargs.get("mask_rim", 10)
        mask_f = mask_kwargs.get("mask_f", 1.0)
        # initialize the structuring element
        struct = scipy.ndimage.generate_binary_structure(2, 1)
        # iterate it to expand it nxn
        n = mask_f * timestep / kmperpixel
        struct = scipy.ndimage.iterate_structure(struct, int((n - 1) / 2.0))

    noise_kwargs.update({
        "win_size": win_size,
        "overlap": overlap,
        "war_thr": war_thr,
        "rm_rdisc": True,
        "donorm": True,
    })

    print("Estimating nowcast parameters...", end="")

    def estimator(R, parsglob=None, idxm=None, idxn=None):

        pars = {}

        # initialize the perturbation generator for the precipitation field
        if noise_method is not None and parsglob is None:
            P = init_noise(R, fft_method=fft_method, **noise_kwargs)
        else:
            P = None
        pars["P"] = P

        # initialize the band-pass filter
        if parsglob is None:
            filter = filter_method(R.shape[1:], n_cascade_levels,
                                   **filter_kwargs)
            pars["filter"] = filter
        else:
            pars["filter"] = None

        # compute the cascade decompositions of the input precipitation fields
        if parsglob is None:
            R_d = []
            for i in range(ar_order + 1):
                R_d_ = decomp_method(
                    R[i, :, :],
                    filter,
                    fft_method=fft_method,
                    normalize=True,
                    compute_stats=True,
                )
                R_d.append(R_d_)
            R_d_ = None

        # normalize the cascades and rearrange them into a four-dimensional array
        # of shape (n_cascade_levels,ar_order+1,m,n) for the autoregressive model
        if parsglob is None:
            R_c = nowcast_utils.stack_cascades(R_d, n_cascade_levels)
            mu = R_d[-1]["means"]
            sigma = R_d[-1]["stds"]
            R_d = None

        else:
            R_c = parsglob["R_c"][0][:, :,
                                     idxm.item(0):idxm.item(1),
                                     idxn.item(0):idxn.item(1)].copy()
            mu = np.mean(R_c, axis=(2, 3))
            sigma = np.std(R_c, axis=(2, 3))

            R_c = (R_c - mu[:, :, None, None]) / sigma[:, :, None, None]

            mu = mu[:, -1]
            sigma = sigma[:, -1]

        pars["mu"] = mu
        pars["sigma"] = sigma

        # compute lag-l temporal autocorrelation coefficients for each cascade level
        GAMMA = np.empty((n_cascade_levels, ar_order))
        for i in range(n_cascade_levels):
            R_c_ = np.stack([R_c[i, j, :, :] for j in range(ar_order + 1)])
            GAMMA[i, :] = correlation.temporal_autocorrelation(R_c_)
        R_c_ = None

        if ar_order == 2:
            # adjust the local lag-2 correlation coefficient to ensure that the AR(p)
            # process is stationary
            for i in range(n_cascade_levels):
                GAMMA[i, 1] = autoregression.adjust_lag2_corrcoef2(
                    GAMMA[i, 0], GAMMA[i, 1])

        # estimate the parameters of the AR(p) model from the autocorrelation
        # coefficients
        PHI = np.empty((n_cascade_levels, ar_order + 1))
        for i in range(n_cascade_levels):
            PHI[i, :] = autoregression.estimate_ar_params_yw(GAMMA[i, :])
        pars["PHI"] = PHI

        # stack the cascades into a five-dimensional array containing all ensemble
        # members
        R_c = [R_c.copy() for i in range(n_ens_members)]
        pars["R_c"] = R_c

        if mask_method is not None and parsglob is None:
            MASK_prec = R[-1, :, :] >= R_thr
            if mask_method == "incremental":
                # initialize precip mask for each member
                MASK_prec = _compute_incremental_mask(MASK_prec, struct,
                                                      mask_rim)
                MASK_prec = [MASK_prec.copy() for j in range(n_ens_members)]
        else:
            MASK_prec = None
        pars["MASK_prec"] = MASK_prec

        return pars

    # prepare windows
    M, N = R.shape[1:]
    n_windows_M = np.ceil(1.0 * M / win_size[0]).astype(int)
    n_windows_N = np.ceil(1.0 * N / win_size[1]).astype(int)
    idxm = np.zeros((2, 1), dtype=int)
    idxn = np.zeros((2, 1), dtype=int)

    sys.stdout.flush()
    if measure_time:
        starttime = time.time()

    # compute global parameters to be used as defaults
    parsglob = estimator(R)

    # loop windows
    if n_windows_M > 1 or n_windows_N > 1:
        war = np.empty((n_windows_M, n_windows_N))
        PHI = np.empty(
            (n_windows_M, n_windows_N, n_cascade_levels, ar_order + 1))
        mu = np.empty((n_windows_M, n_windows_N, n_cascade_levels))
        sigma = np.empty((n_windows_M, n_windows_N, n_cascade_levels))
        ff = []
        rc = []
        pp = []
        mm = []
        for m in range(n_windows_M):
            ff_ = []
            pp_ = []
            rc_ = []
            mm_ = []
            for n in range(n_windows_N):

                # compute indices of local window
                idxm[0] = int(
                    np.max((m * win_size[0] - overlap * win_size[0], 0)))
                idxm[1] = int(
                    np.min((idxm[0] + win_size[0] + overlap * win_size[0], M)))
                idxn[0] = int(
                    np.max((n * win_size[1] - overlap * win_size[1], 0)))
                idxn[1] = int(
                    np.min((idxn[0] + win_size[1] + overlap * win_size[1], N)))

                mask = np.zeros((M, N), dtype=bool)
                mask[idxm.item(0):idxm.item(1),
                     idxn.item(0):idxn.item(1)] = True

                R_ = R[:, idxm.item(0):idxm.item(1), idxn.item(0):idxn.item(1)]

                war[m, n] = np.sum(R_[-1, :, :] >= R_thr) / R_[-1, :, :].size
                if war[m, n] > war_thr:

                    # estimate local parameters
                    pars = estimator(R, parsglob, idxm, idxn)
                    ff_.append(pars["filter"])
                    pp_.append(pars["P"])
                    rc_.append(pars["R_c"])
                    mm_.append(pars["MASK_prec"])
                    mu[m, n, :] = pars["mu"]
                    sigma[m, n, :] = pars["sigma"]
                    PHI[m, n, :, :] = pars["PHI"]

                else:
                    # dry window
                    ff_.append(None)
                    pp_.append(None)
                    rc_.append(None)
                    mm_.append(None)

            ff.append(ff_)
            pp.append(pp_)
            rc.append(rc_)
            mm.append(mm_)

        # remove unnecessary variables
        ff_ = None
        pp_ = None
        rc_ = None
        mm_ = None
        pars = None

    if measure_time:
        print("%.2f seconds." % (time.time() - starttime))
    else:
        print(" done.")

    # initialize the random generators
    if noise_method is not None:
        randgen_prec = []
        randgen_motion = []
        np.random.seed(seed)
        for j in range(n_ens_members):
            rs = np.random.RandomState(seed)
            randgen_prec.append(rs)
            seed = rs.randint(0, high=1e9)
            rs = np.random.RandomState(seed)
            randgen_motion.append(rs)
            seed = rs.randint(0, high=1e9)

    if vel_pert_method is not None:
        init_vel_noise, generate_vel_noise = noise.get_method(vel_pert_method)

        # initialize the perturbation generators for the motion field
        vps = []
        for j in range(n_ens_members):
            kwargs = {
                "randstate": randgen_motion[j],
                "p_par": vp_par,
                "p_perp": vp_perp,
            }
            vp_ = init_vel_noise(V, 1.0 / kmperpixel, timestep, **kwargs)
            vps.append(vp_)

    D = [None for j in range(n_ens_members)]
    R_f = [[] for j in range(n_ens_members)]

    if measure_time:
        init_time = time.time() - starttime_init

    R = R[-1, :, :]

    print("Starting nowcast computation.")

    if measure_time:
        starttime_mainloop = time.time()

    # iterate each time step
    for t in range(n_timesteps):
        print("Computing nowcast for time step %d... " % (t + 1), end="")
        sys.stdout.flush()
        if measure_time:
            starttime = time.time()

        # iterate each ensemble member
        def worker(j):

            # first the global step

            if noise_method is not None:
                # generate noise field
                EPS = generate_noise(parsglob["P"],
                                     randstate=randgen_prec[j],
                                     fft_method=fft_method)
                # decompose the noise field into a cascade
                EPS_d = decomp_method(
                    EPS,
                    parsglob["filter"],
                    fft_method=fft_method,
                    normalize=True,
                    compute_stats=True,
                )
            else:
                EPS_d = None

            # iterate the AR(p) model for each cascade level
            R_c = parsglob["R_c"][j].copy()
            if R_c.shape[1] >= ar_order:
                R_c = R_c[:, -ar_order:, :, :].copy()
            for i in range(n_cascade_levels):
                # normalize the noise cascade
                if EPS_d is not None:
                    EPS_ = (EPS_d["cascade_levels"][i, :, :] -
                            EPS_d["means"][i]) / EPS_d["stds"][i]
                else:
                    EPS_ = None
                # apply AR(p) process to cascade level
                R_c[i, :, :, :] = autoregression.iterate_ar_model(
                    R_c[i, :, :, :], parsglob["PHI"][i, :], eps=EPS_)
                EPS_ = None
            parsglob["R_c"][j] = R_c.copy()
            EPS = None

            # compute the recomposed precipitation field(s) from the cascades
            # obtained from the AR(p) model(s)
            R_c_ = _recompose_cascade(R_c, parsglob["mu"], parsglob["sigma"])
            R_c = None

            # then the local steps
            if n_windows_M > 1 or n_windows_N > 1:
                idxm = np.zeros((2, 1), dtype=int)
                idxn = np.zeros((2, 1), dtype=int)
                R_l = np.zeros((M, N), dtype=float)
                M_s = np.zeros((M, N), dtype=float)
                for m in range(n_windows_M):
                    for n in range(n_windows_N):

                        # compute indices of local window
                        idxm[0] = int(
                            np.max(
                                (m * win_size[0] - overlap * win_size[0], 0)))
                        idxm[1] = int(
                            np.min(
                                (idxm[0] + win_size[0] + overlap * win_size[0],
                                 M)))
                        idxn[0] = int(
                            np.max(
                                (n * win_size[1] - overlap * win_size[1], 0)))
                        idxn[1] = int(
                            np.min(
                                (idxn[0] + win_size[1] + overlap * win_size[1],
                                 N)))

                        # build localization mask
                        mask = _get_mask((M, N), idxm, idxn)
                        mask_l = mask[idxm.item(0):idxm.item(1),
                                      idxn.item(0):idxn.item(1)]
                        M_s += mask

                        # skip if dry
                        if war[m, n] > war_thr:

                            R_c = rc[m][n][j].copy()
                            if R_c.shape[1] >= ar_order:
                                R_c = R_c[:, -ar_order:, :, :]
                            if noise_method is not None:
                                # extract noise field
                                EPS_d_l = EPS_d[
                                    "cascade_levels"][:,
                                                      idxm.item(0):idxm.item(1
                                                                             ),
                                                      idxn.item(0):idxn.
                                                      item(1), ].copy()
                                mu_ = np.mean(EPS_d_l, axis=(1, 2))
                                sigma_ = np.std(EPS_d_l, axis=(1, 2))
                            else:
                                EPS_d_l = None

                            # iterate the AR(p) model for each cascade level
                            for i in range(n_cascade_levels):
                                # normalize the noise cascade
                                if EPS_d_l is not None:
                                    EPS_ = (EPS_d_l[i, :, :] -
                                            mu_[i, None,
                                                None]) / sigma_[i, None, None]
                                else:
                                    EPS_ = None
                                # apply AR(p) process to cascade level
                                R_c[i, :, :, :] = autoregression.iterate_ar_model(
                                    R_c[i, :, :, :], PHI[m, n, i, :], eps=EPS_)
                                EPS_ = None
                            rc[m][n][j] = R_c.copy()
                            EPS_d_l = mu_ = sigma_ = None

                            # compute the recomposed precipitation field(s) from the cascades
                            # obtained from the AR(p) model(s)
                            mu_ = mu[m, n, :]
                            sigma_ = sigma[m, n, :]
                            R_c = [((R_c[i, -1, :, :] * sigma_[i]) + mu_[i]) *
                                   parsglob["sigma"][i] + parsglob["mu"][i]
                                   for i in range(len(mu_))]
                            R_l_ = np.sum(np.stack(R_c), axis=0)
                            R_c = mu_ = sigma_ = None
                            # R_l_ = _recompose_cascade(R_c[:, :, :], mu[m, n, :], sigma[m, n, :])
                        else:
                            R_l_ = R_c_[idxm.item(0):idxm.item(1),
                                        idxn.item(0):idxn.item(1)].copy()

                        if probmatching_method == "cdf":
                            # adjust the CDF of the forecast to match the most recently
                            # observed precipitation field
                            R_ = R[idxm.item(0):idxm.item(1),
                                   idxn.item(0):idxn.item(1)].copy()
                            R_l_ = probmatching.nonparam_match_empirical_cdf(
                                R_l_, R_)
                            R_ = None

                        R_l[idxm.item(0):idxm.item(1),
                            idxn.item(0):idxn.item(1)] += (R_l_ * mask_l)
                        R_l_ = None

                ind = M_s > 0
                R_l[ind] *= 1 / M_s[ind]
                R_l[~ind] = R_min

                R_c_ = R_l.copy()
                R_l = None

            if probmatching_method == "cdf":
                # adjust the CDF of the forecast to match the most recently
                # observed precipitation field
                R_c_[R_c_ < R_thr] = R_min
                R_c_ = probmatching.nonparam_match_empirical_cdf(R_c_, R)

            if mask_method is not None:
                # apply the precipitation mask to prevent generation of new
                # precipitation into areas where it was not originally
                # observed
                if mask_method == "incremental":
                    MASK_prec = parsglob["MASK_prec"][j].copy()
                    R_c_ = R_c_.min() + (R_c_ - R_c_.min()) * MASK_prec
                    MASK_prec = None

            if mask_method == "incremental":
                parsglob["MASK_prec"][j] = _compute_incremental_mask(
                    R_c_ >= R_thr, struct, mask_rim)

            # compute the perturbed motion field
            if vel_pert_method is not None:
                V_ = V + generate_vel_noise(vps[j], (t + 1) * timestep)
            else:
                V_ = V

            # advect the recomposed precipitation field to obtain the forecast
            # for time step t
            extrap_kwargs.update({
                "displacement_prev": D[j],
                "return_displacement": True
            })
            R_f_, D_ = extrapolator_method(R_c_, V_, 1, **extrap_kwargs)
            D[j] = D_
            R_f_ = R_f_[0]

            R_f_[R_f_ < R_thr] = R_min

            return R_f_

        res = []
        for j in range(n_ens_members):
            if not dask_imported or n_ens_members == 1:
                res.append(worker(j))
            else:
                res.append(dask.delayed(worker)(j))

        R_f_ = (dask.compute(*res, num_workers=num_ensemble_workers)
                if dask_imported and n_ens_members > 1 else res)
        res = None

        if measure_time:
            print("%.2f seconds." % (time.time() - starttime))
        else:
            print("done.")

        if callback is not None:
            callback(np.stack(R_f_))
            R_f_ = None

        if return_output:
            for j in range(n_ens_members):
                R_f[j].append(R_f_[j])

    if measure_time:
        mainloop_time = time.time() - starttime_mainloop

    if return_output:
        outarr = np.stack([np.stack(R_f[j]) for j in range(n_ens_members)])
        if measure_time:
            return outarr, init_time, mainloop_time
        else:
            return outarr
    else:
        return None
Exemplo n.º 56
0
def test_wait_on_many(layers):
    t1, t2, cnt = demo_tuples(layers)
    out = wait_on(t1, {"x": [t2]})
    assert dask.compute(*out, scheduler="sync") == ((1, 2, 3), {"x": [(4, 5)]})
    assert cnt.n == 5
Exemplo n.º 57
0
# subprocess.Popen(['mpirun', '--np', '4', 'dask-mpi'], stdin=subprocess.DEVNULL)

# models to run
model_names = [
    "casing", "background", "permeable", "approx_casing", "approx_permeable",
    "approx_permeable2"
]


# Set up the simulation
@dask.delayed
def run_simulation(m):
    sim = casingSimulations.run.SimulationTDEM(
        modelParameters=m + ".json",
        meshGenerator='MeshParameters.json',
        srcList='sources.json',
        fields_filename=m + "_fields.npy")
    fields = sim.run(verbose=True)
    return fields[:, '{}Solution'.format(sim.formulation), :]


f = {}
for m in model_names:
    f[m] = run_simulation(m)

dask.compute(f, num_workers=3)  #, scheduler='distributed')

# # run the simulation
# fields = sim.run(verbose=True)
Exemplo n.º 58
0
 def persister(bag):
     [bag] = persist_with_trace(bag)
     return dask.compute(bag)
Exemplo n.º 59
0
def run_hpo(daskClient,
            nTimesteps,
            nParticles,
            nWorkers,
            paramRanges,
            trainData_cDF,
            trainLabels_cDF,
            testData_cDF,
            testLabels_cDF,
            randomSeed=0):

    pandasTestLabels = testLabels_cDF.to_pandas()

    if daskClient is not None:
        scatteredData_future = daskClient.scatter(
            [trainData_cDF, trainLabels_cDF, testData_cDF, testLabels_cDF],
            broadcast=True)

    trainData_cDF_future = scatteredData_future[0]
    trainLabels_cDF_future = scatteredData_future[1]
    testData_cDF_future = scatteredData_future[2]
    testLabels_cDF_future = scatteredData_future[3]

    particles, velocities, accuracies, bestParticleIndex, \
        globalBestParticleParams, particleBoostingRounds, particleColors = initalize_hpo ( nTimesteps = nTimesteps,
                                                                                           nParticles = nParticles,
                                                                                           nWorkers = nWorkers,
                                                                                           paramRanges = paramRanges)
    globalBestAccuracy = 0

    trainingTimes = np.zeros((nTimesteps, nParticles))
    startTime = time.time()

    predictionHistory = np.zeros(
        (nTimesteps, nParticles, testData_cDF.shape[0]))

    for iTimestep in range(0, nTimesteps):
        if daskClient is not None:
            # [ delayed ] train xgboost models on train data
            delayedParticleTrain = [
                delayed(train_model_hpo)(trainData_cDF_future,
                                         trainLabels_cDF_future,
                                         testData_cDF_future,
                                         testLabels_cDF_future,
                                         particles[iTimestep, iParticle, :],
                                         iParticle, iTimestep)
                for iParticle in range(nParticles)
            ]

            # [ delayed ] determine number of trees/training-rounds returned early stopping -- used to set particle sizes
            delayedParticleRounds = [
                iParticle[0].best_iteration
                for iParticle in delayedParticleTrain
            ]

            # [delayed ] eval trained models on test/validation data
            delayedParticlePredictions = [
                delayed(test_model_hpo)(iParticle[0], iParticle[1],
                                        testData_cDF_future,
                                        testLabels_cDF_future)
                for iParticle in delayedParticleTrain
            ]

            # execute delayed
            particlePredictions = dask.compute(delayedParticlePredictions)[0]

            for iParticle in range(nParticles):
                predictionHistory[
                    iTimestep,
                    iParticle, :] = particlePredictions[iParticle][0]
            #import pdb; pdb.set_trace()

            # compute accuracies of predictions
            accuracies[iTimestep, :] = [
                accuracy_score(pandasTestLabels, iParticle[0])
                for iParticle in particlePredictions
            ]
            particleBoostingRounds[iTimestep, :] = [
                iParticle[1] for iParticle in particlePredictions
            ]
            trainingTimes[iTimestep, :] = [
                iParticle[2] for iParticle in particlePredictions
            ]
            del particlePredictions
        else:
            for iParticle in range(nParticles):
                trainedModels, _ = train_model_hpo(
                    pandasTrainData, pandasTrainLabels,
                    particles[iTimestep, iParticle, :], iParticle, iTimestep)
                predictions, _ = test_model_hpo(trainedModels, pandasTestData,
                                                pandasTestLabels)
                accuracies[iTimestep,
                           iParticle] = accuracy_score(pandasTestLabels,
                                                       predictions)

        bestParticleIndex[iTimestep + 1] = np.argmax(accuracies[iTimestep, :])
        currentBestAccuracy = np.max(accuracies[iTimestep, :])

        print('@ hpo timestep : {}, best accuracy is {}'.format(
            iTimestep, np.max(accuracies[iTimestep, :])))
        if iTimestep + 1 < nTimesteps:
            if currentBestAccuracy > globalBestAccuracy:
                print('\t updating best GLOBAL accuracy')
                globalBestAccuracy = currentBestAccuracy
                globalBestParticleParams[iTimestep + 1] = particles[
                    iTimestep, bestParticleIndex[iTimestep + 1], :]
            else:
                globalBestParticleParams[
                    iTimestep +
                    1] = globalBestParticleParams[iTimestep].copy()

            particles[iTimestep + 1, :, :], velocities[
                iTimestep + 1, :, :] = update_particles(
                    paramRanges,
                    particles[iTimestep, :, :].copy(),
                    velocities[iTimestep, :, :].copy(),
                    bestParticleIndex[iTimestep + 1],
                    globalBestParticleParams[iTimestep + 1],
                    randomSeed=iTimestep)

    particleSizes = particleBoostingRounds / np.max(
        particleBoostingRounds) * 10 + 2
    elapsedTime = time.time() - startTime
    print('elapsed time : {}'.format(elapsedTime))

    return accuracies, particles, velocities, particleSizes, particleColors, bestParticleIndex, particleBoostingRounds, trainingTimes, predictionHistory, elapsedTime
Exemplo n.º 60
0
def forecast(
    R,
    V,
    timesteps,
    n_ens_members=24,
    n_cascade_levels=6,
    R_thr=None,
    kmperpixel=None,
    timestep=None,
    extrap_method="semilagrangian",
    decomp_method="fft",
    bandpass_filter_method="gaussian",
    noise_method="nonparametric",
    noise_stddev_adj=None,
    ar_order=2,
    vel_pert_method="bps",
    conditional=False,
    probmatching_method="cdf",
    mask_method="incremental",
    callback=None,
    return_output=True,
    seed=None,
    num_workers=1,
    fft_method="numpy",
    domain="spatial",
    extrap_kwargs=None,
    filter_kwargs=None,
    noise_kwargs=None,
    vel_pert_kwargs=None,
    mask_kwargs=None,
    measure_time=False,
):
    """Generate a nowcast ensemble by using the Short-Term Ensemble Prediction
    System (STEPS) method.

    Parameters
    ----------
    R: array-like
      Array of shape (ar_order+1,m,n) containing the input precipitation fields
      ordered by timestamp from oldest to newest. The time steps between the
      inputs are assumed to be regular.
    V: array-like
      Array of shape (2,m,n) containing the x- and y-components of the advection
      field. The velocities are assumed to represent one time step between the
      inputs. All values are required to be finite.
    timesteps: int or list of floats
      Number of time steps to forecast or a list of time steps for which the
      forecasts are computed (relative to the input time step). The elements of
      the list are required to be in ascending order.
    n_ens_members: int, optional
      The number of ensemble members to generate.
    n_cascade_levels: int, optional
      The number of cascade levels to use.
    R_thr: float, optional
      Specifies the threshold value for minimum observable precipitation
      intensity. Required if mask_method is not None or conditional is True.
    kmperpixel: float, optional
      Spatial resolution of the input data (kilometers/pixel). Required if
      vel_pert_method is not None or mask_method is 'incremental'.
    timestep: float, optional
      Time step of the motion vectors (minutes). Required if vel_pert_method is
      not None or mask_method is 'incremental'.
    extrap_method: str, optional
      Name of the extrapolation method to use. See the documentation of
      pysteps.extrapolation.interface.
    decomp_method: {'fft'}, optional
      Name of the cascade decomposition method to use. See the documentation
      of pysteps.cascade.interface.
    bandpass_filter_method: {'gaussian', 'uniform'}, optional
      Name of the bandpass filter method to use with the cascade decomposition.
      See the documentation of pysteps.cascade.interface.
    noise_method: {'parametric','nonparametric','ssft','nested',None}, optional
      Name of the noise generator to use for perturbating the precipitation
      field. See the documentation of pysteps.noise.interface. If set to None,
      no noise is generated.
    noise_stddev_adj: {'auto','fixed',None}, optional
      Optional adjustment for the standard deviations of the noise fields added
      to each cascade level. This is done to compensate incorrect std. dev.
      estimates of casace levels due to presence of no-rain areas. 'auto'=use
      the method implemented in pysteps.noise.utils.compute_noise_stddev_adjs.
      'fixed'= use the formula given in :cite:`BPS2006` (eq. 6), None=disable
      noise std. dev adjustment.
    ar_order: int, optional
      The order of the autoregressive model to use. Must be >= 1.
    vel_pert_method: {'bps',None}, optional
      Name of the noise generator to use for perturbing the advection field. See
      the documentation of pysteps.noise.interface. If set to None, the advection
      field is not perturbed.
    conditional: bool, optional
      If set to True, compute the statistics of the precipitation field
      conditionally by excluding pixels where the values are below the threshold
      R_thr.
    mask_method: {'obs','sprog','incremental',None}, optional
      The method to use for masking no precipitation areas in the forecast field.
      The masked pixels are set to the minimum value of the observations.
      'obs' = apply R_thr to the most recently observed precipitation intensity
      field, 'sprog' = use the smoothed forecast field from S-PROG, where the
      AR(p) model has been applied, 'incremental' = iteratively buffer the mask
      with a certain rate (currently it is 1 km/min), None=no masking.
    probmatching_method: {'cdf','mean',None}, optional
      Method for matching the statistics of the forecast field with those of
      the most recently observed one. 'cdf'=map the forecast CDF to the observed
      one, 'mean'=adjust only the conditional mean value of the forecast field
      in precipitation areas, None=no matching applied. Using 'mean' requires
      that mask_method is not None.
    callback: function, optional
      Optional function that is called after computation of each time step of
      the nowcast. The function takes one argument: a three-dimensional array
      of shape (n_ens_members,h,w), where h and w are the height and width
      of the input field R, respectively. This can be used, for instance,
      writing the outputs into files.
    return_output: bool, optional
      Set to False to disable returning the outputs as numpy arrays. This can
      save memory if the intermediate results are written to output files using
      the callback function.
    seed: int, optional
      Optional seed number for the random generators.
    num_workers: int, optional
      The number of workers to use for parallel computation. Applicable if dask
      is enabled or pyFFTW is used for computing the FFT. When num_workers>1, it
      is advisable to disable OpenMP by setting the environment variable
      OMP_NUM_THREADS to 1. This avoids slowdown caused by too many simultaneous
      threads.
    fft_method: str, optional
      A string defining the FFT method to use (see utils.fft.get_method).
      Defaults to 'numpy' for compatibility reasons. If pyFFTW is installed,
      the recommended method is 'pyfftw'.
    domain: {"spatial", "spectral"}
      If "spatial", all computations are done in the spatial domain (the
      classical STEPS model). If "spectral", the AR(2) models and stochastic
      perturbations are applied directly in the spectral domain to reduce
      memory footprint and improve performance :cite:`PCH2019b`.
    extrap_kwargs: dict, optional
      Optional dictionary containing keyword arguments for the extrapolation
      method. See the documentation of pysteps.extrapolation.
    filter_kwargs: dict, optional
      Optional dictionary containing keyword arguments for the filter method.
      See the documentation of pysteps.cascade.bandpass_filters.py.
    noise_kwargs: dict, optional
      Optional dictionary containing keyword arguments for the initializer of
      the noise generator. See the documentation of pysteps.noise.fftgenerators.
    vel_pert_kwargs: dict, optional
      Optional dictionary containing keyword arguments 'p_par' and 'p_perp' for
      the initializer of the velocity perturbator. The choice of the optimal
      parameters depends on the domain and the used optical flow method.

      Default parameters from :cite:`BPS2006`:
      p_par  = [10.88, 0.23, -7.68]
      p_perp = [5.76, 0.31, -2.72]

      Parameters fitted to the data (optical flow/domain):

      darts/fmi:
      p_par  = [13.71259667, 0.15658963, -16.24368207]
      p_perp = [8.26550355, 0.17820458, -9.54107834]

      darts/mch:
      p_par  = [24.27562298, 0.11297186, -27.30087471]
      p_perp = [-7.80797846e+01, -3.38641048e-02, 7.56715304e+01]

      darts/fmi+mch:
      p_par  = [16.55447057, 0.14160448, -19.24613059]
      p_perp = [14.75343395, 0.11785398, -16.26151612]

      lucaskanade/fmi:
      p_par  = [2.20837526, 0.33887032, -2.48995355]
      p_perp = [2.21722634, 0.32359621, -2.57402761]

      lucaskanade/mch:
      p_par  = [2.56338484, 0.3330941, -2.99714349]
      p_perp = [1.31204508, 0.3578426, -1.02499891]

      lucaskanade/fmi+mch:
      p_par  = [2.31970635, 0.33734287, -2.64972861]
      p_perp = [1.90769947, 0.33446594, -2.06603662]

      vet/fmi:
      p_par  = [0.25337388, 0.67542291, 11.04895538]
      p_perp = [0.02432118, 0.99613295, 7.40146505]

      vet/mch:
      p_par  = [0.5075159, 0.53895212, 7.90331791]
      p_perp = [0.68025501, 0.41761289, 4.73793581]

      vet/fmi+mch:
      p_par  = [0.29495222, 0.62429207, 8.6804131 ]
      p_perp = [0.23127377, 0.59010281, 5.98180004]

      fmi=Finland, mch=Switzerland, fmi+mch=both pooled into the same data set

      The above parameters have been fitten by using run_vel_pert_analysis.py
      and fit_vel_pert_params.py located in the scripts directory.

      See pysteps.noise.motion for additional documentation.
    mask_kwargs: dict
      Optional dictionary containing mask keyword arguments 'mask_f' and
      'mask_rim', the factor defining the the mask increment and the rim size,
      respectively.
      The mask increment is defined as mask_f*timestep/kmperpixel.
    measure_time: bool
      If set to True, measure, print and return the computation time.

    Returns
    -------
    out: ndarray
      If return_output is True, a four-dimensional array of shape
      (n_ens_members,num_timesteps,m,n) containing a time series of forecast
      precipitation fields for each ensemble member. Otherwise, a None value
      is returned. The time series starts from t0+timestep, where timestep is
      taken from the input precipitation fields R. If measure_time is True, the
      return value is a three-element tuple containing the nowcast array, the
      initialization time of the nowcast generator and the time used in the
      main loop (seconds).

    See also
    --------
    pysteps.extrapolation.interface, pysteps.cascade.interface,
    pysteps.noise.interface, pysteps.noise.utils.compute_noise_stddev_adjs

    References
    ----------
    :cite:`Seed2003`, :cite:`BPS2006`, :cite:`SPN2013`, :cite:`PCH2019b`
    """

    _check_inputs(R, V, timesteps, ar_order)

    if extrap_kwargs is None:
        extrap_kwargs = dict()

    if filter_kwargs is None:
        filter_kwargs = dict()

    if noise_kwargs is None:
        noise_kwargs = dict()

    if vel_pert_kwargs is None:
        vel_pert_kwargs = dict()

    if mask_kwargs is None:
        mask_kwargs = dict()

    if np.any(~np.isfinite(V)):
        raise ValueError("V contains non-finite values")

    if mask_method not in ["obs", "sprog", "incremental", None]:
        raise ValueError(
            "unknown mask method %s: must be 'obs', 'sprog' or 'incremental' or None"
            % mask_method
        )

    if conditional and R_thr is None:
        raise ValueError("conditional=True but R_thr is not set")

    if mask_method is not None and R_thr is None:
        raise ValueError("mask_method!=None but R_thr=None")

    if noise_stddev_adj not in ["auto", "fixed", None]:
        raise ValueError(
            "unknown noise_std_dev_adj method %s: must be 'auto', 'fixed', or None"
            % noise_stddev_adj
        )

    if kmperpixel is None:
        if vel_pert_method is not None:
            raise ValueError("vel_pert_method is set but kmperpixel=None")
        if mask_method == "incremental":
            raise ValueError("mask_method='incremental' but kmperpixel=None")

    if timestep is None:
        if vel_pert_method is not None:
            raise ValueError("vel_pert_method is set but timestep=None")
        if mask_method == "incremental":
            raise ValueError("mask_method='incremental' but timestep=None")

    print("Computing STEPS nowcast:")
    print("------------------------")
    print("")

    print("Inputs:")
    print("-------")
    print("input dimensions: %dx%d" % (R.shape[1], R.shape[2]))
    if kmperpixel is not None:
        print("km/pixel:         %g" % kmperpixel)
    if timestep is not None:
        print("time step:        %d minutes" % timestep)
    print("")

    print("Methods:")
    print("--------")
    print("extrapolation:          %s" % extrap_method)
    print("bandpass filter:        %s" % bandpass_filter_method)
    print("decomposition:          %s" % decomp_method)
    print("noise generator:        %s" % noise_method)
    print("noise adjustment:       %s" % ("yes" if noise_stddev_adj else "no"))
    print("velocity perturbator:   %s" % vel_pert_method)
    print("conditional statistics: %s" % ("yes" if conditional else "no"))
    print("precip. mask method:    %s" % mask_method)
    print("probability matching:   %s" % probmatching_method)
    print("FFT method:             %s" % fft_method)
    print("domain:                 %s" % domain)
    print("")

    print("Parameters:")
    print("-----------")
    if isinstance(timesteps, int):
        print("number of time steps:     %d" % timesteps)
    else:
        print("time steps:               %s" % timesteps)
    print("ensemble size:            %d" % n_ens_members)
    print("parallel threads:         %d" % num_workers)
    print("number of cascade levels: %d" % n_cascade_levels)
    print("order of the AR(p) model: %d" % ar_order)
    if vel_pert_method == "bps":
        vp_par = vel_pert_kwargs.get("p_par", noise.motion.get_default_params_bps_par())
        vp_perp = vel_pert_kwargs.get(
            "p_perp", noise.motion.get_default_params_bps_perp()
        )
        print(
            "velocity perturbations, parallel:      %g,%g,%g"
            % (vp_par[0], vp_par[1], vp_par[2])
        )
        print(
            "velocity perturbations, perpendicular: %g,%g,%g"
            % (vp_perp[0], vp_perp[1], vp_perp[2])
        )

    if conditional or mask_method is not None:
        print("precip. intensity threshold: %g" % R_thr)

    num_ensemble_workers = n_ens_members if num_workers > n_ens_members else num_workers

    if measure_time:
        starttime_init = time.time()

    fft = utils.get_method(fft_method, shape=R.shape[1:], n_threads=num_workers)

    M, N = R.shape[1:]

    # initialize the band-pass filter
    filter_method = cascade.get_method(bandpass_filter_method)
    filter = filter_method((M, N), n_cascade_levels, **filter_kwargs)

    decomp_method, recomp_method = cascade.get_method(decomp_method)

    extrapolator_method = extrapolation.get_method(extrap_method)

    x_values, y_values = np.meshgrid(np.arange(R.shape[2]), np.arange(R.shape[1]))

    xy_coords = np.stack([x_values, y_values])

    R = R[-(ar_order + 1) :, :, :].copy()

    # determine the domain mask from non-finite values
    domain_mask = np.logical_or.reduce(
        [~np.isfinite(R[i, :]) for i in range(R.shape[0])]
    )

    # determine the precipitation threshold mask
    if conditional:
        MASK_thr = np.logical_and.reduce(
            [R[i, :, :] >= R_thr for i in range(R.shape[0])]
        )
    else:
        MASK_thr = None

    # advect the previous precipitation fields to the same position with the
    # most recent one (i.e. transform them into the Lagrangian coordinates)
    extrap_kwargs = extrap_kwargs.copy()
    extrap_kwargs["xy_coords"] = xy_coords
    extrap_kwargs["allow_nonfinite_values"] = True
    res = list()

    def f(R, i):
        return extrapolator_method(R[i, :, :], V, ar_order - i, "min", **extrap_kwargs)[
            -1
        ]

    for i in range(ar_order):
        if not DASK_IMPORTED:
            R[i, :, :] = f(R, i)
        else:
            res.append(dask.delayed(f)(R, i))

    if DASK_IMPORTED:
        num_workers_ = len(res) if num_workers > len(res) else num_workers
        R = np.stack(list(dask.compute(*res, num_workers=num_workers_)) + [R[-1, :, :]])

    # replace non-finite values with the minimum value
    R = R.copy()
    for i in range(R.shape[0]):
        R[i, ~np.isfinite(R[i, :])] = np.nanmin(R[i, :])

    if noise_method is not None:
        # get methods for perturbations
        init_noise, generate_noise = noise.get_method(noise_method)

        # initialize the perturbation generator for the precipitation field
        pp = init_noise(R, fft_method=fft, **noise_kwargs)

        if noise_stddev_adj == "auto":
            print("Computing noise adjustment coefficients... ", end="", flush=True)
            if measure_time:
                starttime = time.time()

            R_min = np.min(R)
            noise_std_coeffs = noise.utils.compute_noise_stddev_adjs(
                R[-1, :, :],
                R_thr,
                R_min,
                filter,
                decomp_method,
                pp,
                generate_noise,
                20,
                conditional=True,
                num_workers=num_workers,
            )

            if measure_time:
                print("%.2f seconds." % (time.time() - starttime))
            else:
                print("done.")
        elif noise_stddev_adj == "fixed":
            f = lambda k: 1.0 / (0.75 + 0.09 * k)
            noise_std_coeffs = [f(k) for k in range(1, n_cascade_levels + 1)]
        else:
            noise_std_coeffs = np.ones(n_cascade_levels)

        if noise_stddev_adj is not None:
            print("noise std. dev. coeffs:   %s" % str(noise_std_coeffs))

    # compute the cascade decompositions of the input precipitation fields
    R_d = []
    for i in range(ar_order + 1):
        R_ = decomp_method(
            R[i, :, :],
            filter,
            mask=MASK_thr,
            fft_method=fft,
            output_domain=domain,
            normalize=True,
            compute_stats=True,
            compact_output=True,
        )
        R_d.append(R_)

    # normalize the cascades and rearrange them into a four-dimensional array
    # of shape (n_cascade_levels,ar_order+1,m,n) for the autoregressive model
    R_c = nowcast_utils.stack_cascades(R_d, n_cascade_levels)

    R_d = R_d[-1]
    R_d = [R_d.copy() for j in range(n_ens_members)]

    # compute lag-l temporal autocorrelation coefficients for each cascade level
    GAMMA = np.empty((n_cascade_levels, ar_order))
    for i in range(n_cascade_levels):
        GAMMA[i, :] = correlation.temporal_autocorrelation(R_c[i], mask=MASK_thr)

    nowcast_utils.print_corrcoefs(GAMMA)

    if ar_order == 2:
        # adjust the lag-2 correlation coefficient to ensure that the AR(p)
        # process is stationary
        for i in range(n_cascade_levels):
            GAMMA[i, 1] = autoregression.adjust_lag2_corrcoef2(GAMMA[i, 0], GAMMA[i, 1])

    # estimate the parameters of the AR(p) model from the autocorrelation
    # coefficients
    PHI = np.empty((n_cascade_levels, ar_order + 1))
    for i in range(n_cascade_levels):
        PHI[i, :] = autoregression.estimate_ar_params_yw(GAMMA[i, :])

    nowcast_utils.print_ar_params(PHI)

    # discard all except the p-1 last cascades because they are not needed for
    # the AR(p) model
    R_c = [R_c[i][-ar_order:] for i in range(n_cascade_levels)]

    # stack the cascades into a list containing all ensemble members
    R_c = [
        [R_c[j].copy() for j in range(n_cascade_levels)] for i in range(n_ens_members)
    ]

    # initialize the random generators
    if noise_method is not None:
        randgen_prec = []
        randgen_motion = []
        np.random.seed(seed)
        for j in range(n_ens_members):
            rs = np.random.RandomState(seed)
            randgen_prec.append(rs)
            seed = rs.randint(0, high=1e9)
            rs = np.random.RandomState(seed)
            randgen_motion.append(rs)
            seed = rs.randint(0, high=1e9)

    if vel_pert_method is not None:
        init_vel_noise, generate_vel_noise = noise.get_method(vel_pert_method)

        # initialize the perturbation generators for the motion field
        vps = []
        for j in range(n_ens_members):
            kwargs = {
                "randstate": randgen_motion[j],
                "p_par": vp_par,
                "p_perp": vp_perp,
            }
            vp_ = init_vel_noise(V, 1.0 / kmperpixel, timestep, **kwargs)
            vps.append(vp_)

    D = [None for j in range(n_ens_members)]
    R_f = [[] for j in range(n_ens_members)]

    if probmatching_method == "mean":
        mu_0 = np.mean(R[-1, :, :][R[-1, :, :] >= R_thr])

    R_m = None

    if mask_method is not None:
        MASK_prec = R[-1, :, :] >= R_thr

        if mask_method == "obs":
            pass
        elif mask_method == "sprog":
            # compute the wet area ratio and the precipitation mask
            war = 1.0 * np.sum(MASK_prec) / (R.shape[1] * R.shape[2])
            R_m = [R_c[0][i].copy() for i in range(n_cascade_levels)]
            R_m_d = R_d[0].copy()
        elif mask_method == "incremental":
            # get mask parameters
            mask_rim = mask_kwargs.get("mask_rim", 10)
            mask_f = mask_kwargs.get("mask_f", 1.0)
            # initialize the structuring element
            struct = scipy.ndimage.generate_binary_structure(2, 1)
            # iterate it to expand it nxn
            n = mask_f * timestep / kmperpixel
            struct = scipy.ndimage.iterate_structure(struct, int((n - 1) / 2.0))
            # initialize precip mask for each member
            MASK_prec = _compute_incremental_mask(MASK_prec, struct, mask_rim)
            MASK_prec = [MASK_prec.copy() for j in range(n_ens_members)]

    if noise_method is None and R_m is None:
        R_m = [R_c[0][i].copy() for i in range(n_cascade_levels)]

    fft_objs = []
    for i in range(n_ens_members):
        fft_objs.append(utils.get_method(fft_method, shape=R.shape[1:]))

    if measure_time:
        init_time = time.time() - starttime_init

    R = R[-1, :, :]

    print("Starting nowcast computation.")

    if measure_time:
        starttime_mainloop = time.time()

    if isinstance(timesteps, int):
        timesteps = range(timesteps + 1)
        timestep_type = "int"
    else:
        original_timesteps = [0] + list(timesteps)
        timesteps = nowcast_utils.binned_timesteps(original_timesteps)
        timestep_type = "list"

    extrap_kwargs["return_displacement"] = True
    R_f_prev = [R for i in range(n_ens_members)]
    t_prev = [0.0 for j in range(n_ens_members)]
    t_total = [0.0 for j in range(n_ens_members)]

    # iterate each time step
    for t, subtimestep_idx in enumerate(timesteps):
        if timestep_type == "list":
            subtimesteps = [original_timesteps[t_] for t_ in subtimestep_idx]
        else:
            subtimesteps = [t]

        if (timestep_type == "list" and subtimesteps) or (
            timestep_type == "int" and t > 0
        ):
            is_nowcast_time_step = True
        else:
            is_nowcast_time_step = False

        if is_nowcast_time_step:
            print(
                "Computing nowcast for time step %d... " % t,
                end="",
                flush=True,
            )

        if measure_time:
            starttime = time.time()

        if noise_method is None or mask_method == "sprog":
            for i in range(n_cascade_levels):
                # use a separate AR(p) model for the non-perturbed forecast,
                # from which the mask is obtained
                R_m[i] = autoregression.iterate_ar_model(R_m[i], PHI[i, :])

            R_m_d["cascade_levels"] = [R_m[i][-1] for i in range(n_cascade_levels)]
            if domain == "spatial":
                R_m_d["cascade_levels"] = np.stack(R_m_d["cascade_levels"])
            R_m_ = recomp_method(R_m_d)
            if domain == "spectral":
                R_m_ = fft.irfft2(R_m_)

            if mask_method == "sprog":
                MASK_prec = _compute_sprog_mask(R_m_, war)

        # the nowcast iteration for each ensemble member
        def worker(j):
            if noise_method is not None:
                # generate noise field
                EPS = generate_noise(
                    pp, randstate=randgen_prec[j], fft_method=fft_objs[j], domain=domain
                )

                # decompose the noise field into a cascade
                EPS = decomp_method(
                    EPS,
                    filter,
                    fft_method=fft_objs[j],
                    input_domain=domain,
                    output_domain=domain,
                    compute_stats=True,
                    normalize=True,
                    compact_output=True,
                )
            else:
                EPS = None

            # iterate the AR(p) model for each cascade level
            for i in range(n_cascade_levels):
                # normalize the noise cascade
                if EPS is not None:
                    EPS_ = EPS["cascade_levels"][i]
                    EPS_ *= noise_std_coeffs[i]
                else:
                    EPS_ = None
                # apply AR(p) process to cascade level
                if EPS is not None or vel_pert_method is not None:
                    R_c[j][i] = autoregression.iterate_ar_model(
                        R_c[j][i], PHI[i, :], eps=EPS_
                    )
                else:
                    # use the deterministic AR(p) model computed above if
                    # perturbations are disabled
                    R_c[j][i] = R_m[i]

            EPS = None
            EPS_ = None

            # compute the recomposed precipitation field(s) from the cascades
            # obtained from the AR(p) model(s)
            R_d[j]["cascade_levels"] = [
                R_c[j][i][-1, :] for i in range(n_cascade_levels)
            ]
            if domain == "spatial":
                R_d[j]["cascade_levels"] = np.stack(R_d[j]["cascade_levels"])
            R_f_new = recomp_method(R_d[j])

            if domain == "spectral":
                R_f_new = fft_objs[j].irfft2(R_f_new)

            if mask_method is not None:
                # apply the precipitation mask to prevent generation of new
                # precipitation into areas where it was not originally
                # observed
                R_cmin = R_f_new.min()
                if mask_method == "incremental":
                    R_f_new = R_cmin + (R_f_new - R_cmin) * MASK_prec[j]
                    MASK_prec_ = R_f_new > R_cmin
                else:
                    MASK_prec_ = MASK_prec

                # Set to min value outside of mask
                R_f_new[~MASK_prec_] = R_cmin

            if probmatching_method == "cdf":
                # adjust the CDF of the forecast to match the most recently
                # observed precipitation field
                R_f_new = probmatching.nonparam_match_empirical_cdf(R_f_new, R)
            elif probmatching_method == "mean":
                MASK = R_f_new >= R_thr
                mu_fct = np.mean(R_f_new[MASK])
                R_f_new[MASK] = R_f_new[MASK] - mu_fct + mu_0

            if mask_method == "incremental":
                MASK_prec[j] = _compute_incremental_mask(
                    R_f_new >= R_thr, struct, mask_rim
                )

            R_f_new[domain_mask] = np.nan

            R_f_out = []
            extrap_kwargs_ = extrap_kwargs.copy()

            V_pert = V

            # advect the recomposed precipitation field to obtain the forecast for
            # the current time step (or subtimesteps if non-integer time steps are
            # given)
            for t_sub in subtimesteps:
                if t_sub > 0:
                    t_diff_prev_int = t_sub - int(t_sub)
                    if t_diff_prev_int > 0.0:
                        R_f_ip = (1.0 - t_diff_prev_int) * R_f_prev[
                            j
                        ] + t_diff_prev_int * R_f_new
                    else:
                        R_f_ip = R_f_prev[j]

                    t_diff_prev = t_sub - t_prev[j]
                    t_total[j] += t_diff_prev

                    # compute the perturbed motion field
                    if vel_pert_method is not None:
                        V_pert = V + generate_vel_noise(vps[j], t_total[j] * timestep)

                    extrap_kwargs_["displacement_prev"] = D[j]
                    R_f_ep, D[j] = extrapolator_method(
                        R_f_ip,
                        V_pert,
                        [t_diff_prev],
                        **extrap_kwargs_,
                    )
                    R_f_out.append(R_f_ep[0])
                    t_prev[j] = t_sub

            # advect the forecast field by one time step if no subtimesteps in the
            # current interval were found
            if not subtimesteps:
                t_diff_prev = t + 1 - t_prev[j]
                t_total[j] += t_diff_prev

                # compute the perturbed motion field
                if vel_pert_method is not None:
                    V_pert = V + generate_vel_noise(vps[j], t_total[j] * timestep)

                extrap_kwargs_["displacement_prev"] = D[j]
                _, D[j] = extrapolator_method(
                    None,
                    V_pert,
                    [t_diff_prev],
                    **extrap_kwargs_,
                )
                t_prev[j] = t + 1

            R_f_prev[j] = R_f_new

            return R_f_out

        res = []
        for j in range(n_ens_members):
            if not DASK_IMPORTED or n_ens_members == 1:
                res.append(worker(j))
            else:
                res.append(dask.delayed(worker)(j))

        R_f_ = (
            dask.compute(*res, num_workers=num_ensemble_workers)
            if DASK_IMPORTED and n_ens_members > 1
            else res
        )
        res = None

        if is_nowcast_time_step:
            if measure_time:
                print("%.2f seconds." % (time.time() - starttime))
            else:
                print("done.")

        if callback is not None:
            callback(np.stack(R_f_))
            R_f_ = None

        if return_output:
            for j in range(n_ens_members):
                R_f[j].extend(R_f_[j])

    if measure_time:
        mainloop_time = time.time() - starttime_mainloop

    if return_output:
        outarr = np.stack([np.stack(R_f[j]) for j in range(n_ens_members)])
        if measure_time:
            return outarr, init_time, mainloop_time
        else:
            return outarr
    else:
        return None