def test_optimize(): x = dask.delayed(inc)(1) y = dask.delayed(inc)(x) z = x + y x2, y2, z2, constant = optimize(x, y, z, 1) assert constant == 1 # Same graphs for each dsk = dict(x2.dask) assert dict(y2.dask) == dsk assert dict(z2.dask) == dsk # Computationally equivalent assert dask.compute(x2, y2, z2) == dask.compute(x, y, z) # Applying optimizations before compute and during compute gives # same results. Shows optimizations are occurring. sols = dask.compute(x, y, z, optimizations=[inc_to_dec]) x3, y3, z3 = optimize(x, y, z, optimizations=[inc_to_dec]) assert dask.compute(x3, y3, z3) == sols # Optimize respects global optimizations as well with dask.config.set(optimizations=[inc_to_dec]): x4, y4, z4 = optimize(x, y, z) for a, b in zip([x3, y3, z3], [x4, y4, z4]): assert dict(a.dask) == dict(b.dask)
def test_read_bytes_delimited(s3, blocksize): _, values = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=b'\n', s3=s3) _, values2 = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=b'foo', s3=s3) assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=d, s3=s3) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_bytes_delimited(): with filetexts(files, mode='b'): for bs in [5, 15, 45, 1500]: _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'\n') _, values2 = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'foo') assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def to_json(df, url_path, orient='records', lines=None, storage_options=None, compute=True, encoding='utf-8', errors='strict', compression=None, **kwargs): """Write dataframe into JSON text files This utilises ``pandas.DataFrame.to_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this produces the kind of JSON output that is most common in big-data applications, and which can be chunked when reading (see ``read_json()``). Parameters ---------- df: dask.DataFrame Data to save url_path: str, list of str Location to write to. If a string, and there are more than one partitions in df, should include a glob character to expand into a set of file names, or provide a ``name_function=`` parameter. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation compute: bool If true, immediately executes. If False, returns a set of delayed objects, which can be computed at a later time. encoding, errors: Text conversion, ``see str.encode()`` compression : string or None String like 'gzip' or 'xz'. """ if lines is None: lines = orient == 'records' if orient != 'records' and lines: raise ValueError('Line-delimited JSON is only available with' 'orient="records".') kwargs['orient'] = orient kwargs['lines'] = lines and orient == 'records' outfiles = open_files( url_path, 'wt', encoding=encoding, errors=errors, name_function=kwargs.pop('name_function', None), num=df.npartitions, compression=compression, **(storage_options or {}) ) parts = [dask.delayed(write_json_partition)(d, outfile, kwargs) for outfile, d in zip(outfiles, df.to_delayed())] if compute: dask.compute(parts) return [f.path for f in outfiles] else: return parts
def test_write_bytes(s3): paths = ['s3://' + test_bucket_name + '/more/' + f for f in files] values = [delayed(v) for v in files.values()] out = core.write_bytes(values, paths) compute(*out) sample, values = read_bytes('s3://' + test_bucket_name + '/more/test/accounts.*') results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def test_inner_compute(): x = da.ones(10, chunks=(5,)) + 1 + 2 + 3 a = x.sum() y = x * 2 * 3 * 4 b = y.sum() z = x * 2 * 3 dask.compute(x, a, y, b, z)
def test_enforce_columns(): blocks = [[b'aa,bb\n1,1.0\n2.2.0', b'10,20\n30,40'], [b'AA,bb\n1,1.0\n2.2.0', b'10,20\n30,40']] head = pd.read_csv(BytesIO(blocks[0][0]), header=0) with pytest.raises(ValueError): dfs = read_csv_from_bytes(blocks, b'aa,bb\n', head, {}, collection=False, enforce=True) compute(*dfs)
def test_enforce_columns(reader, blocks): # Replace second header with different column name blocks = [blocks[0], [blocks[1][0].replace(b'a', b'A'), blocks[1][1]]] head = reader(BytesIO(blocks[0][0]), header=0) header = blocks[0][0].split(b'\n')[0] + b'\n' with pytest.raises(ValueError): dfs = text_blocks_to_pandas(reader, blocks, header, head, {}, collection=False, enforce=True) dask.compute(*dfs, scheduler='sync')
def _compute_tasks(self, tasks, processes): """ Compute all dask tasks """ if processes is None: out = da.compute(*tasks, scheduler="single-threaded") else: out = da.compute(*tasks, num_workers=processes) return out
def to_csv(df, filename, name_function=None, compression=None, compute=True, get=None, **kwargs): values = [_to_csv_chunk(d, **kwargs) for d in df.to_delayed()] values = write_bytes(values, filename, name_function, compression, encoding=None) if compute: from dask import compute compute(*values, get=get) else: return values
def main(input_file, dtypes, output_path): """Create Plots From data in input""" data = pd.read_csv(input_file) new_file_name = f"{input_file}.parq" data.to_parquet(new_file_name) data_types = json.load(open(dtypes, "r")) plots = create_plots(new_file_name, data_types, output_path) with ProgressBar(): dask.compute(*plots, scheduler="processes", n_workers=22)
def test_to_textfiles(ext, myopen): b = db.from_sequence(['abc', '123', 'xyz'], npartitions=2) with tmpdir() as dir: c = b.to_textfiles(os.path.join(dir, '*.' + ext), compute=False) dask.compute(*c, get=dask.get) assert os.path.exists(os.path.join(dir, '1.' + ext)) f = myopen(os.path.join(dir, '1.' + ext), 'rb') text = f.read() if hasattr(text, 'decode'): text = text.decode() assert 'xyz' in text f.close()
def test_read_text(fmt, bs, encoding): compress = compression.compress[fmt] files2 = dict((k, compress(v.encode(encoding))) for k, v in files.items()) with filetexts(files2, mode='b'): b = read_text('.test.accounts.*.json', compression=fmt, blocksize=bs, encoding=encoding) L, = compute(b) assert ''.join(L) == expected blocks = read_text('.test.accounts.*.json', compression=fmt, blocksize=bs, encoding=encoding, collection=False) L = compute(*blocks) assert ''.join(line for block in L for line in block) == expected
def test_simple_write(tmpdir): tmpdir = str(tmpdir) make_bytes = lambda: b'000' some_bytes = delayed(make_bytes)() data = [some_bytes, some_bytes] out = write_bytes(data, tmpdir) assert len(out) == 2 compute(*out) files = os.listdir(tmpdir) assert len(files) == 2 assert '0.part' in files d = open(os.path.join(tmpdir, files[0]), 'rb').read() assert d == b'000'
def test_compressed_write(tmpdir): tmpdir = str(tmpdir) make_bytes = lambda: b'000' some_bytes = delayed(make_bytes)() data = [some_bytes, some_bytes] out = write_bytes(data, os.path.join(tmpdir, 'bytes-*.gz'), compression='gzip') compute(*out) files = os.listdir(tmpdir) assert len(files) == 2 assert 'bytes-0.gz' in files import gzip d = gzip.GzipFile(os.path.join(tmpdir, files[0])).read() assert d == b'000'
def test_registered_read_bytes(): from dask.bytes.core import read_bytes with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*') results = compute(*concat(values)) assert set(results) == set(files.values())
def test_compression_binary(fmt): from dask.bytes.core import open_files files2 = valmap(compression.compress[fmt], files) with filetexts(files2, mode='b'): myfiles = open_files('.test.accounts.*', compression=fmt) data = compute(*[file.read() for file in myfiles]) assert list(data) == [files[k] for k in sorted(files)]
def test_registered_open_files(): from dask.bytes.core import open_files with filetexts(files, mode='b'): myfiles = open_files('.test.accounts.*') assert len(myfiles) == len(files) data = compute(*[file.read() for file in myfiles]) assert list(data) == [files[k] for k in sorted(files)]
def test_nout(): func = delayed(lambda x: (x, -x), nout=2, pure=True) x = func(1) assert len(x) == 2 a, b = x assert compute(a, b) == (1, -1) assert a._length is None assert b._length is None pytest.raises(TypeError, lambda: len(a)) pytest.raises(TypeError, lambda: list(a)) pytest.raises(ValueError, lambda: delayed(add, nout=-1)) pytest.raises(ValueError, lambda: delayed(add, nout=True)) func = delayed(add, nout=None) a = func(1) assert a._length is None pytest.raises(TypeError, lambda: list(a)) pytest.raises(TypeError, lambda: len(a)) func = delayed(lambda x: (x,), nout=1, pure=True) x = func(1) assert len(x) == 1 a, = x assert a.compute() == 1 assert a._length is None pytest.raises(TypeError, lambda: len(a)) func = delayed(lambda x: tuple(), nout=0, pure=True) x = func(1) assert len(x) == 0 assert x.compute() == tuple()
def test_custom_delayed(): x = Tuple({'a': 1, 'b': 2, 'c': (add, 'a', 'b')}, ['a', 'b', 'c']) x2 = delayed(add, pure=True)(x, (4, 5, 6)) n = delayed(len, pure=True)(x) assert delayed(len, pure=True)(x).key == n.key assert x2.compute() == (1, 2, 3, 4, 5, 6) assert compute(n, x2, x) == (3, (1, 2, 3, 4, 5, 6), (1, 2, 3))
def test_enforce_dtypes(reader, blocks): head = reader(BytesIO(blocks[0][0]), header=0) header = blocks[0][0].split(b'\n')[0] + b'\n' dfs = text_blocks_to_pandas(reader, blocks, header, head, {}, collection=False) dfs = dask.compute(*dfs, scheduler='sync') assert all(df.dtypes.to_dict() == head.dtypes.to_dict() for df in dfs)
def test_registered_open_text_files(s3): from dask.bytes.core import open_text_files myfiles = open_text_files('s3://' + test_bucket_name + '/test/accounts.*.json', s3=s3) assert len(myfiles) == len(files) data = compute(*[file.read() for file in myfiles]) assert list(data) == [files[k].decode() for k in sorted(files)]
def compute(self, **kwargs): items = list(self.items()) keys = [key for key, _ in items] values = [value for _, value in items] values = dask.compute(*values, **kwargs) return dask_dict(zip(keys, values))
def test_open_files_write(tmpdir): tmpdir = str(tmpdir) f = open_file_write([os.path.join(tmpdir, 'test1'), os.path.join(tmpdir, 'test2')]) assert len(f) == 2 files = compute(*f) assert files[0].mode == 'wb'
def test_registered_open_files(s3): from dask.bytes.core import open_files myfiles = open_files("s3://%s/test/accounts.*.json" % test_bucket_name, s3=s3) assert len(myfiles) == len(files) data = compute(*[file.read() for file in myfiles]) assert list(data) == [files[k] for k in sorted(files)]
def test_registered(s3): from dask.bytes.core import read_bytes sample, values = read_bytes("s3://%s/test/accounts.*.json" % test_bucket_name, s3=s3) results = compute(*concat(values)) assert set(results) == set(files.values())
def l1_lpsolver_parallel(obs_phase, freqs, sigma_max = np.pi, fout=0.5, solve_cs=True, problem_name="l1_tec_solver",num_threads = None): '''Solve the tec and cs for multiple datasets. `obs_phase` : `numpy.ndarray` the measured phase with shape (num_freqs, num_datasets) `freqs` : `numpy.ndarray` the frequencies at the datapoints (num_freqs,) `sigma_max` : (optional) `float` the maximum allowed deviation for outlier detection. default np.pi `fout` : (optional) `float` The maximum fraction of allowed outliers out of total number of datapoints. default 0.5 `solve_cs` : (optional) bool Whether to solve cs (True) `num_threads` : (optional) `int` number of parallel threads to run. default None is num_cpu `problem_name` : (optional) `str` name of problem "l1_tec_solver" ''' from dask import delayed, compute from dask.threaded import get from functools import partial dsk = {} assert len(obs_phase.shape) == 2, "obs_phase not dim 2 {}".format(obs_phase.shape) N = obs_phase.shape[1] values = [delayed(partial(l1_lpsolver, sigma_max=sigma_max, fout=fout,solve_cs=solve_cs, problem_name="{}{:03d}".format(problem_name,i)), pure=True)( obs_phase[:,i], freqs) for i in range(N)] #client = Client() results = compute(*values, get=get, num_workers=num_threads) return results
def test_to_textfiles_encoding(): b = db.from_sequence([u'汽车', u'苹果', u'天气'], npartitions=2) for ext, myopen in [('gz', GzipFile), ('bz2', BZ2File), ('', open)]: if ext == 'bz2' and PY2: continue with tmpdir() as dir: c = b.to_textfiles(os.path.join(dir, '*.' + ext), encoding='gb18030', compute=False) dask.compute(*c) assert os.path.exists(os.path.join(dir, '1.' + ext)) f = myopen(os.path.join(dir, '1.' + ext), 'rb') text = f.read() if hasattr(text, 'decode'): text = text.decode('gb18030') assert u'天气' in text f.close()
def compute_with_trace(*args): """Do Dask compute(), but with added Eliot tracing. Dask is a graph of tasks, but Eliot logs trees. So we need to emulate a graph using a tree. We do this by making Eliot action for each task, but having it list the tasks it depends on. We use the following algorithm: 1. Create a top-level action. 2. For each entry in the dask graph, create a child with serialize_task_id. Do this in likely order of execution, so that if B depends on A the task level of B is higher than the task Ievel of A. 3. Replace each function with a wrapper that uses the corresponding task ID (with Action.continue_task), and while it's at it also records which other things this function depends on. Known issues: 1. Retries will confuse Eliot. Probably need different distributed-tree mechanism within Eliot to solve that. """ # 1. Create top-level Eliot Action: with start_action(action_type="dask:compute"): # In order to reduce logging verbosity, add logging to the already # optimized graph: optimized = optimize(*args, optimizations=[_add_logging]) return compute(*optimized, optimize_graph=False)
def test_read_bytes_blocksize_types(blocksize): with filetexts(files, mode='b'): sample, vals = read_bytes('.test.account*', blocksize=blocksize) results = compute(*concat(vals)) ourlines = b"".join(results).split(b'\n') testlines = b"".join(files.values()).split(b'\n') assert set(ourlines) == set(testlines)
def run_TPI(p, client=None): ''' Solve for transition path equilibrium of OG-USA. Args: p (OG-USA Specifications object): model parameters client (Dask client object): client Returns: output (dictionary): dictionary with transition path solution results ''' # unpack tuples of parameters initial_values, ss_vars, theta, baseline_values = get_initial_SS_values(p) (B0, b_sinit, b_splus1init, factor, initial_b, initial_n, D0) = initial_values (TRbaseline, Gbaseline) = baseline_values print('Government spending breakpoints are tG1: ', p.tG1, '; and tG2:', p.tG2) # Initialize guesses at time paths # Make array of initial guesses for labor supply and savings guesses_b = utils.get_initial_path(initial_b, ss_vars['bssmat_splus1'], p, 'ratio') guesses_n = utils.get_initial_path(initial_n, ss_vars['nssmat'], p, 'ratio') b_mat = guesses_b n_mat = guesses_n ind = np.arange(p.S) # Get path for aggregate savings and labor supply` L_init = np.ones((p.T + p.S, )) * ss_vars['Lss'] B_init = np.ones((p.T + p.S, )) * ss_vars['Bss'] L_init[:p.T] = aggr.get_L(n_mat[:p.T], p, 'TPI') B_init[1:p.T] = aggr.get_B(b_mat[:p.T], p, 'TPI', False)[:p.T - 1] B_init[0] = B0 if not p.small_open: if p.budget_balance: K_init = B_init else: K_init = B_init * ss_vars['Kss'] / ss_vars['Bss'] else: K_init = firm.get_B(L_init, p.firm_r, p, 'TPI') K = K_init K_d = K_init * ss_vars['K_d_ss'] / ss_vars['Kss'] K_f = K_init * ss_vars['K_f_ss'] / ss_vars['Kss'] L = L_init B = B_init Y = np.zeros_like(K) Y[:p.T] = firm.get_Y(K[:p.T], L[:p.T], p, 'TPI') Y[p.T:] = ss_vars['Yss'] r = np.zeros_like(Y) if not p.small_open: r[:p.T] = firm.get_r(Y[:p.T], K[:p.T], p, 'TPI') r[p.T:] = ss_vars['rss'] else: r = p.firm_r # compute w w = np.zeros_like(r) w[:p.T] = firm.get_w_from_r(r[:p.T], p, 'TPI') w[p.T:] = ss_vars['wss'] r_gov = fiscal.get_r_gov(r, p) if p.budget_balance: r_hh = r else: r_hh = aggr.get_r_hh(r, r_gov, K, ss_vars['Dss']) if p.small_open: r_hh = p.hh_r BQ0 = aggr.get_BQ(r[0], initial_b, None, p, 'SS', True) if not p.use_zeta: BQ = np.zeros((p.T + p.S, p.J)) for j in range(p.J): BQ[:, j] = (list(np.linspace(BQ0[j], ss_vars['BQss'][j], p.T)) + [ss_vars['BQss'][j]] * p.S) BQ = np.array(BQ) else: BQ = (list(np.linspace(BQ0, ss_vars['BQss'], p.T)) + [ss_vars['BQss']] * p.S) BQ = np.array(BQ) if p.budget_balance: if np.abs(ss_vars['TR_ss']) < 1e-13: TR_ss2 = 0.0 # sometimes SS is very small but not zero, # even if taxes are zero, this get's rid of the approximation # error, which affects the perc changes below else: TR_ss2 = ss_vars['TR_ss'] TR = np.ones(p.T + p.S) * TR_ss2 total_revenue = TR G = np.zeros(p.T + p.S) elif not p.baseline_spending: TR = p.alpha_T * Y G = np.ones(p.T + p.S) * ss_vars['Gss'] elif p.baseline_spending: TR = TRbaseline TR_new = p.TR # Need to set TR_new for later reference G = Gbaseline G_0 = Gbaseline[0] # Initialize some starting values if p.budget_balance: D = np.zeros(p.T + p.S) else: D = np.ones(p.T + p.S) * ss_vars['Dss'] if ss_vars['Dss'] == 0: D_d = np.zeros(p.T + p.S) D_f = np.zeros(p.T + p.S) else: D_d = D * ss_vars['D_d_ss'] / ss_vars['Dss'] D_f = D * ss_vars['D_f_ss'] / ss_vars['Dss'] total_revenue = np.ones(p.T + p.S) * ss_vars['total_revenue_ss'] TPIiter = 0 TPIdist = 10 euler_errors = np.zeros((p.T, 2 * p.S, p.J)) TPIdist_vec = np.zeros(p.maxiter) # TPI loop while (TPIiter < p.maxiter) and (TPIdist >= p.mindist_TPI): r_gov[:p.T] = fiscal.get_r_gov(r[:p.T], p) if p.budget_balance: r_hh[:p.T] = r[:p.T] else: K[:p.T] = firm.get_K_from_Y(Y[:p.T], r[:p.T], p, 'TPI') r_hh[:p.T] = aggr.get_r_hh(r[:p.T], r_gov[:p.T], K[:p.T], D[:p.T]) if p.small_open: r_hh[:p.T] = p.hh_r[:p.T] outer_loop_vars = (r, w, r_hh, BQ, TR, theta) euler_errors = np.zeros((p.T, 2 * p.S, p.J)) lazy_values = [] for j in range(p.J): guesses = (guesses_b[:, :, j], guesses_n[:, :, j]) lazy_values.append( delayed(inner_loop)(guesses, outer_loop_vars, initial_values, j, ind, p)) results = compute(*lazy_values, scheduler=dask.multiprocessing.get, num_workers=p.num_workers) for j, result in enumerate(results): euler_errors[:, :, j], b_mat[:, :, j], n_mat[:, :, j] = result bmat_s = np.zeros((p.T, p.S, p.J)) bmat_s[0, 1:, :] = initial_b[:-1, :] bmat_s[1:, 1:, :] = b_mat[:p.T - 1, :-1, :] bmat_splus1 = np.zeros((p.T, p.S, p.J)) bmat_splus1[:, :, :] = b_mat[:p.T, :, :] etr_params_4D = np.tile( p.etr_params.reshape(p.T, p.S, 1, p.etr_params.shape[2]), (1, 1, p.J, 1)) bqmat = household.get_bq(BQ, None, p, 'TPI') trmat = household.get_tr(TR, None, p, 'TPI') tax_mat = tax.total_taxes(r_hh[:p.T], w[:p.T], bmat_s, n_mat[:p.T, :, :], bqmat[:p.T, :, :], factor, trmat[:p.T, :, :], theta, 0, None, False, 'TPI', p.e, etr_params_4D, p) r_hh_path = utils.to_timepath_shape(r_hh) wpath = utils.to_timepath_shape(w) c_mat = household.get_cons(r_hh_path[:p.T, :, :], wpath[:p.T, :, :], bmat_s, bmat_splus1, n_mat[:p.T, :, :], bqmat[:p.T, :, :], tax_mat, p.e, p.tau_c[:p.T, :, :], p) y_before_tax_mat = (r_hh_path[:p.T, :, :] * bmat_s[:p.T, :, :] + wpath[:p.T, :, :] * p.e * n_mat[:p.T, :, :]) if not p.baseline_spending and not p.budget_balance: Y[:p.T] = TR[:p.T] / p.alpha_T[:p.T] # maybe unecessary (total_rev, T_Ipath, T_Ppath, T_BQpath, T_Wpath, T_Cpath, business_revenue) = aggr.revenue( r_hh[:p.T], w[:p.T], bmat_s, n_mat[:p.T, :, :], bqmat[:p.T, :, :], c_mat[:p.T, :, :], Y[:p.T], L[:p.T], K[:p.T], factor, theta, etr_params_4D, p, 'TPI') total_revenue[:p.T] = total_rev # set intial debt value if p.baseline: D0 = p.initial_debt_ratio * Y[0] if not p.baseline_spending: G_0 = p.alpha_G[0] * Y[0] dg_fixed_values = (Y, total_revenue, TR, D0, G_0) Dnew, G[:p.T] = fiscal.D_G_path(r_gov, dg_fixed_values, Gbaseline, p) # Fix initial amount of foreign debt holding D_f[0] = p.initial_foreign_debt_ratio * Dnew[0] for t in range(1, p.T): D_f[t + 1] = (D_f[t] / (np.exp(p.g_y) * (1 + p.g_n[t + 1])) + p.zeta_D[t] * (Dnew[t + 1] - (Dnew[t] / (np.exp(p.g_y) * (1 + p.g_n[t + 1]))))) D_d[:p.T] = Dnew[:p.T] - D_f[:p.T] else: # if budget balance Dnew = np.zeros(p.T + 1) G[:p.T] = np.zeros(p.T) D_f[:p.T] = np.zeros(p.T) D_d[:p.T] = np.zeros(p.T) L[:p.T] = aggr.get_L(n_mat[:p.T], p, 'TPI') B[1:p.T] = aggr.get_B(bmat_splus1[:p.T], p, 'TPI', False)[:p.T - 1] K_demand_open = firm.get_K(L[:p.T], p.firm_r[:p.T], p, 'TPI') K_d[:p.T] = B[:p.T] - D_d[:p.T] if np.any(K_d < 0): print('K_d has negative elements. Setting them ' + 'positive to prevent NAN.') K_d[:p.T] = np.fmax(K_d[:p.T], 0.05 * B[:p.T]) K_f[:p.T] = p.zeta_K[:p.T] * (K_demand_open - B[:p.T] + D_d[:p.T]) K = K_f + K_d if np.any(B) < 0: print('B has negative elements. B[0:9]:', B[0:9]) print('B[T-2:T]:', B[p.T - 2, p.T]) if p.small_open: K[:p.T] = K_demand_open Ynew = firm.get_Y(K[:p.T], L[:p.T], p, 'TPI') rnew = r.copy() if not p.small_open: rnew[:p.T] = firm.get_r(Ynew[:p.T], K[:p.T], p, 'TPI') else: rnew[:p.T] = r[:p.T].copy() r_gov_new = fiscal.get_r_gov(rnew, p) if p.budget_balance: r_hh_new = rnew[:p.T] else: r_hh_new = aggr.get_r_hh(rnew[:p.T], r_gov_new[:p.T], K[:p.T], Dnew[:p.T]) if p.small_open: r_hh_new = p.hh_r[:p.T] # compute w wnew = firm.get_w_from_r(rnew[:p.T], p, 'TPI') b_mat_shift = np.append(np.reshape(initial_b, (1, p.S, p.J)), b_mat[:p.T - 1, :, :], axis=0) BQnew = aggr.get_BQ(r_hh_new[:p.T], b_mat_shift, None, p, 'TPI', False) bqmat_new = household.get_bq(BQnew, None, p, 'TPI') (total_rev, T_Ipath, T_Ppath, T_BQpath, T_Wpath, T_Cpath, business_revenue) = aggr.revenue( r_hh_new[:p.T], wnew[:p.T], bmat_s, n_mat[:p.T, :, :], bqmat_new[:p.T, :, :], c_mat[:p.T, :, :], Ynew[:p.T], L[:p.T], K[:p.T], factor, theta, etr_params_4D, p, 'TPI') total_revenue[:p.T] = total_rev if p.budget_balance: TR_new = total_revenue elif not p.baseline_spending: TR_new = p.alpha_T[:p.T] * Ynew[:p.T] # If baseline_spending==True, no need to update TR, it's fixed # update vars for next iteration w[:p.T] = wnew[:p.T] r[:p.T] = utils.convex_combo(rnew[:p.T], r[:p.T], p.nu) BQ[:p.T] = utils.convex_combo(BQnew[:p.T], BQ[:p.T], p.nu) D[:p.T] = Dnew[:p.T] Y[:p.T] = utils.convex_combo(Ynew[:p.T], Y[:p.T], p.nu) if not p.baseline_spending: TR[:p.T] = utils.convex_combo(TR_new[:p.T], TR[:p.T], p.nu) guesses_b = utils.convex_combo(b_mat, guesses_b, p.nu) guesses_n = utils.convex_combo(n_mat, guesses_n, p.nu) print('r diff: ', (rnew[:p.T] - r[:p.T]).max(), (rnew[:p.T] - r[:p.T]).min()) print('BQ diff: ', (BQnew[:p.T] - BQ[:p.T]).max(), (BQnew[:p.T] - BQ[:p.T]).min()) print('TR diff: ', (TR_new[:p.T] - TR[:p.T]).max(), (TR_new[:p.T] - TR[:p.T]).min()) print('Y diff: ', (Ynew[:p.T] - Y[:p.T]).max(), (Ynew[:p.T] - Y[:p.T]).min()) if not p.baseline_spending: if TR.all() != 0: TPIdist = np.array( list(utils.pct_diff_func(rnew[:p.T], r[:p.T])) + list( utils.pct_diff_func(BQnew[:p.T], BQ[:p.T]).flatten()) + list(utils.pct_diff_func(TR_new[:p.T], TR[:p.T]))).max() else: TPIdist = np.array( list(utils.pct_diff_func(rnew[:p.T], r[:p.T])) + list( utils.pct_diff_func(BQnew[:p.T], BQ[:p.T]).flatten()) + list(np.abs(TR[:p.T]))).max() else: TPIdist = np.array( list(utils.pct_diff_func(rnew[:p.T], r[:p.T])) + list(utils.pct_diff_func(BQnew[:p.T], BQ[:p.T]).flatten()) + list(utils.pct_diff_func(Ynew[:p.T], Y[:p.T]))).max() TPIdist_vec[TPIiter] = TPIdist # After T=10, if cycling occurs, drop the value of nu # wait til after T=10 or so, because sometimes there is a jump up # in the first couple iterations # if TPIiter > 10: # if TPIdist_vec[TPIiter] - TPIdist_vec[TPIiter - 1] > 0: # nu /= 2 # print 'New Value of nu:', nu TPIiter += 1 print('Iteration:', TPIiter) print('\tDistance:', TPIdist) # Compute effective and marginal tax rates for all agents mtrx_params_4D = np.tile( p.mtrx_params.reshape(p.T, p.S, 1, p.mtrx_params.shape[2]), (1, 1, p.J, 1)) mtry_params_4D = np.tile( p.mtry_params.reshape(p.T, p.S, 1, p.mtry_params.shape[2]), (1, 1, p.J, 1)) e_3D = np.tile(p.e.reshape(1, p.S, p.J), (p.T, 1, 1)) mtry_path = tax.MTR_income(r_hh_path[:p.T], wpath[:p.T], bmat_s[:p.T, :, :], n_mat[:p.T, :, :], factor, True, e_3D, etr_params_4D, mtry_params_4D, p) mtrx_path = tax.MTR_income(r_hh_path[:p.T], wpath[:p.T], bmat_s[:p.T, :, :], n_mat[:p.T, :, :], factor, False, e_3D, etr_params_4D, mtrx_params_4D, p) etr_path = tax.ETR_income(r_hh_path[:p.T], wpath[:p.T], bmat_s[:p.T, :, :], n_mat[:p.T, :, :], factor, e_3D, etr_params_4D, p) C = aggr.get_C(c_mat, p, 'TPI') # Note that implicity in this computation is that immigrants' # wealth is all in the form of private capital I_d = aggr.get_I(bmat_splus1[:p.T], K_d[1:p.T + 1], K_d[:p.T], p, 'TPI') I = aggr.get_I(bmat_splus1[:p.T], K[1:p.T + 1], K[:p.T], p, 'TPI') # solve resource constraint # net foreign borrowing new_borrowing_f = (D_f[1:p.T + 1] * np.exp(p.g_y) * (1 + p.g_n[1:p.T + 1]) - D_f[:p.T]) debt_service_f = D_f * r_hh RC_error = aggr.resource_constraint(Y[:p.T - 1], C[:p.T - 1], G[:p.T - 1], I_d[:p.T - 1], K_f[:p.T - 1], new_borrowing_f[:p.T - 1], debt_service_f[:p.T - 1], r_hh[:p.T - 1], p) # Compute total investment (not just domestic) I_total = ((1 + p.g_n[:p.T]) * np.exp(p.g_y) * K[1:p.T + 1] - (1.0 - p.delta) * K[:p.T]) # Compute income tax revenues tax_rev = aggr.get_L(T_Ipath, p, 'TPI') payroll_tax_revenue = p.frac_tax_payroll[:p.T] * tax_rev[:p.T] iit_revenue = tax_rev[:p.T] - payroll_tax_revenue # Compute resource constraint error rce_max = np.amax(np.abs(RC_error)) print('Max absolute value resource constraint error:', rce_max) print('Checking time path for violations of constraints.') for t in range(p.T): household.constraint_checker_TPI(b_mat[t], n_mat[t], c_mat[t], t, p.ltilde) eul_savings = euler_errors[:, :p.S, :].max(1).max(1) eul_laborleisure = euler_errors[:, p.S:, :].max(1).max(1) print('Max Euler error, savings: ', eul_savings) print('Max Euler error labor supply: ', eul_laborleisure) ''' ------------------------------------------------------------------------ Save variables/values so they can be used in other modules ------------------------------------------------------------------------ ''' output = { 'Y': Y[:p.T], 'B': B, 'K': K, 'K_f': K_f, 'K_d': K_d, 'L': L, 'C': C, 'I': I, 'I_total': I_total, 'I_d': I_d, 'BQ': BQ, 'total_revenue': total_revenue, 'business_revenue': business_revenue, 'IITpayroll_revenue': T_Ipath, 'iit_revenue': iit_revenue, 'payroll_tax_revenue': payroll_tax_revenue, 'TR': TR, 'T_P': T_Ppath, 'T_BQ': T_BQpath, 'T_W': T_Wpath, 'T_C': T_Cpath, 'G': G, 'D': D, 'D_f': D_f, 'D_d': D_d, 'r': r, 'r_gov': r_gov, 'r_hh': r_hh, 'w': w, 'bmat_splus1': bmat_splus1, 'bmat_s': bmat_s[:p.T, :, :], 'n_mat': n_mat[:p.T, :, :], 'c_path': c_mat, 'bq_path': bqmat, 'tr_path': trmat, 'y_before_tax_mat': y_before_tax_mat, 'tax_path': tax_mat, 'eul_savings': eul_savings, 'eul_laborleisure': eul_laborleisure, 'resource_constraint_error': RC_error, 'new_borrowing_f': new_borrowing_f, 'debt_service_f': debt_service_f, 'etr_path': etr_path, 'mtrx_path': mtrx_path, 'mtry_path': mtry_path } tpi_dir = os.path.join(p.output_base, "TPI") utils.mkdirs(tpi_dir) tpi_vars = os.path.join(tpi_dir, "TPI_vars.pkl") with open(tpi_vars, "wb") as f: pickle.dump(output, f) if np.any(G) < 0: print('Government spending is negative along transition path' + ' to satisfy budget') if (((TPIiter >= p.maxiter) or (np.absolute(TPIdist) > p.mindist_TPI)) and ENFORCE_SOLUTION_CHECKS): raise RuntimeError('Transition path equlibrium not found' + ' (TPIdist)') if ((np.any(np.absolute(RC_error) >= p.mindist_TPI * 10)) and ENFORCE_SOLUTION_CHECKS): raise RuntimeError('Transition path equlibrium not found ' + '(RC_error)') if ((np.any(np.absolute(eul_savings) >= p.mindist_TPI) or (np.any(np.absolute(eul_laborleisure) > p.mindist_TPI))) and ENFORCE_SOLUTION_CHECKS): raise RuntimeError('Transition path equlibrium not found ' + '(eulers)') return output
def open_mfdataset( paths, chunks=None, concat_dim="time", compat="no_conflicts", preprocess=None, engine=None, lock=None, data_vars="all", coords="different", combine="nested", autoclose=None, parallel=False, join="outer", attrs_file=None, **kwargs, ): """Helper function for opening multiple files as an xarray_ dataset. Adapted from upstream implementation_. See docs_ for usage. .. todo:: To be removed when a backend entrypoint_ is implementated. .. _implementation: https://github.com/pydata/xarray/blob/484d1ce5ff8969b6ca6fa942b344379725f33b9c/xarray/backends/api.py#L726 .. _docs: https://xarray.pydata.org/en/v0.15.1/generated/xarray.open_mfdataset.html .. _entrypoint: https://github.com/pydata/xarray/pull/3166 """ if isinstance(paths, str): paths = sorted(glob(paths)) else: paths = [str(p) if isinstance(p, Path) else p for p in paths] if not paths: raise OSError("no files to open") # If combine='by_coords' then this is unnecessary, but quick. # If combine='nested' then this creates a flat list which is easier to # iterate over, while saving the originally-supplied structure as "ids" if combine == "nested": if isinstance(concat_dim, (str, xr.DataArray)) or concat_dim is None: concat_dim = [concat_dim] open_kwargs = dict() if parallel: import dask # wrap the open_dataset, getattr, and preprocess with delayed open_ = dask.delayed(open_dataset) if preprocess is not None: preprocess = dask.delayed(preprocess) else: open_ = open_dataset datasets = [open_(p, **open_kwargs) for p in paths] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] if parallel: # calling compute here will return the datasets # the underlying datasets will still be stored as dask arrays datasets, = dask.compute(datasets) # Combine all datasets, closing them in case of a ValueError try: if combine == "nested": # Combined nested list by successive concat and merge operations # along each dimension, using structure given by "ids" combined = xr.combine_nested( datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords, join=join, ) elif combine == "by_coords": # Redo ordering from coordinates, ignoring how they were ordered # previously combined = xr.combine_by_coords(datasets, compat=compat, data_vars=data_vars, coords=coords, join=join) else: raise ValueError("{} is an invalid option for the keyword argument" " ``combine``".format(combine)) except ValueError: for ds in datasets: ds.close() raise # read global attributes from the attrs_file or from the first dataset if attrs_file is not None: if isinstance(attrs_file, Path): attrs_file = str(attrs_file) combined.attrs = datasets[paths.index(attrs_file)].attrs else: combined.attrs = datasets[0].attrs return combined
def read(filename, band_names=None, time_names=None, bounds=None, chunks=256, num_workers=1, **kwargs): """ Reads a window slice in-memory Args: filename (str or list): A file name or list of file names to open read. band_names (Optional[list]): A list of names to give the output band dimension. time_names (Optional[list]): A list of names to give the time dimension. bounds (Optional[1d array-like]): A bounding box to subset to, given as [minx, miny, maxx, maxy] or [left, bottom, right, top]. chunks (Optional[tuple]): The data chunk size. num_workers (Optional[int]): The number of parallel ``dask`` workers. kwargs (Optional[dict]): Keyword arguments to pass to ``rasterio.write``. Returns: ``xarray.DataArray`` """ # Cannot pass 'chunks' to rasterio if 'chunks' in kwargs: del kwargs['chunks'] if isinstance(filename, str): with rio.open(filename) as src: src_transform = src.gw.transform if hasattr( src, 'gw') else src.transform if bounds and ('window' not in kwargs): kwargs['window'] = from_bounds(*bounds, transform=src_transform) ycoords, xcoords, attrs = get_attrs(src, **kwargs) data = dask.compute(read_delayed(filename, chunks, **kwargs), num_workers=num_workers)[0] if not band_names: band_names = np.arange(1, data.shape[0] + 1) if len(band_names) != data.shape[0]: logger.exception( ' The band names do not match the output dimensions.') raise ValueError data = xr.DataArray(data, dims=('band', 'y', 'x'), coords={ 'band': band_names, 'y': ycoords[:data.shape[-2]], 'x': xcoords[:data.shape[-1]] }, attrs=attrs) else: with rio.open(filename[0]) as src: src_transform = src.gw.transform if hasattr( src, 'gw') else src.transform if bounds and ('window' not in kwargs): kwargs['window'] = from_bounds(*bounds, transform=src_transform) ycoords, xcoords, attrs = get_attrs(src, **kwargs) data = da.concatenate(dask.compute(read_list(filename, chunks, **kwargs), num_workers=num_workers), axis=0) if not band_names: band_names = np.arange(1, data.shape[-3] + 1) if len(band_names) != data.shape[-3]: logger.exception( ' The band names do not match the output dimensions.') raise ValueError if not time_names: time_names = np.arange(1, len(filename) + 1) if len(time_names) != data.shape[-4]: logger.exception( ' The time names do not match the output dimensions.') raise ValueError data = xr.DataArray(data, dims=('time', 'band', 'y', 'x'), coords={ 'time': time_names, 'band': band_names, 'y': ycoords[:data.shape[-2]], 'x': xcoords[:data.shape[-1]] }, attrs=attrs) return data
def do_work(): return dask.compute( *[self.load_image_label(), self.load_image_annotations()]), None
initial_states_compartment : pd.DataFrame with index of draws and colunms for S, E, I1, I2, R use_mechanistic_testing : bool test_rate : tests per person per day test_positive_rate : fraction of daily tests that test positive (if there are enough infections to do so) Results ------- returns two dicts of pd.DataFrames with columns for counts for S, E, I1, I2, and R as well as new infections, and rows for each day of projection; first dict is for agent counts, and second dict is for compartment counts """ from dask import delayed, compute assert 0 <= mixing_parameter <= 1, 'mixing_parameter must be in interval [0,1]' df_agent_count_dict, df_compartment_count_dict = {}, {} for draw in np.random.choice(range(1_000), replace=False, size=n_draws): df_tuple = delayed(run_one_hybrid_model)( draw, n_simulants, mixing_parameter, params, beta_agent, beta_compartment, start_time, end_time, initial_states_agent, initial_states_compartment, use_mechanistic_testing, test_rate, test_positive_rate) # append the counts to their dicts df_agent_count_dict[draw] = df_tuple[0] df_compartment_count_dict[draw] = df_tuple[1] return compute(df_agent_count_dict, df_compartment_count_dict)
def test_compute(self): """compute_with_trace() runs the same logic as compute().""" bag = from_sequence([1, 2, 3]) bag = bag.map(lambda x: x * 7).map(lambda x: x * 4) bag = bag.fold(lambda x, y: x + y) self.assertEqual(dask.compute(bag), compute_with_trace(bag))
import sys import time if __name__ == '__main__': # spin up the dask client (NEEDS to be within __main__) c = None if len(sys.argv) > 1 and int(sys.argv[1]) > 1: print(f"Starting dask Client with {sys.argv[1]} processors") c = Client(threads_per_worker=1, n_workers=int(sys.argv[1])) print("client started...") ds = yt.load_sample("snapshot_033") sp = ds.sphere(ds.domain_center, (2, 'code_length')) ptf = {'PartType0': ['Mass']} mock_sphere = gda.MockSphere(sp) delayed_reader = gda.delayed_gadget(ds, ptf, mock_selector=mock_sphere, subchunk_size=None) # delayed_reader.delayed_chunks[0].compute() # data = compute(*delayed_reader.delayed_chunks) data_subset = compute(*delayed_reader.masked_chunks) # delayed_reader.set_chunk_masks("snapshot_033") # masks = compute(*delayed_reader.masks) # print(f"\nCompute time (neglecting Client spinup and yt initialization): {select_time}s") # if c is not None: print("\nshutting down dask client") c.shutdown()
print('ds_in size: ', ds_in[in_vars].nbytes / 1e9) full_template = create_template(ds_in['ppt'], in_vars + model_vars) for v in aux_vars: full_template[v] = ds_in[v] full_template = full_template.chunk(chunks) out_mapper = get_out_mapper(os.environ["BLOB_ACCOUNT_KEY"]) print('clearing existing store') out_mapper.clear() full_template.to_zarr(out_mapper, mode='w', compute=False) regions = get_regions(ds_in) reg_tasks = [] for region in regions: reg_tasks.append(block_wrapper(region, os.environ['BLOB_ACCOUNT_KEY'])) return finish_store(out_mapper, reg_tasks) if __name__ == '__main__': from dask.distributed import Client with Client(threads_per_worker=1, memory_limit='4 G') as client: print(client) print(client.dashboard_link) task = main() dask.compute(task, retries=10)
d = [] start = 0 incr = 1000 stop = len(files) ranges = list(range(start, stop, incr)) for i in tqdm_notebook(ranges): print(f'Processing {i}') d = [] for file in files[i:i + incr]: print(file) d.append(process_float(file)) results = dask.compute(*d) t = xr.concat(results, dim='N_PROF', coords='minimal') t = t.chunk({'N_PROF': 10000, 'N_LEVELS': 3000}) print(f'Finished concatenating dataset') numcodecs.blosc.use_threads = False synchronizer = zarr.ProcessSynchronizer('../../argozarr/argodask2.sync') #compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) zarr_path = '../../argozarr/argo_dask2.zarr' #encoding = {vname: {'compressor': compressor} for vname in t.variables} d = t.to_zarr(zarr_path, mode='a', synchronizer=synchronizer, compute=True, append_dim='N_PROF')
def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, data_vars='all', coords='different', autoclose=False, parallel=False, **kwargs): """Open multiple files as a single dataset. Requires dask to be installed. See documentation for details on dask [1]. Attributes from the first dataset file are used for the combined dataset. Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit list of files to open. Paths can be given as strings or as pathlib Paths. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details [2]. concat_dim : None, str, DataArray or Index, optional Dimension to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only need to provide this argument if the dimension along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for 'netcdf4'. autoclose : bool, optional If True, automatically close files to avoid OS Error of too many files being open. However, this option doesn't work with streams, e.g., BytesIO. lock : False, True or threading.Lock, optional This argument is passed on to :py:func:`dask.array.from_array`. By default, a per-variable lock is used when reading data from netCDF files with the netcdf4 and h5netcdf engines to avoid issues with concurrent access when using dask's multithreaded backend. data_vars : {'minimal', 'different', 'all' or list of str}, optional These data variables will be concatenated together: * 'minimal': Only data variables in which the dimension already appears are included. * 'different': Data variables which are not equal (ignoring attributes) across all datasets are also concatenated (as well as all for which dimension already appears). Beware: this option may load the data payload of data variables into memory if they are not already loaded. * 'all': All data variables will be concatenated. * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. coords : {'minimal', 'different', 'all' o list of str}, optional These coordinate variables will be concatenated together: * 'minimal': Only coordinates in which the dimension already appears are included. * 'different': Coordinates which are not equal (ignoring attributes) across all datasets are also concatenated (as well as all for which dimension already appears). Beware: this option may load the data payload of coordinate variables into memory if they are not already loaded. * 'all': All coordinate variables will be concatenated, except those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition the 'minimal' coordinates. parallel : bool, optional If True, the open and preprocess steps of this function will be performed in parallel using ``dask.delayed``. Default is False. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. Returns ------- xarray.Dataset See Also -------- auto_combine open_dataset References ---------- .. [1] http://xarray.pydata.org/en/stable/dask.html .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance """ if isinstance(paths, basestring): paths = sorted(glob(paths)) else: paths = [str(p) if isinstance(p, path_type) else p for p in paths] if not paths: raise IOError('no files to open') if lock is None: lock = _default_lock(paths[0], engine) open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, autoclose=autoclose, **kwargs) if parallel: import dask # wrap the open_dataset, getattr, and preprocess with delayed open_ = dask.delayed(open_dataset) getattr_ = dask.delayed(getattr) if preprocess is not None: preprocess = dask.delayed(preprocess) else: open_ = open_dataset getattr_ = getattr datasets = [open_(p, **open_kwargs) for p in paths] file_objs = [getattr_(ds, '_file_obj') for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] if parallel: # calling compute here will return the datasets/file_objs lists, # the underlying datasets will still be stored as dask arrays datasets, file_objs = dask.compute(datasets, file_objs) # close datasets in case of a ValueError try: if concat_dim is _CONCAT_DIM_DEFAULT: combined = auto_combine(datasets, compat=compat, data_vars=data_vars, coords=coords) else: combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat, data_vars=data_vars, coords=coords) except ValueError: for ds in datasets: ds.close() raise combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs return combined
def get_pandas_parallel(): files = get_files(get_data_dir()) dfs = dask.compute( read_parquet_delayed(file_name) for file_name in files) df = pd.concat(dfs[0]) return df
def init_scalable(X, n_clusters, random_state=None, max_iter=None, oversampling_factor=2): """K-Means initialization using k-means|| This is algorithm 2 in Scalable K-Means++ (2012). """ logger.info("Initializing with k-means||") # Step 1: Initialize Centers idx = 0 centers = da.compute(X[idx, np.newaxis])[0] c_idx = {idx} # Step 2: Initialize cost (cost, ) = compute(evaluate_cost(X, centers)) if cost == 0: n_iter = 0 else: n_iter = int(np.round(np.log(cost))) if max_iter is not None: n_iter = min(max_iter, n_iter) # Steps 3 - 6: update candidate Centers for i in range(n_iter): with _timer( "init iteration %2d/%2d , %2d centers" % (i + 1, n_iter, len(c_idx)), _logger=logger, ): new_idxs = _sample_points(X, centers, oversampling_factor, random_state) new_idxs = set(*compute(new_idxs)) c_idx |= new_idxs # Sort before slicing, for better performance / memory # usage with the scheduler. # See https://github.com/dask/dask-ml/issues/39 centers = X[sorted(c_idx)].compute() # XXX: scikit-learn doesn't have weighted k-means. # The paper weights each center by the number of points closest to it. # https://stackoverflow.com/a/37198799/1889400 claims you can scale the # features before clustering, but that doesn't seem right. # I think that replicating the *points*, proportional to the number of # original points closest to the candidate centers, would be a better way # to do that. if len(centers) < n_clusters: logger.warning("Found fewer than %d clusters in init.", n_clusters) # supplement with random need = n_clusters - len(centers) locs = sorted( random_state.choice(np.arange(0, len(X)), size=need, replace=False, chunks=len(X))) extra = X[locs].compute() return np.vstack([centers, extra]) else: # Step 7, 8 without weights # dask RandomState objects aren't valid for scikit-learn rng2 = (random_state.randint( 0, np.iinfo("i4").max - 1, chunks=()).compute(scheduler="single-threaded").item()) km = sklearn.cluster.KMeans(n_clusters, random_state=rng2) km.fit(centers) return km.cluster_centers_
def compute_overview(df: dd.DataFrame, cfg: Config, dtype: Optional[DTypeDef]) -> Intermediate: """ Compute functions for plot(df) Parameters ---------- df DataFrame from which visualizations are generated cfg Config instance dtype: str or DType or dict of str or dict of DType, default None Specify Data Types for designated column or all columns. E.g. dtype = {"a": Continuous, "b": "Nominal"} or dtype = {"a": Continuous(), "b": "nominal"} or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() """ # pylint: disable=too-many-branches if cfg.bar.enable or cfg.insight.enable: # extract the first rows to check if a column contains a mutable type head: pd.DataFrame = df.head() # head triggers a (small) data read data: List[Tuple[str, DType, Any]] = [] for col in df.columns: col_dtype = detect_dtype(df[col], dtype) if is_dtype(col_dtype, Continuous()) and (cfg.hist.enable or cfg.insight.enable): data.append((col, Continuous(), _cont_calcs(df[col].dropna(), cfg))) elif is_dtype(col_dtype, Nominal()) and (cfg.bar.enable or cfg.insight.enable): # Since it will throw error if column is object while some cells are # numerical, we transform column to string first. df[col] = df[col].astype(str) data.append((col, Nominal(), _nom_calcs(df[col].dropna(), head[col], cfg))) elif is_dtype(col_dtype, DateTime()) and (cfg.line.enable or cfg.insight.enable): data.append((col, DateTime(), dask.delayed(_calc_line_dt)(df[[col]], cfg.line.unit))) ov_stats = calc_stats(df, cfg, dtype) # overview statistics data, ov_stats = dask.compute(data, ov_stats) # extract the plotting data, and detect and format the insights plot_data: List[Tuple[str, DType, Any]] = [] col_insights: Dict[str, List[str]] = {} all_ins = _format_ov_ins(ov_stats, cfg) if cfg.insight.enable else [] for col, dtp, dat in data: if is_dtype(dtp, Continuous()): if cfg.insight.enable: col_ins, ins = _format_cont_ins(col, dat, ov_stats["nrows"], cfg) if cfg.hist.enable: plot_data.append((col, dtp, dat["hist"])) elif is_dtype(dtp, Nominal()): if cfg.insight.enable: col_ins, ins = _format_nom_ins(col, dat, ov_stats["nrows"], cfg) if cfg.bar.enable: plot_data.append((col, dtp, (dat["bar"].to_frame(), dat["nuniq"]))) elif is_dtype(dtp, DateTime()): plot_data.append((col, dtp, dat)) continue if cfg.insight.enable: if col_ins: col_insights[col] = col_ins all_ins += ins return Intermediate( data=plot_data, stats=ov_stats, column_insights=col_insights, overview_insights=_insight_pagination(all_ins), visual_type="distribution_grid", )
def write(self): writes = xds_to_table(self.datasets, self.table_name, columns="ALL") dask.compute(writes)
def ms_create(ms_table_name, info, ant_pos, vis_array, baselines, timestamps, pol_feeds, sources): ''' Create a Measurement Set from some TART observations Parameters ---------- ms_table_name : string The name of the MS top level directory. I think this only workds in the local directory. info : JSON "info": { "info": { "L0_frequency": 1571328000.0, "bandwidth": 2500000.0, "baseband_frequency": 4092000.0, "location": { "alt": 270.0, "lat": -45.85177, "lon": 170.5456 }, "name": "Signal Hill - Dunedin", "num_antenna": 24, "operating_frequency": 1575420000.0, "sampling_frequency": 16368000.0 } }, Returns ------- None ''' epoch_s = timestamp_to_ms_epoch(timestamps) LOGGER.info("Time {}".format(epoch_s)) try: loc = info['location'] except: loc = info # Sort out the coordinate frames using astropy # https://casa.nrao.edu/casadocs/casa-5.4.1/reference-material/coordinate-frames iers.conf.iers_auto_url = 'https://astroconda.org/aux/astropy_mirror/iers_a_1/finals2000A.all' iers.conf.auto_max_age = None location = EarthLocation.from_geodetic(lon=loc['lon']*u.deg, lat=loc['lat']*u.deg, height=loc['alt']*u.m, ellipsoid='WGS84') obstime = Time(timestamps) local_frame = AltAz(obstime=obstime, location=location) phase_altaz = SkyCoord(alt=90.0*u.deg, az=0.0*u.deg, obstime = obstime, frame = 'altaz', location = location) phase_j2000 = phase_altaz.transform_to('fk5') # Get the stokes enums for the polarization types corr_types = [[MS_STOKES_ENUMS[p_f] for p_f in pol_feeds]] LOGGER.info("Pol Feeds {}".format(pol_feeds)) LOGGER.info("Correlation Types {}".format(corr_types)) num_freq_channels = [1] ant_table = MSTable(ms_table_name, 'ANTENNA') feed_table = MSTable(ms_table_name, 'FEED') field_table = MSTable(ms_table_name, 'FIELD') pol_table = MSTable(ms_table_name, 'POLARIZATION') obs_table = MSTable(ms_table_name, 'OBSERVATION') # SOURCE is an optional MS sub-table src_table = MSTable(ms_table_name, 'SOURCE') ddid_table_name = "::".join((ms_table_name, "DATA_DESCRIPTION")) spw_table_name = "::".join((ms_table_name, "SPECTRAL_WINDOW")) ms_datasets = [] ddid_datasets = [] spw_datasets = [] # Create ANTENNA dataset # Each column in the ANTENNA has a fixed shape so we # can represent all rows with one dataset num_ant = len(ant_pos) position = da.asarray(ant_pos) diameter = da.ones(num_ant) * 0.025 offset = da.zeros((num_ant, 3)) names = np.array(['ANTENNA-%d' % i for i in range(num_ant)], dtype=np.object) stations = np.array([info['name'] for i in range(num_ant)], dtype=np.object) dataset = Dataset({ 'POSITION': (("row", "xyz"), position), 'OFFSET': (("row", "xyz"), offset), 'DISH_DIAMETER': (("row",), diameter), 'NAME': (("row",), da.from_array(names, chunks=num_ant)), 'STATION': (("row",), da.from_array(stations, chunks=num_ant)), }) ant_table.append(dataset) ################### Create a FEED dataset. ################################### # There is one feed per antenna, so this should be quite similar to the ANTENNA num_pols = len(pol_feeds) pol_types = pol_feeds pol_responses = [POL_RESPONSES[ct] for ct in pol_feeds] LOGGER.info("Pol Types {}".format(pol_types)) LOGGER.info("Pol Responses {}".format(pol_responses)) antenna_ids = da.asarray(range(num_ant)) feed_ids = da.zeros(num_ant) num_receptors = da.zeros(num_ant) + num_pols polarization_types = np.array([pol_types for i in range(num_ant)], dtype=np.object) receptor_angles = np.array([[0.0] for i in range(num_ant)]) pol_response = np.array([pol_responses for i in range(num_ant)]) beam_offset = np.array([[[0.0, 0.0]] for i in range(num_ant)]) dataset = Dataset({ 'ANTENNA_ID': (("row",), antenna_ids), 'FEED_ID': (("row",), feed_ids), 'NUM_RECEPTORS': (("row",), num_receptors), 'POLARIZATION_TYPE': (("row", "receptors",), da.from_array(polarization_types, chunks=num_ant)), 'RECEPTOR_ANGLE': (("row", "receptors",), da.from_array(receptor_angles, chunks=num_ant)), 'POL_RESPONSE': (("row", "receptors", "receptors-2"), da.from_array(pol_response, chunks=num_ant)), 'BEAM_OFFSET': (("row", "receptors", "radec"), da.from_array(beam_offset, chunks=num_ant)), }) feed_table.append(dataset) ####################### FIELD dataset ######################################### direction = [[phase_j2000.ra.radian, phase_j2000.dec.radian]] field_direction = da.asarray(direction)[None, :] field_name = da.asarray(np.asarray(['up'], dtype=np.object), chunks=1) field_num_poly = da.zeros(1) # Zero order polynomial in time for phase center. dir_dims = ("row", 'field-poly', 'field-dir',) dataset = Dataset({ 'PHASE_DIR': (dir_dims, field_direction), 'DELAY_DIR': (dir_dims, field_direction), 'REFERENCE_DIR': (dir_dims, field_direction), 'NUM_POLY': (("row", ), field_num_poly), 'NAME': (("row", ), field_name), }) field_table.append(dataset) ######################### OBSERVATION dataset ##################################### dataset = Dataset({ 'TELESCOPE_NAME': (("row",), da.asarray(np.asarray(['TART'], dtype=np.object), chunks=1)), 'OBSERVER': (("row",), da.asarray(np.asarray(['Tim'], dtype=np.object), chunks=1)), "TIME_RANGE": (("row","obs-exts"), da.asarray(np.array([[epoch_s, epoch_s+1]]), chunks=1)), }) obs_table.append(dataset) ######################## SOURCE datasets ######################################## for src in sources: name = src['name'] # Convert to J2000 dir_altaz = SkyCoord(alt=src['el']*u.deg, az=src['az']*u.deg, obstime = obstime, frame = 'altaz', location = location) dir_j2000 = dir_altaz.transform_to('fk5') direction = [dir_j2000.ra.radian, dir_j2000.dec.radian] #LOGGER.info("SOURCE: {}, timestamp: {}".format(name, timestamps)) dask_num_lines = da.full((1,), 1, dtype=np.int32) dask_direction = da.asarray(direction)[None, :] dask_name = da.asarray(np.asarray([name], dtype=np.object), chunks=1) dask_time = da.asarray(np.array([epoch_s])) dataset = Dataset({ "NUM_LINES": (("row",), dask_num_lines), "NAME": (("row",), dask_name), "TIME": (("row",), dask_time), "DIRECTION": (("row", "dir"), dask_direction), }) src_table.append(dataset) # Create POLARISATION datasets. # Dataset per output row required because column shapes are variable for corr_type in corr_types: corr_prod = [[i, i] for i in range(len(corr_type))] corr_prod = np.array(corr_prod) LOGGER.info("Corr Prod {}".format(corr_prod)) LOGGER.info("Corr Type {}".format(corr_type)) dask_num_corr = da.full((1,), len(corr_type), dtype=np.int32) LOGGER.info("NUM_CORR {}".format(dask_num_corr)) dask_corr_type = da.from_array(corr_type, chunks=len(corr_type))[None, :] dask_corr_product = da.asarray(corr_prod)[None, :] LOGGER.info("Dask Corr Prod {}".format(dask_corr_product.shape)) LOGGER.info("Dask Corr Type {}".format(dask_corr_type.shape)) dataset = Dataset({ "NUM_CORR": (("row",), dask_num_corr), "CORR_TYPE": (("row", "corr"), dask_corr_type), "CORR_PRODUCT": (("row", "corr", "corrprod_idx"), dask_corr_product), }) pol_table.append(dataset) # Create multiple SPECTRAL_WINDOW datasets # Dataset per output row required because column shapes are variable for num_chan in num_freq_channels: dask_num_chan = da.full((1,), num_chan, dtype=np.int32) dask_chan_freq = da.asarray([[info['operating_frequency']]]) dask_chan_width = da.full((1, num_chan), 2.5e6/num_chan) dataset = Dataset({ "NUM_CHAN": (("row",), dask_num_chan), "CHAN_FREQ": (("row", "chan"), dask_chan_freq), "CHAN_WIDTH": (("row", "chan"), dask_chan_width), "EFFECTIVE_BW": (("row", "chan"), dask_chan_width), "RESOLUTION": (("row", "chan"), dask_chan_width), }) spw_datasets.append(dataset) # For each cartesian product of SPECTRAL_WINDOW and POLARIZATION # create a corresponding DATA_DESCRIPTION. # Each column has fixed shape so we handle all rows at once spw_ids, pol_ids = zip(*product(range(len(num_freq_channels)), range(len(corr_types)))) dask_spw_ids = da.asarray(np.asarray(spw_ids, dtype=np.int32)) dask_pol_ids = da.asarray(np.asarray(pol_ids, dtype=np.int32)) ddid_datasets.append(Dataset({ "SPECTRAL_WINDOW_ID": (("row",), dask_spw_ids), "POLARIZATION_ID": (("row",), dask_pol_ids), })) # Now create the associated MS dataset #vis_data, baselines = cal_vis.get_all_visibility() #vis_array = np.array(vis_data, dtype=np.complex64) chunks = { "row": (vis_array.shape[0],), } baselines = np.array(baselines) #LOGGER.info(f"baselines {baselines}") bl_pos = np.array(ant_pos)[baselines] uu_a, vv_a, ww_a = -(bl_pos[:, 1] - bl_pos[:, 0]).T #/constants.L1_WAVELENGTH # Use the - sign to get the same orientation as our tart projections. uvw_array = np.array([uu_a, vv_a, ww_a]).T for ddid, (spw_id, pol_id) in enumerate(zip(spw_ids, pol_ids)): # Infer row, chan and correlation shape #LOGGER.info("ddid:{} ({}, {})".format(ddid, spw_id, pol_id)) row = sum(chunks['row']) chan = spw_datasets[spw_id].CHAN_FREQ.shape[1] corr = pol_table.datasets[pol_id].CORR_TYPE.shape[1] # Create some dask vis data dims = ("row", "chan", "corr") LOGGER.info("Data size %s %s %s" % (row, chan, corr)) #np_data = vis_array.reshape((row, chan, corr)) np_data = np.zeros((row, chan, corr), dtype=np.complex128) for i in range(corr): np_data[:, :, i] = vis_array.reshape((row, chan)) #np_data = np.array([vis_array.reshape((row, chan, 1)) for i in range(corr)]) np_uvw = uvw_array.reshape((row, 3)) data_chunks = tuple((chunks['row'], chan, corr)) dask_data = da.from_array(np_data, chunks=data_chunks) flag_categories = da.from_array(0.05*np.ones((row, chan, corr, 1))) flag_data = np.zeros((row, chan, corr), dtype=np.bool_) uvw_data = da.from_array(np_uvw) # Create dask ddid column dask_ddid = da.full(row, ddid, chunks=chunks['row'], dtype=np.int32) dataset = Dataset({ 'DATA': (dims, dask_data), 'FLAG': (dims, da.from_array(flag_data)), 'TIME': (("row", "corr"), da.from_array(epoch_s*np.ones((row, corr)))), 'TIME_CENTROID': (("row", "corr"), da.from_array(epoch_s*np.ones((row, corr)))), 'WEIGHT': (("row", "corr"), da.from_array(0.95*np.ones((row, corr)))), 'WEIGHT_SPECTRUM': (dims, da.from_array(0.95*np.ones_like(np_data, dtype=np.float64))), 'SIGMA_SPECTRUM': (dims, da.from_array(np.ones_like(np_data, dtype=np.float64)*0.05)), 'SIGMA': (("row", "corr"), da.from_array(0.05*np.ones((row, corr)))), 'UVW': (("row", "uvw",), uvw_data), 'FLAG_CATEGORY': (('row', 'flagcat', 'chan', 'corr'), flag_categories), # {'dims': ('flagcat', 'chan', 'corr')} 'ANTENNA1': (("row",), da.from_array(baselines[:, 0])), 'ANTENNA2': (("row",), da.from_array(baselines[:, 1])), 'FEED1': (("row",), da.from_array(baselines[:, 0])), 'FEED2': (("row",), da.from_array(baselines[:, 1])), 'DATA_DESC_ID': (("row",), dask_ddid) }) ms_datasets.append(dataset) ms_writes = xds_to_table(ms_datasets, ms_table_name, columns="ALL") spw_writes = xds_to_table(spw_datasets, spw_table_name, columns="ALL") ddid_writes = xds_to_table(ddid_datasets, ddid_table_name, columns="ALL") dask.compute(ms_writes) ant_table.write() feed_table.write() field_table.write() pol_table.write() obs_table.write() src_table.write() dask.compute(spw_writes) dask.compute(ddid_writes)
def calc_cog_sog(obj): """ This function calculates the course and speed over ground of a moving platform using the lat/lon. Note,data are resampled to 1 minute in order to provide a better estimate of speed/course compared with 1 second. Function is set up to use dask for the calculations in order to improve efficiency. Data are then resampled to 1 second to match native format. This assumes that the input data are 1 second. See this `example <https://ARM-DOE.github.io/ACT/source/auto_examples/correct_ship_wind_data.html #sphx-glr-source-auto-examples-correct-ship-wind-data-py>`_. Parameters ---------- obj : ACT Dataset ACT Dataset to calculate COG/SOG from. Assumes lat/lon are variables and that it's 1-second data. Returns ------- obj : ACT Dataset Returns object with course_over_ground and speed_over_ground variables. """ # Convert data to 1 minute in order to get proper values new_obj = obj.resample(time='1min').nearest() # Get lat and lon data if 'lat' in new_obj: lat = new_obj['lat'] elif 'latitude' in new_obj: lat = new_obj['latitude'] else: return new_obj if 'lon' in new_obj: lon = new_obj['lon'] elif 'longitude' in new_obj: lon = new_obj['longitude'] else: return new_obj # Set pyproj Geod _GEOD = pyproj.Geod(ellps='WGS84') # Set up delayed tasks for dask task = [] time = new_obj['time'].values for i in range(len(lat) - 1): task.append( dask.delayed(proc_scog)(_GEOD, lon[i + 1], lat[i + 1], lon[i], lat[i], time[i], time[i + 1])) # Compute and process results Adding 2 values # to the end to make up for the missing times results = dask.compute(*task) sog = [r[0] for r in results] sog.append(sog[-1]) sog.append(sog[-1]) cog = [r[1] for r in results] cog.append(cog[-1]) cog.append(cog[-1]) time = np.append(time, time[-1] + np.timedelta64(1, 'm')) atts = {'long_name': 'Speed over ground', 'units': 'm/s'} sog_da = xr.DataArray(sog, coords={'time': time}, dims=['time'], attrs=atts) sog_da = sog_da.resample(time='1s').nearest() atts = {'long_name': 'Course over ground', 'units': 'deg'} cog_da = xr.DataArray(cog, coords={'time': time}, dims=['time'], attrs=atts) cog_da = cog_da.resample(time='1s').nearest() obj['course_over_ground'] = cog_da obj['speed_over_ground'] = sog_da return obj
def test_custom_collection(): # Arbitrary hashables h1 = object() h2 = object() dsk = {("x", h1): 1, ("x", h2): 2} dsk2 = { ("y", h1): (add, ("x", h1), ("x", h2)), ("y", h2): (add, ("y", h1), 1) } dsk2.update(dsk) dsk3 = {"z": (add, ("y", h1), ("y", h2))} dsk3.update(dsk2) w = Tuple({}, []) # A collection can have no keys at all x = Tuple(dsk, [("x", h1), ("x", h2)]) y = Tuple(dsk2, [("y", h1), ("y", h2)]) z = Tuple(dsk3, ["z"]) # Collection with multiple names t = w + x + y + z # __slots__ defined on base mixin class propagates with pytest.raises(AttributeError): x.foo = 1 # is_dask_collection assert is_dask_collection(w) assert is_dask_collection(x) assert is_dask_collection(y) assert is_dask_collection(z) assert is_dask_collection(t) # tokenize assert tokenize(w) == tokenize(w) assert tokenize(x) == tokenize(x) assert tokenize(y) == tokenize(y) assert tokenize(z) == tokenize(z) assert tokenize(t) == tokenize(t) # All tokens are unique assert len({tokenize(coll) for coll in (w, x, y, z, t)}) == 5 # get_collection_names assert get_collection_names(w) == set() assert get_collection_names(x) == {"x"} assert get_collection_names(y) == {"y"} assert get_collection_names(z) == {"z"} assert get_collection_names(t) == {"x", "y", "z"} # compute assert w.compute() == () assert x.compute() == (1, 2) assert y.compute() == (3, 4) assert z.compute() == (7, ) assert dask.compute(w, [{ "x": x }, y, z]) == ((), [{ "x": (1, 2) }, (3, 4), (7, )]) assert t.compute() == (1, 2, 3, 4, 7) # persist t2 = t.persist() assert isinstance(t2, Tuple) assert t2._keys == t._keys assert sorted(t2._dask.values()) == [1, 2, 3, 4, 7] assert t2.compute() == (1, 2, 3, 4, 7) w2, x2, y2, z2 = dask.persist(w, x, y, z) assert y2._keys == y._keys assert y2._dask == {("y", h1): 3, ("y", h2): 4} assert y2.compute() == (3, 4) t3 = x2 + y2 + z2 assert t3.compute() == (1, 2, 3, 4, 7) # __dask_postpersist__ with name change rebuild, args = w.__dask_postpersist__() w3 = rebuild({}, *args, rename={"w": "w3"}) assert w3.compute() == () rebuild, args = x.__dask_postpersist__() x3 = rebuild({("x3", h1): 10, ("x3", h2): 20}, *args, rename={"x": "x3"}) assert x3.compute() == (10, 20) rebuild, args = z.__dask_postpersist__() z3 = rebuild({"z3": 70}, *args, rename={"z": "z3"}) assert z3.compute() == (70, )
def do_work(): return dask.compute( *[self.load_images(), self.load_models(), self.load_labels()]), None
def parLapply(CORE_NUM, iterable, func, *args, **kwargs): with dask.config.set(scheduler='processes', num_workers=CORE_NUM): f_par = functools.partial(func, *args, **kwargs) result = compute([delayed(f_par)(item) for item in iterable])[0] return result
def check_parts(df, sol): assert all((p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed()))
def generate_product( self, dc, path_prefix, aoi, output_projection, start_date, end_date, platform, res, aoi_crs, **kwargs, ): ## Create datacube query dask_chunks = dict(time=10, x=1000, y=1000) query = create_base_query(aoi, res, output_projection, aoi_crs, dask_chunks) all_measurements = ["green", "red", "blue", "nir", "swir1", "swir2"] product, measurement, water_product = create_product_measurement( platform, all_measurements) time = (start_date, end_date) ## Create dask graph ds = dc.load( time=time, platform=platform, product=product, measurements=measurement, **query, ) if is_dataset_empty(ds): raise Exception( "DataCube Load returned an empty Dataset." + "Please check load parameters for Baseline Dataset!") water_scenes = dc.load( product=water_product, measurements=["water_classification"], time=time, **query, ) # Set land to no_data water_dataset = water_scenes.where(water_scenes > 0) good_quality = mask_good_quality(ds, product) ds_clear = ds.where(good_quality) ds_clear_land = ds_clear.where(water_dataset.water_classification > 0) tsm_dataset = xr.map_blocks(tsm, ds_clear_land) mean_tsm = tsm_dataset.mean(dim=["time"]) max_tsm = tsm_dataset.max(dim=["time"]) min_tsm = tsm_dataset.min(dim=["time"]) ## Compute mean_tsm, max_tsm, min_tsm = dask.compute(mean_tsm, max_tsm, min_tsm) ## Write files result = [] file_name = path.join(path_prefix, "mean_tsm.tiff") import_export.export_xarray_to_geotiff( mean_tsm, file_name, crs=output_projection, x_coord="x", y_coord="y", ) result.append(file_name) file_name = path.join(path_prefix, "min_tsm.tiff") import_export.export_xarray_to_geotiff( min_tsm, file_name, crs=output_projection, x_coord="x", y_coord="y", ) result.append(file_name) file_name = path.join(path_prefix, "max_tsm.tiff") import_export.export_xarray_to_geotiff( max_tsm, file_name, crs=output_projection, x_coord="x", y_coord="y", ) result.append(file_name) return result
def compute(self, **kwargs) -> Any: return dask.compute(self, **kwargs)
def _kmeans_single_lloyd( X, n_clusters, max_iter=300, init="k-means||", verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, oversampling_factor=2, init_max_iter=None, ): centers = k_init( X, n_clusters, init=init, oversampling_factor=oversampling_factor, random_state=random_state, max_iter=init_max_iter, ) dt = X.dtype P = X.shape[1] for i in range(max_iter): with _timer("Lloyd loop %2d." % i, _logger=logger): labels, distances = pairwise_distances_argmin_min( X, centers, metric="euclidean", metric_kwargs={"squared": True}) labels = labels.astype(np.int32) # distances is always float64, but we need it to match X.dtype # for centers_dense, but remain float64 for inertia r = blockwise( _centers_dense, "ij", X, "ij", labels, "i", n_clusters, None, "i", adjust_chunks={ "i": n_clusters, "j": P }, dtype=X.dtype, ) new_centers = da.from_delayed(sum(r.to_delayed().flatten()), (n_clusters, P), X.dtype) counts = da.bincount(labels, minlength=n_clusters) # Require at least one per bucket, to avoid division by 0. counts = da.maximum(counts, 1) new_centers = new_centers / counts[:, None] (new_centers, ) = compute(new_centers) # Convergence check shift = squared_norm(centers - new_centers) logger.info("Shift: %0.4f", shift) if shift < tol: break centers = new_centers if shift > 1e-7: labels, distances = pairwise_distances_argmin_min(X, centers) labels = labels.astype(np.int32) inertia = distances.sum() centers = centers.astype(dt) return labels, inertia, centers, i + 1
def to_csv( df, filename, single_file=False, encoding="utf-8", mode="wt", name_function=None, compression=None, compute=True, scheduler=None, storage_options=None, header_first_partition_only=None, compute_kwargs=None, **kwargs, ): """ Store Dask DataFrame to CSV files One filename per partition will be created. You can specify the filenames in a variety of ways. Use a globstring:: >>> df.to_csv('/path/to/data/export-*.csv') # doctest: +SKIP The * will be replaced by the increasing sequence 0, 1, 2, ... :: /path/to/data/export-0.csv /path/to/data/export-1.csv Use a globstring and a ``name_function=`` keyword argument. The name_function function should expect an integer and produce a string. Strings produced by name_function must preserve the order of their respective partition indices. >>> from datetime import date, timedelta >>> def name(i): ... return str(date(2015, 1, 1) + i * timedelta(days=1)) >>> name(0) '2015-01-01' >>> name(15) '2015-01-16' >>> df.to_csv('/path/to/data/export-*.csv', name_function=name) # doctest: +SKIP :: /path/to/data/export-2015-01-01.csv /path/to/data/export-2015-01-02.csv ... You can also provide an explicit list of paths:: >>> paths = ['/path/to/data/alice.csv', '/path/to/data/bob.csv', ...] # doctest: +SKIP >>> df.to_csv(paths) # doctest: +SKIP Parameters ---------- df : dask.DataFrame Data to save filename : string Path glob indicating the naming scheme for the output files single_file : bool, default False Whether to save everything into a single CSV file. Under the single file mode, each partition is appended at the end of the specified CSV file. Note that not all filesystems support the append mode and thus the single file mode, especially on cloud storage systems such as S3 or GCS. A warning will be issued when writing to a file that is not backed by a local filesystem. encoding : string, optional A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. mode : str Python write mode, default 'w' name_function : callable, default None Function accepting an integer (partition index) and producing a string to replace the asterisk in the given filename globstring. Should preserve the lexicographic order of partitions. Not supported when `single_file` is `True`. compression : string, optional a string representing the compression to use in the output file, allowed values are 'gzip', 'bz2', 'xz', only used when the first argument is a filename compute : bool If true, immediately executes. If False, returns a set of delayed objects, which can be computed at a later time. storage_options : dict Parameters passed on to the backend filesystem class. header_first_partition_only : boolean, default None If set to `True`, only write the header row in the first output file. By default, headers are written to all partitions under the multiple file mode (`single_file` is `False`) and written only once under the single file mode (`single_file` is `True`). It must not be `False` under the single file mode. compute_kwargs : dict, optional Options to be passed in to the compute method kwargs : dict, optional Additional parameters to pass to `pd.DataFrame.to_csv()` Returns ------- The names of the file written if they were computed right away If not, the delayed tasks associated to the writing of the files Raises ------ ValueError If `header_first_partition_only` is set to `False` or `name_function` is specified when `single_file` is `True`. """ if single_file and name_function is not None: raise ValueError( "name_function is not supported under the single file mode") if header_first_partition_only is None: header_first_partition_only = single_file elif not header_first_partition_only and single_file: raise ValueError( "header_first_partition_only cannot be False in the single file mode." ) file_options = dict( compression=compression, encoding=encoding, newline="", **(storage_options or {}), ) to_csv_chunk = delayed(_write_csv, pure=False) dfs = df.to_delayed() if single_file: first_file = open_file(filename, mode=mode, **file_options) if not isinstance(first_file.fs, fsspec.implementations.local.LocalFileSystem): warn("Appending data to a network storage system may not work.") value = to_csv_chunk(dfs[0], first_file, **kwargs) append_mode = mode.replace("w", "") + "a" append_file = open_file(filename, mode=append_mode, **file_options) kwargs["header"] = False for d in dfs[1:]: value = to_csv_chunk(d, append_file, depend_on=value, **kwargs) values = [value] files = [first_file] else: files = open_files( filename, mode=mode, name_function=name_function, num=df.npartitions, **file_options, ) values = [to_csv_chunk(dfs[0], files[0], **kwargs)] if header_first_partition_only: kwargs["header"] = False values.extend( [to_csv_chunk(d, f, **kwargs) for d, f in zip(dfs[1:], files[1:])]) if compute: if compute_kwargs is None: compute_kwargs = dict() if scheduler is not None: warn( "The 'scheduler' keyword argument for `to_csv()` is deprecated and" "will be removed in a future version. " "Please use the `compute_kwargs` argument instead. " f"For example, df.to_csv(..., compute_kwargs={{scheduler: {scheduler}}})", FutureWarning, ) if (scheduler is not None and compute_kwargs.get("scheduler") is not None and compute_kwargs.get("scheduler") != scheduler): raise ValueError( f"Differing values for 'scheduler' have been passed in.\n" f"scheduler argument: {scheduler}\n" f"via compute_kwargs: {compute_kwargs.get('scheduler')}") if scheduler is not None and compute_kwargs.get("scheduler") is None: compute_kwargs["scheduler"] = scheduler import dask dask.compute(*values, **compute_kwargs) return [f.path for f in files] else: return values
def forecast( R, metadata, V, n_timesteps, n_ens_members=24, n_cascade_levels=6, win_size=256, overlap=0.1, war_thr=0.1, extrap_method="semilagrangian", decomp_method="fft", bandpass_filter_method="gaussian", noise_method="ssft", ar_order=2, vel_pert_method=None, probmatching_method="cdf", mask_method="incremental", callback=None, fft_method="numpy", return_output=True, seed=None, num_workers=1, extrap_kwargs=None, filter_kwargs=None, noise_kwargs=None, vel_pert_kwargs=None, mask_kwargs=None, measure_time=False, ): """ Generate a nowcast ensemble by using the Short-space ensemble prediction system (SSEPS) method. This is an experimental version of STEPS which allows for localization by means of a window function. Parameters ---------- R : array-like Array of shape (ar_order+1,m,n) containing the input precipitation fields ordered by timestamp from oldest to newest. The time steps between the inputs are assumed to be regular, and the inputs are required to have finite values. metadata : dict Metadata dictionary containing the accutime, xpixelsize, threshold and zerovalue attributes as described in the documentation of :py:mod:`pysteps.io.importers`. xpixelsize is assumed to be in meters. V : array-like Array of shape (2,m,n) containing the x- and y-components of the advection field. The velocities are assumed to represent one time step between the inputs. All values are required to be finite. win_size : int or two-element sequence of ints Size-length of the localization window. overlap : float [0,1[ A float between 0 and 1 prescribing the level of overlap between successive windows. If set to 0, no overlap is used. war_thr : float Threshold for the minimum fraction of rain in a given window. n_timesteps : int Number of time steps to forecast. n_ens_members : int The number of ensemble members to generate. n_cascade_levels : int The number of cascade levels to use. extrap_method : {'semilagrangian'} Name of the extrapolation method to use. See the documentation of pysteps.extrapolation.interface. decomp_method : {'fft'} Name of the cascade decomposition method to use. See the documentation of pysteps.cascade.interface. bandpass_filter_method : {'gaussian', 'uniform'} Name of the bandpass filter method to use with the cascade decomposition. noise_method : {'parametric','nonparametric','ssft','nested',None} Name of the noise generator to use for perturbating the precipitation field. See the documentation of pysteps.noise.interface. If set to None, no noise is generated. ar_order: int The order of the autoregressive model to use. Must be >= 1. vel_pert_method: {'bps',None} Name of the noise generator to use for perturbing the advection field. See the documentation of pysteps.noise.interface. If set to None, the advection field is not perturbed. mask_method : {'incremental', None} The method to use for masking no precipitation areas in the forecast field. The masked pixels are set to the minimum value of the observations. 'incremental' = iteratively buffer the mask with a certain rate (currently it is 1 km/min), None=no masking. probmatching_method : {'cdf', None} Method for matching the statistics of the forecast field with those of the most recently observed one. 'cdf'=map the forecast CDF to the observed one, None=no matching applied. Using 'mean' requires that mask_method is not None. callback : function Optional function that is called after computation of each time step of the nowcast. The function takes one argument: a three-dimensional array of shape (n_ens_members,h,w), where h and w are the height and width of the input field R, respectively. This can be used, for instance, writing the outputs into files. return_output : bool Set to False to disable returning the outputs as numpy arrays. This can save memory if the intermediate results are written to output files using the callback function. seed : int Optional seed number for the random generators. num_workers : int The number of workers to use for parallel computation. Applicable if dask is enabled or pyFFTW is used for computing the FFT. When num_workers>1, it is advisable to disable OpenMP by setting the environment variable OMP_NUM_THREADS to 1. This avoids slowdown caused by too many simultaneous threads. fft_method : str A string defining the FFT method to use (see utils.fft.get_method). Defaults to 'numpy' for compatibility reasons. If pyFFTW is installed, the recommended method is 'pyfftw'. extrap_kwargs : dict Optional dictionary containing keyword arguments for the extrapolation method. See the documentation of pysteps.extrapolation. filter_kwargs : dict Optional dictionary containing keyword arguments for the filter method. See the documentation of pysteps.cascade.bandpass_filters.py. noise_kwargs : dict Optional dictionary containing keyword arguments for the initializer of the noise generator. See the documentation of pysteps.noise.fftgenerators. vel_pert_kwargs : dict Optional dictionary containing keyword arguments "p_pert_par" and "p_pert_perp" for the initializer of the velocity perturbator. See the documentation of pysteps.noise.motion. mask_kwargs : dict Optional dictionary containing mask keyword arguments 'mask_f' and 'mask_rim', the factor defining the the mask increment and the rim size, respectively. The mask increment is defined as mask_f*timestep/kmperpixel. measure_time : bool If set to True, measure, print and return the computation time. Returns ------- out : ndarray If return_output is True, a four-dimensional array of shape (n_ens_members,n_timesteps,m,n) containing a time series of forecast precipitation fields for each ensemble member. Otherwise, a None value is returned. The time series starts from t0+timestep, where timestep is taken from the input precipitation fields R. See also -------- pysteps.extrapolation.interface, pysteps.cascade.interface, pysteps.noise.interface, pysteps.noise.utils.compute_noise_stddev_adjs Notes ----- Please be aware that this represents a (very) experimental implementation. References ---------- :cite:`Seed2003`, :cite:`BPS2006`, :cite:`SPN2013`, :cite:`NBSG2017` """ _check_inputs(R, V, ar_order) if extrap_kwargs is None: extrap_kwargs = dict() if filter_kwargs is None: filter_kwargs = dict() if noise_kwargs is None: noise_kwargs = dict() if vel_pert_kwargs is None: vel_pert_kwargs = dict() if mask_kwargs is None: mask_kwargs = dict() if np.any(~np.isfinite(R)): raise ValueError("R contains non-finite values") if np.any(~np.isfinite(V)): raise ValueError("V contains non-finite values") if mask_method not in ["incremental", None]: raise ValueError( "unknown mask method %s: must be 'incremental' or None" % mask_method) if np.isscalar(win_size): win_size = (np.int(win_size), np.int(win_size)) else: win_size = tuple([np.int(win_size[i]) for i in range(2)]) timestep = metadata["accutime"] kmperpixel = metadata["xpixelsize"] / 1000 print("Computing SSEPS nowcast:") print("------------------------") print("") print("Inputs:") print("-------") print("input dimensions: %dx%d" % (R.shape[1], R.shape[2])) print("km/pixel: %g" % kmperpixel) print("time step: %d minutes" % timestep) print("") print("Methods:") print("--------") print("extrapolation: %s" % extrap_method) print("bandpass filter: %s" % bandpass_filter_method) print("decomposition: %s" % decomp_method) print("noise generator: %s" % noise_method) print("velocity perturbator: %s" % vel_pert_method) print("precip. mask method: %s" % mask_method) print("probability matching: %s" % probmatching_method) print("FFT method: %s" % fft_method) print("") print("Parameters:") print("-----------") print("localization window: %dx%d" % (win_size[0], win_size[1])) print("overlap: %.1f" % overlap) print("war thr: %.2f" % war_thr) print("number of time steps: %d" % n_timesteps) print("ensemble size: %d" % n_ens_members) print("number of cascade levels: %d" % n_cascade_levels) print("order of the AR(p) model: %d" % ar_order) print("dask imported: %s" % ("yes" if dask_imported else "no")) print("num workers: %d" % num_workers) if vel_pert_method == "bps": vp_par = vel_pert_kwargs.get("p_pert_par", noise.motion.get_default_params_bps_par()) vp_perp = vel_pert_kwargs.get( "p_pert_perp", noise.motion.get_default_params_bps_perp()) print("velocity perturbations, parallel: %g,%g,%g" % (vp_par[0], vp_par[1], vp_par[2])) print("velocity perturbations, perpendicular: %g,%g,%g" % (vp_perp[0], vp_perp[1], vp_perp[2])) R_thr = metadata["threshold"] R_min = metadata["zerovalue"] num_ensemble_workers = n_ens_members if num_workers > n_ens_members else num_workers if measure_time: starttime_init = time.time() # get methods extrapolator_method = extrapolation.get_method(extrap_method) x_values, y_values = np.meshgrid(np.arange(R.shape[2]), np.arange(R.shape[1])) xy_coords = np.stack([x_values, y_values]) decomp_method, __ = cascade.get_method(decomp_method) filter_method = cascade.get_method(bandpass_filter_method) if noise_method is not None: init_noise, generate_noise = noise.get_method(noise_method) # advect the previous precipitation fields to the same position with the # most recent one (i.e. transform them into the Lagrangian coordinates) R = R[-(ar_order + 1):, :, :].copy() extrap_kwargs = extrap_kwargs.copy() extrap_kwargs["xy_coords"] = xy_coords res = [] f = lambda R, i: extrapolator_method(R[i, :, :], V, ar_order - i, "min", ** extrap_kwargs)[-1] for i in range(ar_order): if not dask_imported: R[i, :, :] = f(R, i) else: res.append(dask.delayed(f)(R, i)) if dask_imported: num_workers_ = len(res) if num_workers > len(res) else num_workers R = np.stack( list(dask.compute(*res, num_workers=num_workers_)) + [R[-1, :, :]]) if mask_method == "incremental": # get mask parameters mask_rim = mask_kwargs.get("mask_rim", 10) mask_f = mask_kwargs.get("mask_f", 1.0) # initialize the structuring element struct = scipy.ndimage.generate_binary_structure(2, 1) # iterate it to expand it nxn n = mask_f * timestep / kmperpixel struct = scipy.ndimage.iterate_structure(struct, int((n - 1) / 2.0)) noise_kwargs.update({ "win_size": win_size, "overlap": overlap, "war_thr": war_thr, "rm_rdisc": True, "donorm": True, }) print("Estimating nowcast parameters...", end="") def estimator(R, parsglob=None, idxm=None, idxn=None): pars = {} # initialize the perturbation generator for the precipitation field if noise_method is not None and parsglob is None: P = init_noise(R, fft_method=fft_method, **noise_kwargs) else: P = None pars["P"] = P # initialize the band-pass filter if parsglob is None: filter = filter_method(R.shape[1:], n_cascade_levels, **filter_kwargs) pars["filter"] = filter else: pars["filter"] = None # compute the cascade decompositions of the input precipitation fields if parsglob is None: R_d = [] for i in range(ar_order + 1): R_d_ = decomp_method( R[i, :, :], filter, fft_method=fft_method, normalize=True, compute_stats=True, ) R_d.append(R_d_) R_d_ = None # normalize the cascades and rearrange them into a four-dimensional array # of shape (n_cascade_levels,ar_order+1,m,n) for the autoregressive model if parsglob is None: R_c = nowcast_utils.stack_cascades(R_d, n_cascade_levels) mu = R_d[-1]["means"] sigma = R_d[-1]["stds"] R_d = None else: R_c = parsglob["R_c"][0][:, :, idxm.item(0):idxm.item(1), idxn.item(0):idxn.item(1)].copy() mu = np.mean(R_c, axis=(2, 3)) sigma = np.std(R_c, axis=(2, 3)) R_c = (R_c - mu[:, :, None, None]) / sigma[:, :, None, None] mu = mu[:, -1] sigma = sigma[:, -1] pars["mu"] = mu pars["sigma"] = sigma # compute lag-l temporal autocorrelation coefficients for each cascade level GAMMA = np.empty((n_cascade_levels, ar_order)) for i in range(n_cascade_levels): R_c_ = np.stack([R_c[i, j, :, :] for j in range(ar_order + 1)]) GAMMA[i, :] = correlation.temporal_autocorrelation(R_c_) R_c_ = None if ar_order == 2: # adjust the local lag-2 correlation coefficient to ensure that the AR(p) # process is stationary for i in range(n_cascade_levels): GAMMA[i, 1] = autoregression.adjust_lag2_corrcoef2( GAMMA[i, 0], GAMMA[i, 1]) # estimate the parameters of the AR(p) model from the autocorrelation # coefficients PHI = np.empty((n_cascade_levels, ar_order + 1)) for i in range(n_cascade_levels): PHI[i, :] = autoregression.estimate_ar_params_yw(GAMMA[i, :]) pars["PHI"] = PHI # stack the cascades into a five-dimensional array containing all ensemble # members R_c = [R_c.copy() for i in range(n_ens_members)] pars["R_c"] = R_c if mask_method is not None and parsglob is None: MASK_prec = R[-1, :, :] >= R_thr if mask_method == "incremental": # initialize precip mask for each member MASK_prec = _compute_incremental_mask(MASK_prec, struct, mask_rim) MASK_prec = [MASK_prec.copy() for j in range(n_ens_members)] else: MASK_prec = None pars["MASK_prec"] = MASK_prec return pars # prepare windows M, N = R.shape[1:] n_windows_M = np.ceil(1.0 * M / win_size[0]).astype(int) n_windows_N = np.ceil(1.0 * N / win_size[1]).astype(int) idxm = np.zeros((2, 1), dtype=int) idxn = np.zeros((2, 1), dtype=int) sys.stdout.flush() if measure_time: starttime = time.time() # compute global parameters to be used as defaults parsglob = estimator(R) # loop windows if n_windows_M > 1 or n_windows_N > 1: war = np.empty((n_windows_M, n_windows_N)) PHI = np.empty( (n_windows_M, n_windows_N, n_cascade_levels, ar_order + 1)) mu = np.empty((n_windows_M, n_windows_N, n_cascade_levels)) sigma = np.empty((n_windows_M, n_windows_N, n_cascade_levels)) ff = [] rc = [] pp = [] mm = [] for m in range(n_windows_M): ff_ = [] pp_ = [] rc_ = [] mm_ = [] for n in range(n_windows_N): # compute indices of local window idxm[0] = int( np.max((m * win_size[0] - overlap * win_size[0], 0))) idxm[1] = int( np.min((idxm[0] + win_size[0] + overlap * win_size[0], M))) idxn[0] = int( np.max((n * win_size[1] - overlap * win_size[1], 0))) idxn[1] = int( np.min((idxn[0] + win_size[1] + overlap * win_size[1], N))) mask = np.zeros((M, N), dtype=bool) mask[idxm.item(0):idxm.item(1), idxn.item(0):idxn.item(1)] = True R_ = R[:, idxm.item(0):idxm.item(1), idxn.item(0):idxn.item(1)] war[m, n] = np.sum(R_[-1, :, :] >= R_thr) / R_[-1, :, :].size if war[m, n] > war_thr: # estimate local parameters pars = estimator(R, parsglob, idxm, idxn) ff_.append(pars["filter"]) pp_.append(pars["P"]) rc_.append(pars["R_c"]) mm_.append(pars["MASK_prec"]) mu[m, n, :] = pars["mu"] sigma[m, n, :] = pars["sigma"] PHI[m, n, :, :] = pars["PHI"] else: # dry window ff_.append(None) pp_.append(None) rc_.append(None) mm_.append(None) ff.append(ff_) pp.append(pp_) rc.append(rc_) mm.append(mm_) # remove unnecessary variables ff_ = None pp_ = None rc_ = None mm_ = None pars = None if measure_time: print("%.2f seconds." % (time.time() - starttime)) else: print(" done.") # initialize the random generators if noise_method is not None: randgen_prec = [] randgen_motion = [] np.random.seed(seed) for j in range(n_ens_members): rs = np.random.RandomState(seed) randgen_prec.append(rs) seed = rs.randint(0, high=1e9) rs = np.random.RandomState(seed) randgen_motion.append(rs) seed = rs.randint(0, high=1e9) if vel_pert_method is not None: init_vel_noise, generate_vel_noise = noise.get_method(vel_pert_method) # initialize the perturbation generators for the motion field vps = [] for j in range(n_ens_members): kwargs = { "randstate": randgen_motion[j], "p_par": vp_par, "p_perp": vp_perp, } vp_ = init_vel_noise(V, 1.0 / kmperpixel, timestep, **kwargs) vps.append(vp_) D = [None for j in range(n_ens_members)] R_f = [[] for j in range(n_ens_members)] if measure_time: init_time = time.time() - starttime_init R = R[-1, :, :] print("Starting nowcast computation.") if measure_time: starttime_mainloop = time.time() # iterate each time step for t in range(n_timesteps): print("Computing nowcast for time step %d... " % (t + 1), end="") sys.stdout.flush() if measure_time: starttime = time.time() # iterate each ensemble member def worker(j): # first the global step if noise_method is not None: # generate noise field EPS = generate_noise(parsglob["P"], randstate=randgen_prec[j], fft_method=fft_method) # decompose the noise field into a cascade EPS_d = decomp_method( EPS, parsglob["filter"], fft_method=fft_method, normalize=True, compute_stats=True, ) else: EPS_d = None # iterate the AR(p) model for each cascade level R_c = parsglob["R_c"][j].copy() if R_c.shape[1] >= ar_order: R_c = R_c[:, -ar_order:, :, :].copy() for i in range(n_cascade_levels): # normalize the noise cascade if EPS_d is not None: EPS_ = (EPS_d["cascade_levels"][i, :, :] - EPS_d["means"][i]) / EPS_d["stds"][i] else: EPS_ = None # apply AR(p) process to cascade level R_c[i, :, :, :] = autoregression.iterate_ar_model( R_c[i, :, :, :], parsglob["PHI"][i, :], eps=EPS_) EPS_ = None parsglob["R_c"][j] = R_c.copy() EPS = None # compute the recomposed precipitation field(s) from the cascades # obtained from the AR(p) model(s) R_c_ = _recompose_cascade(R_c, parsglob["mu"], parsglob["sigma"]) R_c = None # then the local steps if n_windows_M > 1 or n_windows_N > 1: idxm = np.zeros((2, 1), dtype=int) idxn = np.zeros((2, 1), dtype=int) R_l = np.zeros((M, N), dtype=float) M_s = np.zeros((M, N), dtype=float) for m in range(n_windows_M): for n in range(n_windows_N): # compute indices of local window idxm[0] = int( np.max( (m * win_size[0] - overlap * win_size[0], 0))) idxm[1] = int( np.min( (idxm[0] + win_size[0] + overlap * win_size[0], M))) idxn[0] = int( np.max( (n * win_size[1] - overlap * win_size[1], 0))) idxn[1] = int( np.min( (idxn[0] + win_size[1] + overlap * win_size[1], N))) # build localization mask mask = _get_mask((M, N), idxm, idxn) mask_l = mask[idxm.item(0):idxm.item(1), idxn.item(0):idxn.item(1)] M_s += mask # skip if dry if war[m, n] > war_thr: R_c = rc[m][n][j].copy() if R_c.shape[1] >= ar_order: R_c = R_c[:, -ar_order:, :, :] if noise_method is not None: # extract noise field EPS_d_l = EPS_d[ "cascade_levels"][:, idxm.item(0):idxm.item(1 ), idxn.item(0):idxn. item(1), ].copy() mu_ = np.mean(EPS_d_l, axis=(1, 2)) sigma_ = np.std(EPS_d_l, axis=(1, 2)) else: EPS_d_l = None # iterate the AR(p) model for each cascade level for i in range(n_cascade_levels): # normalize the noise cascade if EPS_d_l is not None: EPS_ = (EPS_d_l[i, :, :] - mu_[i, None, None]) / sigma_[i, None, None] else: EPS_ = None # apply AR(p) process to cascade level R_c[i, :, :, :] = autoregression.iterate_ar_model( R_c[i, :, :, :], PHI[m, n, i, :], eps=EPS_) EPS_ = None rc[m][n][j] = R_c.copy() EPS_d_l = mu_ = sigma_ = None # compute the recomposed precipitation field(s) from the cascades # obtained from the AR(p) model(s) mu_ = mu[m, n, :] sigma_ = sigma[m, n, :] R_c = [((R_c[i, -1, :, :] * sigma_[i]) + mu_[i]) * parsglob["sigma"][i] + parsglob["mu"][i] for i in range(len(mu_))] R_l_ = np.sum(np.stack(R_c), axis=0) R_c = mu_ = sigma_ = None # R_l_ = _recompose_cascade(R_c[:, :, :], mu[m, n, :], sigma[m, n, :]) else: R_l_ = R_c_[idxm.item(0):idxm.item(1), idxn.item(0):idxn.item(1)].copy() if probmatching_method == "cdf": # adjust the CDF of the forecast to match the most recently # observed precipitation field R_ = R[idxm.item(0):idxm.item(1), idxn.item(0):idxn.item(1)].copy() R_l_ = probmatching.nonparam_match_empirical_cdf( R_l_, R_) R_ = None R_l[idxm.item(0):idxm.item(1), idxn.item(0):idxn.item(1)] += (R_l_ * mask_l) R_l_ = None ind = M_s > 0 R_l[ind] *= 1 / M_s[ind] R_l[~ind] = R_min R_c_ = R_l.copy() R_l = None if probmatching_method == "cdf": # adjust the CDF of the forecast to match the most recently # observed precipitation field R_c_[R_c_ < R_thr] = R_min R_c_ = probmatching.nonparam_match_empirical_cdf(R_c_, R) if mask_method is not None: # apply the precipitation mask to prevent generation of new # precipitation into areas where it was not originally # observed if mask_method == "incremental": MASK_prec = parsglob["MASK_prec"][j].copy() R_c_ = R_c_.min() + (R_c_ - R_c_.min()) * MASK_prec MASK_prec = None if mask_method == "incremental": parsglob["MASK_prec"][j] = _compute_incremental_mask( R_c_ >= R_thr, struct, mask_rim) # compute the perturbed motion field if vel_pert_method is not None: V_ = V + generate_vel_noise(vps[j], (t + 1) * timestep) else: V_ = V # advect the recomposed precipitation field to obtain the forecast # for time step t extrap_kwargs.update({ "displacement_prev": D[j], "return_displacement": True }) R_f_, D_ = extrapolator_method(R_c_, V_, 1, **extrap_kwargs) D[j] = D_ R_f_ = R_f_[0] R_f_[R_f_ < R_thr] = R_min return R_f_ res = [] for j in range(n_ens_members): if not dask_imported or n_ens_members == 1: res.append(worker(j)) else: res.append(dask.delayed(worker)(j)) R_f_ = (dask.compute(*res, num_workers=num_ensemble_workers) if dask_imported and n_ens_members > 1 else res) res = None if measure_time: print("%.2f seconds." % (time.time() - starttime)) else: print("done.") if callback is not None: callback(np.stack(R_f_)) R_f_ = None if return_output: for j in range(n_ens_members): R_f[j].append(R_f_[j]) if measure_time: mainloop_time = time.time() - starttime_mainloop if return_output: outarr = np.stack([np.stack(R_f[j]) for j in range(n_ens_members)]) if measure_time: return outarr, init_time, mainloop_time else: return outarr else: return None
def test_wait_on_many(layers): t1, t2, cnt = demo_tuples(layers) out = wait_on(t1, {"x": [t2]}) assert dask.compute(*out, scheduler="sync") == ((1, 2, 3), {"x": [(4, 5)]}) assert cnt.n == 5
# subprocess.Popen(['mpirun', '--np', '4', 'dask-mpi'], stdin=subprocess.DEVNULL) # models to run model_names = [ "casing", "background", "permeable", "approx_casing", "approx_permeable", "approx_permeable2" ] # Set up the simulation @dask.delayed def run_simulation(m): sim = casingSimulations.run.SimulationTDEM( modelParameters=m + ".json", meshGenerator='MeshParameters.json', srcList='sources.json', fields_filename=m + "_fields.npy") fields = sim.run(verbose=True) return fields[:, '{}Solution'.format(sim.formulation), :] f = {} for m in model_names: f[m] = run_simulation(m) dask.compute(f, num_workers=3) #, scheduler='distributed') # # run the simulation # fields = sim.run(verbose=True)
def persister(bag): [bag] = persist_with_trace(bag) return dask.compute(bag)
def run_hpo(daskClient, nTimesteps, nParticles, nWorkers, paramRanges, trainData_cDF, trainLabels_cDF, testData_cDF, testLabels_cDF, randomSeed=0): pandasTestLabels = testLabels_cDF.to_pandas() if daskClient is not None: scatteredData_future = daskClient.scatter( [trainData_cDF, trainLabels_cDF, testData_cDF, testLabels_cDF], broadcast=True) trainData_cDF_future = scatteredData_future[0] trainLabels_cDF_future = scatteredData_future[1] testData_cDF_future = scatteredData_future[2] testLabels_cDF_future = scatteredData_future[3] particles, velocities, accuracies, bestParticleIndex, \ globalBestParticleParams, particleBoostingRounds, particleColors = initalize_hpo ( nTimesteps = nTimesteps, nParticles = nParticles, nWorkers = nWorkers, paramRanges = paramRanges) globalBestAccuracy = 0 trainingTimes = np.zeros((nTimesteps, nParticles)) startTime = time.time() predictionHistory = np.zeros( (nTimesteps, nParticles, testData_cDF.shape[0])) for iTimestep in range(0, nTimesteps): if daskClient is not None: # [ delayed ] train xgboost models on train data delayedParticleTrain = [ delayed(train_model_hpo)(trainData_cDF_future, trainLabels_cDF_future, testData_cDF_future, testLabels_cDF_future, particles[iTimestep, iParticle, :], iParticle, iTimestep) for iParticle in range(nParticles) ] # [ delayed ] determine number of trees/training-rounds returned early stopping -- used to set particle sizes delayedParticleRounds = [ iParticle[0].best_iteration for iParticle in delayedParticleTrain ] # [delayed ] eval trained models on test/validation data delayedParticlePredictions = [ delayed(test_model_hpo)(iParticle[0], iParticle[1], testData_cDF_future, testLabels_cDF_future) for iParticle in delayedParticleTrain ] # execute delayed particlePredictions = dask.compute(delayedParticlePredictions)[0] for iParticle in range(nParticles): predictionHistory[ iTimestep, iParticle, :] = particlePredictions[iParticle][0] #import pdb; pdb.set_trace() # compute accuracies of predictions accuracies[iTimestep, :] = [ accuracy_score(pandasTestLabels, iParticle[0]) for iParticle in particlePredictions ] particleBoostingRounds[iTimestep, :] = [ iParticle[1] for iParticle in particlePredictions ] trainingTimes[iTimestep, :] = [ iParticle[2] for iParticle in particlePredictions ] del particlePredictions else: for iParticle in range(nParticles): trainedModels, _ = train_model_hpo( pandasTrainData, pandasTrainLabels, particles[iTimestep, iParticle, :], iParticle, iTimestep) predictions, _ = test_model_hpo(trainedModels, pandasTestData, pandasTestLabels) accuracies[iTimestep, iParticle] = accuracy_score(pandasTestLabels, predictions) bestParticleIndex[iTimestep + 1] = np.argmax(accuracies[iTimestep, :]) currentBestAccuracy = np.max(accuracies[iTimestep, :]) print('@ hpo timestep : {}, best accuracy is {}'.format( iTimestep, np.max(accuracies[iTimestep, :]))) if iTimestep + 1 < nTimesteps: if currentBestAccuracy > globalBestAccuracy: print('\t updating best GLOBAL accuracy') globalBestAccuracy = currentBestAccuracy globalBestParticleParams[iTimestep + 1] = particles[ iTimestep, bestParticleIndex[iTimestep + 1], :] else: globalBestParticleParams[ iTimestep + 1] = globalBestParticleParams[iTimestep].copy() particles[iTimestep + 1, :, :], velocities[ iTimestep + 1, :, :] = update_particles( paramRanges, particles[iTimestep, :, :].copy(), velocities[iTimestep, :, :].copy(), bestParticleIndex[iTimestep + 1], globalBestParticleParams[iTimestep + 1], randomSeed=iTimestep) particleSizes = particleBoostingRounds / np.max( particleBoostingRounds) * 10 + 2 elapsedTime = time.time() - startTime print('elapsed time : {}'.format(elapsedTime)) return accuracies, particles, velocities, particleSizes, particleColors, bestParticleIndex, particleBoostingRounds, trainingTimes, predictionHistory, elapsedTime
def forecast( R, V, timesteps, n_ens_members=24, n_cascade_levels=6, R_thr=None, kmperpixel=None, timestep=None, extrap_method="semilagrangian", decomp_method="fft", bandpass_filter_method="gaussian", noise_method="nonparametric", noise_stddev_adj=None, ar_order=2, vel_pert_method="bps", conditional=False, probmatching_method="cdf", mask_method="incremental", callback=None, return_output=True, seed=None, num_workers=1, fft_method="numpy", domain="spatial", extrap_kwargs=None, filter_kwargs=None, noise_kwargs=None, vel_pert_kwargs=None, mask_kwargs=None, measure_time=False, ): """Generate a nowcast ensemble by using the Short-Term Ensemble Prediction System (STEPS) method. Parameters ---------- R: array-like Array of shape (ar_order+1,m,n) containing the input precipitation fields ordered by timestamp from oldest to newest. The time steps between the inputs are assumed to be regular. V: array-like Array of shape (2,m,n) containing the x- and y-components of the advection field. The velocities are assumed to represent one time step between the inputs. All values are required to be finite. timesteps: int or list of floats Number of time steps to forecast or a list of time steps for which the forecasts are computed (relative to the input time step). The elements of the list are required to be in ascending order. n_ens_members: int, optional The number of ensemble members to generate. n_cascade_levels: int, optional The number of cascade levels to use. R_thr: float, optional Specifies the threshold value for minimum observable precipitation intensity. Required if mask_method is not None or conditional is True. kmperpixel: float, optional Spatial resolution of the input data (kilometers/pixel). Required if vel_pert_method is not None or mask_method is 'incremental'. timestep: float, optional Time step of the motion vectors (minutes). Required if vel_pert_method is not None or mask_method is 'incremental'. extrap_method: str, optional Name of the extrapolation method to use. See the documentation of pysteps.extrapolation.interface. decomp_method: {'fft'}, optional Name of the cascade decomposition method to use. See the documentation of pysteps.cascade.interface. bandpass_filter_method: {'gaussian', 'uniform'}, optional Name of the bandpass filter method to use with the cascade decomposition. See the documentation of pysteps.cascade.interface. noise_method: {'parametric','nonparametric','ssft','nested',None}, optional Name of the noise generator to use for perturbating the precipitation field. See the documentation of pysteps.noise.interface. If set to None, no noise is generated. noise_stddev_adj: {'auto','fixed',None}, optional Optional adjustment for the standard deviations of the noise fields added to each cascade level. This is done to compensate incorrect std. dev. estimates of casace levels due to presence of no-rain areas. 'auto'=use the method implemented in pysteps.noise.utils.compute_noise_stddev_adjs. 'fixed'= use the formula given in :cite:`BPS2006` (eq. 6), None=disable noise std. dev adjustment. ar_order: int, optional The order of the autoregressive model to use. Must be >= 1. vel_pert_method: {'bps',None}, optional Name of the noise generator to use for perturbing the advection field. See the documentation of pysteps.noise.interface. If set to None, the advection field is not perturbed. conditional: bool, optional If set to True, compute the statistics of the precipitation field conditionally by excluding pixels where the values are below the threshold R_thr. mask_method: {'obs','sprog','incremental',None}, optional The method to use for masking no precipitation areas in the forecast field. The masked pixels are set to the minimum value of the observations. 'obs' = apply R_thr to the most recently observed precipitation intensity field, 'sprog' = use the smoothed forecast field from S-PROG, where the AR(p) model has been applied, 'incremental' = iteratively buffer the mask with a certain rate (currently it is 1 km/min), None=no masking. probmatching_method: {'cdf','mean',None}, optional Method for matching the statistics of the forecast field with those of the most recently observed one. 'cdf'=map the forecast CDF to the observed one, 'mean'=adjust only the conditional mean value of the forecast field in precipitation areas, None=no matching applied. Using 'mean' requires that mask_method is not None. callback: function, optional Optional function that is called after computation of each time step of the nowcast. The function takes one argument: a three-dimensional array of shape (n_ens_members,h,w), where h and w are the height and width of the input field R, respectively. This can be used, for instance, writing the outputs into files. return_output: bool, optional Set to False to disable returning the outputs as numpy arrays. This can save memory if the intermediate results are written to output files using the callback function. seed: int, optional Optional seed number for the random generators. num_workers: int, optional The number of workers to use for parallel computation. Applicable if dask is enabled or pyFFTW is used for computing the FFT. When num_workers>1, it is advisable to disable OpenMP by setting the environment variable OMP_NUM_THREADS to 1. This avoids slowdown caused by too many simultaneous threads. fft_method: str, optional A string defining the FFT method to use (see utils.fft.get_method). Defaults to 'numpy' for compatibility reasons. If pyFFTW is installed, the recommended method is 'pyfftw'. domain: {"spatial", "spectral"} If "spatial", all computations are done in the spatial domain (the classical STEPS model). If "spectral", the AR(2) models and stochastic perturbations are applied directly in the spectral domain to reduce memory footprint and improve performance :cite:`PCH2019b`. extrap_kwargs: dict, optional Optional dictionary containing keyword arguments for the extrapolation method. See the documentation of pysteps.extrapolation. filter_kwargs: dict, optional Optional dictionary containing keyword arguments for the filter method. See the documentation of pysteps.cascade.bandpass_filters.py. noise_kwargs: dict, optional Optional dictionary containing keyword arguments for the initializer of the noise generator. See the documentation of pysteps.noise.fftgenerators. vel_pert_kwargs: dict, optional Optional dictionary containing keyword arguments 'p_par' and 'p_perp' for the initializer of the velocity perturbator. The choice of the optimal parameters depends on the domain and the used optical flow method. Default parameters from :cite:`BPS2006`: p_par = [10.88, 0.23, -7.68] p_perp = [5.76, 0.31, -2.72] Parameters fitted to the data (optical flow/domain): darts/fmi: p_par = [13.71259667, 0.15658963, -16.24368207] p_perp = [8.26550355, 0.17820458, -9.54107834] darts/mch: p_par = [24.27562298, 0.11297186, -27.30087471] p_perp = [-7.80797846e+01, -3.38641048e-02, 7.56715304e+01] darts/fmi+mch: p_par = [16.55447057, 0.14160448, -19.24613059] p_perp = [14.75343395, 0.11785398, -16.26151612] lucaskanade/fmi: p_par = [2.20837526, 0.33887032, -2.48995355] p_perp = [2.21722634, 0.32359621, -2.57402761] lucaskanade/mch: p_par = [2.56338484, 0.3330941, -2.99714349] p_perp = [1.31204508, 0.3578426, -1.02499891] lucaskanade/fmi+mch: p_par = [2.31970635, 0.33734287, -2.64972861] p_perp = [1.90769947, 0.33446594, -2.06603662] vet/fmi: p_par = [0.25337388, 0.67542291, 11.04895538] p_perp = [0.02432118, 0.99613295, 7.40146505] vet/mch: p_par = [0.5075159, 0.53895212, 7.90331791] p_perp = [0.68025501, 0.41761289, 4.73793581] vet/fmi+mch: p_par = [0.29495222, 0.62429207, 8.6804131 ] p_perp = [0.23127377, 0.59010281, 5.98180004] fmi=Finland, mch=Switzerland, fmi+mch=both pooled into the same data set The above parameters have been fitten by using run_vel_pert_analysis.py and fit_vel_pert_params.py located in the scripts directory. See pysteps.noise.motion for additional documentation. mask_kwargs: dict Optional dictionary containing mask keyword arguments 'mask_f' and 'mask_rim', the factor defining the the mask increment and the rim size, respectively. The mask increment is defined as mask_f*timestep/kmperpixel. measure_time: bool If set to True, measure, print and return the computation time. Returns ------- out: ndarray If return_output is True, a four-dimensional array of shape (n_ens_members,num_timesteps,m,n) containing a time series of forecast precipitation fields for each ensemble member. Otherwise, a None value is returned. The time series starts from t0+timestep, where timestep is taken from the input precipitation fields R. If measure_time is True, the return value is a three-element tuple containing the nowcast array, the initialization time of the nowcast generator and the time used in the main loop (seconds). See also -------- pysteps.extrapolation.interface, pysteps.cascade.interface, pysteps.noise.interface, pysteps.noise.utils.compute_noise_stddev_adjs References ---------- :cite:`Seed2003`, :cite:`BPS2006`, :cite:`SPN2013`, :cite:`PCH2019b` """ _check_inputs(R, V, timesteps, ar_order) if extrap_kwargs is None: extrap_kwargs = dict() if filter_kwargs is None: filter_kwargs = dict() if noise_kwargs is None: noise_kwargs = dict() if vel_pert_kwargs is None: vel_pert_kwargs = dict() if mask_kwargs is None: mask_kwargs = dict() if np.any(~np.isfinite(V)): raise ValueError("V contains non-finite values") if mask_method not in ["obs", "sprog", "incremental", None]: raise ValueError( "unknown mask method %s: must be 'obs', 'sprog' or 'incremental' or None" % mask_method ) if conditional and R_thr is None: raise ValueError("conditional=True but R_thr is not set") if mask_method is not None and R_thr is None: raise ValueError("mask_method!=None but R_thr=None") if noise_stddev_adj not in ["auto", "fixed", None]: raise ValueError( "unknown noise_std_dev_adj method %s: must be 'auto', 'fixed', or None" % noise_stddev_adj ) if kmperpixel is None: if vel_pert_method is not None: raise ValueError("vel_pert_method is set but kmperpixel=None") if mask_method == "incremental": raise ValueError("mask_method='incremental' but kmperpixel=None") if timestep is None: if vel_pert_method is not None: raise ValueError("vel_pert_method is set but timestep=None") if mask_method == "incremental": raise ValueError("mask_method='incremental' but timestep=None") print("Computing STEPS nowcast:") print("------------------------") print("") print("Inputs:") print("-------") print("input dimensions: %dx%d" % (R.shape[1], R.shape[2])) if kmperpixel is not None: print("km/pixel: %g" % kmperpixel) if timestep is not None: print("time step: %d minutes" % timestep) print("") print("Methods:") print("--------") print("extrapolation: %s" % extrap_method) print("bandpass filter: %s" % bandpass_filter_method) print("decomposition: %s" % decomp_method) print("noise generator: %s" % noise_method) print("noise adjustment: %s" % ("yes" if noise_stddev_adj else "no")) print("velocity perturbator: %s" % vel_pert_method) print("conditional statistics: %s" % ("yes" if conditional else "no")) print("precip. mask method: %s" % mask_method) print("probability matching: %s" % probmatching_method) print("FFT method: %s" % fft_method) print("domain: %s" % domain) print("") print("Parameters:") print("-----------") if isinstance(timesteps, int): print("number of time steps: %d" % timesteps) else: print("time steps: %s" % timesteps) print("ensemble size: %d" % n_ens_members) print("parallel threads: %d" % num_workers) print("number of cascade levels: %d" % n_cascade_levels) print("order of the AR(p) model: %d" % ar_order) if vel_pert_method == "bps": vp_par = vel_pert_kwargs.get("p_par", noise.motion.get_default_params_bps_par()) vp_perp = vel_pert_kwargs.get( "p_perp", noise.motion.get_default_params_bps_perp() ) print( "velocity perturbations, parallel: %g,%g,%g" % (vp_par[0], vp_par[1], vp_par[2]) ) print( "velocity perturbations, perpendicular: %g,%g,%g" % (vp_perp[0], vp_perp[1], vp_perp[2]) ) if conditional or mask_method is not None: print("precip. intensity threshold: %g" % R_thr) num_ensemble_workers = n_ens_members if num_workers > n_ens_members else num_workers if measure_time: starttime_init = time.time() fft = utils.get_method(fft_method, shape=R.shape[1:], n_threads=num_workers) M, N = R.shape[1:] # initialize the band-pass filter filter_method = cascade.get_method(bandpass_filter_method) filter = filter_method((M, N), n_cascade_levels, **filter_kwargs) decomp_method, recomp_method = cascade.get_method(decomp_method) extrapolator_method = extrapolation.get_method(extrap_method) x_values, y_values = np.meshgrid(np.arange(R.shape[2]), np.arange(R.shape[1])) xy_coords = np.stack([x_values, y_values]) R = R[-(ar_order + 1) :, :, :].copy() # determine the domain mask from non-finite values domain_mask = np.logical_or.reduce( [~np.isfinite(R[i, :]) for i in range(R.shape[0])] ) # determine the precipitation threshold mask if conditional: MASK_thr = np.logical_and.reduce( [R[i, :, :] >= R_thr for i in range(R.shape[0])] ) else: MASK_thr = None # advect the previous precipitation fields to the same position with the # most recent one (i.e. transform them into the Lagrangian coordinates) extrap_kwargs = extrap_kwargs.copy() extrap_kwargs["xy_coords"] = xy_coords extrap_kwargs["allow_nonfinite_values"] = True res = list() def f(R, i): return extrapolator_method(R[i, :, :], V, ar_order - i, "min", **extrap_kwargs)[ -1 ] for i in range(ar_order): if not DASK_IMPORTED: R[i, :, :] = f(R, i) else: res.append(dask.delayed(f)(R, i)) if DASK_IMPORTED: num_workers_ = len(res) if num_workers > len(res) else num_workers R = np.stack(list(dask.compute(*res, num_workers=num_workers_)) + [R[-1, :, :]]) # replace non-finite values with the minimum value R = R.copy() for i in range(R.shape[0]): R[i, ~np.isfinite(R[i, :])] = np.nanmin(R[i, :]) if noise_method is not None: # get methods for perturbations init_noise, generate_noise = noise.get_method(noise_method) # initialize the perturbation generator for the precipitation field pp = init_noise(R, fft_method=fft, **noise_kwargs) if noise_stddev_adj == "auto": print("Computing noise adjustment coefficients... ", end="", flush=True) if measure_time: starttime = time.time() R_min = np.min(R) noise_std_coeffs = noise.utils.compute_noise_stddev_adjs( R[-1, :, :], R_thr, R_min, filter, decomp_method, pp, generate_noise, 20, conditional=True, num_workers=num_workers, ) if measure_time: print("%.2f seconds." % (time.time() - starttime)) else: print("done.") elif noise_stddev_adj == "fixed": f = lambda k: 1.0 / (0.75 + 0.09 * k) noise_std_coeffs = [f(k) for k in range(1, n_cascade_levels + 1)] else: noise_std_coeffs = np.ones(n_cascade_levels) if noise_stddev_adj is not None: print("noise std. dev. coeffs: %s" % str(noise_std_coeffs)) # compute the cascade decompositions of the input precipitation fields R_d = [] for i in range(ar_order + 1): R_ = decomp_method( R[i, :, :], filter, mask=MASK_thr, fft_method=fft, output_domain=domain, normalize=True, compute_stats=True, compact_output=True, ) R_d.append(R_) # normalize the cascades and rearrange them into a four-dimensional array # of shape (n_cascade_levels,ar_order+1,m,n) for the autoregressive model R_c = nowcast_utils.stack_cascades(R_d, n_cascade_levels) R_d = R_d[-1] R_d = [R_d.copy() for j in range(n_ens_members)] # compute lag-l temporal autocorrelation coefficients for each cascade level GAMMA = np.empty((n_cascade_levels, ar_order)) for i in range(n_cascade_levels): GAMMA[i, :] = correlation.temporal_autocorrelation(R_c[i], mask=MASK_thr) nowcast_utils.print_corrcoefs(GAMMA) if ar_order == 2: # adjust the lag-2 correlation coefficient to ensure that the AR(p) # process is stationary for i in range(n_cascade_levels): GAMMA[i, 1] = autoregression.adjust_lag2_corrcoef2(GAMMA[i, 0], GAMMA[i, 1]) # estimate the parameters of the AR(p) model from the autocorrelation # coefficients PHI = np.empty((n_cascade_levels, ar_order + 1)) for i in range(n_cascade_levels): PHI[i, :] = autoregression.estimate_ar_params_yw(GAMMA[i, :]) nowcast_utils.print_ar_params(PHI) # discard all except the p-1 last cascades because they are not needed for # the AR(p) model R_c = [R_c[i][-ar_order:] for i in range(n_cascade_levels)] # stack the cascades into a list containing all ensemble members R_c = [ [R_c[j].copy() for j in range(n_cascade_levels)] for i in range(n_ens_members) ] # initialize the random generators if noise_method is not None: randgen_prec = [] randgen_motion = [] np.random.seed(seed) for j in range(n_ens_members): rs = np.random.RandomState(seed) randgen_prec.append(rs) seed = rs.randint(0, high=1e9) rs = np.random.RandomState(seed) randgen_motion.append(rs) seed = rs.randint(0, high=1e9) if vel_pert_method is not None: init_vel_noise, generate_vel_noise = noise.get_method(vel_pert_method) # initialize the perturbation generators for the motion field vps = [] for j in range(n_ens_members): kwargs = { "randstate": randgen_motion[j], "p_par": vp_par, "p_perp": vp_perp, } vp_ = init_vel_noise(V, 1.0 / kmperpixel, timestep, **kwargs) vps.append(vp_) D = [None for j in range(n_ens_members)] R_f = [[] for j in range(n_ens_members)] if probmatching_method == "mean": mu_0 = np.mean(R[-1, :, :][R[-1, :, :] >= R_thr]) R_m = None if mask_method is not None: MASK_prec = R[-1, :, :] >= R_thr if mask_method == "obs": pass elif mask_method == "sprog": # compute the wet area ratio and the precipitation mask war = 1.0 * np.sum(MASK_prec) / (R.shape[1] * R.shape[2]) R_m = [R_c[0][i].copy() for i in range(n_cascade_levels)] R_m_d = R_d[0].copy() elif mask_method == "incremental": # get mask parameters mask_rim = mask_kwargs.get("mask_rim", 10) mask_f = mask_kwargs.get("mask_f", 1.0) # initialize the structuring element struct = scipy.ndimage.generate_binary_structure(2, 1) # iterate it to expand it nxn n = mask_f * timestep / kmperpixel struct = scipy.ndimage.iterate_structure(struct, int((n - 1) / 2.0)) # initialize precip mask for each member MASK_prec = _compute_incremental_mask(MASK_prec, struct, mask_rim) MASK_prec = [MASK_prec.copy() for j in range(n_ens_members)] if noise_method is None and R_m is None: R_m = [R_c[0][i].copy() for i in range(n_cascade_levels)] fft_objs = [] for i in range(n_ens_members): fft_objs.append(utils.get_method(fft_method, shape=R.shape[1:])) if measure_time: init_time = time.time() - starttime_init R = R[-1, :, :] print("Starting nowcast computation.") if measure_time: starttime_mainloop = time.time() if isinstance(timesteps, int): timesteps = range(timesteps + 1) timestep_type = "int" else: original_timesteps = [0] + list(timesteps) timesteps = nowcast_utils.binned_timesteps(original_timesteps) timestep_type = "list" extrap_kwargs["return_displacement"] = True R_f_prev = [R for i in range(n_ens_members)] t_prev = [0.0 for j in range(n_ens_members)] t_total = [0.0 for j in range(n_ens_members)] # iterate each time step for t, subtimestep_idx in enumerate(timesteps): if timestep_type == "list": subtimesteps = [original_timesteps[t_] for t_ in subtimestep_idx] else: subtimesteps = [t] if (timestep_type == "list" and subtimesteps) or ( timestep_type == "int" and t > 0 ): is_nowcast_time_step = True else: is_nowcast_time_step = False if is_nowcast_time_step: print( "Computing nowcast for time step %d... " % t, end="", flush=True, ) if measure_time: starttime = time.time() if noise_method is None or mask_method == "sprog": for i in range(n_cascade_levels): # use a separate AR(p) model for the non-perturbed forecast, # from which the mask is obtained R_m[i] = autoregression.iterate_ar_model(R_m[i], PHI[i, :]) R_m_d["cascade_levels"] = [R_m[i][-1] for i in range(n_cascade_levels)] if domain == "spatial": R_m_d["cascade_levels"] = np.stack(R_m_d["cascade_levels"]) R_m_ = recomp_method(R_m_d) if domain == "spectral": R_m_ = fft.irfft2(R_m_) if mask_method == "sprog": MASK_prec = _compute_sprog_mask(R_m_, war) # the nowcast iteration for each ensemble member def worker(j): if noise_method is not None: # generate noise field EPS = generate_noise( pp, randstate=randgen_prec[j], fft_method=fft_objs[j], domain=domain ) # decompose the noise field into a cascade EPS = decomp_method( EPS, filter, fft_method=fft_objs[j], input_domain=domain, output_domain=domain, compute_stats=True, normalize=True, compact_output=True, ) else: EPS = None # iterate the AR(p) model for each cascade level for i in range(n_cascade_levels): # normalize the noise cascade if EPS is not None: EPS_ = EPS["cascade_levels"][i] EPS_ *= noise_std_coeffs[i] else: EPS_ = None # apply AR(p) process to cascade level if EPS is not None or vel_pert_method is not None: R_c[j][i] = autoregression.iterate_ar_model( R_c[j][i], PHI[i, :], eps=EPS_ ) else: # use the deterministic AR(p) model computed above if # perturbations are disabled R_c[j][i] = R_m[i] EPS = None EPS_ = None # compute the recomposed precipitation field(s) from the cascades # obtained from the AR(p) model(s) R_d[j]["cascade_levels"] = [ R_c[j][i][-1, :] for i in range(n_cascade_levels) ] if domain == "spatial": R_d[j]["cascade_levels"] = np.stack(R_d[j]["cascade_levels"]) R_f_new = recomp_method(R_d[j]) if domain == "spectral": R_f_new = fft_objs[j].irfft2(R_f_new) if mask_method is not None: # apply the precipitation mask to prevent generation of new # precipitation into areas where it was not originally # observed R_cmin = R_f_new.min() if mask_method == "incremental": R_f_new = R_cmin + (R_f_new - R_cmin) * MASK_prec[j] MASK_prec_ = R_f_new > R_cmin else: MASK_prec_ = MASK_prec # Set to min value outside of mask R_f_new[~MASK_prec_] = R_cmin if probmatching_method == "cdf": # adjust the CDF of the forecast to match the most recently # observed precipitation field R_f_new = probmatching.nonparam_match_empirical_cdf(R_f_new, R) elif probmatching_method == "mean": MASK = R_f_new >= R_thr mu_fct = np.mean(R_f_new[MASK]) R_f_new[MASK] = R_f_new[MASK] - mu_fct + mu_0 if mask_method == "incremental": MASK_prec[j] = _compute_incremental_mask( R_f_new >= R_thr, struct, mask_rim ) R_f_new[domain_mask] = np.nan R_f_out = [] extrap_kwargs_ = extrap_kwargs.copy() V_pert = V # advect the recomposed precipitation field to obtain the forecast for # the current time step (or subtimesteps if non-integer time steps are # given) for t_sub in subtimesteps: if t_sub > 0: t_diff_prev_int = t_sub - int(t_sub) if t_diff_prev_int > 0.0: R_f_ip = (1.0 - t_diff_prev_int) * R_f_prev[ j ] + t_diff_prev_int * R_f_new else: R_f_ip = R_f_prev[j] t_diff_prev = t_sub - t_prev[j] t_total[j] += t_diff_prev # compute the perturbed motion field if vel_pert_method is not None: V_pert = V + generate_vel_noise(vps[j], t_total[j] * timestep) extrap_kwargs_["displacement_prev"] = D[j] R_f_ep, D[j] = extrapolator_method( R_f_ip, V_pert, [t_diff_prev], **extrap_kwargs_, ) R_f_out.append(R_f_ep[0]) t_prev[j] = t_sub # advect the forecast field by one time step if no subtimesteps in the # current interval were found if not subtimesteps: t_diff_prev = t + 1 - t_prev[j] t_total[j] += t_diff_prev # compute the perturbed motion field if vel_pert_method is not None: V_pert = V + generate_vel_noise(vps[j], t_total[j] * timestep) extrap_kwargs_["displacement_prev"] = D[j] _, D[j] = extrapolator_method( None, V_pert, [t_diff_prev], **extrap_kwargs_, ) t_prev[j] = t + 1 R_f_prev[j] = R_f_new return R_f_out res = [] for j in range(n_ens_members): if not DASK_IMPORTED or n_ens_members == 1: res.append(worker(j)) else: res.append(dask.delayed(worker)(j)) R_f_ = ( dask.compute(*res, num_workers=num_ensemble_workers) if DASK_IMPORTED and n_ens_members > 1 else res ) res = None if is_nowcast_time_step: if measure_time: print("%.2f seconds." % (time.time() - starttime)) else: print("done.") if callback is not None: callback(np.stack(R_f_)) R_f_ = None if return_output: for j in range(n_ens_members): R_f[j].extend(R_f_[j]) if measure_time: mainloop_time = time.time() - starttime_mainloop if return_output: outarr = np.stack([np.stack(R_f[j]) for j in range(n_ens_members)]) if measure_time: return outarr, init_time, mainloop_time else: return outarr else: return None