def test_from_dask_array_compat_numpy_array_1d(): x = da.ones(10, chunks=3) d1 = dd.from_dask_array(x) # dask assert isinstance(d1, dd.Series) assert (d1.compute().values == x.compute()).all() assert d1.name is None d2 = dd.from_array(x.compute()) # numpy assert isinstance(d1, dd.Series) assert (d2.compute().values == x.compute()).all() assert d2.name is None d1 = dd.from_dask_array(x, columns='name') # dask assert isinstance(d1, dd.Series) assert (d1.compute().values == x.compute()).all() assert d1.name == 'name' d2 = dd.from_array(x.compute(), columns='name') # numpy assert isinstance(d1, dd.Series) assert (d2.compute().values == x.compute()).all() assert d2.name == 'name' # passing list via columns results in DataFrame d1 = dd.from_dask_array(x, columns=['name']) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index(['name'])) d2 = dd.from_array(x.compute(), columns=['name']) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index(['name']))
def test_dask_dataframe(client): n = 10 m = 100 partition_size = 25 X = dd.from_array(np.random.random((m, n)), partition_size) y = dd.from_array(np.random.random(m), partition_size) weights = dd.from_array(np.random.random(m), partition_size) xgb.dask.run(client, run_create_dmatrix, X, y, weights)
def test_from_array(): x = np.arange(10 * 3).reshape(10, 3) d = dd.from_array(x, chunksize=4) assert list(d.columns) == ["0", "1", "2"] assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() d = dd.from_array(x, chunksize=4, columns=list("abc")) assert list(d.columns) == ["a", "b", "c"] assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() pytest.raises(ValueError, dd.from_array, np.ones(shape=(10, 10, 10)))
def test_from_array(): x = np.arange(10 * 3).reshape(10, 3) d = dd.from_array(x, chunksize=4) assert list(d.columns) == ['0', '1', '2'] assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() d = dd.from_array(x, chunksize=4, columns=list('abc')) assert list(d.columns) == ['a', 'b', 'c'] assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() pytest.raises(ValueError, dd.from_array, np.ones(shape=(10, 10, 10)))
def test_from_array(): x = np.arange(10 * 3).reshape(10, 3) d = dd.from_array(x, chunksize=4) assert isinstance(d, dd.DataFrame) tm.assert_index_equal(d.columns, pd.Index([0, 1, 2])) assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() d = dd.from_array(x, chunksize=4, columns=list('abc')) assert isinstance(d, dd.DataFrame) tm.assert_index_equal(d.columns, pd.Index(['a', 'b', 'c'])) assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() with pytest.raises(ValueError): dd.from_array(np.ones(shape=(10, 10, 10)))
def test_from_array_with_record_dtype(): x = np.array([(i, i * 10) for i in range(10)], dtype=[("a", "i4"), ("b", "i4")]) d = dd.from_array(x, chunksize=4) assert list(d.columns) == ["a", "b"] assert d.divisions == (0, 4, 8, 9) assert (d.compute().to_records(index=False) == x).all()
def test_from_array_1d_with_column_names(): x = da.ones(10, chunks=3) y = np.ones(10) d1 = dd.from_dask_array(x, columns="name") # dask p1 = pd.Series(y, name="name") assert_eq(d1, p1) d2 = dd.from_array(x.compute(), columns="name") # numpy assert_eq(d2, d1)
def test_from_array_with_record_dtype(): x = np.array([(i, i * 10) for i in range(10)], dtype=[('a', 'i4'), ('b', 'i4')]) d = dd.from_array(x, chunksize=4) assert list(d.columns) == ['a', 'b'] assert d.divisions == (0, 4, 8, 9) assert (d.compute().to_records(index=False) == x).all()
def create_diffdiff(): print("\ngenerating spreads vs spreads...") diff_diff = triu(dateless_df, True) ## merge abbr together, eg. "XXXX (PBF) - XXXX (SSF)" to "XXXX - XXXX (PBF-SSF)" diff_diff.columns = [ f'{h.split(" (")[0]}{h.split(" (")[1].split(")")[1]} ({h.split(" (")[1].split(")")[0]}-{h.split(" (")[2]}' for h in diff_diff.columns ] ## verbose version of above's list comprehension # new_headers = [] # for header in diff_diff.columns: # split_header = header.split(" (") # product_A = split_header[0] # split_section = split_header[1].split(")") # product_B = split_section[1] # new_headers.append(f"{product_A}{product_B} ({split_section[0]}-{split_header[2]}") # diff_diff.columns = new_headers diff_diff = triu(diff_diff, False) diff_diff = diff_diff.repartition(npartitions=200) diff_diff = diff_diff.reset_index(drop=True) dd_date_col = dd.from_array(date_col) dd_date_col = dd_date_col.repartition(npartitions=200) dd_date_col = dd_date_col.reset_index(drop=True) diff_diff = diff_diff.assign(date=dd_date_col) diff_diff = dd.melt( diff_diff, id_vars="date", var_name="product_diff", value_name="price_diff").dropna().reset_index(drop=True) diff_diff["product_diff"] = diff_diff["product_diff"].astype( "category") diff_diff["differential_A"] = diff_diff["product_diff"].str.partition( " - ")[0] diff_diff["differential_B"] = diff_diff["product_diff"].str.partition( " - ")[2] print(f"\nsaving file... ({round((time.time() - starttime), 2)}s)") dd.to_csv(df=diff_diff, filename=os.path.join(os.getcwd(), "cleaned_data", "diff_diff.csv"), index=False, single_file=True, encoding="utf-8-sig", chunksize=10000) print( f"[diff_diff.csv] saved successfully... ({round((time.time() - starttime), 2)}s)" )
def triu(df, pandas_mode): arr = df.values r, c = np.triu_indices(arr.shape[1], 1) cols = df.columns sep = " – " if pandas_mode else " - " nm = [cols[i] + sep + cols[j] for i, j in zip(r, c)] return pd.DataFrame(arr[:, r] - arr[:, c], columns=nm) if pandas_mode else dd.from_array( arr[:, r] - arr[:, c], columns=nm)
def test_dask_predict_shape_infer() -> None: with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = make_classification(n_samples=1000, n_informative=5, n_classes=3) X_ = dd.from_array(X, chunksize=100) y_ = dd.from_array(y, chunksize=100) dtrain = xgb.dask.DaskDMatrix(client, data=X_, label=y_) model = xgb.dask.train( client, {"objective": "multi:softprob", "num_class": 3}, dtrain=dtrain ) preds = xgb.dask.predict(client, model, dtrain) assert preds.shape[0] == preds.compute().shape[0] assert preds.shape[1] == preds.compute().shape[1]
def test_from_array_with_record_dtype(): x = np.array([(i, i * 10) for i in range(10)], dtype=[("a", "i4"), ("b", "i4")]) d = dd.from_array(x, chunksize=4) assert isinstance(d, dd.DataFrame) assert list(d.columns) == ["a", "b"] assert d.divisions == (0, 4, 8, 9) assert (d.compute().to_records(index=False) == x).all()
def fit(self, data, args): params = self.configure(data, args) devices = GPUtil.getAvailable(limit=32 if args.gpus < 0 else args.gpus) cluster = LocalCluster(n_workers=len(devices), threads_per_worker=args.cpus // len(devices), local_dir="/opt/gbm-datasets") client = Client(cluster) partition_size = 100000 try: X = dd.from_array(data.X_train, partition_size) y = dd.from_array(data.y_train, partition_size) except ValueError: X = dd.from_pandas(data.X_train, partition_size) y = dd.from_pandas(data.y_train, partition_size) result = xgb.dask.run(client, self.train, X, y, params, devices, args) self.model, train_time = next(iter(result.values())) client.close() return train_time
def test_from_array_with_record_dtype(): x = np.array([(i, i * 10) for i in range(10)], dtype=[('a', 'i4'), ('b', 'i4')]) d = dd.from_array(x, chunksize=4) assert isinstance(d, dd.DataFrame) assert list(d.columns) == ['a', 'b'] assert d.divisions == (0, 4, 8, 9) assert (d.compute().to_records(index=False) == x).all()
def test_from_array(): x = np.arange(10 * 3).reshape(10, 3) d = dd.from_array(x, chunksize=4) assert isinstance(d, dd.DataFrame) assert d._known_dtype tm.assert_index_equal(d.columns, pd.Index([0, 1, 2])) assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() d = dd.from_array(x, chunksize=4, columns=list('abc')) assert isinstance(d, dd.DataFrame) assert d._known_dtype tm.assert_index_equal(d.columns, pd.Index(['a', 'b', 'c'])) assert d.divisions == (0, 4, 8, 9) assert (d.compute().values == x).all() with pytest.raises(ValueError): dd.from_array(np.ones(shape=(10, 10, 10)))
def test_from_array_with_column_names(): x = da.ones((10, 3), chunks=(3, 3)) y = np.ones((10, 3)) d1 = dd.from_dask_array(x, columns=["a", "b", "c"]) # dask p1 = pd.DataFrame(y, columns=["a", "b", "c"]) assert_eq(d1, p1) d2 = dd.from_array(y, columns=["a", "b", "c"]) # numpy assert_eq(d1, d2)
def test_from_array(): x = np.array([(i, i*10) for i in range(10)], dtype=[('a', 'i4'), ('b', 'i4')]) d = dd.from_array(x, chunksize=4) assert list(d.columns) == ['a', 'b'] assert d.divisions == (4, 8) assert (d.compute().to_records(index=False) == x).all()
def test_n_workers(self) -> None: with LocalCluster(n_workers=2) as cluster: with Client(cluster) as client: workers = list(_get_client_workers(client).keys()) from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) dX = client.submit(da.from_array, X, workers=[workers[0]]).result() dy = client.submit(da.from_array, y, workers=[workers[0]]).result() train = xgb.dask.DaskDMatrix(client, dX, dy) dX = dd.from_array(X) dX = client.persist(dX, workers={dX: workers[1]}) dy = dd.from_array(y) dy = client.persist(dy, workers={dy: workers[1]}) valid = xgb.dask.DaskDMatrix(client, dX, dy) merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')]) assert len(merged) == 2
def test_from_dask_array_compat_numpy_array(): x = da.ones((10, 3), chunks=(3, 3)) y = np.ones((10, 3)) d1 = dd.from_dask_array(x) # dask p1 = pd.DataFrame(y) assert_eq(d1, p1) d2 = dd.from_array(y) # numpy assert_eq(d2, d1)
def sim_rms_results(rmsdata, sims=100): '''simulates the RMS results based on the ''' logger = logging.getLogger(__name__) logger.info('set lines') lines = dd.from_array(rmsdata["line"].unique(), columns=["line"]) logger.info('set states') states = dd.from_array(rmsdata["state"].unique(), columns=["state"]) #rmsdf = pd.pivot_table(rmsdata, values=['PERSPVALUE', 'STDDEVI', 'STDDEVC', 'EXPVALUE'], index=['LOBNAME', 'STATE', 'EVENTID']) logger.info('pivot rms data for events') rmsevents = pd.pivot_table(rmsdata, values='rate', index='eventid') freq_mean = rmsevents.sum() #EVENTID RATE PERSPCODE STATE LOBNAME PERSPVALUE STDDEVI STDDEVC EXPVALUE #catindex = [rmsdata['LOBNAME'].values, rmsdata['STATE'].values, rmsdata['EVENTID'].values] logger.info('create rmslookup table') rmslookup = dd.from_array({ 'line': rmsdata['line'].values, 'state': rmsdata['state'].values, 'eventid': rmsdata['eventid'].values, 'meanvalue': rmsdata['meanvalue'].values, 'stddevi': rmsdata['stddevi'].values, 'stddevc': rmsdata['stddevc'].values, 'exposure': rmsdata['exposure'].values }) #rmslookup.index.names = ['lob', 'state', 'eventid'] #eventlookup = pd.DataFrame(rmsevents.index) simulated_events = simulate_state_lines_losses(rmsevents, freq_mean, states, lines, sims) simulated_events = simulated_events.to_dataframe() simulated_cats = dd.merge(simulated_events, rmslookup, how='inner', left_on=['line', 'state', 'eventid'], right_on=['line', 'state', 'eventid']) #simulated_cats['rand']= np.random.uniform(0,1, len(simulated_cats)) simulated_cats['loss'] = calculate_rms_loss(simulated_cats, "Beta") logger.info("set index to simulated cats") simulated_cats = simulated_cats.set_index( ['line', 'state', 'simulation', 'eventid', 'eventseq']) #results = results.reset_index() logger.info("Completed simulating cats") return simulated_cats
def _create_data(objective, n_samples=100, output='array', chunk_size=50): if objective.endswith('classification'): if objective == 'binary-classification': centers = [[-4, -4], [4, 4]] elif objective == 'multiclass-classification': centers = [[-4, -4], [4, 4], [-4, 4]] else: raise ValueError(f"Unknown classification task '{objective}'") X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42) elif objective == 'regression': X, y = make_regression(n_samples=n_samples, random_state=42) else: raise ValueError("Unknown objective '%s'" % objective) rnd = np.random.RandomState(42) weights = rnd.random(X.shape[0]) * 0.01 if output == 'array': dX = da.from_array(X, (chunk_size, X.shape[1])) dy = da.from_array(y, chunk_size) dw = da.from_array(weights, chunk_size) elif output.startswith('dataframe'): X_df = pd.DataFrame( X, columns=['feature_%d' % i for i in range(X.shape[1])]) if output == 'dataframe-with-categorical': num_cat_cols = 5 for i in range(num_cat_cols): col_name = "cat_col" + str(i) cat_values = rnd.choice(['a', 'b'], X.shape[0]) cat_series = pd.Series(cat_values, dtype='category') X_df[col_name] = cat_series X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1))) # for the small data sizes used in tests, it's hard to get LGBMRegressor to choose # categorical features for splits. So for regression tests with categorical features, # _create_data() returns a DataFrame with ONLY categorical features if objective == 'regression': cat_cols = [ col for col in X_df.columns if col.startswith('cat_col') ] X_df = X_df[cat_cols] X = X[:, -num_cat_cols:] y_df = pd.Series(y, name='target') dX = dd.from_pandas(X_df, chunksize=chunk_size) dy = dd.from_pandas(y_df, chunksize=chunk_size) dw = dd.from_array(weights, chunksize=chunk_size) elif output == 'scipy_csr_matrix': dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csr_matrix) dy = da.from_array(y, chunks=chunk_size) dw = da.from_array(weights, chunk_size) else: raise ValueError("Unknown output type '%s'" % output) return X, y, weights, dX, dy, dw
def predict_xgboost_gpu(xgb_model, X, data_chunksize=None, n_gpus=None, n_threads_per_gpu=1, gpu_cluster=None, client=None): ''' Predicts the output for the input features X using the 'xgb_model' running on the GPU. :param xgb_model: a dask XGBoost model to use for predictions :param X: the input features to use for predictions, must be either a numpy ndarray or a pandas DataFrame :param data_chunksize: chunk sizes to be used on a dask dataframe, leave the default value None for auto decision :param n_gpus: number of GPUs to be used. Default value None selects all available devices; :param n_threads_per_gpu: number of threads per GPU; :param gpu_cluster: an existing dask cluster object to use. This param should be used if you call this method too many times in quick successions. Note that this function doesn't close an externally created cluster. :param client: an existing dask cluster object to use. This param should be used if you call this method too many times in quick successions. Note that this function doesn't close an externally created client. :return: If the input features X is a pandas DataFrame, returns a array-like DataFrame of single column containing the predictions; Otherwise, if the input features X is a numpy ndarray, returns a 1D ndarray containing the predictions . ''' if gpu_cluster is None: local_gpus = LocalCUDACluster(n_workers=n_gpus, threads_per_worker=n_threads_per_gpu) else: local_gpus = gpu_cluster if client is None: local_dask_client = Client(local_gpus) else: local_dask_client = client if data_chunksize is None: data_chunksize = X.shape[0] // len(local_gpus.cuda_visible_devices) if isinstance(X, pd.DataFrame): ndarray = False X = from_pandas(X, chunksize=data_chunksize) else: ndarray = True X = from_array(X, chunksize=data_chunksize) y_predicted = dask_xgboost_predict(local_dask_client, xgb_model, X) y_predicted = pd.DataFrame(y_predicted) if client is None: local_dask_client.close() if gpu_cluster is None: local_gpus.close() if ndarray: return y_predicted.to_numpy() return y_predicted
def run_empty_dmatrix_cls(client: "Client", parameters: dict) -> None: n_classes = 4 def _check_outputs(out: xgb.dask.TrainReturnT, predictions: np.ndarray) -> None: assert isinstance(out['booster'], xgb.dask.Booster) assert len(out['history']['validation']['merror']) == 2 assert isinstance(predictions, np.ndarray) assert predictions.shape[1] == n_classes, predictions.shape kRows, kCols = 1, 97 X = dd.from_array(np.random.randn(kRows, kCols)) y = dd.from_array(np.random.randint(low=0, high=n_classes, size=kRows)) dtrain = xgb.dask.DaskDMatrix(client, X, y) parameters['objective'] = 'multi:softprob' parameters['eval_metric'] = 'merror' parameters['num_class'] = n_classes out = xgb.dask.train(client, parameters, dtrain=dtrain, evals=[(dtrain, 'validation')], num_boost_round=2) predictions = xgb.dask.predict(client=client, model=out, data=dtrain) assert predictions.shape[1] == n_classes predictions = predictions.compute() _check_outputs(out, predictions) # train has more rows than evals valid = dtrain kRows += 1 X = dd.from_array(np.random.randn(kRows, kCols)) y = dd.from_array(np.random.randint(low=0, high=n_classes, size=kRows)) dtrain = xgb.dask.DaskDMatrix(client, X, y) out = xgb.dask.train(client, parameters, dtrain=dtrain, evals=[(valid, 'validation')], num_boost_round=2) predictions = xgb.dask.predict(client=client, model=out, data=valid).compute() _check_outputs(out, predictions)
def rstr_dk(self, T=504, L=21, half_life=126): rstr = np.tile(np.nan, self.length) rstr = dd.from_array(rstr).compute() rt_whole = (self.quota_ddf.close.diff()/self.quota_ddf.close.shift()).compute() for t in np.arange(T+L, self.length+1): rt = rt_whole[(t-T-L):(t-L)] rft = 9.85e-5 # rstr[t - 1] = sum((np.log(1 + rt) - np.log(1 + rft)) * halflife_da(half_life, length=T)) rstr[t - 1] = sum((np.log(1+rt.values)-np.log(1+rft)) * halflife(half_life, length=T)) return rstr
def test_from_array_1d_list_of_columns_gives_dataframe(): x = da.ones(10, chunks=3) y = np.ones(10) # passing list via columns results in DataFrame d1 = dd.from_dask_array(x, columns=["name"]) # dask p1 = pd.DataFrame(y, columns=["name"]) assert_eq(d1, p1) d2 = dd.from_array(y, columns=["name"]) # numpy assert_eq(d2, d1)
def test_from_dask_array_compat_numpy_array_1d(): x = da.ones(10, chunks=3) y = np.ones(10) d1 = dd.from_dask_array(x) # dask p1 = pd.Series(y) assert_eq(d1, p1) d2 = dd.from_array(y) # numpy assert_eq(d2, d1)
def get_cat_data(self): """Slices the categorical columns from the dask dataframe and returns them. Returns: A dask dataframe with the categorical columns. """ cat_data_array = np.array(self.dask_df[self.cat_labels]) return dd.from_array(cat_data_array, chunksize=200000, columns=self.cat_labels)
def execute_between_time(op, data, lower, upper, **kwargs): # TODO - Can this be done better? indexer = ( (data.dt.time.astype(str) >= lower) & (data.dt.time.astype(str) <= upper) ).to_dask_array(True) result = da.zeros(len(data), dtype=np.bool_) result[indexer] = True return dd.from_array(result)
def get_client_id_data(self): """Slices the 'client_id' column from the dask dataframe and returns it. Returns: A dask dataframe with the 'client_id' column. """ client_id_data_array = np.array(self.dask_df['client_id']) return dd.from_array(client_id_data_array, chunksize=200000, columns=['client_id'])
def compute_projection_column_expr( expr, parent, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): result_name = getattr(expr, '_name', None) op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ops.TableColumn): # slightly faster path for simple column selection name = op.name if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) (root_table, ) = op.root_tables() left_root, right_root = ops.distinct_roots(parent_table_op.left, parent_table_op.right) suffixes = { left_root: constants.LEFT_JOIN_SUFFIX, right_root: constants.RIGHT_JOIN_SUFFIX, } return data.loc[:, name + suffixes[root_table]].rename(result_name or name) data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) }, timecontext, ) for t in op.root_tables()) result = execute(expr, scope=scope, timecontext=timecontext, **kwargs) assert result_name is not None, 'Column selection name is None' if np.isscalar(result): series = dd.from_array(np.repeat(result, len(data.index))) series.name = result_name series.index = data.index return series return result.rename(result_name)
def _try_read_hdf(file, key, use_dask): if use_dask: try: return dd.read_hdf(file, key=key) except ValueError: return dd.from_array(np.array([[]])) else: try: return pd.read_hdf(file, key=key) except KeyError: return pd.DataFrame([])
def test_DataFrame_from_dask_array(): x = da.ones((10, 3), chunks=(4, 2)) pdf = pd.DataFrame(np.ones((10, 3)), columns=["a", "b", "c"]) df = dd.from_dask_array(x, ["a", "b", "c"]) assert not hlg_layer_topological(df.dask, -1).is_materialized() assert_eq(df, pdf) # dd.from_array should re-route to from_dask_array df2 = dd.from_array(x, columns=["a", "b", "c"]) assert not hlg_layer_topological(df2.dask, -1).is_materialized() assert_eq(df, df2)
def simple_example(): X, y = make_classification(n_samples=10000, n_features=2, chunks=50) X = dd.from_dask_array(X, columns=["a","b"]) y = dd.from_array(y) lr = LogisticRegression() lr.fit(X.values, y.values) print('Predictions =', lr.predict(X.values).compute()) print('Probabilities =', lr.predict_proba(X.values).compute()) print('Scores =', lr.score(X.values, y.values).compute())
def test_DataFrame_from_dask_array(): x = da.ones((10, 3), chunks=(4, 2)) df = from_dask_array(x, ["a", "b", "c"]) assert list(df.columns) == ["a", "b", "c"] assert list(df.divisions) == [0, 4, 8, 9] assert (df.compute(get=get_sync).values == x.compute(get=get_sync)).all() # dd.from_array should re-route to from_dask_array df2 = dd.from_array(x, columns=["a", "b", "c"]) assert df2.columns == df.columns assert df2.divisions == df.divisions
def test_DataFrame_from_dask_array(): x = da.ones((10, 3), chunks=(4, 2)) df = from_dask_array(x, ['a', 'b', 'c']) assert list(df.columns) == ['a', 'b', 'c'] assert list(df.divisions) == [0, 4, 8, 9] assert (df.compute(get=get_sync).values == x.compute(get=get_sync)).all() # dd.from_array should re-route to from_dask_array df2 = dd.from_array(x, columns=['a', 'b', 'c']) assert df2.columns == df.columns assert df2.divisions == df.divisions
def test_times_ops_with_tz(t, df, tz, rconstruct, column): expected = dd.from_array(rconstruct(len(df), dtype=bool), ) time = t[column].time() expr = time.between('01:00', '02:00', timezone=tz) result = expr.compile() tm.assert_series_equal(result.compute(), expected.compute()) # Test that casting behavior is the same as using the timezone kwarg ts = t[column].cast(dt.Timestamp(timezone=tz)) expr = ts.time().between('01:00', '02:00') result = expr.compile() tm.assert_series_equal(result.compute(), expected.compute())
def test_from_dask_array_compat_numpy_array(): x = da.ones((3, 3, 3), chunks=2) with pytest.raises(ValueError): dd.from_dask_array(x) # dask with pytest.raises(ValueError): dd.from_array(x.compute()) # numpy x = da.ones((10, 3), chunks=(3, 3)) d1 = dd.from_dask_array(x) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index([0, 1, 2])) d2 = dd.from_array(x.compute()) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index([0, 1, 2])) with pytest.raises(ValueError): dd.from_dask_array(x, columns=['a']) # dask with pytest.raises(ValueError): dd.from_array(x.compute(), columns=['a']) # numpy d1 = dd.from_dask_array(x, columns=['a', 'b', 'c']) # dask assert isinstance(d1, dd.DataFrame) assert (d1.compute().values == x.compute()).all() tm.assert_index_equal(d1.columns, pd.Index(['a', 'b', 'c'])) d2 = dd.from_array(x.compute(), columns=['a', 'b', 'c']) # numpy assert isinstance(d1, dd.DataFrame) assert (d2.compute().values == x.compute()).all() tm.assert_index_equal(d2.columns, pd.Index(['a', 'b', 'c']))
def log_group_agg(ddf): ddf_grouped=ddf.groupby(by="msno").agg( {'date': ['min', 'max', 'count'], 'num_25': ['min', 'max', 'mean', 'std', 'sum'], 'num_50': ['min', 'max', 'mean', 'std', 'sum'], 'num_75': ['min', 'max', 'mean', 'std', 'sum'], 'num_985': ['min', 'max', 'mean', 'std','sum'], 'num_100': ['min', 'max', 'mean', 'std','sum'], 'num_unq': ['min', 'max', 'mean', 'std','sum'], 'total_secs': ['min', 'max', 'mean', 'std','sum'], 'skip_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip25_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip50_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip75_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip985_ratio': ['min', 'max', 'mean', 'std','sum'], 'num100_ratio': ['min', 'max', 'mean', 'std','sum'], 'unq_ratio': ['min', 'max', 'mean', 'std','sum'], 'unq_secs_ratio': ['min', 'max', 'mean', 'std','sum'], 'num100_secs_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip_secs_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip25_secs_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip50_secs_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip75_secs_ratio': ['min', 'max', 'mean', 'std','sum'], 'skip985_secs_ratio': ['min', 'max', 'mean', 'std','sum'], 'daily_listening_ratio': ['min', 'max', 'mean', 'std','sum'] }) grouped_col_names =['min_date', 'max_date', 'count_date', 'min_num_25', 'max_num_25', 'mean_num_25', 'std_num_25', 'sum_num_25', 'min_num_50', 'max_num_50', 'mean_num_50', 'std_num_50', 'sum_num_50', 'min_num_75', 'max_num_75', 'mean_num_75', 'std_num_75', 'sum_num_75', 'min_num_985', 'max_num_985', 'mean_num_985', 'std_num_985', 'sum_num_985', 'min_num_100', 'max_num_100', 'mean_num_100', 'std_num_100', 'sum_num_100', 'min_num_unq', 'max_num_unq', 'mean_num_unq', 'std_num_unq', 'sum_num_unq', 'min_total_secs', 'max_total_secs', 'mean_total_secs', 'std_total_secs', 'sum_total_secs', 'min_skip_ratio', 'max_skip_ratio', 'mean_skip_ratio', 'std_skip_ratio','sum_skip_ratio', 'min_skip25_ratio', 'max_skip25_ratio', 'mean_skip25_ratio', 'std_skip25_ratio','sum_skip25_ratio', 'min_skip50_ratio', 'max_skip50_ratio', 'mean_skip50_ratio', 'std_skip50_ratio','sum_skip50_ratio', 'min_skip75_ratio', 'max_skip75_ratio', 'mean_skip75_ratio', 'std_skip75_ratio','sum_skip75_ratio', 'min_skip985_ratio', 'max_skip985_ratio', 'mean_skip985_ratio', 'std_skip985_ratio','sum_skip985_ratio', 'min_num100_ratio', 'max_num100_ratio', 'mean_num100_ratio', 'std_num100_ratio','sum_num100_ratio', 'min_unq_ratio', 'max_unq_ratio', 'mean_unq_ratio', 'std_unq_ratio','sum_unq_ratio', 'min_unq_secs_ratio', 'max_unq_secs_ratio', 'mean_unq_secs_ratio', 'std_unq_secs_ratio','sum_unq_secs_ratio', 'min_num100_secs_ratio', 'max_num100_secs_ratio', 'mean_num100_secs_ratio', 'std_num100_secs_ratio','sum_num100_secs_ratio', 'min_skip_secs_ratio', 'max_skip_secs_ratio', 'mean_skip_secs_ratio', 'std_skip_secs_ratio','sum_skip_secs_ratio', 'min_skip25_secs_ratio', 'max_skip25_secs_ratio', 'mean_skip25_secs_ratio', 'std_skip25_secs_ratio','sum_skip25_secs_ratio', 'min_skip50_secs_ratio', 'max_skip50_secs_ratio', 'mean_skip50_secs_ratio', 'std_skip50_secs_ratio','sum_skip50_secs_ratio', 'min_skip75_secs_ratio', 'max_skip75_secs_ratio', 'mean_skip75_secs_ratio', 'std_skip75_secs_ratio','sum_skip75_secs_ratio', 'min_skip985_secs_ratio', 'max_skip985_secs_ratio', 'mean_skip985_secs_ratio', 'std_skip985_secs_ratio','sum_skip985_secs_ratio', 'min_daily_listening_ratio', 'max_daily_listening_ratio', 'mean_daily_listening_ratio', 'std_daily_listening_ratio','sum_daily_listening_ratio'] ddf_agg = dd.from_array(ddf_grouped.values,chunksize=25, columns=grouped_col_names) ddf_agg['msno'] = ddf_grouped.index return ddf_agg
def testFromDaskDfArray(self): from xgboost_ray.data_sources.dask import DASK_INSTALLED if not DASK_INSTALLED: self.skipTest("Dask not installed.") return import dask.dataframe as dd import dask.array as da in_x = dd.from_array(self.x) in_y = da.from_array(self.y) self._testMatrixCreation(in_x, in_y, distributed=False)
def test_DataFrame_from_dask_array(): x = da.ones((10, 3), chunks=(4, 2)) df = dd.from_dask_array(x, ['a', 'b', 'c']) assert isinstance(df, dd.DataFrame) tm.assert_index_equal(df.columns, pd.Index(['a', 'b', 'c'])) assert list(df.divisions) == [0, 4, 8, 9] assert (df.compute(scheduler='sync').values == x.compute(scheduler='sync')).all() # dd.from_array should re-route to from_dask_array df2 = dd.from_array(x, columns=['a', 'b', 'c']) assert isinstance(df, dd.DataFrame) tm.assert_index_equal(df2.columns, df.columns) assert df2.divisions == df.divisions
def test_DataFrame_from_dask_array(): x = da.ones((10, 3), chunks=(4, 2)) df = from_dask_array(x, ['a', 'b', 'c']) assert isinstance(df, dd.DataFrame) assert df.columns == ('a', 'b', 'c') assert list(df.divisions) == [0, 4, 8, 9] assert (df.compute(get=get_sync).values == x.compute(get=get_sync)).all() # dd.from_array should re-route to from_dask_array df2 = dd.from_array(x, columns=['a', 'b', 'c']) assert isinstance(df, dd.DataFrame) assert df2.columns == df.columns assert df2.divisions == df.divisions
def test_Series_from_dask_array(): x = da.ones(10, chunks=4) ser = from_dask_array(x, "a") assert ser.name == "a" assert list(ser.divisions) == [0, 4, 8, 9] assert (ser.compute(get=get_sync).values == x.compute(get=get_sync)).all() ser = from_dask_array(x) assert ser.name is None # dd.from_array should re-route to from_dask_array ser2 = dd.from_array(x) assert eq(ser, ser2)
def test_Series_from_dask_array(): x = da.ones(10, chunks=4) ser = dd.from_dask_array(x, 'a') assert isinstance(ser, dd.Series) assert ser.name == 'a' assert list(ser.divisions) == [0, 4, 8, 9] assert (ser.compute(scheduler='sync').values == x.compute(scheduler='sync')).all() ser = dd.from_dask_array(x) assert isinstance(ser, dd.Series) assert ser.name is None # dd.from_array should re-route to from_dask_array ser2 = dd.from_array(x) assert isinstance(ser2, dd.Series) assert_eq(ser, ser2)
def test_meta_from_array(): x = np.array([[1, 2], [3, 4]], dtype=np.int64) res = _meta_from_array(x) assert isinstance(res, pd.DataFrame) assert res[0].dtype == np.int64 assert res[1].dtype == np.int64 tm.assert_index_equal(res.columns, pd.Index([0, 1])) x = np.array([[1., 2.], [3., 4.]], dtype=np.float64) res = _meta_from_array(x, columns=['a', 'b']) assert isinstance(res, pd.DataFrame) assert res['a'].dtype == np.float64 assert res['b'].dtype == np.float64 tm.assert_index_equal(res.columns, pd.Index(['a', 'b'])) with pytest.raises(ValueError): _meta_from_array(x, columns=['a', 'b', 'c']) np.random.seed(42) x = np.random.rand(201, 2) x = dd.from_array(x, chunksize=50, columns=['a', 'b']) assert len(x.divisions) == 6 # Should be 5 partitions and the end
data = [[]]*len(flist) bar = progressbar.ProgressBar() with h5py.File(fl,'r') as hf: groups = list(filter(lambda x: type(x[1])==h5py._hl.group.Group, hf.items())) print 'every '+str(setsgroup)+' groups' for gr in bar(groups[::setsgroup]): g=gr[1] if run: shead = g.attrs['spechead'].split(',') fhead = g.attrs['fluxhead'].split(',') spec = dd.from_array(g.get('spec')[1:,:],chunksize=50000, columns = shead) flux = dd.from_array(g.get('flux')[1:,:],chunksize=50000,columns = fhead) if run: #spec = spec.set_index('TIME', sorted=True) M = spec.M.mean() spec = spec/M #flux = flux.set_index('TIME', sorted=True) shead = g.attrs['spechead'].split(',') fcol = g.attrs['fluxhead'] products = [i.split('+') for i in re.findall(r'-->([A-z0-9+]*)',fcol)]
def __init__(self,h5file,groupid=False): self.origin = h5file #if not os.path.isfile(h5file) : print 'no file found'; return None #self.hf = h5py.File( h5file, 'r') with h5py.File(h5file,'r') as hf: groups = list(filter(lambda x: type(x[1])==h5py._hl.group.Group, hf.items())) self.groups = dict([[i[0],j] for j,i in enumerate(groups)]) self.groupkeys = groups[0][1].attrs.keys() self.flux=False if groupid: g = groups[int(groupid)][1] else: g = groups[0][1] self.groupname = groups[0][0] self.wall= g.attrs['wall'] shead = g.attrs['spechead'].split(',') rhead = g.attrs['ratehead'].split(',') fhead = g.attrs['fluxhead'].split(',') vhead = g.attrs['vdothead'].split(',') jhead = g.attrs['jacsphead'].split(',') #self.fhd = g.attrs['ratehead'].split(',') spec = dd.from_array(g.get('spec')[1:,:],chunksize=50000, columns = shead) vdot = dd.from_array(g.get('vdot')[1:,:],chunksize=50000, columns = vhead) jacsp = dd.from_array(g.get('jacsp')[1:,:],chunksize=50000, columns = jhead) if len(rhead) != len(set(rhead)): print 'Duplicates detected, please parse mecnahisms in future to prevent this' rate = pd.DataFrame(g.get('rate')[1:,:],columns=rhead) rate = rate.groupby(rate.columns, axis=1).sum() rhead = rate.columns rate = dd.from_pandas(rate,chunksize=50000) else: rate = dd.from_array(g.get('rate')[1:,:],chunksize=50000,columns = rhead) print g.get('flux').shape print len(fhead) if len(fhead) != len(set(fhead)): #print 'Duplicates detected, please parse mecnahisms in future to prevent this' flux = pd.DataFrame(g.get('flux')[1:,:],columns=fhead) flux = flux.groupby(flux.columns, axis=1).sum() fhead = flux.columns flux = dd.from_pandas(flux,chunksize=50000) else: flux = dd.from_array(g.get('flux')[1:,:],chunksize=50000,columns = fhead) self.timesteps = spec['TIME'].astype('M8[s]').compute() spec['TIME'] = vdot['TIME'] = rate['TIME'] = flux['TIME'] = jacsp['TIME'] = self.timesteps #dd.from_array(np.array(self.timesteps[1:])) self.ts= np.array(self.timesteps) ''' n = int(len(shead)/5) print n, 'partitions' spec.repartition(npartitions= n) ''' spec = spec.set_index('TIME', sorted=True) self.M = spec.M.mean() self.spec = spec/self.M self.vdot = vdot.set_index('TIME', sorted=True) self.jacsp=jacsp.set_index('TIME', sorted=True) self.rate = rate.set_index('TIME', sorted=True) self.flux = flux.set_index('TIME', sorted=True) fcol = ','.join(fhead) self.products = [i.split('+') for i in re.findall(r'-->([A-z0-9+]*)',fcol)] self.reactants = np.array([j.split('+') for j in re.findall(r'([A-z0-9+]{1,60})-->',fcol)]) ''' try: self.adj = np.array(g['adj']) self.adjspec = np.array(g.attrs['adjspec'].split(',')) self.adjts= np.array(g.attrs['adjts'].split(',')) except Exception as e: print e,'no adjacency matrix data' ''' hf.close() #check regex works #if (len(reactants) + len(products))/2 != len(rhead)-ratebuff : print 'reactants and poducts differing lengths' , len(reactants) , len(products) , len(rhead) #shead.extend(['DUMMY','CL','CLO']) self.prodloss = {k: {'loss':[],'prod':[]} for k in shead} ### reaction prodloss arrays for idx in xrange(len(self.reactants)): for i in self.reactants[idx]: try:self.prodloss[i]['loss'].append(idx) except:None for i in self.products[idx]: try:self.prodloss[i]['prod'].append(idx) except:None
def __init__(self, h5file, groupid=False,selection = 'spec,rate,flux,vdot,jacsp'.split(','), prodloss = True,ts = 600): ''' h5file - filename groupid - select a specific group entry selections - which data sections to provide in the class prodloss - create rxn/ropa dictionaries ''' self.origin = h5file self.selection = selection #if not os.path.isfile(h5file) : print 'no file found'; return None #self.hf = h5py.File( h5file, 'r') with h5py.File(h5file,'r') as hf: groups = list(filter(lambda x: type(x[1])==h5py._hl.group.Group, hf.items())) self.groups = dict([[i[0],j] for j,i in enumerate(groups)]) self.groupkeys = groups[0][1].attrs.keys() self.flux=False if type(groupid) == int: g = groups[groupid] elif type(groupid) == str: g = groups[self.groups[groupid]] else: g = groups[0] self.groupname = g[0] g = g[1] self.wall= g.attrs['wall'] if True: # spec MUST always be included... #'spec' in selection: shead = g.attrs['spechead'].decode("utf-8").split(',') spec = dd.from_array(g.get('spec')[:,:],chunksize=50000, columns = shead) self.timesteps = spec['TIME'].compute().astype('M8[s]') self.ts= np.array(self.timesteps) spec['TIME'] = self.timesteps spec = spec.set_index('TIME', sorted=True) self.spinup= self.ts[int( (spec.SPINUP.max()/ts).compute() ) ] self.M = spec.M.mean() self.spec = spec/self.M fhead = g.attrs['fluxhead'].decode("utf-8").split(',') if 'rate' in selection: rhead = g.attrs['ratehead'].decode("utf-8").split(',') if len(rhead) != len(set(rhead)): print ('Duplicates detected, please parse mecnahisms in future to prevent this') rate = pd.DataFrame(g.get('rate')[:,:],columns=rhead) rate = rate.groupby(rate.columns, axis=1).sum() rhead = rate.columns rate = dd.from_pandas(rate,chunksize=50000) else: rate = dd.from_array(g.get('rate')[:,:],chunksize=50000,columns = rhead) rate['TIME'] = self.timesteps self.rate = rate.set_index('TIME', sorted=True) if 'flux' in selection: if len(fhead) != len(set(fhead)): #print 'Duplicates detected, please parse mecnahisms in future to prevent this' flux = pd.DataFrame(g.get('flux')[:,:],columns=fhead) flux = flux.groupby(flux.columns, axis=1).sum() fhead = flux.columns flux = dd.from_pandas(flux,chunksize=50000) else: flux = dd.from_array(g.get('flux')[:,:],chunksize=50000,columns = fhead) flux['TIME'] = self.timesteps self.flux = flux.set_index('TIME', sorted=True) if 'vdot' in selection: vhead = g.attrs['vdothead'].decode("utf-8").split(',') vdot = dd.from_array(g.get('vdot')[:,:],chunksize=50000, columns = vhead) #vdot*=-1 # convert such that -ve values suggest flux leaving the species. vdot['TIME'] = self.timesteps self.vdot = vdot.set_index('TIME', sorted=True) if 'jacsp' in selection: jhead = g.attrs['jacsphead'].decode("utf-8").split(',') jacsp = dd.from_array(g.get('jacsp')[:,:],chunksize=50000, columns = jhead) jacsp['TIME'] = self.timesteps self.jacsp=jacsp.set_index('TIME', sorted=True) hf.close() if prodloss: fcol = ','.join(fhead) self.products = [i.split('+') for i in re.findall(r'-->([A-z0-9+]*)',fcol)] self.reactants = np.array([j.split('+') for j in re.findall(r'([A-z0-9+]{1,60})-->',fcol)]) self.prodloss = {k: {'loss':[],'prod':[]} for k in shead} ### reaction prodloss arrays for idx in range(len(self.reactants)): for i in self.reactants[idx]: try:self.prodloss[i]['loss'].append(idx) except:None for i in self.products[idx]: try:self.prodloss[i]['prod'].append(idx) except:None