Python from_array 예제들, dask.dataframe.from_array Python 예제들

예제 #1

0

파일 보기

파일: test_io.py 프로젝트: floriango/dask

def test_from_dask_array_compat_numpy_array_1d():

    x = da.ones(10, chunks=3)
    d1 = dd.from_dask_array(x)       # dask
    assert isinstance(d1, dd.Series)
    assert (d1.compute().values == x.compute()).all()
    assert d1.name is None

    d2 = dd.from_array(x.compute())  # numpy
    assert isinstance(d1, dd.Series)
    assert (d2.compute().values == x.compute()).all()
    assert d2.name is None

    d1 = dd.from_dask_array(x, columns='name')       # dask
    assert isinstance(d1, dd.Series)
    assert (d1.compute().values == x.compute()).all()
    assert d1.name == 'name'

    d2 = dd.from_array(x.compute(), columns='name')  # numpy
    assert isinstance(d1, dd.Series)
    assert (d2.compute().values == x.compute()).all()
    assert d2.name == 'name'

    # passing list via columns results in DataFrame
    d1 = dd.from_dask_array(x, columns=['name'])       # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index(['name']))

    d2 = dd.from_array(x.compute(), columns=['name'])  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index(['name']))

예제 #2

0

파일 보기

파일: test_with_dask.py 프로젝트: dmlc/xgboost

def test_dask_dataframe(client):
    n = 10
    m = 100
    partition_size = 25
    X = dd.from_array(np.random.random((m, n)), partition_size)
    y = dd.from_array(np.random.random(m), partition_size)
    weights = dd.from_array(np.random.random(m), partition_size)
    xgb.dask.run(client, run_create_dmatrix, X, y, weights)

예제 #3

0

파일 보기

파일: test_io.py 프로젝트: hgz2373294/dask

def test_from_array():
    x = np.arange(10 * 3).reshape(10, 3)
    d = dd.from_array(x, chunksize=4)
    assert list(d.columns) == ["0", "1", "2"]
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    d = dd.from_array(x, chunksize=4, columns=list("abc"))
    assert list(d.columns) == ["a", "b", "c"]
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    pytest.raises(ValueError, dd.from_array, np.ones(shape=(10, 10, 10)))

예제 #4

0

파일 보기

파일: test_io.py 프로젝트: GaelVaroquaux/dask

def test_from_array():
    x = np.arange(10 * 3).reshape(10, 3)
    d = dd.from_array(x, chunksize=4)
    assert list(d.columns) == ['0', '1', '2']
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    d = dd.from_array(x, chunksize=4, columns=list('abc'))
    assert list(d.columns) == ['a', 'b', 'c']
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    pytest.raises(ValueError, dd.from_array, np.ones(shape=(10, 10, 10)))

예제 #5

0

파일 보기

파일: test_io.py 프로젝트: floriango/dask

def test_from_array():
    x = np.arange(10 * 3).reshape(10, 3)
    d = dd.from_array(x, chunksize=4)
    assert isinstance(d, dd.DataFrame)
    tm.assert_index_equal(d.columns, pd.Index([0, 1, 2]))
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    d = dd.from_array(x, chunksize=4, columns=list('abc'))
    assert isinstance(d, dd.DataFrame)
    tm.assert_index_equal(d.columns, pd.Index(['a', 'b', 'c']))
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    with pytest.raises(ValueError):
        dd.from_array(np.ones(shape=(10, 10, 10)))

예제 #6

0

파일 보기

파일: test_io.py 프로젝트: hgz2373294/dask

def test_from_array_with_record_dtype():
    x = np.array([(i, i * 10) for i in range(10)], dtype=[("a", "i4"), ("b", "i4")])
    d = dd.from_array(x, chunksize=4)

    assert list(d.columns) == ["a", "b"]
    assert d.divisions == (0, 4, 8, 9)

    assert (d.compute().to_records(index=False) == x).all()

예제 #7

0

파일 보기

def test_from_array_1d_with_column_names():
    x = da.ones(10, chunks=3)
    y = np.ones(10)
    d1 = dd.from_dask_array(x, columns="name")  # dask
    p1 = pd.Series(y, name="name")
    assert_eq(d1, p1)

    d2 = dd.from_array(x.compute(), columns="name")  # numpy
    assert_eq(d2, d1)

예제 #8

0

파일 보기

파일: test_io.py 프로젝트: bowlofstew/dask

def test_from_array_with_record_dtype():
    x = np.array([(i, i * 10) for i in range(10)],
                 dtype=[('a', 'i4'), ('b', 'i4')])
    d = dd.from_array(x, chunksize=4)

    assert list(d.columns) == ['a', 'b']
    assert d.divisions == (0, 4, 8, 9)

    assert (d.compute().to_records(index=False) == x).all()

예제 #9

0

파일 보기

파일: cleandata.py 프로젝트: zhermin/data-science

    def create_diffdiff():
        print("\ngenerating spreads vs spreads...")

        diff_diff = triu(dateless_df, True)

        ## merge abbr together, eg. "XXXX (PBF) - XXXX (SSF)" to "XXXX - XXXX (PBF-SSF)"
        diff_diff.columns = [
            f'{h.split(" (")[0]}{h.split(" (")[1].split(")")[1]} ({h.split(" (")[1].split(")")[0]}-{h.split(" (")[2]}'
            for h in diff_diff.columns
        ]

        ## verbose version of above's list comprehension
        # new_headers = []
        # for header in diff_diff.columns:
        #     split_header = header.split(" (")
        #     product_A = split_header[0]
        #     split_section = split_header[1].split(")")
        #     product_B = split_section[1]
        #     new_headers.append(f"{product_A}{product_B} ({split_section[0]}-{split_header[2]}")
        # diff_diff.columns = new_headers

        diff_diff = triu(diff_diff, False)

        diff_diff = diff_diff.repartition(npartitions=200)
        diff_diff = diff_diff.reset_index(drop=True)

        dd_date_col = dd.from_array(date_col)
        dd_date_col = dd_date_col.repartition(npartitions=200)
        dd_date_col = dd_date_col.reset_index(drop=True)

        diff_diff = diff_diff.assign(date=dd_date_col)

        diff_diff = dd.melt(
            diff_diff,
            id_vars="date",
            var_name="product_diff",
            value_name="price_diff").dropna().reset_index(drop=True)

        diff_diff["product_diff"] = diff_diff["product_diff"].astype(
            "category")

        diff_diff["differential_A"] = diff_diff["product_diff"].str.partition(
            " - ")[0]
        diff_diff["differential_B"] = diff_diff["product_diff"].str.partition(
            " - ")[2]

        print(f"\nsaving file... ({round((time.time() - starttime), 2)}s)")
        dd.to_csv(df=diff_diff,
                  filename=os.path.join(os.getcwd(), "cleaned_data",
                                        "diff_diff.csv"),
                  index=False,
                  single_file=True,
                  encoding="utf-8-sig",
                  chunksize=10000)
        print(
            f"[diff_diff.csv] saved successfully... ({round((time.time() - starttime), 2)}s)"
        )

예제 #10

0

파일 보기

파일: cleandata.py 프로젝트: zhermin/data-science

def triu(df, pandas_mode):
    arr = df.values
    r, c = np.triu_indices(arr.shape[1], 1)
    cols = df.columns
    sep = " – " if pandas_mode else " - "
    nm = [cols[i] + sep + cols[j] for i, j in zip(r, c)]
    return pd.DataFrame(arr[:, r] - arr[:, c],
                        columns=nm) if pandas_mode else dd.from_array(
                            arr[:, r] - arr[:, c], columns=nm)

예제 #11

0

파일 보기

def test_dask_predict_shape_infer() -> None:
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            X, y = make_classification(n_samples=1000, n_informative=5,
                                       n_classes=3)
            X_ = dd.from_array(X, chunksize=100)
            y_ = dd.from_array(y, chunksize=100)
            dtrain = xgb.dask.DaskDMatrix(client, data=X_, label=y_)

            model = xgb.dask.train(
                client,
                {"objective": "multi:softprob", "num_class": 3},
                dtrain=dtrain
            )

            preds = xgb.dask.predict(client, model, dtrain)
            assert preds.shape[0] == preds.compute().shape[0]
            assert preds.shape[1] == preds.compute().shape[1]

예제 #12

0

파일 보기

def test_from_array_with_record_dtype():
    x = np.array([(i, i * 10) for i in range(10)],
                 dtype=[("a", "i4"), ("b", "i4")])
    d = dd.from_array(x, chunksize=4)
    assert isinstance(d, dd.DataFrame)
    assert list(d.columns) == ["a", "b"]
    assert d.divisions == (0, 4, 8, 9)

    assert (d.compute().to_records(index=False) == x).all()

예제 #13

0

파일 보기

파일: algorithms.py 프로젝트: paulhendricks/gbm-bench

 def fit(self, data, args):
     params = self.configure(data, args)
     devices = GPUtil.getAvailable(limit=32 if args.gpus < 0 else args.gpus)
     cluster = LocalCluster(n_workers=len(devices),
                            threads_per_worker=args.cpus // len(devices),
                            local_dir="/opt/gbm-datasets")
     client = Client(cluster)
     partition_size = 100000
     try:
         X = dd.from_array(data.X_train, partition_size)
         y = dd.from_array(data.y_train, partition_size)
     except ValueError:
         X = dd.from_pandas(data.X_train, partition_size)
         y = dd.from_pandas(data.y_train, partition_size)
     result = xgb.dask.run(client, self.train, X, y, params, devices, args)
     self.model, train_time = next(iter(result.values()))
     client.close()
     return train_time

예제 #14

0

파일 보기

파일: test_io.py 프로젝트: floriango/dask

def test_from_array_with_record_dtype():
    x = np.array([(i, i * 10) for i in range(10)],
                 dtype=[('a', 'i4'), ('b', 'i4')])
    d = dd.from_array(x, chunksize=4)
    assert isinstance(d, dd.DataFrame)
    assert list(d.columns) == ['a', 'b']
    assert d.divisions == (0, 4, 8, 9)

    assert (d.compute().to_records(index=False) == x).all()

예제 #15

0

파일 보기

def test_from_array():
    x = np.arange(10 * 3).reshape(10, 3)
    d = dd.from_array(x, chunksize=4)
    assert isinstance(d, dd.DataFrame)
    assert d._known_dtype
    tm.assert_index_equal(d.columns, pd.Index([0, 1, 2]))
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    d = dd.from_array(x, chunksize=4, columns=list('abc'))
    assert isinstance(d, dd.DataFrame)
    assert d._known_dtype
    tm.assert_index_equal(d.columns, pd.Index(['a', 'b', 'c']))
    assert d.divisions == (0, 4, 8, 9)
    assert (d.compute().values == x).all()

    with pytest.raises(ValueError):
        dd.from_array(np.ones(shape=(10, 10, 10)))

예제 #16

0

파일 보기

def test_from_array_with_column_names():
    x = da.ones((10, 3), chunks=(3, 3))
    y = np.ones((10, 3))
    d1 = dd.from_dask_array(x, columns=["a", "b", "c"])  # dask
    p1 = pd.DataFrame(y, columns=["a", "b", "c"])
    assert_eq(d1, p1)

    d2 = dd.from_array(y, columns=["a", "b", "c"])  # numpy
    assert_eq(d1, d2)

예제 #17

0

파일 보기

파일: test_io.py 프로젝트: bjlittle/dask

def test_from_array():
    x = np.array([(i, i*10) for i in range(10)],
                 dtype=[('a', 'i4'), ('b', 'i4')])
    d = dd.from_array(x, chunksize=4)

    assert list(d.columns) == ['a', 'b']
    assert d.divisions == (4, 8)

    assert (d.compute().to_records(index=False) == x).all()

예제 #18

0

파일 보기

    def test_n_workers(self) -> None:
        with LocalCluster(n_workers=2) as cluster:
            with Client(cluster) as client:
                workers = list(_get_client_workers(client).keys())
                from sklearn.datasets import load_breast_cancer
                X, y = load_breast_cancer(return_X_y=True)
                dX = client.submit(da.from_array, X, workers=[workers[0]]).result()
                dy = client.submit(da.from_array, y, workers=[workers[0]]).result()
                train = xgb.dask.DaskDMatrix(client, dX, dy)

                dX = dd.from_array(X)
                dX = client.persist(dX, workers={dX: workers[1]})
                dy = dd.from_array(y)
                dy = client.persist(dy, workers={dy: workers[1]})
                valid = xgb.dask.DaskDMatrix(client, dX, dy)

                merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')])
                assert len(merged) == 2

예제 #19

0

파일 보기

def test_from_dask_array_compat_numpy_array():
    x = da.ones((10, 3), chunks=(3, 3))
    y = np.ones((10, 3))
    d1 = dd.from_dask_array(x)  # dask
    p1 = pd.DataFrame(y)
    assert_eq(d1, p1)

    d2 = dd.from_array(y)  # numpy
    assert_eq(d2, d1)

예제 #20

0

파일 보기

파일: RMS_Cats.py 프로젝트: jmatthewpeters/PyCatSims

def sim_rms_results(rmsdata, sims=100):
    '''simulates the RMS results based on the 
    '''
    logger = logging.getLogger(__name__)
    logger.info('set lines')
    lines = dd.from_array(rmsdata["line"].unique(), columns=["line"])
    logger.info('set states')
    states = dd.from_array(rmsdata["state"].unique(), columns=["state"])
    #rmsdf = pd.pivot_table(rmsdata, values=['PERSPVALUE', 'STDDEVI', 'STDDEVC', 'EXPVALUE'], index=['LOBNAME', 'STATE', 'EVENTID'])
    logger.info('pivot rms data for events')
    rmsevents = pd.pivot_table(rmsdata, values='rate', index='eventid')
    freq_mean = rmsevents.sum()
    #EVENTID 	RATE 	PERSPCODE 	STATE 	LOBNAME 	PERSPVALUE 	STDDEVI 	STDDEVC 	EXPVALUE
    #catindex = [rmsdata['LOBNAME'].values, rmsdata['STATE'].values, rmsdata['EVENTID'].values]
    logger.info('create rmslookup table')
    rmslookup = dd.from_array({
        'line': rmsdata['line'].values,
        'state': rmsdata['state'].values,
        'eventid': rmsdata['eventid'].values,
        'meanvalue': rmsdata['meanvalue'].values,
        'stddevi': rmsdata['stddevi'].values,
        'stddevc': rmsdata['stddevc'].values,
        'exposure': rmsdata['exposure'].values
    })
    #rmslookup.index.names = ['lob', 'state', 'eventid']
    #eventlookup = pd.DataFrame(rmsevents.index)

    simulated_events = simulate_state_lines_losses(rmsevents, freq_mean,
                                                   states, lines, sims)
    simulated_events = simulated_events.to_dataframe()
    simulated_cats = dd.merge(simulated_events,
                              rmslookup,
                              how='inner',
                              left_on=['line', 'state', 'eventid'],
                              right_on=['line', 'state', 'eventid'])
    #simulated_cats['rand']= np.random.uniform(0,1, len(simulated_cats))
    simulated_cats['loss'] = calculate_rms_loss(simulated_cats, "Beta")
    logger.info("set index to simulated cats")
    simulated_cats = simulated_cats.set_index(
        ['line', 'state', 'simulation', 'eventid', 'eventseq'])

    #results = results.reset_index()
    logger.info("Completed simulating cats")
    return simulated_cats

예제 #21

0

파일 보기

파일: test_dask.py 프로젝트: DanLiBCG/LightGBM

def _create_data(objective, n_samples=100, output='array', chunk_size=50):
    if objective.endswith('classification'):
        if objective == 'binary-classification':
            centers = [[-4, -4], [4, 4]]
        elif objective == 'multiclass-classification':
            centers = [[-4, -4], [4, 4], [-4, 4]]
        else:
            raise ValueError(f"Unknown classification task '{objective}'")
        X, y = make_blobs(n_samples=n_samples,
                          centers=centers,
                          random_state=42)
    elif objective == 'regression':
        X, y = make_regression(n_samples=n_samples, random_state=42)
    else:
        raise ValueError("Unknown objective '%s'" % objective)
    rnd = np.random.RandomState(42)
    weights = rnd.random(X.shape[0]) * 0.01

    if output == 'array':
        dX = da.from_array(X, (chunk_size, X.shape[1]))
        dy = da.from_array(y, chunk_size)
        dw = da.from_array(weights, chunk_size)
    elif output.startswith('dataframe'):
        X_df = pd.DataFrame(
            X, columns=['feature_%d' % i for i in range(X.shape[1])])
        if output == 'dataframe-with-categorical':
            num_cat_cols = 5
            for i in range(num_cat_cols):
                col_name = "cat_col" + str(i)
                cat_values = rnd.choice(['a', 'b'], X.shape[0])
                cat_series = pd.Series(cat_values, dtype='category')
                X_df[col_name] = cat_series
                X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1)))

            # for the small data sizes used in tests, it's hard to get LGBMRegressor to choose
            # categorical features for splits. So for regression tests with categorical features,
            # _create_data() returns a DataFrame with ONLY categorical features
            if objective == 'regression':
                cat_cols = [
                    col for col in X_df.columns if col.startswith('cat_col')
                ]
                X_df = X_df[cat_cols]
                X = X[:, -num_cat_cols:]
        y_df = pd.Series(y, name='target')
        dX = dd.from_pandas(X_df, chunksize=chunk_size)
        dy = dd.from_pandas(y_df, chunksize=chunk_size)
        dw = dd.from_array(weights, chunksize=chunk_size)
    elif output == 'scipy_csr_matrix':
        dX = da.from_array(X, chunks=(chunk_size,
                                      X.shape[1])).map_blocks(csr_matrix)
        dy = da.from_array(y, chunks=chunk_size)
        dw = da.from_array(weights, chunk_size)
    else:
        raise ValueError("Unknown output type '%s'" % output)

    return X, y, weights, dX, dy, dw

예제 #22

0

파일 보기

파일: xgboost_gpu.py 프로젝트: gmmoliveira/link-prediction

def predict_xgboost_gpu(xgb_model,
                        X,
                        data_chunksize=None,
                        n_gpus=None,
                        n_threads_per_gpu=1,
                        gpu_cluster=None,
                        client=None):
    '''
	Predicts the output for the input features X using the 'xgb_model' running on the GPU.

	:param xgb_model: a dask XGBoost model to use for predictions
	:param X: the input features to use for predictions, must be either a numpy ndarray or a pandas DataFrame
	:param data_chunksize: chunk sizes to be used on a dask dataframe, leave the default value None for auto decision
	:param n_gpus: number of GPUs to be used. Default value None selects all available devices;
	:param n_threads_per_gpu: number of threads per GPU;
	:param gpu_cluster: an existing dask cluster object to use. This param should be used if you call this method
		too many times in quick successions. Note that this function doesn't close an externally created cluster.
	:param client: an existing dask cluster object to use. This param should be used if you call this method
		too many times in quick successions. Note that this function doesn't close an externally created client.
	:return:
		If the input features X is a pandas DataFrame, returns a array-like DataFrame of single column containing
		the predictions;

		Otherwise, if the input features X is a numpy ndarray, returns a 1D ndarray containing the predictions .
	'''
    if gpu_cluster is None:
        local_gpus = LocalCUDACluster(n_workers=n_gpus,
                                      threads_per_worker=n_threads_per_gpu)
    else:
        local_gpus = gpu_cluster
    if client is None:
        local_dask_client = Client(local_gpus)
    else:
        local_dask_client = client

    if data_chunksize is None:
        data_chunksize = X.shape[0] // len(local_gpus.cuda_visible_devices)

    if isinstance(X, pd.DataFrame):
        ndarray = False
        X = from_pandas(X, chunksize=data_chunksize)
    else:
        ndarray = True
        X = from_array(X, chunksize=data_chunksize)

    y_predicted = dask_xgboost_predict(local_dask_client, xgb_model, X)
    y_predicted = pd.DataFrame(y_predicted)

    if client is None:
        local_dask_client.close()
    if gpu_cluster is None:
        local_gpus.close()

    if ndarray:
        return y_predicted.to_numpy()
    return y_predicted

예제 #23

0

파일 보기

파일: test_with_dask.py 프로젝트: victorustc/xgboost

def run_empty_dmatrix_cls(client: "Client", parameters: dict) -> None:
    n_classes = 4

    def _check_outputs(out: xgb.dask.TrainReturnT,
                       predictions: np.ndarray) -> None:
        assert isinstance(out['booster'], xgb.dask.Booster)
        assert len(out['history']['validation']['merror']) == 2
        assert isinstance(predictions, np.ndarray)
        assert predictions.shape[1] == n_classes, predictions.shape

    kRows, kCols = 1, 97
    X = dd.from_array(np.random.randn(kRows, kCols))
    y = dd.from_array(np.random.randint(low=0, high=n_classes, size=kRows))
    dtrain = xgb.dask.DaskDMatrix(client, X, y)
    parameters['objective'] = 'multi:softprob'
    parameters['eval_metric'] = 'merror'
    parameters['num_class'] = n_classes

    out = xgb.dask.train(client,
                         parameters,
                         dtrain=dtrain,
                         evals=[(dtrain, 'validation')],
                         num_boost_round=2)
    predictions = xgb.dask.predict(client=client, model=out, data=dtrain)
    assert predictions.shape[1] == n_classes
    predictions = predictions.compute()
    _check_outputs(out, predictions)

    # train has more rows than evals
    valid = dtrain
    kRows += 1
    X = dd.from_array(np.random.randn(kRows, kCols))
    y = dd.from_array(np.random.randint(low=0, high=n_classes, size=kRows))
    dtrain = xgb.dask.DaskDMatrix(client, X, y)

    out = xgb.dask.train(client,
                         parameters,
                         dtrain=dtrain,
                         evals=[(valid, 'validation')],
                         num_boost_round=2)
    predictions = xgb.dask.predict(client=client, model=out,
                                   data=valid).compute()
    _check_outputs(out, predictions)

예제 #24

0

파일 보기

파일: cne5_factors.py 프로젝트: ljhust/QuAI

 def rstr_dk(self, T=504, L=21, half_life=126):
     rstr = np.tile(np.nan, self.length)
     rstr = dd.from_array(rstr).compute()
     rt_whole = (self.quota_ddf.close.diff()/self.quota_ddf.close.shift()).compute()
     for t in np.arange(T+L, self.length+1):
         rt = rt_whole[(t-T-L):(t-L)]
         rft = 9.85e-5
         # rstr[t - 1] = sum((np.log(1 + rt) - np.log(1 + rft)) * halflife_da(half_life, length=T))
         rstr[t - 1] = sum((np.log(1+rt.values)-np.log(1+rft)) * halflife(half_life, length=T))
     return rstr

예제 #25

0

파일 보기

def test_from_array_1d_list_of_columns_gives_dataframe():
    x = da.ones(10, chunks=3)
    y = np.ones(10)
    # passing list via columns results in DataFrame
    d1 = dd.from_dask_array(x, columns=["name"])  # dask
    p1 = pd.DataFrame(y, columns=["name"])
    assert_eq(d1, p1)

    d2 = dd.from_array(y, columns=["name"])  # numpy
    assert_eq(d2, d1)

예제 #26

0

파일 보기

def test_from_dask_array_compat_numpy_array_1d():

    x = da.ones(10, chunks=3)
    y = np.ones(10)
    d1 = dd.from_dask_array(x)  # dask
    p1 = pd.Series(y)
    assert_eq(d1, p1)

    d2 = dd.from_array(y)  # numpy
    assert_eq(d2, d1)

예제 #27

0

파일 보기

    def get_cat_data(self):
        """Slices the categorical columns from the dask dataframe and returns them.

        Returns:
            A dask dataframe with the categorical columns.
        """
        cat_data_array = np.array(self.dask_df[self.cat_labels])
        return dd.from_array(cat_data_array,
                             chunksize=200000,
                             columns=self.cat_labels)

예제 #28

0

파일 보기

def execute_between_time(op, data, lower, upper, **kwargs):
    # TODO - Can this be done better?
    indexer = (
        (data.dt.time.astype(str) >= lower)
        & (data.dt.time.astype(str) <= upper)
    ).to_dask_array(True)

    result = da.zeros(len(data), dtype=np.bool_)
    result[indexer] = True
    return dd.from_array(result)

예제 #29

0

파일 보기

    def get_client_id_data(self):
        """Slices the 'client_id' column from the dask dataframe and returns it.

        Returns:
            A dask dataframe with the 'client_id' column.
        """
        client_id_data_array = np.array(self.dask_df['client_id'])
        return dd.from_array(client_id_data_array,
                             chunksize=200000,
                             columns=['client_id'])

예제 #30

0

파일 보기

파일: selection.py 프로젝트: shubhamjainbb/ibis

def compute_projection_column_expr(
    expr,
    parent,
    data,
    scope: Scope,
    timecontext: Optional[TimeContext],
    **kwargs,
):
    result_name = getattr(expr, '_name', None)
    op = expr.op()
    parent_table_op = parent.table.op()

    if isinstance(op, ops.TableColumn):
        # slightly faster path for simple column selection
        name = op.name

        if name in data:
            return data[name].rename(result_name or name)

        if not isinstance(parent_table_op, ops.Join):
            raise KeyError(name)
        (root_table, ) = op.root_tables()
        left_root, right_root = ops.distinct_roots(parent_table_op.left,
                                                   parent_table_op.right)
        suffixes = {
            left_root: constants.LEFT_JOIN_SUFFIX,
            right_root: constants.RIGHT_JOIN_SUFFIX,
        }
        return data.loc[:, name + suffixes[root_table]].rename(result_name
                                                               or name)

    data_columns = frozenset(data.columns)

    scope = scope.merge_scopes(
        Scope(
            {
                t:
                map_new_column_names_to_data(
                    remap_overlapping_column_names(parent_table_op, t,
                                                   data_columns),
                    data,
                )
            },
            timecontext,
        ) for t in op.root_tables())

    result = execute(expr, scope=scope, timecontext=timecontext, **kwargs)
    assert result_name is not None, 'Column selection name is None'
    if np.isscalar(result):
        series = dd.from_array(np.repeat(result, len(data.index)))
        series.name = result_name
        series.index = data.index
        return series
    return result.rename(result_name)

예제 #31

0

파일 보기

파일: hdf5_reader.py 프로젝트: kingshuk00/Zumsehen

def _try_read_hdf(file, key, use_dask):
    if use_dask:
        try:
            return dd.read_hdf(file, key=key)
        except ValueError:
            return dd.from_array(np.array([[]]))
    else:
        try:
            return pd.read_hdf(file, key=key)
        except KeyError:
            return pd.DataFrame([])

예제 #32

0

파일 보기

def test_DataFrame_from_dask_array():
    x = da.ones((10, 3), chunks=(4, 2))
    pdf = pd.DataFrame(np.ones((10, 3)), columns=["a", "b", "c"])
    df = dd.from_dask_array(x, ["a", "b", "c"])
    assert not hlg_layer_topological(df.dask, -1).is_materialized()
    assert_eq(df, pdf)

    # dd.from_array should re-route to from_dask_array
    df2 = dd.from_array(x, columns=["a", "b", "c"])
    assert not hlg_layer_topological(df2.dask, -1).is_materialized()
    assert_eq(df, df2)

예제 #33

0

파일 보기

def simple_example():
	X, y = make_classification(n_samples=10000, n_features=2, chunks=50)

	X = dd.from_dask_array(X, columns=["a","b"])
	y = dd.from_array(y)

	lr = LogisticRegression()
	lr.fit(X.values, y.values)

	print('Predictions =', lr.predict(X.values).compute())
	print('Probabilities =', lr.predict_proba(X.values).compute())
	print('Scores =', lr.score(X.values, y.values).compute())

예제 #34

0

파일 보기

파일: test_io.py 프로젝트: hgz2373294/dask

def test_DataFrame_from_dask_array():
    x = da.ones((10, 3), chunks=(4, 2))

    df = from_dask_array(x, ["a", "b", "c"])
    assert list(df.columns) == ["a", "b", "c"]
    assert list(df.divisions) == [0, 4, 8, 9]
    assert (df.compute(get=get_sync).values == x.compute(get=get_sync)).all()

    # dd.from_array should re-route to from_dask_array
    df2 = dd.from_array(x, columns=["a", "b", "c"])
    assert df2.columns == df.columns
    assert df2.divisions == df.divisions

예제 #35

0

파일 보기

def test_DataFrame_from_dask_array():
    x = da.ones((10, 3), chunks=(4, 2))

    df = from_dask_array(x, ['a', 'b', 'c'])
    assert list(df.columns) == ['a', 'b', 'c']
    assert list(df.divisions) == [0, 4, 8, 9]
    assert (df.compute(get=get_sync).values == x.compute(get=get_sync)).all()

    # dd.from_array should re-route to from_dask_array
    df2 = dd.from_array(x, columns=['a', 'b', 'c'])
    assert df2.columns == df.columns
    assert df2.divisions == df.divisions

예제 #36

0

파일 보기

파일: test_temporal.py 프로젝트: randxie/ibis

def test_times_ops_with_tz(t, df, tz, rconstruct, column):
    expected = dd.from_array(rconstruct(len(df), dtype=bool), )
    time = t[column].time()
    expr = time.between('01:00', '02:00', timezone=tz)
    result = expr.compile()
    tm.assert_series_equal(result.compute(), expected.compute())

    # Test that casting behavior is the same as using the timezone kwarg
    ts = t[column].cast(dt.Timestamp(timezone=tz))
    expr = ts.time().between('01:00', '02:00')
    result = expr.compile()
    tm.assert_series_equal(result.compute(), expected.compute())

예제 #37

0

파일 보기

def test_from_dask_array_compat_numpy_array():
    x = da.ones((3, 3, 3), chunks=2)

    with pytest.raises(ValueError):
        dd.from_dask_array(x)       # dask

    with pytest.raises(ValueError):
        dd.from_array(x.compute())  # numpy

    x = da.ones((10, 3), chunks=(3, 3))
    d1 = dd.from_dask_array(x)       # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index([0, 1, 2]))

    d2 = dd.from_array(x.compute())  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index([0, 1, 2]))

    with pytest.raises(ValueError):
        dd.from_dask_array(x, columns=['a'])       # dask

    with pytest.raises(ValueError):
        dd.from_array(x.compute(), columns=['a'])  # numpy

    d1 = dd.from_dask_array(x, columns=['a', 'b', 'c'])       # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index(['a', 'b', 'c']))

    d2 = dd.from_array(x.compute(), columns=['a', 'b', 'c'])  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index(['a', 'b', 'c']))

예제 #38

0

파일 보기

파일: test_io.py 프로젝트: floriango/dask

def test_from_dask_array_compat_numpy_array():
    x = da.ones((3, 3, 3), chunks=2)

    with pytest.raises(ValueError):
        dd.from_dask_array(x)       # dask

    with pytest.raises(ValueError):
        dd.from_array(x.compute())  # numpy

    x = da.ones((10, 3), chunks=(3, 3))
    d1 = dd.from_dask_array(x)       # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index([0, 1, 2]))

    d2 = dd.from_array(x.compute())  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index([0, 1, 2]))

    with pytest.raises(ValueError):
        dd.from_dask_array(x, columns=['a'])       # dask

    with pytest.raises(ValueError):
        dd.from_array(x.compute(), columns=['a'])  # numpy

    d1 = dd.from_dask_array(x, columns=['a', 'b', 'c'])       # dask
    assert isinstance(d1, dd.DataFrame)
    assert (d1.compute().values == x.compute()).all()
    tm.assert_index_equal(d1.columns, pd.Index(['a', 'b', 'c']))

    d2 = dd.from_array(x.compute(), columns=['a', 'b', 'c'])  # numpy
    assert isinstance(d1, dd.DataFrame)
    assert (d2.compute().values == x.compute()).all()
    tm.assert_index_equal(d2.columns, pd.Index(['a', 'b', 'c']))

예제 #39

0

파일 보기

def log_group_agg(ddf):
    ddf_grouped=ddf.groupby(by="msno").agg(
    {'date': ['min', 'max',  'count'],
     'num_25': ['min', 'max', 'mean', 'std', 'sum'],
     'num_50': ['min', 'max', 'mean', 'std', 'sum'],
     'num_75': ['min', 'max', 'mean', 'std', 'sum'],
     'num_985': ['min', 'max', 'mean', 'std','sum'],
     'num_100': ['min', 'max', 'mean', 'std','sum'],
     'num_unq': ['min', 'max', 'mean', 'std','sum'],
     'total_secs': ['min', 'max', 'mean', 'std','sum'],
     'skip_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip25_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip50_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip75_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip985_ratio': ['min', 'max', 'mean', 'std','sum'],
     'num100_ratio': ['min', 'max', 'mean', 'std','sum'],
     'unq_ratio': ['min', 'max', 'mean', 'std','sum'],
     'unq_secs_ratio': ['min', 'max', 'mean', 'std','sum'],
     'num100_secs_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip_secs_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip25_secs_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip50_secs_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip75_secs_ratio': ['min', 'max', 'mean', 'std','sum'],
     'skip985_secs_ratio': ['min', 'max', 'mean', 'std','sum'],
     'daily_listening_ratio': ['min', 'max', 'mean', 'std','sum']     
    })
    grouped_col_names =['min_date', 'max_date', 'count_date', 
                    'min_num_25', 'max_num_25', 'mean_num_25', 'std_num_25', 'sum_num_25',
                    'min_num_50', 'max_num_50', 'mean_num_50', 'std_num_50', 'sum_num_50',
                    'min_num_75', 'max_num_75', 'mean_num_75', 'std_num_75', 'sum_num_75',
                    'min_num_985', 'max_num_985', 'mean_num_985', 'std_num_985', 'sum_num_985',
                    'min_num_100', 'max_num_100', 'mean_num_100', 'std_num_100', 'sum_num_100',
                    'min_num_unq', 'max_num_unq', 'mean_num_unq', 'std_num_unq', 'sum_num_unq',
                    'min_total_secs', 'max_total_secs', 'mean_total_secs', 'std_total_secs', 'sum_total_secs',
                    'min_skip_ratio', 'max_skip_ratio', 'mean_skip_ratio', 'std_skip_ratio','sum_skip_ratio',
                    'min_skip25_ratio', 'max_skip25_ratio', 'mean_skip25_ratio', 'std_skip25_ratio','sum_skip25_ratio',
                    'min_skip50_ratio', 'max_skip50_ratio', 'mean_skip50_ratio', 'std_skip50_ratio','sum_skip50_ratio',
                    'min_skip75_ratio', 'max_skip75_ratio', 'mean_skip75_ratio', 'std_skip75_ratio','sum_skip75_ratio',
                    'min_skip985_ratio', 'max_skip985_ratio', 'mean_skip985_ratio', 'std_skip985_ratio','sum_skip985_ratio',
                    'min_num100_ratio', 'max_num100_ratio', 'mean_num100_ratio', 'std_num100_ratio','sum_num100_ratio',
                    'min_unq_ratio', 'max_unq_ratio', 'mean_unq_ratio', 'std_unq_ratio','sum_unq_ratio',
                    'min_unq_secs_ratio', 'max_unq_secs_ratio', 'mean_unq_secs_ratio', 'std_unq_secs_ratio','sum_unq_secs_ratio',
                    'min_num100_secs_ratio', 'max_num100_secs_ratio', 'mean_num100_secs_ratio', 'std_num100_secs_ratio','sum_num100_secs_ratio',
                    'min_skip_secs_ratio', 'max_skip_secs_ratio', 'mean_skip_secs_ratio', 'std_skip_secs_ratio','sum_skip_secs_ratio',
                    'min_skip25_secs_ratio', 'max_skip25_secs_ratio', 'mean_skip25_secs_ratio', 'std_skip25_secs_ratio','sum_skip25_secs_ratio',
                    'min_skip50_secs_ratio', 'max_skip50_secs_ratio', 'mean_skip50_secs_ratio', 'std_skip50_secs_ratio','sum_skip50_secs_ratio',
                    'min_skip75_secs_ratio', 'max_skip75_secs_ratio', 'mean_skip75_secs_ratio', 'std_skip75_secs_ratio','sum_skip75_secs_ratio',
                    'min_skip985_secs_ratio', 'max_skip985_secs_ratio', 'mean_skip985_secs_ratio', 'std_skip985_secs_ratio','sum_skip985_secs_ratio',
                    'min_daily_listening_ratio', 'max_daily_listening_ratio', 'mean_daily_listening_ratio', 'std_daily_listening_ratio','sum_daily_listening_ratio'] 
    ddf_agg = dd.from_array(ddf_grouped.values,chunksize=25, columns=grouped_col_names)
    ddf_agg['msno']  =  ddf_grouped.index
    return ddf_agg

예제 #40

0

파일 보기

    def testFromDaskDfArray(self):
        from xgboost_ray.data_sources.dask import DASK_INSTALLED
        if not DASK_INSTALLED:
            self.skipTest("Dask not installed.")
            return

        import dask.dataframe as dd
        import dask.array as da

        in_x = dd.from_array(self.x)
        in_y = da.from_array(self.y)

        self._testMatrixCreation(in_x, in_y, distributed=False)

예제 #41

0

파일 보기

파일: test_io.py 프로젝트: floriango/dask

def test_DataFrame_from_dask_array():
    x = da.ones((10, 3), chunks=(4, 2))

    df = dd.from_dask_array(x, ['a', 'b', 'c'])
    assert isinstance(df, dd.DataFrame)
    tm.assert_index_equal(df.columns, pd.Index(['a', 'b', 'c']))
    assert list(df.divisions) == [0, 4, 8, 9]
    assert (df.compute(scheduler='sync').values == x.compute(scheduler='sync')).all()

    # dd.from_array should re-route to from_dask_array
    df2 = dd.from_array(x, columns=['a', 'b', 'c'])
    assert isinstance(df, dd.DataFrame)
    tm.assert_index_equal(df2.columns, df.columns)
    assert df2.divisions == df.divisions

예제 #42

0

파일 보기

파일: test_io.py 프로젝트: jrenner/dask

def test_DataFrame_from_dask_array():
    x = da.ones((10, 3), chunks=(4, 2))

    df = from_dask_array(x, ['a', 'b', 'c'])
    assert isinstance(df, dd.DataFrame)
    assert df.columns == ('a', 'b', 'c')
    assert list(df.divisions) == [0, 4, 8, 9]
    assert (df.compute(get=get_sync).values == x.compute(get=get_sync)).all()

    # dd.from_array should re-route to from_dask_array
    df2 = dd.from_array(x, columns=['a', 'b', 'c'])
    assert isinstance(df, dd.DataFrame)
    assert df2.columns == df.columns
    assert df2.divisions == df.divisions

예제 #43

0

파일 보기

파일: test_io.py 프로젝트: hgz2373294/dask

def test_Series_from_dask_array():
    x = da.ones(10, chunks=4)

    ser = from_dask_array(x, "a")
    assert ser.name == "a"
    assert list(ser.divisions) == [0, 4, 8, 9]
    assert (ser.compute(get=get_sync).values == x.compute(get=get_sync)).all()

    ser = from_dask_array(x)
    assert ser.name is None

    # dd.from_array should re-route to from_dask_array
    ser2 = dd.from_array(x)
    assert eq(ser, ser2)

예제 #44

0

파일 보기

파일: test_io.py 프로젝트: floriango/dask

def test_Series_from_dask_array():
    x = da.ones(10, chunks=4)

    ser = dd.from_dask_array(x, 'a')
    assert isinstance(ser, dd.Series)
    assert ser.name == 'a'
    assert list(ser.divisions) == [0, 4, 8, 9]
    assert (ser.compute(scheduler='sync').values == x.compute(scheduler='sync')).all()

    ser = dd.from_dask_array(x)
    assert isinstance(ser, dd.Series)
    assert ser.name is None

    # dd.from_array should re-route to from_dask_array
    ser2 = dd.from_array(x)
    assert isinstance(ser2, dd.Series)
    assert_eq(ser, ser2)

예제 #45

0

파일 보기

파일: test_io.py 프로젝트: floriango/dask

def test_meta_from_array():
    x = np.array([[1, 2], [3, 4]], dtype=np.int64)
    res = _meta_from_array(x)
    assert isinstance(res, pd.DataFrame)
    assert res[0].dtype == np.int64
    assert res[1].dtype == np.int64
    tm.assert_index_equal(res.columns, pd.Index([0, 1]))

    x = np.array([[1., 2.], [3., 4.]], dtype=np.float64)
    res = _meta_from_array(x, columns=['a', 'b'])
    assert isinstance(res, pd.DataFrame)
    assert res['a'].dtype == np.float64
    assert res['b'].dtype == np.float64
    tm.assert_index_equal(res.columns, pd.Index(['a', 'b']))

    with pytest.raises(ValueError):
        _meta_from_array(x, columns=['a', 'b', 'c'])

    np.random.seed(42)
    x = np.random.rand(201, 2)
    x = dd.from_array(x, chunksize=50, columns=['a', 'b'])
    assert len(x.divisions) == 6   # Should be 5 partitions and the end

예제 #46

0

파일 보기

파일: zgroupfract.py 프로젝트: wolfiex/DSMACC-testing


data = [[]]*len(flist)
bar = progressbar.ProgressBar()

with h5py.File(fl,'r') as hf:
        groups = list(filter(lambda x: type(x[1])==h5py._hl.group.Group, hf.items()))
        print 'every '+str(setsgroup)+' groups'
        for gr in bar(groups[::setsgroup]):

            g=gr[1]
            if run:
                shead = g.attrs['spechead'].split(',')
                fhead = g.attrs['fluxhead'].split(',')

            spec = dd.from_array(g.get('spec')[1:,:],chunksize=50000, columns = shead)
            flux = dd.from_array(g.get('flux')[1:,:],chunksize=50000,columns = fhead)



            if run:

                #spec = spec.set_index('TIME', sorted=True)
                M =  spec.M.mean()
                spec = spec/M
                #flux = flux.set_index('TIME', sorted=True)

                shead = g.attrs['spechead'].split(',')
                fcol = g.attrs['fluxhead']

                products = [i.split('+') for i in re.findall(r'-->([A-z0-9+]*)',fcol)]

예제 #47

0

파일 보기

파일: zhdf.py 프로젝트: wolfiex/DSMACC-testing

    def __init__(self,h5file,groupid=False):

        self.origin = h5file
        #if not os.path.isfile(h5file) : print 'no file found'; return None
        #self.hf = h5py.File( h5file, 'r')
        with h5py.File(h5file,'r') as hf:
            groups = list(filter(lambda x: type(x[1])==h5py._hl.group.Group, hf.items()))
            self.groups = dict([[i[0],j] for j,i in enumerate(groups)])
            self.groupkeys = groups[0][1].attrs.keys()
            self.flux=False
            if groupid:
                g = groups[int(groupid)][1]
            else:
                g = groups[0][1]
            self.groupname = groups[0][0]
            self.wall= g.attrs['wall']

            shead = g.attrs['spechead'].split(',')
            rhead = g.attrs['ratehead'].split(',')
            fhead = g.attrs['fluxhead'].split(',')
            vhead = g.attrs['vdothead'].split(',')
            jhead = g.attrs['jacsphead'].split(',')




            #self.fhd = g.attrs['ratehead'].split(',')
            spec = dd.from_array(g.get('spec')[1:,:],chunksize=50000, columns = shead)
            vdot = dd.from_array(g.get('vdot')[1:,:],chunksize=50000, columns = vhead)
            jacsp = dd.from_array(g.get('jacsp')[1:,:],chunksize=50000, columns = jhead)

            if len(rhead) != len(set(rhead)):
                print 'Duplicates detected, please parse mecnahisms in future to prevent this'
                rate = pd.DataFrame(g.get('rate')[1:,:],columns=rhead)
                rate = rate.groupby(rate.columns, axis=1).sum()

                rhead = rate.columns
                rate = dd.from_pandas(rate,chunksize=50000)
            else:
                rate = dd.from_array(g.get('rate')[1:,:],chunksize=50000,columns = rhead)

            print g.get('flux').shape
            print len(fhead)

            if len(fhead) != len(set(fhead)):
                #print 'Duplicates detected, please parse mecnahisms in future to prevent this'
                flux = pd.DataFrame(g.get('flux')[1:,:],columns=fhead)
                flux = flux.groupby(flux.columns, axis=1).sum()

                fhead = flux.columns
                flux = dd.from_pandas(flux,chunksize=50000)
            else:
                flux = dd.from_array(g.get('flux')[1:,:],chunksize=50000,columns = fhead)
                
                
     
            self.timesteps = spec['TIME'].astype('M8[s]').compute()
            spec['TIME'] = vdot['TIME'] = rate['TIME'] = flux['TIME'] = jacsp['TIME'] = self.timesteps 
            #dd.from_array(np.array(self.timesteps[1:]))
            self.ts= np.array(self.timesteps)
            '''
            n = int(len(shead)/5)
            print n, 'partitions'
            spec.repartition(npartitions= n)
            '''

            spec = spec.set_index('TIME', sorted=True)
            self.M =  spec.M.mean()
            self.spec = spec/self.M
            self.vdot = vdot.set_index('TIME', sorted=True)
            self.jacsp=jacsp.set_index('TIME', sorted=True)
            self.rate = rate.set_index('TIME', sorted=True)
            self.flux = flux.set_index('TIME', sorted=True)

            fcol = ','.join(fhead)

            self.products = [i.split('+') for i in re.findall(r'-->([A-z0-9+]*)',fcol)]
            self.reactants = np.array([j.split('+') for j in re.findall(r'([A-z0-9+]{1,60})-->',fcol)])
            '''
            try:
                self.adj = np.array(g['adj'])
                self.adjspec = np.array(g.attrs['adjspec'].split(','))
                self.adjts= np.array(g.attrs['adjts'].split(','))
            except Exception as e:
                print e,'no adjacency matrix data'
            '''


            hf.close()

            #check regex works
            #if (len(reactants) + len(products))/2 != len(rhead)-ratebuff : print 'reactants and poducts differing lengths' , len(reactants) , len(products) , len(rhead)


            #shead.extend(['DUMMY','CL','CLO'])
            self.prodloss = {k: {'loss':[],'prod':[]} for k in shead}
            ### reaction prodloss arrays
            for idx in xrange(len(self.reactants)):
                for i in self.reactants[idx]:
                    try:self.prodloss[i]['loss'].append(idx)
                    except:None
                for i in self.products[idx]:
                    try:self.prodloss[i]['prod'].append(idx)
                    except:None

예제 #48

0

파일 보기

파일: zhdf.py 프로젝트: wolfiex/DSMACC-testing

    def __init__(self, h5file, groupid=False,selection = 'spec,rate,flux,vdot,jacsp'.split(','),
        prodloss = True,ts = 600):

        '''
        h5file     - filename
        groupid    - select a specific group entry
        selections - which data sections to provide in the class
        prodloss   - create rxn/ropa dictionaries

        '''

        self.origin = h5file
        self.selection = selection
        #if not os.path.isfile(h5file) : print 'no file found'; return None
        #self.hf = h5py.File( h5file, 'r')
        with h5py.File(h5file,'r') as hf:


            groups = list(filter(lambda x: type(x[1])==h5py._hl.group.Group, hf.items()))
            self.groups = dict([[i[0],j] for j,i in enumerate(groups)])
            self.groupkeys = groups[0][1].attrs.keys()
            self.flux=False
            
            if type(groupid) == int: 
                g = groups[groupid]
            elif type(groupid) == str: 
                g = groups[self.groups[groupid]]
            else:
                g = groups[0]


            self.groupname = g[0]
            g = g[1]
            self.wall= g.attrs['wall']




            if True: # spec MUST always be included...
                #'spec' in selection:
                shead = g.attrs['spechead'].decode("utf-8").split(',')
                spec = dd.from_array(g.get('spec')[:,:],chunksize=50000, columns = shead)
                self.timesteps = spec['TIME'].compute().astype('M8[s]')



                self.ts= np.array(self.timesteps)
                spec['TIME'] = self.timesteps
                spec = spec.set_index('TIME', sorted=True)
                self.spinup= self.ts[int( (spec.SPINUP.max()/ts).compute() ) ]
                self.M =  spec.M.mean()
                self.spec = spec/self.M

                fhead = g.attrs['fluxhead'].decode("utf-8").split(',')




            if 'rate' in selection:
                rhead = g.attrs['ratehead'].decode("utf-8").split(',')
                if len(rhead) != len(set(rhead)):
                    print ('Duplicates detected, please parse mecnahisms in future to prevent this')
                    rate = pd.DataFrame(g.get('rate')[:,:],columns=rhead)
                    rate = rate.groupby(rate.columns, axis=1).sum()

                    rhead = rate.columns
                    rate = dd.from_pandas(rate,chunksize=50000)
                else:
                    rate = dd.from_array(g.get('rate')[:,:],chunksize=50000,columns = rhead)

                rate['TIME'] = self.timesteps
                self.rate = rate.set_index('TIME', sorted=True)




            if 'flux' in selection:

                if len(fhead) != len(set(fhead)):
                    #print 'Duplicates detected, please parse mecnahisms in future to prevent this'
                    flux = pd.DataFrame(g.get('flux')[:,:],columns=fhead)
                    flux = flux.groupby(flux.columns, axis=1).sum()

                    fhead = flux.columns
                    flux = dd.from_pandas(flux,chunksize=50000)
                else:
                    flux = dd.from_array(g.get('flux')[:,:],chunksize=50000,columns = fhead)
                flux['TIME'] = self.timesteps
                self.flux = flux.set_index('TIME', sorted=True)


            if 'vdot' in selection:
                vhead = g.attrs['vdothead'].decode("utf-8").split(',')
                vdot = dd.from_array(g.get('vdot')[:,:],chunksize=50000, columns = vhead)
                #vdot*=-1 # convert such that -ve values suggest flux leaving the species.
                vdot['TIME'] = self.timesteps
                self.vdot = vdot.set_index('TIME', sorted=True)


            if 'jacsp' in selection:
                jhead = g.attrs['jacsphead'].decode("utf-8").split(',')
                jacsp = dd.from_array(g.get('jacsp')[:,:],chunksize=50000, columns = jhead)
                jacsp['TIME'] = self.timesteps
                self.jacsp=jacsp.set_index('TIME', sorted=True)


            hf.close()


            if prodloss:
                fcol = ','.join(fhead)
                self.products = [i.split('+') for i in re.findall(r'-->([A-z0-9+]*)',fcol)]
                self.reactants = np.array([j.split('+') for j in re.findall(r'([A-z0-9+]{1,60})-->',fcol)])

                self.prodloss = {k: {'loss':[],'prod':[]} for k in shead}
                ### reaction prodloss arrays
                for idx in range(len(self.reactants)):
                    for i in self.reactants[idx]:
                        try:self.prodloss[i]['loss'].append(idx)
                        except:None
                    for i in self.products[idx]:
                        try:self.prodloss[i]['prod'].append(idx)
                        except:None