예제 #1
0
파일: test_multi.py 프로젝트: maxhutch/dask
def test_concat(join):
    pdf1 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'y': list('abcdef')},
                        index=[1, 2, 3, 4, 6, 7])
    ddf1 = dd.from_pandas(pdf1, 2)
    pdf2 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'y': list('abcdef')},
                        index=[8, 9, 10, 11, 12, 13])
    ddf2 = dd.from_pandas(pdf2, 2)

    # different columns
    pdf3 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'z': list('abcdef')},
                        index=[8, 9, 10, 11, 12, 13])
    ddf3 = dd.from_pandas(pdf3, 2)

    for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2),
                                 (ddf1, ddf3, pdf1, pdf3)]:
        result = dd.concat([dd1, dd2], join=join)
        expected = pd.concat([pd1, pd2], join=join)
        assert eq(result, expected)

    # test outer only, inner has a problem on pandas side
    for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2),
                                 (ddf1, ddf3, pdf1, pdf3),
                                 (ddf1.x, ddf2.x, pdf1.x, pdf2.x),
                                 (ddf1.x, ddf3.z, pdf1.x, pdf3.z),
                                 (ddf1.x, ddf2.x, pdf1.x, pdf2.x),
                                 (ddf1.x, ddf3.z, pdf1.x, pdf3.z)]:
        result = dd.concat([dd1, dd2])
        expected = pd.concat([pd1, pd2])
        assert eq(result, expected)
예제 #2
0
def test_concat3():
    pdf1 = pd.DataFrame(np.random.randn(6, 5),
                        columns=list('ABCDE'), index=list('abcdef'))
    pdf2 = pd.DataFrame(np.random.randn(6, 5),
                        columns=list('ABCFG'), index=list('ghijkl'))
    pdf3 = pd.DataFrame(np.random.randn(6, 5),
                        columns=list('ABCHI'), index=list('mnopqr'))
    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    result = dd.concat([ddf1, ddf2])
    assert result.divisions == ddf1.divisions[:-1] + ddf2.divisions
    assert result.npartitions == ddf1.npartitions + ddf2.npartitions
    assert_eq(result, pd.concat([pdf1, pdf2]))

    assert_eq(dd.concat([ddf1, ddf2], interleave_partitions=True),
              pd.concat([pdf1, pdf2]))

    result = dd.concat([ddf1, ddf2, ddf3])
    assert result.divisions == (ddf1.divisions[:-1] + ddf2.divisions[:-1] +
                                ddf3.divisions)
    assert result.npartitions == (ddf1.npartitions + ddf2.npartitions +
                                  ddf3.npartitions)
    assert_eq(result, pd.concat([pdf1, pdf2, pdf3]))

    assert_eq(dd.concat([ddf1, ddf2, ddf3], interleave_partitions=True),
              pd.concat([pdf1, pdf2, pdf3]))
예제 #3
0
파일: test_multi.py 프로젝트: sinhrks/dask
def test_concat_unknown_divisions_errors():
    a = pd.Series([1, 2, 3, 4, 5, 6])
    b = pd.Series([4, 3, 2, 1])
    aa = dd.from_pandas(a, npartitions=2, sort=False)
    bb = dd.from_pandas(b, npartitions=2, sort=False)

    with pytest.raises(ValueError):
        dd.concat([aa, bb], axis=1).compute()
예제 #4
0
def test_concat_one_series():
    a = pd.Series([1, 2, 3, 4])
    aa = dd.from_pandas(a, npartitions=2, sort=False)

    c = dd.concat([aa], axis=0)
    assert isinstance(c, dd.Series)

    c = dd.concat([aa], axis=1)
    assert isinstance(c, dd.DataFrame)
예제 #5
0
def test_concat_unknown_divisions():
    a = pd.Series([1, 2, 3, 4])
    b = pd.Series([4, 3, 2, 1])
    aa = dd.from_pandas(a, npartitions=2, sort=False)
    bb = dd.from_pandas(b, npartitions=2, sort=False)

    assert not aa.known_divisions

    assert eq(pd.concat([a, b], axis=1), dd.concat([aa, bb], axis=1))

    cc = dd.from_pandas(b, npartitions=1, sort=False)
    with pytest.raises(ValueError):
        dd.concat([aa, cc], axis=1)
예제 #6
0
def test_concat5():
    pdf1 = pd.DataFrame(np.random.randn(7, 5),
                        columns=list('ABCDE'), index=list('abcdefg'))
    pdf2 = pd.DataFrame(np.random.randn(7, 6),
                        columns=list('FGHIJK'), index=list('abcdefg'))
    pdf3 = pd.DataFrame(np.random.randn(7, 6),
                        columns=list('FGHIJK'), index=list('cdefghi'))
    pdf4 = pd.DataFrame(np.random.randn(7, 5),
                        columns=list('FGHAB'), index=list('cdefghi'))
    pdf5 = pd.DataFrame(np.random.randn(7, 5),
                        columns=list('FGHAB'), index=list('fklmnop'))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)
    ddf4 = dd.from_pandas(pdf4, 2)
    ddf5 = dd.from_pandas(pdf5, 3)

    cases = [[ddf1, ddf2], [ddf1, ddf3], [ddf1, ddf4], [ddf1, ddf5],
             [ddf3, ddf4], [ddf3, ddf5], [ddf5, ddf1, ddf4], [ddf5, ddf3],
             [ddf1.A, ddf4.A], [ddf2.F, ddf3.F], [ddf4.A, ddf5.A],
             [ddf1.A, ddf4.F], [ddf2.F, ddf3.H], [ddf4.A, ddf5.B],
             [ddf1, ddf4.A], [ddf3.F, ddf2], [ddf5, ddf1.A, ddf2]]

    for case in cases:
        pdcase = [c.compute() for c in case]

        with pytest.warns(None):
            # some cases will raise warning directly from pandas
            assert_eq(dd.concat(case, interleave_partitions=True),
                      pd.concat(pdcase))

        assert_eq(dd.concat(case, join='inner', interleave_partitions=True),
                  pd.concat(pdcase, join='inner'))

        assert_eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1))

        assert_eq(dd.concat(case, axis=1, join='inner'),
                  pd.concat(pdcase, axis=1, join='inner'))

    # Dask + pandas
    cases = [[ddf1, pdf2], [ddf1, pdf3], [pdf1, ddf4],
             [pdf1.A, ddf4.A], [ddf2.F, pdf3.F],
             [ddf1, pdf4.A], [ddf3.F, pdf2], [ddf2, pdf1, ddf3.F]]

    for case in cases:
        pdcase = [c.compute() if isinstance(c, _Frame) else c for c in case]

        assert_eq(dd.concat(case, interleave_partitions=True),
                  pd.concat(pdcase))

        assert_eq(dd.concat(case, join='inner', interleave_partitions=True),
                  pd.concat(pdcase, join='inner'))

        assert_eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1))

        assert_eq(dd.concat(case, axis=1, join='inner'),
                  pd.concat(pdcase, axis=1, join='inner'))
예제 #7
0
def test_union_with_list_types(t, df, distinct):
    expr = t.union(t, distinct=distinct)
    result = expr.compile()
    expected = (
        df if distinct else dd.concat([df, df], axis=0, ignore_index=True)
    )
    tm.assert_frame_equal(result.compute(), expected.compute())
예제 #8
0
 def predict_price(total_amount, trip_distance, passenger_count):
     # Create a dataframe out of the three columns
     # and pass it to dask-xgboost, to predict
     # distributed
     X = dd.concat([total_amount, trip_distance, passenger_count],
                   axis=1).astype("float64")
     return dask_xgboost.predict(client, bst, X)
예제 #9
0
def _merge_xyz(xyz_data_frames: list):
    """Try to merge xyz data frames."""
    try:
        df = dd.concat(xyz_data_frames, axis=0)
        return df
    except Exception as e:
        raise (e)
예제 #10
0
    def run(self):
        dsk = None
        if ParquetTarget(
            env_workaround().return_env("local_location")
            + "rates/"
            + self.instrument
            + "/"
        ).exists():
            input_target = next(iter(self.input()))
            dsk = input_target.read()

        df = self.fetch()

        if dsk != None:
            dsk2 = dd.from_pandas(df, chunksize=10000)
            dsk = dd.concat([dsk, dsk2])
            dsk = dsk.drop_duplicates()

        else:
            dsk = dd.from_pandas(df, chunksize=10000)

        self.output().write(dsk)

        if self.storage == "s3":
            self.s3output().write(dsk)
예제 #11
0
    def to_dc(
        cls,
        input_item: InputType,
        table_name: str,
        format: str = None,
        persist: bool = True,
        **kwargs,
    ) -> DataContainer:
        """
        Turn possible input descriptions or formats (e.g. dask dataframes, pandas dataframes,
        locations as string, hive tables) into the loaded data containers,
        maybe persist them to cluster memory before.
        """
        filled_get_dask_dataframe = lambda *args: cls._get_dask_dataframe(
            *args, table_name=table_name, format=format, **kwargs,
        )

        if isinstance(input_item, list):
            table = dd.concat([filled_get_dask_dataframe(item) for item in input_item])
        else:
            table = filled_get_dask_dataframe(input_item)

        if persist:
            table = table.persist()

        return DataContainer(table.copy(), ColumnContainer(table.columns))
예제 #12
0
 def _load_table(self, table):
     df = None
     for name, filepath in self._named_files.items():
         filepath, ext = filepath
         if '://' not in filepath:
             filepath = os.path.join(self.root, filepath)
         if name != table:
             continue
         load_fn, kwargs = self._load_fn(ext)
         paths = self._resolve_template_vars(filepath)
         if self.use_dask and ext in ('csv', 'json', 'parquet', 'parq'):
             df = load_fn(paths, **kwargs)
         else:
             dfs = [load_fn(path, **kwargs) for path in paths]
             if len(dfs) <= 1:
                 df = dfs[0] if dfs else None
             elif self.use_dask and hasattr(dfs[0], 'compute'):
                 import dask.dataframe as dd
                 df = dd.concat(dfs)
             else:
                 df = pd.concat(dfs)
         if hasattr(df, 'persist'):
             df = df.persist()
     if df is None:
         tables = list(self._named_files)
         raise ValueError(
             f"Table '{table}' not found. Available tables include: {tables}."
         )
     return df
예제 #13
0
파일: util.py 프로젝트: CGe0516/ibis
def safe_concat(dfs: List[Union[dd.Series, dd.DataFrame]]) -> dd.DataFrame:
    """
    Concat a list of `dd.Series` or `dd.DataFrame` objects into one DataFrame

    This will use `DataFrame.concat` if all pieces are the same length.
    Otherwise we will iterratively join.

    When axis=1 and divisions are unknown, Dask `DataFrame.concat` can only
    operate on objects with equal lengths, otherwise it will raise a
    ValueError in `concat_and_check`.

    See https://github.com/dask/dask/blob/2c2e837674895cafdb0612be81250ef2657d947e/dask/dataframe/multi.py#L907 # noqa

    Note - this is likely to be quite slow, but this should be hit rarely in
    real usage. A situtation that triggeres this slow path is aggregations
    where aggregations return different numbers of rows (see
    `test_aggregation_group_by` for a specific example).

    TODO - performance.
    """
    lengths = list(map(len, dfs))
    if len(set(lengths)) != 1:
        result = dfs[0].to_frame()

        for other in dfs[1:]:
            result = result.join(other.to_frame(), how="outer")
    else:
        result = dd.concat(dfs, axis=1)

    return result
예제 #14
0
def from_excel(path: Union[str, List[str]], **params) -> dd.DataFrame:
    """Creates a `dask.dataframe.DataFrame` from one or several excel files.
    Includes a "path column".

    Parameters
    ----------
    path
        Path to files
    params
        Extra arguments passed on to `pandas.read_excel`

    Returns
    -------
    df
        A `dask.dataframe.DataFrame`
    """
    path_list = _get_file_paths(path)

    dds = []
    for path_name in path_list:
        parts = delayed(pd.read_excel)(path_name, **params)
        data_frame = dd.from_delayed(parts).fillna("")
        data_frame[PATH_COLUMN_NAME] = path_name
        dds.append(data_frame)

    return dd.concat(dds)
예제 #15
0
    def cv_cat_mapping(X_mat, cat_feat_cols=cat_feat_cols, orig_colnames=orig_colnames):
        """Map the OHE categorical variables to their counts within CV.
        """
        X_df = pd.DataFrame(X_mat) # Make matrix into pd.df
        X_df.columns = orig_colnames

        # Create indices for each OHE cat variable
        cat_feat_1 = cat_feat_cols[0:2]
        cat_feat_2 = cat_feat_cols[2:5]
        cat_feat_3 = cat_feat_cols[5:7]
        cat_feat_3 = cat_feat_cols[5:7]
        cat_feat_4 = cat_feat_cols[7:19]
        cat_feat_col_list = [cat_feat_1, cat_feat_2, cat_feat_3, cat_feat_4]

        cat_mapping_list = [] # Map each OHE variable to their counts
        for cat_idx in range(len(cat_feat_col_list)):
            # Apply to X_df
            ohe_var = X_df.loc[:,cat_feat_col_list[cat_idx]]
            mapping_fun = mapping_fun_list[cat_idx]
            cat_mapping_list.append(ohe_var.apply(lambda x: mapping_fun(x), axis=1))

        cat_counts_df = pd.DataFrame(cat_mapping_list).T # Put results into a df
        cat_counts_df.columns=['type_counts', 'time_counts', 'dose_counts', 'new_feature_counts']

        X_df = dd.concat([X_df, cat_counts_df], axis=1) # Combine with X_df
        X_df = X_df.compute() # Return Dask to Pandas

        return X_df
예제 #16
0
def create_12_mon_features(joined_df, **kwargs):
    #ddf.to_parquet(joined_df, 'dask_create_12_mon_features.pq')
    testdfs = []
    n_months = 12
    for y in range(1, n_months + 1):
        tmpdf = joined_df[[
            'loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12',
            'upb_12'
        ]]
        tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf[
            'timestamp_month']
        tmpdf['josh_mody_n'] = np.floor(
            (tmpdf['josh_months'].astype('float64') - 24000 - y) / 12)
        grp = tmpdf.groupby(['loan_id', 'josh_mody_n']).agg({
            'delinquency_12': 'max',
            'upb_12': 'min'
        }).reset_index()
        tmpdf['delinquency_12'] = (tmpdf['delinquency_12'] > 3).astype('int32')
        tmpdf['delinquency_12'] += (tmpdf['upb_12'] == 0).astype('int32')
        tmpdf['timestamp_year'] = np.floor(
            ((tmpdf['josh_mody_n'] * n_months) + 24000 +
             (y - 1)) / 12).astype('int16')
        tmpdf['timestamp_month'] = np.int8(y)
        tmpdf = tmpdf.drop(['josh_mody_n'], axis=1)
        testdfs.append(tmpdf)
        del (tmpdf)
    del (joined_df)

    return ddf.concat(testdfs)
예제 #17
0
def create_date_dataframe(dss, fields_shp, doa, class_col_name,
                          class_use_dict):

    #Put intermediate data frames in a list
    field_dataframes = []

    #Iterate geo-dataframe
    for idx, shape in fields_shp.iterrows():

        # Indicate column of field ID
        IDLote = shape['IDLote']
        # Indicate column of phenology class value
        class_value = shape[class_col_name]
        # Indicate the geometry column
        polygon = shape['geometry']

        if not np.isnan(class_value):
            df = get_field_dataset(dss, polygon, class_value, doa, IDLote)

            df['IDLote'] = IDLote

            df['time'] = doa

            try:
                df['tt'] = class_use_dict[IDLote]
            except KeyError:
                df['tt'] = 'nd'

            field_dataframes.append(df)

    #Concatenate dask dataframes for all fields
    data = dd.concat(field_dataframes, axis=0, interleave_partitions=True)

    return data
예제 #18
0
def from_parquet(path: Union[str, List[str]], **params) -> dd.DataFrame:
    """Creates a `dd.DataFrame` from one or several parquet files.

    Includes a "path column".

    Parameters
    ----------
    path
        Path to files
    **params
        Extra arguments passed on to `pandas.read_parquet`

    Returns
    -------
    df
        A `dd.DataFrame`
    """
    path_list = _get_file_paths(path)

    dds = []
    for path_name in path_list:
        ddf = dd.read_parquet(path_name, engine="pyarrow", **params)
        ddf[PATH_COLUMN_NAME] = path_name
        dds.append(ddf)

    return dd.concat(dds)
예제 #19
0
def concat(dfs: List[DataframeLike], engine: Engine):

    if engine == Engine.PANDAS:
        return pd.concat(dfs, ignore_index=True, sort=False)

    if engine == Engine.DASK:
        import dask.dataframe
        return dask.dataframe.concat(dfs).reset_index(drop=True)

    if engine == Engine.CUDF:
        import cudf
        try:
            return cudf.concat(dfs, ignore_index=True)
        except TypeError as e:
            logger.warning(
                'Failed to concat, likely due to column type issue, try converting to a string; columns'
            )
            for df in dfs:
                logger.warning('df types :: %s', df.dtypes)
            raise e

    if engine == Engine.DASK:
        import dask.dataframe as dd
        return dd.concat(dfs)

    if engine == Engine.DASK_CUDF:
        import dask_cudf
        return dask_cudf.concat(dfs)

    raise NotImplementedError('Unknown engine')
예제 #20
0
def from_json(path: Union[str, List[str]],
              flatten: bool = False,
              **params) -> dd.DataFrame:
    """Creates a `dd.DataFrame` from one or several json files.

    Includes a "path column".

    Parameters
    ----------
    path
        Path to files
    flatten
        If true, flatten nested data (default false).
    **params
        Extra arguments passed on to `pandas.read_json`

    Returns
    -------
    dataframe
        A `dd.DataFrame`
    """
    def json_engine(*args, **kwargs) -> pd.DataFrame:
        data_frame = pd.read_json(*args, **kwargs)
        return flatten_dataframe(data_frame) if flatten else data_frame

    path_list = _get_file_paths(path)

    dds = []
    for path_name in path_list:
        ddf = dd.read_json(path_name, engine=json_engine, **params)
        ddf[PATH_COLUMN_NAME] = path_name
        dds.append(ddf)

    return dd.concat(dds)
예제 #21
0
def test_set_index_sorts():
    # https://github.com/dask/dask/issues/2288
    vals = np.array([1348550149000000000, 1348550149000000000, 1348558142000000000,
                     1348558142000000000, 1348585928000000000, 1348585928000000000,
                     1348600739000000000, 1348601706000000000, 1348600739000000000,
                     1348601706000000000, 1348614789000000000, 1348614789000000000,
                     1348621037000000000, 1348621038000000000, 1348621040000000000,
                     1348621037000000000, 1348621038000000000, 1348621040000000000,
                     1348637628000000000, 1348638159000000000, 1348638160000000000,
                     1348638159000000000, 1348638160000000000, 1348637628000000000,
                     1348646354000000000, 1348646354000000000, 1348659107000000000,
                     1348657111000000000, 1348659107000000000, 1348657111000000000,
                     1348672876000000000, 1348672876000000000, 1348682787000000000,
                     1348681985000000000, 1348682787000000000, 1348681985000000000,
                     1348728167000000000, 1348728167000000000, 1348730745000000000,
                     1348730745000000000, 1348750198000000000, 1348750198000000000,
                     1348750198000000000, 1348753539000000000, 1348753539000000000,
                     1348753539000000000, 1348754449000000000, 1348754449000000000,
                     1348761333000000000, 1348761554000000000, 1348761610000000000,
                     1348761333000000000, 1348761554000000000, 1348761610000000000,
                     1348782624000000000, 1348782624000000000, 1348782624000000000,
                     1348782624000000000])
    vals = pd.to_datetime(vals, unit='ns')
    breaks = [10, 36, 58]
    dfs = []

    for i in range(len(breaks)):
        lo = sum(breaks[:i])
        hi = sum(breaks[i:i + 1])

        dfs.append(pd.DataFrame({"timestamp": vals[lo:hi]}, index=range(lo, hi)))

    ddf = dd.concat(dfs).clear_divisions()
    assert ddf.set_index("timestamp").index.compute().is_monotonic is True
def test_set_index_empty_partition():
    test_vals = [1, 2, 3]

    converters = [
        int,
        float,
        str,
        lambda x: pd.to_datetime(x, unit='ns'),
    ]

    for conv in converters:
        df = pd.DataFrame([{
            'x': conv(i),
            'y': i
        } for i in test_vals],
                          columns=['x', 'y'])
        ddf = dd.concat([
            dd.from_pandas(df, npartitions=1),
            dd.from_pandas(df[df.y > df.y.max()], npartitions=1),
        ])

        assert any(
            ddf.get_partition(p).compute().empty
            for p in range(ddf.npartitions))
        assert assert_eq(ddf.set_index('x'), df.set_index('x'))
예제 #23
0
def test_set_index_sorts():
    # https://github.com/dask/dask/issues/2288
    vals = np.array([1348550149000000000, 1348550149000000000, 1348558142000000000,
                     1348558142000000000, 1348585928000000000, 1348585928000000000,
                     1348600739000000000, 1348601706000000000, 1348600739000000000,
                     1348601706000000000, 1348614789000000000, 1348614789000000000,
                     1348621037000000000, 1348621038000000000, 1348621040000000000,
                     1348621037000000000, 1348621038000000000, 1348621040000000000,
                     1348637628000000000, 1348638159000000000, 1348638160000000000,
                     1348638159000000000, 1348638160000000000, 1348637628000000000,
                     1348646354000000000, 1348646354000000000, 1348659107000000000,
                     1348657111000000000, 1348659107000000000, 1348657111000000000,
                     1348672876000000000, 1348672876000000000, 1348682787000000000,
                     1348681985000000000, 1348682787000000000, 1348681985000000000,
                     1348728167000000000, 1348728167000000000, 1348730745000000000,
                     1348730745000000000, 1348750198000000000, 1348750198000000000,
                     1348750198000000000, 1348753539000000000, 1348753539000000000,
                     1348753539000000000, 1348754449000000000, 1348754449000000000,
                     1348761333000000000, 1348761554000000000, 1348761610000000000,
                     1348761333000000000, 1348761554000000000, 1348761610000000000,
                     1348782624000000000, 1348782624000000000, 1348782624000000000,
                     1348782624000000000])
    vals = pd.to_datetime(vals, unit='ns')
    breaks = [10, 36, 58]
    dfs = []

    for i in range(len(breaks)):
        lo = sum(breaks[:i])
        hi = sum(breaks[i:i + 1])

        dfs.append(pd.DataFrame({"timestamp": vals[lo:hi]}, index=range(lo, hi)))

    ddf = dd.concat(dfs).clear_divisions()
    assert ddf.set_index("timestamp").index.compute().is_monotonic is True
예제 #24
0
 def run(self):
     # The main function. Gets the script, creates graph and saves the result
     dsk = dd.read_parquet(env_workaround().return_env("local_location") +
                           "trading_history/*.parquet")
     self.instruments = dsk["instrument"].drop_duplicates().compute()
     self.instruments = list(self.instruments.values)[1:]
     self.requires()
     a = self.extract(self.instruments.pop(), self.granularity)
     for i in self.instruments:
         b = self.extract(i, self.granularity)
         a = dd.concat([a, b], axis=1)
     fig, ax = plt.subplots(figsize=(12, 7))
     sns_plot = sns.heatmap(
         a.corr(),
         xticklabels=a.columns,
         yticklabels=a.columns,
         annot=True,
         linewidths=0.3,
     )
     fig = sns_plot.get_figure()
     plt.title(
         "Correlation of instruments in the portfolio with granularity {}".
         format(self.granularity))
     name = "correlation" + self.granularity + ".png"
     if not os.path.exists(env_workaround().return_env("local_location") +
                           "images/"):
         os.makedirs(env_workaround().return_env("local_location") +
                     "images/")
     fig.savefig(env_workaround().return_env("local_location") + "images/" +
                 name)
     self.fig = fig
예제 #25
0
def simulate_state_lines_losses(eventlookup, freq_mean, states, lines, sims):
    '''assembles state line level events based on the year event
    '''
    logger = logging.getLogger(__name__)
    logger.info('start state lines losses')
    numberofevents = dd(np.random.poisson(freq_mean, sims),
                        index=np.arange(1, sims + 1),
                        columns=['events'])
    catevents = simulate_events(numberofevents, eventlookup, sims)

    simsevents = list(range(len(catevents)))
    #combinedResults = xr.DataArray(np.empty((len(states), len(lines), len(catevents), 4)),name="catevents", coords=[states['state'], lines['line'], simsevents, ["sim", "eventseq", "eventid", "rand"]], dims=['state', 'line', 'eventsim', 'data'] )

    logger.info(
        'start to build full array of losses, combining state lines with events'
    )
    sim_events = dd()
    firstloop = True
    for state in states['state']:
        print(f'start {state}')
        for line in lines['line']:
            #combinedResults.loc[state, line] = catevents.copy()
            print(f'start {line}')
            b = catevents.copy()
            b['state'] = state
            b['line'] = line
            if firstloop:
                sim_events = b
                firstloop = False
            else:
                sim_events = dd.concat([sim_events, b])

    #sim_events = pd.concat(a, ignore_index=True, axis=0, )
    logger.info('Completed combined state lines with events')
    return combinedResults
예제 #26
0
    def load_dataframe(self, file_resources, npartitions=None):
        """
        Args:
            file_resources:
            npartitions:
        """
        dfs = []
        for filename, content in file_resources.items():
            if ".gtf" in filename:
                df = read_gtf(content,
                              npartitions=npartitions,
                              compression="gzip")
                dfs.append(df)

        if npartitions:
            annotation_df = dd.concat(dfs)
        else:
            annotation_df = pd.concat(dfs)

        if self.remove_version_num:
            annotation_df["gene_id"] = annotation_df["gene_id"].str.replace(
                "[.].*", "", regex=True)
            annotation_df["transcript_id"] = annotation_df[
                "transcript_id"].str.replace("[.].*", "", regex=True)

        return annotation_df
예제 #27
0
def fastasToDF(fastas , verbose=False, ecodDB = False):

	regex = re.compile('[^a-zA-Z0-9]')
	regexAA = re.compile('[^ARDNCEQGHILKMFPSTWYV]')
	DFdict={}
	count = 0
	total = []
	DDF =None

	for fasta in fastas:
		if verbose == True:
			print(fasta)

		fastaIter = SeqIO.parse(fasta, "fasta")
		for seq in fastaIter:
			seqstr = regexAA.sub('', str(seq.seq))
			desc =str(seq.description)
			fastastr = '>'+desc+'\n'+seqstr+'\n'
			if desc not in total:
				#check for duclipcates within a folder
				total.append(desc)
				DFdict[desc] = { 'desc': desc.encode(), 'seq':seqstr, 'fasta': fastastr}
			if ecodDB == True:
				labels = ['ECOD uid','ECOD domain' , 'EOCD hierearchy string', 'ECOD pdb_range']
				for i,ecodb in enumerate(seq.description.split('|')[1:]):
					DFdict[desc][labels[i]] = ecodb
			count +=1
			if count % 400 == 0:
				df = pd.DataFrame.from_dict(DFdict, orient = 'index' )
				if df is not None and len(df)>0:
					if DDF is None:
						DDF = dd.from_pandas(df , chunksize = 200)
					else:
						DDF = dd.concat([ DDF,  dd.from_pandas(df , chunksize = 200) ] , interleave_partitions=True )
				DFdict={}
		else:
			df = pd.DataFrame.from_dict(DFdict, orient = 'index')
			if df is not None and len(df)>0:
				if DDF is None:
					DDF = dd.from_pandas(df , chunksize = 200)
				else:
					DDF = dd.concat([ DDF,  dd.from_pandas(df , chunksize = 200) ] , interleave_partitions=True)
			DFdict={}

	if verbose == True:
		print(df)
	return DDF
async def f():
    async with Scheduler(protocol=protocol, interface='ib0',
    dashboard_address=':8789') as s:
        async with Nanny(s.address, protocol=protocol, nthreads=1,
                memory_limit='32GB',
                env={'CUDA_VISIBLE_DEVICES': '2'},
                ) as w:
            async with Nanny(s.address, protocol=protocol,memory_limit='32gb',
                    env={'CUDA_VISIBLE_DEVICES': '3'},
                    nthreads=1) as w2:
                async with Client(s.address, asynchronous=True) as c:
                    with log_errors(pdb=True):
                        # Create a simple random array
                        #n_rows = 50000000
                        #n_keys = 5000000

                        # working!!!
                        n_rows = 5000000
                        n_keys = 500000

                        #n_rows = 5000000
                        #n_keys = 2500000


                        chunks = n_rows // 100
                        left = dd.concat([
                            da.random.random(n_rows, chunks=chunks).to_dask_dataframe(columns='x'),
                            da.random.randint(0, n_keys, size=n_rows,
                                chunks=chunks).to_dask_dataframe(columns='id'),], axis=1).persist()

                        right = dd.concat([
                            da.random.random(n_rows, chunks=chunks).to_dask_dataframe(columns='y'),
                            da.random.randint(0, n_keys, size=n_rows,
                                chunks=chunks).to_dask_dataframe(columns='id'),], axis=1).persist()


                        gright = right.map_partitions(cudf.from_pandas)
                        gleft = left.map_partitions(cudf.from_pandas)
                        #print(format_bytes(await c.compute(left.size) * 8 * 2))
                        #print(format_bytes(await c.compute(right.size) * 8 * 2))

                        res = gleft.merge(gright, on=['id'])
                        res = await res.persist()
                        print("COMPUTING HEAD()")
                        out = await c.compute(res.head(compute=False))
                        #breakpoint()
                        print(out)
예제 #29
0
def h5_append_dummy_row(
        df: Union[pd.DataFrame, dd.DataFrame],
        freq=None,
        tim: Optional[Sequence[Any]] = None
) -> Union[pd.DataFrame, dd.DataFrame]:
    """
    Add row of NaN with index value that will between one of last data and one of next data start
    :param df: dataframe
    :param freq: frequency to calc index. If logically equal to False, then will be calculated using tim
    :param tim: sequence having in last elements time of 2 last rows
    :return: appended dataframe
    """
    if tim is not None:
        try:
            dindex = pd.Timedelta(
                seconds=0.5 / freq) if freq else np.abs(tim[-1] - tim[-2]) / 2
        except IndexError:  # only one element => we think they are seldom so use 1s
            dindex = pd.Timedelta(seconds=1)
        ind_new = [tim[-1] + dindex]
    else:
        df_index, itm = multiindex_timeindex(df.index)
        try:
            dindex = pd.Timedelta(
                seconds=0.5 /
                freq) if freq else np.abs(df_index[-1] - df_index[-2]) / 2
        except (IndexError, NotImplementedError):
            # only one element => we think they are seldom so use 1s or NotImplemented in Dask
            dindex = pd.Timedelta(seconds=1)
        ind_new = multiindex_replace(df.index[-1:], df_index[-1:] + dindex,
                                     itm)

    dict_dummy = {}
    tip0 = None
    same_types = True  # tries prevent fall down to object type (which is bad handled by pandas.pytables) if possible
    for name, field in df.dtypes.iteritems():
        typ = field.type
        dict_dummy[name] = typ(0) if np.issubdtype(
            typ,
            np.integer) else np.NaN if np.issubdtype(typ, np.floating) else ''

        if same_types:
            if typ != tip0:
                if tip0 is None:
                    tip0 = typ
                else:
                    same_types = False

    df_dummy = pd.DataFrame(
        dict_dummy,
        columns=df.columns.values,
        index=ind_new,
        dtype=tip0 if same_types else None).rename_axis('Time')

    if isinstance(df, dd.DataFrame):
        return dd.concat(
            [df, df_dummy], axis=0,
            interleave_partitions=True)  # buggish dask not always can append
    else:
        return df.append(df_dummy)
예제 #30
0
    def load_dataframe(self, file_resources, npartitions=None):
        """
        Args:
            file_resources:
            npartitions:
        """
        go_terms = pd.read_table(
            file_resources["rnacentral_rfam_annotations.tsv"],
            low_memory=True,
            header=None,
            names=["RNAcentral id", "GO terms", "Rfams"])
        go_terms["RNAcentral id"] = go_terms["RNAcentral id"].str.split(
            "_", expand=True, n=2)[0]

        gene_ids = []
        for file in file_resources:
            if "database_mappings" in file:
                if npartitions:
                    id_mapping = dd.read_table(file_resources[file],
                                               header=None,
                                               names=[
                                                   "RNAcentral id", "database",
                                                   "external id", "species",
                                                   "RNA type", "gene symbol"
                                               ])
                else:
                    id_mapping = pd.read_table(file_resources[file],
                                               low_memory=True,
                                               header=None,
                                               names=[
                                                   "RNAcentral id", "database",
                                                   "external id", "species",
                                                   "RNA type", "gene symbol"
                                               ])

                gene_ids.append(id_mapping)

        if npartitions:
            gene_ids = dd.concat(gene_ids, join="inner")
        else:
            gene_ids = pd.concat(gene_ids, join="inner")

        gene_ids["species"] = gene_ids["species"].astype("O")
        if self.species is not None:
            gene_ids = gene_ids[gene_ids["species"] == self.species]

        lnc_go_terms = go_terms[go_terms["RNAcentral id"].isin(
            gene_ids["RNAcentral id"])].groupby("RNAcentral id")[
                "GO terms"].apply(lambda x: "|".join(x.unique()))
        lnc_rfams = go_terms[go_terms["RNAcentral id"].isin(
            gene_ids["RNAcentral id"])].groupby(
                "RNAcentral id")["Rfams"].apply(lambda x: "|".join(x.unique()))

        gene_ids["GO terms"] = gene_ids["RNAcentral id"].map(lnc_go_terms)
        gene_ids["Rfams"] = gene_ids["RNAcentral id"].map(lnc_rfams)
        gene_ids = gene_ids[gene_ids["GO terms"].notnull()
                            | gene_ids["Rfams"].notnull()]

        return gene_ids
예제 #31
0
    def channels(self):
        def get_biometa(array):
            df = dd.from_dask_array(array.channels.data)
            df["Array"] = array.name
            return df

        datasets = self.bag.map(get_biometa)
        return datasets.fold(lambda x, y: dd.concat([x, y]))
예제 #32
0
파일: test_hdf.py 프로젝트: m-rossi/dask
def test_to_hdf_modes_multiple_nodes():
    pytest.importorskip("tables")
    df = pd.DataFrame(
        {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]
    )

    # appending a single partition to existing data
    a = dd.from_pandas(df, 1)
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data2")
        a.to_hdf(fn, "/data*", mode="a")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(dd.concat([df, df]), out)

    # overwriting a file with a single partition
    a = dd.from_pandas(df, 1)
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data2")
        a.to_hdf(fn, "/data*", mode="w")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)

    # appending two partitions to existing data
    a = dd.from_pandas(df, 2)
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data2")
        a.to_hdf(fn, "/data*", mode="a")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(dd.concat([df, df]), out)

    # overwriting a file with two partitions
    a = dd.from_pandas(df, 2)
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data2")
        a.to_hdf(fn, "/data*", mode="w")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)

    # overwriting a single partition, keeping other partitions
    a = dd.from_pandas(df, 2)
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data1")
        a.to_hdf(fn, "/data2")
        a.to_hdf(fn, "/data*", mode="a", append=False)
        out = dd.read_hdf(fn, "/data*")
        assert_eq(dd.concat([df, df]), out)
def par_mc_samples(df,
                   n,
                   reps,
                   replace=False,
                   random_state=8675309,
                   chunksize=100):
    import dask
    import dask.dataframe as dd
    from copy import copy
    import pandas as pd
    cols = copy(df.columns)
    cols = cols.insert(0, 'rep')
    mc_samples_to_return = dd.from_pandas(pd.DataFrame().reindex(columns=cols),
                                          chunksize=chunksize)
    if type(df) != dask.dataframe.core.DataFrame:
        dd_df = dd.from_pandas(df, chunksize=chunksize)
    else:
        dd_df = df
    for i in range(0, reps):
        frac = n / dd_df.shape[0].compute()
        selected_samples_for_rep = dd_df.sample(
            frac=frac, replace=replace, random_state=random_state).compute()
        rep_number = []
        for j in range(0, n):
            rep_number.append([copy(i)])
        rep_number = dd.from_pandas(pd.DataFrame(rep_number),
                                    chunksize=chunksize)
        rep_number.columns = ['rep_number']
        selected_samples_for_rep.reset_index(drop=True, inplace=True)
        selected_samples_for_rep = dd.from_pandas(selected_samples_for_rep,
                                                  chunksize=chunksize)
        selected_samples_for_rep = dd.concat(
            [copy(rep_number),
             copy(selected_samples_for_rep)],
            axis=1,
            ignore_index=True,
            sort=False)
        selected_samples_for_rep.columns = cols
        mc_samples_to_return = dd.concat(
            [copy(mc_samples_to_return),
             copy(selected_samples_for_rep)],
            axis=0,
            sort=False)
        random_state += 1
    mc_samples_to_return = mc_samples_to_return.groupby(by='rep')
    return mc_samples_to_return
예제 #34
0
def make_data(n_keys, n_rows_l, n_rows_r):
    left = dd.concat([
        da.random.random(n_rows_l).to_dask_dataframe(columns='x'),
        da.random.randint(0, n_keys,
                          size=n_rows_l).to_dask_dataframe(columns='id'),
    ],
                     axis=1)

    right = dd.concat([
        da.random.random(n_rows_r).to_dask_dataframe(columns='y'),
        da.random.randint(0, n_keys,
                          size=n_rows_r).to_dask_dataframe(columns='id'),
    ],
                      axis=1)
    gleft = left.map_partitions(cudf.from_pandas)
    gright = right.map_partitions(cudf.from_pandas)
    return gleft, gright
예제 #35
0
 def join_instruments(self, inputs):
     print('Is dask df', type(inputs[0].instrument_trades) is dd.DataFrame)
     print('Is pandas df',
           type(inputs[0].instrument_trades) is pd.DataFrame)
     self.results = dd.concat([input.instrument_trades
                               for input in inputs]).reset_index(drop=True)
     print('Len df', len(self.results))
     self.next(self.end)
예제 #36
0
 def calc(self, df_input: dd.DataFrame, df_output: dd.DataFrame,
          feature_set_list: List) -> dd.DataFrame:
     for feature_set in feature_set_list:
         df_output = dd.concat(
             [df_output,
              self.calc_feature_set(df_input, feature_set)],
             axis=1)
     return df_output
예제 #37
0
파일: dask.py 프로젝트: zhiyuli/holoviews
 def concat(cls, datasets, dimensions, vdims):
     dataframes = []
     for key, ds in datasets:
         data = ds.data.copy()
         for d, k in zip(dimensions, key):
             data[d.name] = k
         dataframes.append(data)
     return dd.concat(dataframes)
예제 #38
0
def test_concat2():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]})}
    a = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None])
    dsk = {('y', 0): pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60]}),
           ('y', 1): pd.DataFrame({'a': [40, 50, 60], 'b': [30, 20, 10]}),
           ('y', 2): pd.DataFrame({'a': [70, 80, 90], 'b': [0, 0, 0]})}
    b = dd.DataFrame(dsk, 'y', ['a', 'b'], [None, None])

    c = dd.concat([a, b])

    assert c.npartitions == a.npartitions + b.npartitions

    assert eq(pd.concat([a.compute(), b.compute()]), c)

    assert dd.concat([a, b]).dask == dd.concat([a, b]).dask
예제 #39
0
 def concat(cls, datasets, dimensions, vdims):
     import dask.dataframe as dd
     dataframes = []
     for key, ds in datasets:
         data = ds.data.copy()
         for d, k in zip(dimensions, key):
             data[d.name] = k
         dataframes.append(data)
     return dd.concat(dataframes)
예제 #40
0
파일: merge.py 프로젝트: semio/ddf_utils
def _merge_two(left: Dict[str, Union[pd.DataFrame, dd.DataFrame]],
               right: Dict[str, Union[pd.DataFrame, dd.DataFrame]],
               index_col: Union[List, str],
               dtype: str, deep=False) -> Dict[str, pd.DataFrame]:
    """merge 2 ingredient data."""
    if len(left) == 0:
        return right

    res_data = {}

    # for datapoints we use dask to help performance.
    if dtype == 'datapoints':
        res_data = dict([(k, v) for k, v in left.items()])
        if deep:
            for k, df in right.items():
                if k in left.keys():
                    columns = left[k].columns.values
                    # res_data[k] = left[k].append(df[columns], interleave_partitions=True)
                    res_data[k] = dd.concat([left[k], df[columns]], axis=0, interleave_partitions=True)
                    res_data[k] = res_data[k].drop_duplicates(subset=index_col, keep='last')
                    # res_data[k] = res_data[k].sort_values(by=index_col)
                else:
                    res_data[k] = df
        else:
            for k, df in right.items():
                res_data[k] = df

    # for concepts/entities, we don't need to use dask.
    elif dtype == 'concepts':

        left_df = pd.concat([x for x in left.values()], sort=False)
        right_df = pd.concat([x for x in right.values()], sort=False)

        if deep:
            merged = left_df.append(right_df, sort=False)
            res = merged.groupby(by=index_col).agg(__get_last_item)
            res_data = {'concept': res.reset_index()}
        else:
            res_data = {'concept':
                        right_df.drop_duplicates(subset='concept',
                                                 keep='last')}
        res_data = res_data

    else:  # entities
        if deep:
            for k, df in right.items():
                if k in left.keys():
                    left[k] = left[k].append(df, ignore_index=True, sort=False)
                    left[k] = left[k].groupby(index_col).agg(__get_last_item).reset_index()
                else:
                    left[k] = df
        else:
            for k, df in right.items():
                left[k] = df
        res_data = left

    return res_data
예제 #41
0
 def check_and_return(ddfs, dfs, join):
     sol = concat(dfs, join=join)
     res = dd.concat(ddfs, join=join, interleave_partitions=divisions)
     assert_eq(res, sol)
     if known:
         parts = compute_as_if_collection(dd.DataFrame, res.dask,
                                          res.__dask_keys__())
         for p in [i.iloc[:0] for i in parts]:
             res._meta == p  # will error if schemas don't align
     assert not cat_index or has_known_categories(res.index) == known
     return res
예제 #42
0
def test_concat2():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]})}
    meta = make_meta({'a': 'i8', 'b': 'i8'})
    a = dd.DataFrame(dsk, 'x', meta, [None, None])
    dsk = {('y', 0): pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60]}),
           ('y', 1): pd.DataFrame({'a': [40, 50, 60], 'b': [30, 20, 10]}),
           ('y', 2): pd.DataFrame({'a': [70, 80, 90], 'b': [0, 0, 0]})}
    b = dd.DataFrame(dsk, 'y', meta, [None, None])

    dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60]}),
           ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10]})}
    meta = make_meta({'b': 'i8', 'c': 'i8'})
    c = dd.DataFrame(dsk, 'y', meta, [None, None])

    dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60],
                                   'd': [70, 80, 90]}),
           ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10],
                                   'd': [90, 80, 70]},
                                  index=[3, 4, 5])}
    meta = make_meta({'b': 'i8', 'c': 'i8', 'd': 'i8'},
                     index=pd.Index([], 'i8'))
    d = dd.DataFrame(dsk, 'y', meta, [0, 3, 5])

    cases = [[a, b], [a, c], [a, d]]
    assert dd.concat([a]) is a
    for case in cases:
        result = dd.concat(case)
        pdcase = [_c.compute() for _c in case]

        assert result.npartitions == case[0].npartitions + case[1].npartitions
        assert result.divisions == (None, ) * (result.npartitions + 1)
        assert_eq(pd.concat(pdcase), result)
        assert set(result.dask) == set(dd.concat(case).dask)

        result = dd.concat(case, join='inner')
        assert result.npartitions == case[0].npartitions + case[1].npartitions
        assert result.divisions == (None, ) * (result.npartitions + 1)
        assert_eq(pd.concat(pdcase, join='inner'), result)
        assert set(result.dask) == set(dd.concat(case, join='inner').dask)
예제 #43
0
def test_concat4_interleave_partitions():
    pdf1 = pd.DataFrame(np.random.randn(10, 5),
                        columns=list('ABCDE'), index=list('abcdefghij'))
    pdf2 = pd.DataFrame(np.random.randn(13, 5),
                        columns=list('ABCDE'), index=list('fghijklmnopqr'))
    pdf3 = pd.DataFrame(np.random.randn(13, 6),
                        columns=list('CDEXYZ'), index=list('fghijklmnopqr'))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    msg = ('All inputs have known divisions which cannot be '
           'concatenated in order. Specify '
           'interleave_partitions=True to ignore order')

    cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1],
             [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]]
    for case in cases:
        pdcase = [c.compute() for c in case]

        with pytest.raises(ValueError) as err:
            dd.concat(case)
        assert msg in str(err.value)

        assert_eq(dd.concat(case, interleave_partitions=True),
                  pd.concat(pdcase))
        assert_eq(dd.concat(case, join='inner', interleave_partitions=True),
                  pd.concat(pdcase, join='inner'))

    msg = "'join' must be 'inner' or 'outer'"
    with pytest.raises(ValueError) as err:
        dd.concat([ddf1, ddf1], join='invalid', interleave_partitions=True)
    assert msg in str(err.value)
예제 #44
0
def test_concat4_interleave_partitions():
    pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list("ABCDE"), index=list("abcdefghij"))
    pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list("ABCDE"), index=list("fghijklmnopqr"))
    pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list("CDEXYZ"), index=list("fghijklmnopqr"))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    msg = (
        "All inputs have known divisions which cannnot be "
        "concatenated in order. Specify "
        "interleave_partitions=True to ignore order"
    )

    cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]]
    for case in cases:
        pdcase = [c.compute() for c in case]

        with tm.assertRaisesRegexp(ValueError, msg):
            dd.concat(case)

        assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase))
        assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner"))

    msg = "'join' must be 'inner' or 'outer'"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.concat([ddf1, ddf1], join="invalid", interleave_partitions=True)
예제 #45
0
def test_concat2():
    dsk = {
        ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}),
        ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}),
        ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}),
    }
    a = dd.DataFrame(dsk, "x", ["a", "b"], [None, None])
    dsk = {
        ("y", 0): pd.DataFrame({"a": [10, 20, 30], "b": [40, 50, 60]}),
        ("y", 1): pd.DataFrame({"a": [40, 50, 60], "b": [30, 20, 10]}),
        ("y", 2): pd.DataFrame({"a": [70, 80, 90], "b": [0, 0, 0]}),
    }
    b = dd.DataFrame(dsk, "y", ["a", "b"], [None, None])

    dsk = {
        ("y", 0): pd.DataFrame({"b": [10, 20, 30], "c": [40, 50, 60]}),
        ("y", 1): pd.DataFrame({"b": [40, 50, 60], "c": [30, 20, 10]}),
    }
    c = dd.DataFrame(dsk, "y", ["b", "c"], [None, None])

    dsk = {
        ("y", 0): pd.DataFrame({"b": [10, 20, 30], "c": [40, 50, 60], "d": [70, 80, 90]}),
        ("y", 1): pd.DataFrame({"b": [40, 50, 60], "c": [30, 20, 10], "d": [90, 80, 70]}, index=[3, 4, 5]),
    }
    d = dd.DataFrame(dsk, "y", ["b", "c", "d"], [0, 3, 5])

    cases = [[a, b], [a, c], [a, d]]
    assert dd.concat([a]) is a
    for case in cases:
        result = dd.concat(case)
        pdcase = [c.compute() for c in case]

        assert result.npartitions == case[0].npartitions + case[1].npartitions
        assert result.divisions == (None,) * (result.npartitions + 1)
        assert eq(pd.concat(pdcase), result)
        assert result.dask == dd.concat(case).dask

        result = dd.concat(case, join="inner")
        assert result.npartitions == case[0].npartitions + case[1].npartitions
        assert result.divisions == (None,) * (result.npartitions + 1)
        assert eq(pd.concat(pdcase, join="inner"), result)
        assert result.dask == dd.concat(case, join="inner").dask

        msg = "Unable to concatenate DataFrame with unknown division " "specifying axis=1"
        with tm.assertRaisesRegexp(ValueError, msg):
            dd.concat(case, axis=1)
예제 #46
0
def test_set_index_empty_partition():
    test_vals = [1, 2, 3]

    converters = [
        int,
        float,
        str,
        lambda x: pd.to_datetime(x, unit='ns'),
    ]

    for conv in converters:
        df = pd.DataFrame([{'x': conv(i), 'y': i} for i in test_vals], columns=['x', 'y'])
        ddf = dd.concat([
            dd.from_pandas(df, npartitions=1),
            dd.from_pandas(df[df.y > df.y.max()], npartitions=1),
        ])

        assert any(ddf.get_partition(p).compute().empty for p in range(ddf.npartitions))
        assert assert_eq(ddf.set_index('x'), df.set_index('x'))
예제 #47
0
def test_concat_datetimeindex():
    # https://github.com/dask/dask/issues/2932
    b2 = pd.DataFrame({'x': ['a']},
                      index=pd.DatetimeIndex(['2015-03-24 00:00:16'],
                                             dtype='datetime64[ns]'))
    b3 = pd.DataFrame({'x': ['c']},
                      index=pd.DatetimeIndex(['2015-03-29 00:00:44'],
                                             dtype='datetime64[ns]'))

    b2['x'] = b2.x.astype('category').cat.set_categories(['a', 'c'])
    b3['x'] = b3.x.astype('category').cat.set_categories(['a', 'c'])

    db2 = dd.from_pandas(b2, 1)
    db3 = dd.from_pandas(b3, 1)

    result = concat([b2.iloc[:0], b3.iloc[:0]])
    assert result.index.dtype == '<M8[ns]'

    result = dd.concat([db2, db3])
    expected = pd.concat([b2, b3])
    assert_eq(result, expected)
예제 #48
0
def test_orc_multiple(orc_files):
    d = read_orc(orc_files[0])
    d2 = read_orc(orc_files)
    assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
    d2 = read_orc(os.path.dirname(orc_files[0]) + '/*.orc')
    assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
예제 #49
0
파일: dask.py 프로젝트: mforbes/holoviews
 def concat(cls, columns_objs):
     cast_objs = cls.cast(columns_objs)
     return dd.concat([col.data for col in cast_objs])
예제 #50
0
파일: core.py 프로젝트: elaeon/ML
 def to_dd(self) -> dd.DataFrame:
     dfs = []
     for group in self.groups:
         df = dd.from_dask_array(self.conn[group], columns=[group])
         dfs.append(df)
     return dd.concat(dfs, axis=1)
예제 #51
0
def test_concat5():
    pdf1 = pd.DataFrame(np.random.randn(7, 5), columns=list("ABCDE"), index=list("abcdefg"))
    pdf2 = pd.DataFrame(np.random.randn(7, 6), columns=list("FGHIJK"), index=list("abcdefg"))
    pdf3 = pd.DataFrame(np.random.randn(7, 6), columns=list("FGHIJK"), index=list("cdefghi"))
    pdf4 = pd.DataFrame(np.random.randn(7, 5), columns=list("FGHAB"), index=list("cdefghi"))
    pdf5 = pd.DataFrame(np.random.randn(7, 5), columns=list("FGHAB"), index=list("fklmnop"))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)
    ddf4 = dd.from_pandas(pdf4, 2)
    ddf5 = dd.from_pandas(pdf5, 3)

    cases = [
        [ddf1, ddf2],
        [ddf1, ddf3],
        [ddf1, ddf4],
        [ddf1, ddf5],
        [ddf3, ddf4],
        [ddf3, ddf5],
        [ddf5, ddf1, ddf4],
        [ddf5, ddf3],
        [ddf1.A, ddf4.A],
        [ddf2.F, ddf3.F],
        [ddf4.A, ddf5.A],
        [ddf1.A, ddf4.F],
        [ddf2.F, ddf3.H],
        [ddf4.A, ddf5.B],
        [ddf1, ddf4.A],
        [ddf3.F, ddf2],
        [ddf5, ddf1.A, ddf2],
    ]

    for case in cases:
        pdcase = [c.compute() for c in case]

        assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase))

        assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner"))

        assert eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1))

        assert eq(dd.concat(case, axis=1, join="inner"), pd.concat(pdcase, axis=1, join="inner"))

    # Dask + pandas
    cases = [
        [ddf1, pdf2],
        [ddf1, pdf3],
        [pdf1, ddf4],
        [pdf1.A, ddf4.A],
        [ddf2.F, pdf3.F],
        [ddf1, pdf4.A],
        [ddf3.F, pdf2],
        [ddf2, pdf1, ddf3.F],
    ]

    for case in cases:
        pdcase = [c.compute() if isinstance(c, _Frame) else c for c in case]

        assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase))

        assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner"))

        assert eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1))

        assert eq(dd.concat(case, axis=1, join="inner"), pd.concat(pdcase, axis=1, join="inner"))