示例#1
0
文件: test_io.py 项目: floriango/dask
def test_from_delayed():
    df = pd.DataFrame(data=np.random.normal(size=(10, 4)), columns=list('abcd'))
    parts = [df.iloc[:1], df.iloc[1:3], df.iloc[3:6], df.iloc[6:10]]
    dfs = [delayed(parts.__getitem__)(i) for i in range(4)]
    meta = dfs[0].compute()

    my_len = lambda x: pd.Series([len(x)])

    for divisions in [None, [0, 1, 3, 6, 10]]:
        ddf = dd.from_delayed(dfs, meta=meta, divisions=divisions)
        assert_eq(ddf, df)
        assert list(ddf.map_partitions(my_len).compute()) == [1, 2, 3, 4]
        assert ddf.known_divisions == (divisions is not None)

        s = dd.from_delayed([d.a for d in dfs], meta=meta.a,
                            divisions=divisions)
        assert_eq(s, df.a)
        assert list(s.map_partitions(my_len).compute()) == [1, 2, 3, 4]
        assert ddf.known_divisions == (divisions is not None)

    meta2 = [(c, 'f8') for c in df.columns]
    assert_eq(dd.from_delayed(dfs, meta=meta2), df)
    assert_eq(dd.from_delayed([d.a for d in dfs], meta=('a', 'f8')), df.a)

    with pytest.raises(ValueError):
        dd.from_delayed(dfs, meta=meta, divisions=[0, 1, 3, 6])

    with pytest.raises(ValueError) as e:
        dd.from_delayed(dfs, meta=meta.a).compute()
    assert str(e.value).startswith('Metadata mismatch found in `from_delayed`')
示例#2
0
    def to_dask(self, *args, **kwds):
        from dask import dataframe as dd

        self._load_metadata()
        return dd.from_delayed([
            dask.delayed(self._get_partition(i, *args, **kwds))
            for i in range(self.npartitions)
        ])
示例#3
0
 def to_dask(self):
     """Create lazy dask dataframe object"""
     import dask.dataframe as dd
     from dask import delayed
     self.discover()
     dpart = delayed(read_file_uavro)
     return dd.from_delayed([dpart(f, self._head) for f in self._files],
                            meta=self.dtype)
示例#4
0
文件: test_io.py 项目: floriango/dask
def test_from_delayed_sorted():
    a = pd.DataFrame({'x': [1, 2]}, index=[1, 10])
    b = pd.DataFrame({'x': [4, 1]}, index=[100, 200])

    A = dd.from_delayed([delayed(a), delayed(b)], divisions='sorted')
    assert A.known_divisions

    assert A.divisions == (1, 100, 200)
示例#5
0
    def aggregrate_files(self, urls, n_procs=1):
        import dask
        import dask.dataframe as dd

        dfs = [dask.delayed(self.read_csv)(f) for f in urls]
        dff = dd.from_delayed(dfs)
        df = dff.compute(num_workers=n_procs)
        return df
示例#6
0
def test_from_delayed_sorted():
    a = pd.DataFrame({'x': [1, 2]}, index=[1, 10])
    b = pd.DataFrame({'x': [4, 1]}, index=[100, 200])

    A = dd.from_delayed([delayed(a), delayed(b)], divisions='sorted')
    assert A.known_divisions

    assert A.divisions == (1, 100, 200)
示例#7
0
def test_basic(loop, delayed):  # noqa: F811
    with dask_cuda.LocalCUDACluster(loop=loop) as cluster:
        with Client(cluster):
            pdf = dask.datasets.timeseries(dtypes={"x": int}).reset_index()
            gdf = pdf.map_partitions(cudf.DataFrame.from_pandas)
            if delayed:
                gdf = dd.from_delayed(gdf.to_delayed())
            dd.assert_eq(pdf.head(), gdf.head())
示例#8
0
    def add_data(self, dates, daily=False, sub_hourly=False, download=False, latlonbox=None):
        """Short summary.

        Parameters
        ----------
        dates : type
            Description of parameter `dates`.
        daily : type
            Description of parameter `daily` (the default is False).
        sub_hourly : type
            Description of parameter `sub_hourly` (the default is False).
        download : type
            Description of parameter `download` (the default is False).
        latlonbox : type
            Description of parameter `latlonbox` (the default is None).

        Returns
        -------
        type
            Description of returned object.

        """
        import dask
        import dask.dataframe as dd

        if self.monitor_df is None:
            self.get_monitor_df()
        if latlonbox is not None:  # get them all[latmin,lonmin,latmax,lonmax]
            mdf = self.monitor_df
            con = (
                (mdf.LATITUDE >= latlonbox[0])
                & (mdf.LATITUDE <= latlonbox[2])
                & (mdf.LONGITUDE >= latlonbox[1])
                & (mdf.LONGITUDE <= latlonbox[3])
            )
            monitors = mdf.loc[con].copy()
        else:
            monitors = self.monitor_df.copy()
        urls, fnames = self.build_urls(monitors, dates, daily=daily, sub_hourly=sub_hourly)
        if download:
            for url, fname in zip(urls, fnames):
                self.retrieve(url, fname)
            dfs = [dask.delayed(self.load_file)(i) for i in fnames]
        else:
            dfs = [dask.delayed(self.load_file)(i) for i in urls]
        dff = dd.from_delayed(dfs)
        self.df = dff.compute()
        self.df = pd.merge(self.df, monitors, how="left", on=["WBANNO", "LATITUDE", "LONGITUDE"])
        if ~self.df.columns.isin(["time"]).max():
            self.df["time"] = self.df.time_local + pd.to_timedelta(self.df.GMT_OFFSET, unit="H")
        id_vars = self.monitor_df.columns.append(pd.Index(["time", "time_local"]))
        keys = self.df.columns[self.df.columns.isin(id_vars)]
        self.df = pd.melt(
            self.df, id_vars=keys, var_name="variable", value_name="obs"
        )  # this stacks columns to be inline with MONET
        self.df.rename(columns={"WBANNO": "siteid"}, inplace=True)
        self.change_units()
        self.df.columns = [i.lower() for i in self.df.columns]
示例#9
0
def join_frames(left, right, on, how, lsuffix, rsuffix):
    """Join two frames on 1 or more columns.

    Parameters
    ----------
    left, right : dask_cudf.DataFrame
    on : tuple[str]
        key column(s)
    how : str
        Join method
    lsuffix, rsuffix : str
    """
    empty_frame = left._meta.merge(right._meta,
                                   on=on,
                                   how=how,
                                   suffixes=(lsuffix, rsuffix))

    def merge(left, right):
        return left.merge(right, on=on, how=how, suffixes=(lsuffix, rsuffix))

    left_val_names = [k for k in left.columns if k not in on]
    right_val_names = [k for k in right.columns if k not in on]
    same_names = set(left_val_names) & set(right_val_names)
    if same_names and not (lsuffix or rsuffix):
        raise ValueError("there are overlapping columns but "
                         "lsuffix and rsuffix are not defined")

    dtypes = {k: left[k].dtype for k in left.columns}
    dtypes.update({k: right[k].dtype for k in right.columns})

    left_parts = left.to_delayed()
    right_parts = right.to_delayed()

    # Add column w/ hash(v) % nparts
    nparts = max(len(left_parts), len(right_parts))

    left_hashed = group_frame(left_parts, nparts, on)
    right_hashed = group_frame(right_parts, nparts, on)

    # Fanout each partition into nparts subgroups
    left_subgroups = fanout_subgroups(left_hashed, nparts)
    right_subgroups = fanout_subgroups(right_hashed, nparts)

    assert len(left_subgroups) == len(right_subgroups)

    # Concat
    left_cats = [delayed(cudf.concat, pure=True)(it) for it in left_subgroups]
    right_cats = [
        delayed(cudf.concat, pure=True)(it) for it in right_subgroups
    ]

    # Combine
    merged = [
        delayed(merge, pure=True)(left_cats[i], right_cats[i])
        for i in range(nparts)
    ]

    return dd.from_delayed(merged, prefix="join_result", meta=empty_frame)
示例#10
0
    def add_data(self,
                 dates,
                 param=None,
                 daily=False,
                 network=None,
                 download=False,
                 local=False,
                 n_procs=1,
                 meta=False):
        """Short summary.

        Parameters
        ----------
        dates : list of datetime objects
            Description of parameter `dates`.
        param : list of strings
            Description of parameter `param` (the default is None).
        daily : boolean
            Description of parameter `daily` (the default is False).
        network : type
            Description of parameter `network` (the default is None).
        download : type
            Description of parameter `download` (the default is False).

        Returns
        -------
        pandas DataFrame
            Description of returned object.

        """
        import dask
        import dask.dataframe as dd
        if param is None:
            params = [
                'SPEC', 'PM10', 'PM2.5', 'PM2.5_FRM', 'CO', 'OZONE', 'SO2',
                'VOC', 'NONOXNOY', 'WIND', 'TEMP', 'RHDP'
            ]
        else:
            params = param
        urls, fnames = self.build_urls(params, dates, daily=daily)
        if download:
            for url, fname in zip(urls, fnames):
                self.retrieve(url, fname)
            dfs = [
                dask.delayed(self.load_aqs_file)(i, network) for i in fnames
            ]
        elif local:
            dfs = [
                dask.delayed(self.load_aqs_file)(i, network) for i in fnames
            ]
        else:
            dfs = [dask.delayed(self.load_aqs_file)(i, network) for i in urls]
        dff = dd.from_delayed(dfs)
        dfff = dff.compute(num_workers=n_procs)
        if meta:
            return (self.add_data2(dfff, daily, network))
        else:
            return dfff
示例#11
0
    def load_all_seasons(self, columns=None, npartitions=8):
        """ Loads all of the seasons in the statcast_data folder
        
        Parameters
        ----------
        columns (list, default=None) : columns to read. If None, all columns are read

        npartitions(int, default=8) : the number of pandas DataFrames to split the Dask DataFrame into

        Returns
        -------
        DataFrame : the data from the statcast_data folder as one DataFrame
        """
        datasets = [dask.delayed(feather.read_dataframe)(f"statcast_data/{year}", columns=columns) for year in range(2017, 2021)]
        meta = self.get_data_types(df=dd.from_delayed(datasets[0]))
        df = dd.from_delayed(datasets, meta=meta).repartition(npartitions=npartitions)
        df.set_index("index")
        return df
示例#12
0
文件: ish.py 项目: barronh/MONET
    def read_sites(self,
                   box=None,
                   country=None,
                   state=None,
                   site=None,
                   resample=True,
                   window='H'):
        import urllib.request, urllib.error, urllib.parse
        from numpy import NaN
        i = self.dates[0]
        year = i.strftime('%Y')
        url = 'https://www1.ncdc.noaa.gov/pub/data/noaa/' + year + '/'
        if self.history is None:
            self.read_ish_history()
        self.history[
            'fname'] = url + self.history.USAF + '-' + self.history.WBAN + '-' + year + '.gz'
        dfloc = self.history.copy()
        if type(box) is not type(None):
            print('Retrieving Sites in: ' + box)
            dfloc = self.subset_sites(latmin=box[0],
                                      lonmin=box[1],
                                      latmax=box[2],
                                      lonmax=box[3])
        elif country is not None:
            print('Retrieving Country: ' + country)
            dfloc = self.history.loc[self.history.CTRY == country, :]
        elif state is not None:
            print('Retrieving State: ' + state)
            dfloc = self.history.loc[self.history.STATE == state, :]
        elif type(site) is not type(None):
            print('Retrieving Site: ' + site)
            dfloc = self.history.loc[self.history.station_id == site, :]
        print(dfloc.fname.unique())
        objs = self.get_url_file_objs(dfloc.fname.unique())
        # return objs,size,self.history.fname
        # dfs = []
        # for f in objs:
        #     try:
        #         dfs.append(self.read_data_frame(f))
        #     except:
        #         pass

        print('  Reading ISH into pandas DataFrame...')
        dfs = [dask.delayed(self.read_data_frame)(f) for f in objs]
        dff = dd.from_delayed(dfs)
        self.df = dff.compute()
        self.df.loc[self.df.vsb == 99999, 'vsb'] = NaN
        if resample:
            print('  Resampling to every ' + window)
            self.df.index = self.df.datetime
            self.df = self.df.groupby('station_id').resample(
                'H').mean().reset_index()
        self.df = self.df.merge(self.history[[
            'station_id', 'latitude', 'longitude', 'STATION NAME'
        ]],
                                on=['station_id'],
                                how='left')
示例#13
0
def aggregate_files(dates=dates, *, download=False, n_procs=1, daily=False):
    """Short summary.

    Parameters
    ----------
    dates : array-like of datetime-like
        Passed to :func:`build_urls`.
    download : bool, optional
        Whether to first download the AirNow files to the local directory
        before loading.
    n_procs : int
        For Dask.

    Returns
    -------
    pandas.DataFrame
        Of the combined AirNow hourly files.
    """
    import dask
    import dask.dataframe as dd

    print("Aggregating AIRNOW files...")

    urls, fnames = build_urls(dates, daily=daily)
    if download:
        for url, fname in zip(urls, fnames):
            retrieve(url, fname)
        dfs = [dask.delayed(read_csv)(f) for f in fnames]
    else:
        dfs = [dask.delayed(read_csv)(f) for f in urls]
    dff = dd.from_delayed(dfs)
    df = dff.compute(num_workers=n_procs).reset_index()

    # Datetime conversion
    if daily:
        df["time"] = pd.to_datetime(df.date, format=r"%m/%d/%y", exact=True)
    else:
        df["time"] = pd.to_datetime(
            df.date + " " + df.time, format=r"%m/%d/%y %H:%M", exact=True
        )  # TODO: move to read_csv? (and some of this other stuff too?)
        df["time_local"] = df.time + pd.to_timedelta(df.utcoffset, unit="H")
    df.drop(["date"], axis=1, inplace=True)

    print("    Adding in Meta-data")
    df = get_station_locations(df)
    if daily:
        df = df[[
            col for col in savecols if col not in {"time_local", "utcoffset"}
        ]]
    else:
        df = df[savecols]
    df.drop_duplicates(inplace=True)

    df = filter_bad_values(df)

    return df.reset_index(drop=True)
示例#14
0
 def load(
     self, name, from_date=None, to_date=None, freq=None, time_travel=None, **kwargs
 ):
     # Find the last value _before_ time range to carry over
     last_before = from_date
     if from_date:
         _, last_before = self._range(
             name, to_date=from_date, time_travel=time_travel
         )
         last_before = last_before["time"]
     ddf = self._read(name, last_before, to_date, freq, time_travel, **kwargs)
     if not from_date:
         from_date = ddf.index.min().compute()  # First value in data
     if not to_date:
         to_date = ddf.index.max().compute()  # Last value in data
     if pd.Timestamp(to_date) < pd.Timestamp(from_date):
         to_date = from_date
     # Keep only last created_time for each index timestamp
     delayed_apply = dask.delayed(
         # Use pandas on each dask partition
         lambda x: x.reset_index()
         .set_index("created_time")
         .sort_index()
         .groupby("time")
         .last()
     )
     ddf = dd.from_delayed([delayed_apply(d) for d in ddf.to_delayed()])
     #  Repartition to remove empty chunks
     ddf = ddf.repartition(partition_size="25MB")
     # Apply resampling/date filtering
     if freq:
         # Index samples for final dataframe
         samples = dd.from_pandas(
             pd.DataFrame(index=pd.date_range(from_date, to_date, freq=freq)),
             chunksize=100000,
         )
         ddf = dd.merge(
             # Interpolate
             dd.merge(
                 ddf,
                 samples,
                 left_index=True,
                 right_index=True,
                 how="outer",
             ).ffill(),
             samples,
             left_index=True,
             right_index=True,
             how="right",
         )
     else:
         # Filter on date range
         ddf = ddf.loc[pd.Timestamp(from_date) : pd.Timestamp(to_date)]
     #  Repartition to remove empty chunks
     ddf = ddf.repartition(partition_size="25MB")
     return ddf
示例#15
0
 def _load_metadata(self):
     import dask.dataframe as dd
     import dask.delayed
     if self.dataframe is None:
         self.parts = [dask.delayed(get_partition)(
             self.url, self.headers, self._source_id, self.container, i
         )
                       for i in range(self.npartitions)]
         self.dataframe = dd.from_delayed(self.parts)
     return self._schema
示例#16
0
 def to_dask(self):
     import dask.delayed
     import dask.dataframe as dd
     self._get_schema()
     dload = dask.delayed(load_stream)
     parts = [
         dload(self._stream_class, s, self._protocol, self._payload,
               self._chunksize) for s in self._stream_sources
     ]
     return dd.from_delayed(parts)
示例#17
0
def to_dask_cudf(dask_arr, client=None):
    client = default_client() if client is None else client

    elms = [_to_cudf(dp) for dp in dask_arr.to_delayed().flatten()]
    dfs = client.compute(elms)

    meta = client.submit(_get_meta, dfs[0])
    meta_local = meta.result()

    return dd.from_delayed(dfs, meta=meta_local)
示例#18
0
def to_dask_cudf(futures):
    """
    Convert a list of futures containing cudf Dataframes into a Dask.Dataframe
    :param futures: list[cudf.Dataframe] list of futures containing dataframes
    :return: dask.Dataframe a dask.Dataframe
    """
    c = default_client()
    # Convert a list of futures containing dfs back into a dask_cudf
    dfs = [d for d in futures if d.type != type(None)]  # NOQA
    meta = c.submit(get_meta, dfs[0]).result()
    return dd.from_delayed(dfs, meta=meta)
示例#19
0
    def load_season(self, season, columns=None, npartitions=8):
        """ Loads a single season from the statcast_data folder
        
        Parameters
        ----------
        season (int, str) : the season to load

        columns (list, default=None) : columns to read. If None, all columns are read

        npartitions(int, default=8) : the number of pandas DataFrames to split the Dask DataFrame into

        Returns
        --------
        DataFrame : the data from the specified season
        """
        df = [dask.delayed(feather.read_dataframe)(f"statcast_data/{season}", columns=columns)]
        meta = self.get_data_types(df=dd.from_delayed(df))
        df = dd.from_delayed(df, meta=meta).repartition(npartitions=npartitions)
        df.set_index("index")
        return df
示例#20
0
def read_metafile(path: PathType) -> dd.DataFrame:
    """Read cbgen metafile containing partitioned variant info"""
    with bgen_metafile(path) as mf:
        divisions = [mf.partition_size * i
                     for i in range(mf.npartitions)] + [mf.nvariants - 1]
        dfs = [
            dask.delayed(_read_metafile_partition)(path, i)
            for i in range(mf.npartitions)
        ]
        meta = dd.utils.make_meta(METAFILE_DTYPE)
        return dd.from_delayed(dfs, meta=meta, divisions=divisions)
示例#21
0
    def calc_docstats_df(self, crps):

        # Get a list of records with doc stats for df
        docstat_lst = [self.make_record_df(doc) for doc in crps]

        # Load into dask df
        docstats_dd = dd.from_delayed(docstat_lst,
                                      # meta=dtype_sample_df
                                      )

        return docstats_dd
示例#22
0
文件: sql.py 项目: simonkamronn/dask
def read_sql_table(table, uri, npartitions=None, columns=None,
                   index_col=None, chunkrowsize=1000000, **kwargs):
    """
    Create dataframe from an SQL table.

    Parameters
    ----------
    table : string
        Table name
    uri : string
        Full sqlalchemy URI for the database connection
    npartitions : int or None
        Number of partitions. If None, uses chunkrowsize.
    chunkrowsize : int
        If npartitions is None, use this to decide the sizes of the
        partitions.
    columns : list of strings or None
        Which columns to select; if None, gets all
    index_col : string
        Column which becomes the index, and defines the partitioning. Should
        be a indexed column in the SQL server. If None, uses row number (numerical
        index).
    kwargs : dict
        Additional parameters to pass to `pd.read_sql()`

    Returns
    -------
    dask.dataframe
    """
    if index_col is None:
        raise ValueError("Must specify index column to partition on")
    if npartitions is None:
        length = pd.read_sql('select count(1) from ' + table, uri).iloc[0, 0]
        npartitions = (length-1) // chunkrowsize + 1
    if columns and index_col not in columns:
        columns.append(index_col)
    columns = ", ".join(['"{}"'.format(c) for c in columns]) if columns else "*"
    head = pd.read_sql('SELECT {columns} FROM {table} LIMIT 5'.format(
        columns=columns, table=table, index_col=index_col
    ), uri, **kwargs)
    columns = ", ".join(['"{}"'.format(c) for c in head.columns]) if columns=="*" else columns
    parts = []
    kwargs['index_col'] = index_col
    for i in range(npartitions):
        q = """
            SELECT {columns} FROM
            (SELECT {columns},
                NTILE({nparts}) OVER (ORDER BY "{index}") as partition
             FROM {table}) temp
            WHERE partition = {i};
            """.format(columns=columns, table=table, nparts=npartitions,
                       index=index_col, i=i+1)
        parts.append(delayed(pd.read_sql_query)(q, uri, **kwargs))
    return from_delayed(parts, head)
示例#23
0
文件: core.py 项目: sriramch/cudf
    def sort_values_binned(self, by):
        """Sorty by the given column and ensure that the same key
        doesn't spread across multiple partitions.
        """
        # Get sorted partitions
        parts = self.sort_values(by=by).to_delayed()

        # Get unique keys in each partition
        @delayed
        def get_unique(p):
            return set(p[by].unique())

        uniques = list(compute(*map(get_unique, parts)))

        joiner = {}
        for i in range(len(uniques)):
            joiner[i] = to_join = {}
            for j in range(i + 1, len(uniques)):
                intersect = uniques[i] & uniques[j]
                # If the keys intersect
                if intersect:
                    # Remove keys
                    uniques[j] -= intersect
                    to_join[j] = frozenset(intersect)
                else:
                    break

        @delayed
        def join(df, other, keys):
            others = [
                other.query("{by}==@k".format(by=by)) for k in sorted(keys)
            ]
            return cudf.concat([df] + others)

        @delayed
        def drop(df, keep_keys):
            locvars = locals()
            for i, k in enumerate(keep_keys):
                locvars["k{}".format(i)] = k

            conds = [
                "{by}==@k{i}".format(by=by, i=i) for i in range(len(keep_keys))
            ]
            expr = " or ".join(conds)
            return df.query(expr)

        for i in range(len(parts)):
            if uniques[i]:
                parts[i] = drop(parts[i], uniques[i])
                for joinee, intersect in joiner[i].items():
                    parts[i] = join(parts[i], parts[joinee], intersect)

        results = [p for i, p in enumerate(parts) if uniques[i]]
        return from_delayed(results, meta=self._meta).reset_index()
示例#24
0
    def sort_values(self, by, ignore_index=False):
        """Sort by the given column

        Parameter
        ---------
        by : str
        """
        parts = self.to_delayed()
        sorted_parts = batcher_sortnet.sort_delayed_frame(parts, by)
        return from_delayed(
            sorted_parts, meta=self._meta).reset_index(force=not ignore_index)
示例#25
0
def test_from_delayed_optimize_fusion():
    # Test that DataFrame optimization fuses a `from_delayed`
    # layer with other Blockwise layers and input Delayed tasks.
    # See: https://github.com/dask/dask/pull/8852
    ddf = (dd.from_delayed(
        map(delayed(lambda x: pd.DataFrame({"x": [x] * 10})), range(10)),
        meta=pd.DataFrame({"x": [0] * 10}),
    ) + 1)
    # NOTE: Fusion requires `optimize_blockwise`` and `fuse_roots`
    assert isinstance(ddf.dask.layers[ddf._name], Blockwise)
    assert len(optimize(ddf.dask, ddf.__dask_keys__()).layers) == 1
示例#26
0
def dask_add_rowid(df, col_name):
    parts = df.to_delayed()

    parts = [
        dask.delayed(_add_rowid)(part, col_name, len(parts), idx)
        for idx, part in enumerate(parts)
    ]

    meta = df._meta.copy()
    meta[col_name] = pd.Series([], dtype=int)

    return dd.from_delayed(parts, meta=meta, divisions=df.divisions)
示例#27
0
def dask_offset_limit(df, offset, limit):
    """Perform a limit-offset operation against a dataframe."""
    parts = df.to_delayed()

    lens = [dask.delayed(len)(part) for part in parts]
    lens = dask.delayed(as_list)(*lens)
    parts = [
        dask.delayed(select_subset)(idx, part, lens, offset, limit, df._meta)
        for idx, part in enumerate(parts)
    ]

    return dd.from_delayed(parts, meta=df._meta)
示例#28
0
def test_check_meta_flag():
    from pandas import Series
    from dask.delayed import delayed
    from dask.dataframe import from_delayed

    a = Series(["a", "b", "a"], dtype="category")
    b = Series(["a", "c", "a"], dtype="category")
    da = delayed(lambda x: x)(a)
    db = delayed(lambda x: x)(b)

    c = from_delayed([da, db], verify_meta=False)
    assert_eq(c, c)
示例#29
0
    def test_categorical_empty(self):
        # GH 1705

        def make_empty():
            return pd.DataFrame({"A": pd.Categorical([np.nan, np.nan])})

        def make_full():
            return pd.DataFrame({"A": pd.Categorical(["a", "a"])})

        a = dd.from_delayed([dask.delayed(make_empty)(), dask.delayed(make_full)()])
        # Used to raise an IndexError
        a.A.cat.categories
示例#30
0
def test_set_index_sorted_min_max_same():
    a = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 0, 0]})
    b = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 1, 1]})

    aa = delayed(a)
    bb = delayed(b)

    df = dd.from_delayed([aa, bb], meta=a)
    assert not df.known_divisions

    df2 = df.set_index('y', sorted=True)
    assert df2.divisions == (0, 1, 1)
示例#31
0
def test_set_index_sorted_min_max_same():
    a = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 0, 0]})
    b = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 1, 1]})

    aa = delayed(a)
    bb = delayed(b)

    df = dd.from_delayed([aa, bb], meta=a)
    assert not df.known_divisions

    df2 = df.set_index('y', sorted=True)
    assert df2.divisions == (0, 1, 1)
示例#32
0
def test_from_delayed_preserves_hlgs():
    df = pd.DataFrame(data=np.random.normal(size=(10, 4)), columns=list("abcd"))
    parts = [df.iloc[:1], df.iloc[1:3], df.iloc[3:6], df.iloc[6:10]]
    dfs = [delayed(parts.__getitem__)(i) for i in range(4)]
    meta = dfs[0].compute()

    chained = [d.a for d in dfs]
    hlg = dd.from_delayed(chained, meta=meta).dask
    for d in chained:
        for layer_name, layer in d.dask.layers.items():
            assert hlg.layers[layer_name] == layer
            assert hlg.dependencies[layer_name] == d.dask.dependencies[layer_name]
示例#33
0
def to_dask_cudf(futures, client=None):
    """
    Convert a list of futures containing cudf Dataframes into a Dask.Dataframe
    :param futures: list[cudf.Dataframe] list of futures containing dataframes
    :param client: dask.distributed.Client Optional client to use
    :return: dask.Dataframe a dask.Dataframe
    """
    c = default_client() if client is None else client
    # Convert a list of futures containing dfs back into a dask_cudf
    dfs = [d for d in futures if d.type != type(None)]  # NOQA
    meta = c.submit(get_meta, dfs[0]).result()
    return dd.from_delayed(dfs, meta=meta)
示例#34
0
def test_from_delayed():
    df = pd.DataFrame(data=np.random.normal(size=(10, 4)),
                      columns=list('abcd'))
    parts = [df.iloc[:1], df.iloc[1:3], df.iloc[3:6], df.iloc[6:10]]
    dfs = [delayed(parts.__getitem__)(i) for i in range(4)]
    meta = dfs[0].compute()

    my_len = lambda x: pd.Series([len(x)])

    for divisions in [None, [0, 1, 3, 6, 10]]:
        ddf = dd.from_delayed(dfs, meta=meta, divisions=divisions)
        assert_eq(ddf, df)
        assert list(ddf.map_partitions(my_len).compute()) == [1, 2, 3, 4]
        assert ddf.known_divisions == (divisions is not None)

        s = dd.from_delayed([d.a for d in dfs],
                            meta=meta.a,
                            divisions=divisions)
        assert_eq(s, df.a)
        assert list(s.map_partitions(my_len).compute()) == [1, 2, 3, 4]
        assert ddf.known_divisions == (divisions is not None)

    with pytest.raises(ValueError):
        dd.from_delayed(dfs, meta=meta, divisions=[0, 1, 3, 6])

    with pytest.raises(ValueError) as e:
        dd.from_delayed(dfs, meta=meta.a).compute()
    assert str(e.value).startswith('Metadata mismatch found in `from_delayed`')
示例#35
0
def main():
    md = ihme.load_metadata()
    metric = md['metric'].copy()
    measure = md['measure'].copy()

    # datapoints
    datapoint_output_dir = osp.join(output_dir, 'deaths')
    os.makedirs(datapoint_output_dir, exist_ok=True)

    data_full = dd.from_delayed([dask.delayed(load_data)(f) for f in os.listdir(source_dir) if f.endswith('.zip')], meta=DTYPES)

    metric = metric.set_index('id')['name'].to_dict()
    measure = measure.set_index('id')['short_name'].to_dict()

    all_measures = list()
    measure_metric_combinations = product(MEASURES, METRICS)
    for g in measure_metric_combinations:
        name = measure[g[0]] + ' ' + metric[g[1]]
        print(f'creating dattpoints for {name}')
        concept = to_concept_id(name)
        all_measures.append((concept, name))

        cols = ['location', 'sex', 'age', 'cause', 'year', 'val']
        df = data_full.loc[(data_full.measure == g[0]) & (data_full.metric == g[1]), cols].compute()
        serve_datapoint(df, concept)

    # entities
    serve_entities(md)

    # concepts
    cont_cdf = pd.DataFrame(all_measures, columns=['concept', 'name'])
    cont_cdf['concept_type'] = 'measure'
    cont_cdf.to_csv('../../ddf--concepts--continuous.csv', index=False)

    dis_cdf = pd.DataFrame([
        ['name', 'Name', 'string'],
        ['short_name', 'Short Name', 'string'],
        ['medium_name', 'Medium Name', 'string'],
        ['long_name', 'Long Name', 'string'],
        ['location', 'Location', 'entity_domain'],
        ['sex', 'Sex', 'entity_domain'],
        ['age', 'Age', 'entity_domain'],
        ['cause', 'Cause', 'entity_domain'],
        ['rei', 'Risk/Etiology/Impairment', 'entity_domain'],
        ['label', 'Label', 'string'],
        ['year', 'Year', 'time'],
        ['type', 'Type', 'string']
    ], columns=['concept', 'name', 'concept_type'])

    dis_cdf.sort_values(by='concept').to_csv('../../ddf--concepts--discrete.csv', index=False)

    print("Done.")
示例#36
0
    def test_categorical_empty(self):
        # GH 1705

        def make_empty():
            return pd.DataFrame({"A": pd.Categorical([np.nan, np.nan])})

        def make_full():
            return pd.DataFrame({"A": pd.Categorical(['a', 'a'])})

        a = dd.from_delayed([dask.delayed(make_empty)(),
                             dask.delayed(make_full)()])
        # Used to raise an IndexError
        a.A.cat.categories
示例#37
0
def to_dask(source):

    futures = [read_partition(source, i) for i in range(source.npartitions)]

    if source.container == 'ndarray':
        # array_parts = [da.from_delayed(f, shape=c.shape, dtype=c.dtype) for f, c in zip(futures, chunks)]
        # return da.concatenate(array_parts, axis=0)
        raise ValueError('FIXME: Support ndarray concatenation')
    elif source.container == 'dataframe':
        import dask.dataframe as dd
        return dd.from_delayed(futures)
    elif source.container == 'list':
        import dask.bag as db
        return db.from_delayed(futures)
    else:
        raise ValueError('Unknown container type: %s' % source.container)
示例#38
0
文件: test_io.py 项目: floriango/dask
def test_from_delayed_misordered_meta():
    df = pd.DataFrame(
        columns=['(1)', '(2)', 'date', 'ent', 'val'],
        data=[range(i * 5, i * 5 + 5) for i in range(3)],
        index=range(3)
    )

    # meta with different order for columns
    misordered_meta = pd.DataFrame(
        columns=['date', 'ent', 'val', '(1)', '(2)'],
        data=[range(5)]
    )

    ddf = dd.from_delayed([delayed(lambda: df)()], meta=misordered_meta)

    with pytest.raises(ValueError) as info:
        #produces dataframe which does not match meta
        ddf.reset_index().compute()
    msg = "The columns in the computed data do not match the columns in the" \
          " provided metadata"
    assert msg in str(info.value)
示例#39
0
文件: json.py 项目: floriango/dask
def read_json(url_path, orient='records', lines=None, storage_options=None,
              blocksize=None, sample=2**20, encoding='utf-8', errors='strict',
              **kwargs):
    """Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is apropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.


    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks wihout data. Only relevant is using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    """
    import dask.dataframe as dd
    if lines is None:
        lines = orient == 'records'
    if orient != 'records' and lines:
        raise ValueError('Line-delimited JSON is only available with'
                         'orient="records".')
    if blocksize and (orient != 'records' or not lines):
        raise ValueError("JSON file chunking only allowed for JSON-lines"
                         "input (orient='records', lines=True).")
    storage_options = storage_options or {}
    if blocksize:
        first, chunks = read_bytes(url_path, b'\n', blocksize=blocksize,
                                   sample=sample, **storage_options)
        chunks = list(dask.core.flatten(chunks))
        first = read_json_chunk(first, encoding, errors, kwargs)
        parts = [dask.delayed(read_json_chunk)(
            chunk, encoding, errors, kwargs, meta=first[:0]
        ) for chunk in chunks]

    else:
        files = open_files(url_path, 'rt', encoding=encoding, errors=errors,
                           **storage_options)
        parts = [dask.delayed(read_json_file)(f, orient, lines, kwargs)
                 for f in files]
    return dd.from_delayed(parts)