def test_from_delayed(): df = pd.DataFrame(data=np.random.normal(size=(10, 4)), columns=list('abcd')) parts = [df.iloc[:1], df.iloc[1:3], df.iloc[3:6], df.iloc[6:10]] dfs = [delayed(parts.__getitem__)(i) for i in range(4)] meta = dfs[0].compute() my_len = lambda x: pd.Series([len(x)]) for divisions in [None, [0, 1, 3, 6, 10]]: ddf = dd.from_delayed(dfs, meta=meta, divisions=divisions) assert_eq(ddf, df) assert list(ddf.map_partitions(my_len).compute()) == [1, 2, 3, 4] assert ddf.known_divisions == (divisions is not None) s = dd.from_delayed([d.a for d in dfs], meta=meta.a, divisions=divisions) assert_eq(s, df.a) assert list(s.map_partitions(my_len).compute()) == [1, 2, 3, 4] assert ddf.known_divisions == (divisions is not None) meta2 = [(c, 'f8') for c in df.columns] assert_eq(dd.from_delayed(dfs, meta=meta2), df) assert_eq(dd.from_delayed([d.a for d in dfs], meta=('a', 'f8')), df.a) with pytest.raises(ValueError): dd.from_delayed(dfs, meta=meta, divisions=[0, 1, 3, 6]) with pytest.raises(ValueError) as e: dd.from_delayed(dfs, meta=meta.a).compute() assert str(e.value).startswith('Metadata mismatch found in `from_delayed`')
def to_dask(self, *args, **kwds): from dask import dataframe as dd self._load_metadata() return dd.from_delayed([ dask.delayed(self._get_partition(i, *args, **kwds)) for i in range(self.npartitions) ])
def to_dask(self): """Create lazy dask dataframe object""" import dask.dataframe as dd from dask import delayed self.discover() dpart = delayed(read_file_uavro) return dd.from_delayed([dpart(f, self._head) for f in self._files], meta=self.dtype)
def test_from_delayed_sorted(): a = pd.DataFrame({'x': [1, 2]}, index=[1, 10]) b = pd.DataFrame({'x': [4, 1]}, index=[100, 200]) A = dd.from_delayed([delayed(a), delayed(b)], divisions='sorted') assert A.known_divisions assert A.divisions == (1, 100, 200)
def aggregrate_files(self, urls, n_procs=1): import dask import dask.dataframe as dd dfs = [dask.delayed(self.read_csv)(f) for f in urls] dff = dd.from_delayed(dfs) df = dff.compute(num_workers=n_procs) return df
def test_from_delayed_sorted(): a = pd.DataFrame({'x': [1, 2]}, index=[1, 10]) b = pd.DataFrame({'x': [4, 1]}, index=[100, 200]) A = dd.from_delayed([delayed(a), delayed(b)], divisions='sorted') assert A.known_divisions assert A.divisions == (1, 100, 200)
def test_basic(loop, delayed): # noqa: F811 with dask_cuda.LocalCUDACluster(loop=loop) as cluster: with Client(cluster): pdf = dask.datasets.timeseries(dtypes={"x": int}).reset_index() gdf = pdf.map_partitions(cudf.DataFrame.from_pandas) if delayed: gdf = dd.from_delayed(gdf.to_delayed()) dd.assert_eq(pdf.head(), gdf.head())
def add_data(self, dates, daily=False, sub_hourly=False, download=False, latlonbox=None): """Short summary. Parameters ---------- dates : type Description of parameter `dates`. daily : type Description of parameter `daily` (the default is False). sub_hourly : type Description of parameter `sub_hourly` (the default is False). download : type Description of parameter `download` (the default is False). latlonbox : type Description of parameter `latlonbox` (the default is None). Returns ------- type Description of returned object. """ import dask import dask.dataframe as dd if self.monitor_df is None: self.get_monitor_df() if latlonbox is not None: # get them all[latmin,lonmin,latmax,lonmax] mdf = self.monitor_df con = ( (mdf.LATITUDE >= latlonbox[0]) & (mdf.LATITUDE <= latlonbox[2]) & (mdf.LONGITUDE >= latlonbox[1]) & (mdf.LONGITUDE <= latlonbox[3]) ) monitors = mdf.loc[con].copy() else: monitors = self.monitor_df.copy() urls, fnames = self.build_urls(monitors, dates, daily=daily, sub_hourly=sub_hourly) if download: for url, fname in zip(urls, fnames): self.retrieve(url, fname) dfs = [dask.delayed(self.load_file)(i) for i in fnames] else: dfs = [dask.delayed(self.load_file)(i) for i in urls] dff = dd.from_delayed(dfs) self.df = dff.compute() self.df = pd.merge(self.df, monitors, how="left", on=["WBANNO", "LATITUDE", "LONGITUDE"]) if ~self.df.columns.isin(["time"]).max(): self.df["time"] = self.df.time_local + pd.to_timedelta(self.df.GMT_OFFSET, unit="H") id_vars = self.monitor_df.columns.append(pd.Index(["time", "time_local"])) keys = self.df.columns[self.df.columns.isin(id_vars)] self.df = pd.melt( self.df, id_vars=keys, var_name="variable", value_name="obs" ) # this stacks columns to be inline with MONET self.df.rename(columns={"WBANNO": "siteid"}, inplace=True) self.change_units() self.df.columns = [i.lower() for i in self.df.columns]
def join_frames(left, right, on, how, lsuffix, rsuffix): """Join two frames on 1 or more columns. Parameters ---------- left, right : dask_cudf.DataFrame on : tuple[str] key column(s) how : str Join method lsuffix, rsuffix : str """ empty_frame = left._meta.merge(right._meta, on=on, how=how, suffixes=(lsuffix, rsuffix)) def merge(left, right): return left.merge(right, on=on, how=how, suffixes=(lsuffix, rsuffix)) left_val_names = [k for k in left.columns if k not in on] right_val_names = [k for k in right.columns if k not in on] same_names = set(left_val_names) & set(right_val_names) if same_names and not (lsuffix or rsuffix): raise ValueError("there are overlapping columns but " "lsuffix and rsuffix are not defined") dtypes = {k: left[k].dtype for k in left.columns} dtypes.update({k: right[k].dtype for k in right.columns}) left_parts = left.to_delayed() right_parts = right.to_delayed() # Add column w/ hash(v) % nparts nparts = max(len(left_parts), len(right_parts)) left_hashed = group_frame(left_parts, nparts, on) right_hashed = group_frame(right_parts, nparts, on) # Fanout each partition into nparts subgroups left_subgroups = fanout_subgroups(left_hashed, nparts) right_subgroups = fanout_subgroups(right_hashed, nparts) assert len(left_subgroups) == len(right_subgroups) # Concat left_cats = [delayed(cudf.concat, pure=True)(it) for it in left_subgroups] right_cats = [ delayed(cudf.concat, pure=True)(it) for it in right_subgroups ] # Combine merged = [ delayed(merge, pure=True)(left_cats[i], right_cats[i]) for i in range(nparts) ] return dd.from_delayed(merged, prefix="join_result", meta=empty_frame)
def add_data(self, dates, param=None, daily=False, network=None, download=False, local=False, n_procs=1, meta=False): """Short summary. Parameters ---------- dates : list of datetime objects Description of parameter `dates`. param : list of strings Description of parameter `param` (the default is None). daily : boolean Description of parameter `daily` (the default is False). network : type Description of parameter `network` (the default is None). download : type Description of parameter `download` (the default is False). Returns ------- pandas DataFrame Description of returned object. """ import dask import dask.dataframe as dd if param is None: params = [ 'SPEC', 'PM10', 'PM2.5', 'PM2.5_FRM', 'CO', 'OZONE', 'SO2', 'VOC', 'NONOXNOY', 'WIND', 'TEMP', 'RHDP' ] else: params = param urls, fnames = self.build_urls(params, dates, daily=daily) if download: for url, fname in zip(urls, fnames): self.retrieve(url, fname) dfs = [ dask.delayed(self.load_aqs_file)(i, network) for i in fnames ] elif local: dfs = [ dask.delayed(self.load_aqs_file)(i, network) for i in fnames ] else: dfs = [dask.delayed(self.load_aqs_file)(i, network) for i in urls] dff = dd.from_delayed(dfs) dfff = dff.compute(num_workers=n_procs) if meta: return (self.add_data2(dfff, daily, network)) else: return dfff
def load_all_seasons(self, columns=None, npartitions=8): """ Loads all of the seasons in the statcast_data folder Parameters ---------- columns (list, default=None) : columns to read. If None, all columns are read npartitions(int, default=8) : the number of pandas DataFrames to split the Dask DataFrame into Returns ------- DataFrame : the data from the statcast_data folder as one DataFrame """ datasets = [dask.delayed(feather.read_dataframe)(f"statcast_data/{year}", columns=columns) for year in range(2017, 2021)] meta = self.get_data_types(df=dd.from_delayed(datasets[0])) df = dd.from_delayed(datasets, meta=meta).repartition(npartitions=npartitions) df.set_index("index") return df
def read_sites(self, box=None, country=None, state=None, site=None, resample=True, window='H'): import urllib.request, urllib.error, urllib.parse from numpy import NaN i = self.dates[0] year = i.strftime('%Y') url = 'https://www1.ncdc.noaa.gov/pub/data/noaa/' + year + '/' if self.history is None: self.read_ish_history() self.history[ 'fname'] = url + self.history.USAF + '-' + self.history.WBAN + '-' + year + '.gz' dfloc = self.history.copy() if type(box) is not type(None): print('Retrieving Sites in: ' + box) dfloc = self.subset_sites(latmin=box[0], lonmin=box[1], latmax=box[2], lonmax=box[3]) elif country is not None: print('Retrieving Country: ' + country) dfloc = self.history.loc[self.history.CTRY == country, :] elif state is not None: print('Retrieving State: ' + state) dfloc = self.history.loc[self.history.STATE == state, :] elif type(site) is not type(None): print('Retrieving Site: ' + site) dfloc = self.history.loc[self.history.station_id == site, :] print(dfloc.fname.unique()) objs = self.get_url_file_objs(dfloc.fname.unique()) # return objs,size,self.history.fname # dfs = [] # for f in objs: # try: # dfs.append(self.read_data_frame(f)) # except: # pass print(' Reading ISH into pandas DataFrame...') dfs = [dask.delayed(self.read_data_frame)(f) for f in objs] dff = dd.from_delayed(dfs) self.df = dff.compute() self.df.loc[self.df.vsb == 99999, 'vsb'] = NaN if resample: print(' Resampling to every ' + window) self.df.index = self.df.datetime self.df = self.df.groupby('station_id').resample( 'H').mean().reset_index() self.df = self.df.merge(self.history[[ 'station_id', 'latitude', 'longitude', 'STATION NAME' ]], on=['station_id'], how='left')
def aggregate_files(dates=dates, *, download=False, n_procs=1, daily=False): """Short summary. Parameters ---------- dates : array-like of datetime-like Passed to :func:`build_urls`. download : bool, optional Whether to first download the AirNow files to the local directory before loading. n_procs : int For Dask. Returns ------- pandas.DataFrame Of the combined AirNow hourly files. """ import dask import dask.dataframe as dd print("Aggregating AIRNOW files...") urls, fnames = build_urls(dates, daily=daily) if download: for url, fname in zip(urls, fnames): retrieve(url, fname) dfs = [dask.delayed(read_csv)(f) for f in fnames] else: dfs = [dask.delayed(read_csv)(f) for f in urls] dff = dd.from_delayed(dfs) df = dff.compute(num_workers=n_procs).reset_index() # Datetime conversion if daily: df["time"] = pd.to_datetime(df.date, format=r"%m/%d/%y", exact=True) else: df["time"] = pd.to_datetime( df.date + " " + df.time, format=r"%m/%d/%y %H:%M", exact=True ) # TODO: move to read_csv? (and some of this other stuff too?) df["time_local"] = df.time + pd.to_timedelta(df.utcoffset, unit="H") df.drop(["date"], axis=1, inplace=True) print(" Adding in Meta-data") df = get_station_locations(df) if daily: df = df[[ col for col in savecols if col not in {"time_local", "utcoffset"} ]] else: df = df[savecols] df.drop_duplicates(inplace=True) df = filter_bad_values(df) return df.reset_index(drop=True)
def load( self, name, from_date=None, to_date=None, freq=None, time_travel=None, **kwargs ): # Find the last value _before_ time range to carry over last_before = from_date if from_date: _, last_before = self._range( name, to_date=from_date, time_travel=time_travel ) last_before = last_before["time"] ddf = self._read(name, last_before, to_date, freq, time_travel, **kwargs) if not from_date: from_date = ddf.index.min().compute() # First value in data if not to_date: to_date = ddf.index.max().compute() # Last value in data if pd.Timestamp(to_date) < pd.Timestamp(from_date): to_date = from_date # Keep only last created_time for each index timestamp delayed_apply = dask.delayed( # Use pandas on each dask partition lambda x: x.reset_index() .set_index("created_time") .sort_index() .groupby("time") .last() ) ddf = dd.from_delayed([delayed_apply(d) for d in ddf.to_delayed()]) # Repartition to remove empty chunks ddf = ddf.repartition(partition_size="25MB") # Apply resampling/date filtering if freq: # Index samples for final dataframe samples = dd.from_pandas( pd.DataFrame(index=pd.date_range(from_date, to_date, freq=freq)), chunksize=100000, ) ddf = dd.merge( # Interpolate dd.merge( ddf, samples, left_index=True, right_index=True, how="outer", ).ffill(), samples, left_index=True, right_index=True, how="right", ) else: # Filter on date range ddf = ddf.loc[pd.Timestamp(from_date) : pd.Timestamp(to_date)] # Repartition to remove empty chunks ddf = ddf.repartition(partition_size="25MB") return ddf
def _load_metadata(self): import dask.dataframe as dd import dask.delayed if self.dataframe is None: self.parts = [dask.delayed(get_partition)( self.url, self.headers, self._source_id, self.container, i ) for i in range(self.npartitions)] self.dataframe = dd.from_delayed(self.parts) return self._schema
def to_dask(self): import dask.delayed import dask.dataframe as dd self._get_schema() dload = dask.delayed(load_stream) parts = [ dload(self._stream_class, s, self._protocol, self._payload, self._chunksize) for s in self._stream_sources ] return dd.from_delayed(parts)
def to_dask_cudf(dask_arr, client=None): client = default_client() if client is None else client elms = [_to_cudf(dp) for dp in dask_arr.to_delayed().flatten()] dfs = client.compute(elms) meta = client.submit(_get_meta, dfs[0]) meta_local = meta.result() return dd.from_delayed(dfs, meta=meta_local)
def to_dask_cudf(futures): """ Convert a list of futures containing cudf Dataframes into a Dask.Dataframe :param futures: list[cudf.Dataframe] list of futures containing dataframes :return: dask.Dataframe a dask.Dataframe """ c = default_client() # Convert a list of futures containing dfs back into a dask_cudf dfs = [d for d in futures if d.type != type(None)] # NOQA meta = c.submit(get_meta, dfs[0]).result() return dd.from_delayed(dfs, meta=meta)
def load_season(self, season, columns=None, npartitions=8): """ Loads a single season from the statcast_data folder Parameters ---------- season (int, str) : the season to load columns (list, default=None) : columns to read. If None, all columns are read npartitions(int, default=8) : the number of pandas DataFrames to split the Dask DataFrame into Returns -------- DataFrame : the data from the specified season """ df = [dask.delayed(feather.read_dataframe)(f"statcast_data/{season}", columns=columns)] meta = self.get_data_types(df=dd.from_delayed(df)) df = dd.from_delayed(df, meta=meta).repartition(npartitions=npartitions) df.set_index("index") return df
def read_metafile(path: PathType) -> dd.DataFrame: """Read cbgen metafile containing partitioned variant info""" with bgen_metafile(path) as mf: divisions = [mf.partition_size * i for i in range(mf.npartitions)] + [mf.nvariants - 1] dfs = [ dask.delayed(_read_metafile_partition)(path, i) for i in range(mf.npartitions) ] meta = dd.utils.make_meta(METAFILE_DTYPE) return dd.from_delayed(dfs, meta=meta, divisions=divisions)
def calc_docstats_df(self, crps): # Get a list of records with doc stats for df docstat_lst = [self.make_record_df(doc) for doc in crps] # Load into dask df docstats_dd = dd.from_delayed(docstat_lst, # meta=dtype_sample_df ) return docstats_dd
def read_sql_table(table, uri, npartitions=None, columns=None, index_col=None, chunkrowsize=1000000, **kwargs): """ Create dataframe from an SQL table. Parameters ---------- table : string Table name uri : string Full sqlalchemy URI for the database connection npartitions : int or None Number of partitions. If None, uses chunkrowsize. chunkrowsize : int If npartitions is None, use this to decide the sizes of the partitions. columns : list of strings or None Which columns to select; if None, gets all index_col : string Column which becomes the index, and defines the partitioning. Should be a indexed column in the SQL server. If None, uses row number (numerical index). kwargs : dict Additional parameters to pass to `pd.read_sql()` Returns ------- dask.dataframe """ if index_col is None: raise ValueError("Must specify index column to partition on") if npartitions is None: length = pd.read_sql('select count(1) from ' + table, uri).iloc[0, 0] npartitions = (length-1) // chunkrowsize + 1 if columns and index_col not in columns: columns.append(index_col) columns = ", ".join(['"{}"'.format(c) for c in columns]) if columns else "*" head = pd.read_sql('SELECT {columns} FROM {table} LIMIT 5'.format( columns=columns, table=table, index_col=index_col ), uri, **kwargs) columns = ", ".join(['"{}"'.format(c) for c in head.columns]) if columns=="*" else columns parts = [] kwargs['index_col'] = index_col for i in range(npartitions): q = """ SELECT {columns} FROM (SELECT {columns}, NTILE({nparts}) OVER (ORDER BY "{index}") as partition FROM {table}) temp WHERE partition = {i}; """.format(columns=columns, table=table, nparts=npartitions, index=index_col, i=i+1) parts.append(delayed(pd.read_sql_query)(q, uri, **kwargs)) return from_delayed(parts, head)
def sort_values_binned(self, by): """Sorty by the given column and ensure that the same key doesn't spread across multiple partitions. """ # Get sorted partitions parts = self.sort_values(by=by).to_delayed() # Get unique keys in each partition @delayed def get_unique(p): return set(p[by].unique()) uniques = list(compute(*map(get_unique, parts))) joiner = {} for i in range(len(uniques)): joiner[i] = to_join = {} for j in range(i + 1, len(uniques)): intersect = uniques[i] & uniques[j] # If the keys intersect if intersect: # Remove keys uniques[j] -= intersect to_join[j] = frozenset(intersect) else: break @delayed def join(df, other, keys): others = [ other.query("{by}==@k".format(by=by)) for k in sorted(keys) ] return cudf.concat([df] + others) @delayed def drop(df, keep_keys): locvars = locals() for i, k in enumerate(keep_keys): locvars["k{}".format(i)] = k conds = [ "{by}==@k{i}".format(by=by, i=i) for i in range(len(keep_keys)) ] expr = " or ".join(conds) return df.query(expr) for i in range(len(parts)): if uniques[i]: parts[i] = drop(parts[i], uniques[i]) for joinee, intersect in joiner[i].items(): parts[i] = join(parts[i], parts[joinee], intersect) results = [p for i, p in enumerate(parts) if uniques[i]] return from_delayed(results, meta=self._meta).reset_index()
def sort_values(self, by, ignore_index=False): """Sort by the given column Parameter --------- by : str """ parts = self.to_delayed() sorted_parts = batcher_sortnet.sort_delayed_frame(parts, by) return from_delayed( sorted_parts, meta=self._meta).reset_index(force=not ignore_index)
def test_from_delayed_optimize_fusion(): # Test that DataFrame optimization fuses a `from_delayed` # layer with other Blockwise layers and input Delayed tasks. # See: https://github.com/dask/dask/pull/8852 ddf = (dd.from_delayed( map(delayed(lambda x: pd.DataFrame({"x": [x] * 10})), range(10)), meta=pd.DataFrame({"x": [0] * 10}), ) + 1) # NOTE: Fusion requires `optimize_blockwise`` and `fuse_roots` assert isinstance(ddf.dask.layers[ddf._name], Blockwise) assert len(optimize(ddf.dask, ddf.__dask_keys__()).layers) == 1
def dask_add_rowid(df, col_name): parts = df.to_delayed() parts = [ dask.delayed(_add_rowid)(part, col_name, len(parts), idx) for idx, part in enumerate(parts) ] meta = df._meta.copy() meta[col_name] = pd.Series([], dtype=int) return dd.from_delayed(parts, meta=meta, divisions=df.divisions)
def dask_offset_limit(df, offset, limit): """Perform a limit-offset operation against a dataframe.""" parts = df.to_delayed() lens = [dask.delayed(len)(part) for part in parts] lens = dask.delayed(as_list)(*lens) parts = [ dask.delayed(select_subset)(idx, part, lens, offset, limit, df._meta) for idx, part in enumerate(parts) ] return dd.from_delayed(parts, meta=df._meta)
def test_check_meta_flag(): from pandas import Series from dask.delayed import delayed from dask.dataframe import from_delayed a = Series(["a", "b", "a"], dtype="category") b = Series(["a", "c", "a"], dtype="category") da = delayed(lambda x: x)(a) db = delayed(lambda x: x)(b) c = from_delayed([da, db], verify_meta=False) assert_eq(c, c)
def test_categorical_empty(self): # GH 1705 def make_empty(): return pd.DataFrame({"A": pd.Categorical([np.nan, np.nan])}) def make_full(): return pd.DataFrame({"A": pd.Categorical(["a", "a"])}) a = dd.from_delayed([dask.delayed(make_empty)(), dask.delayed(make_full)()]) # Used to raise an IndexError a.A.cat.categories
def test_set_index_sorted_min_max_same(): a = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 0, 0]}) b = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 1, 1]}) aa = delayed(a) bb = delayed(b) df = dd.from_delayed([aa, bb], meta=a) assert not df.known_divisions df2 = df.set_index('y', sorted=True) assert df2.divisions == (0, 1, 1)
def test_set_index_sorted_min_max_same(): a = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 0, 0]}) b = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 1, 1]}) aa = delayed(a) bb = delayed(b) df = dd.from_delayed([aa, bb], meta=a) assert not df.known_divisions df2 = df.set_index('y', sorted=True) assert df2.divisions == (0, 1, 1)
def test_from_delayed_preserves_hlgs(): df = pd.DataFrame(data=np.random.normal(size=(10, 4)), columns=list("abcd")) parts = [df.iloc[:1], df.iloc[1:3], df.iloc[3:6], df.iloc[6:10]] dfs = [delayed(parts.__getitem__)(i) for i in range(4)] meta = dfs[0].compute() chained = [d.a for d in dfs] hlg = dd.from_delayed(chained, meta=meta).dask for d in chained: for layer_name, layer in d.dask.layers.items(): assert hlg.layers[layer_name] == layer assert hlg.dependencies[layer_name] == d.dask.dependencies[layer_name]
def to_dask_cudf(futures, client=None): """ Convert a list of futures containing cudf Dataframes into a Dask.Dataframe :param futures: list[cudf.Dataframe] list of futures containing dataframes :param client: dask.distributed.Client Optional client to use :return: dask.Dataframe a dask.Dataframe """ c = default_client() if client is None else client # Convert a list of futures containing dfs back into a dask_cudf dfs = [d for d in futures if d.type != type(None)] # NOQA meta = c.submit(get_meta, dfs[0]).result() return dd.from_delayed(dfs, meta=meta)
def test_from_delayed(): df = pd.DataFrame(data=np.random.normal(size=(10, 4)), columns=list('abcd')) parts = [df.iloc[:1], df.iloc[1:3], df.iloc[3:6], df.iloc[6:10]] dfs = [delayed(parts.__getitem__)(i) for i in range(4)] meta = dfs[0].compute() my_len = lambda x: pd.Series([len(x)]) for divisions in [None, [0, 1, 3, 6, 10]]: ddf = dd.from_delayed(dfs, meta=meta, divisions=divisions) assert_eq(ddf, df) assert list(ddf.map_partitions(my_len).compute()) == [1, 2, 3, 4] assert ddf.known_divisions == (divisions is not None) s = dd.from_delayed([d.a for d in dfs], meta=meta.a, divisions=divisions) assert_eq(s, df.a) assert list(s.map_partitions(my_len).compute()) == [1, 2, 3, 4] assert ddf.known_divisions == (divisions is not None) with pytest.raises(ValueError): dd.from_delayed(dfs, meta=meta, divisions=[0, 1, 3, 6]) with pytest.raises(ValueError) as e: dd.from_delayed(dfs, meta=meta.a).compute() assert str(e.value).startswith('Metadata mismatch found in `from_delayed`')
def main(): md = ihme.load_metadata() metric = md['metric'].copy() measure = md['measure'].copy() # datapoints datapoint_output_dir = osp.join(output_dir, 'deaths') os.makedirs(datapoint_output_dir, exist_ok=True) data_full = dd.from_delayed([dask.delayed(load_data)(f) for f in os.listdir(source_dir) if f.endswith('.zip')], meta=DTYPES) metric = metric.set_index('id')['name'].to_dict() measure = measure.set_index('id')['short_name'].to_dict() all_measures = list() measure_metric_combinations = product(MEASURES, METRICS) for g in measure_metric_combinations: name = measure[g[0]] + ' ' + metric[g[1]] print(f'creating dattpoints for {name}') concept = to_concept_id(name) all_measures.append((concept, name)) cols = ['location', 'sex', 'age', 'cause', 'year', 'val'] df = data_full.loc[(data_full.measure == g[0]) & (data_full.metric == g[1]), cols].compute() serve_datapoint(df, concept) # entities serve_entities(md) # concepts cont_cdf = pd.DataFrame(all_measures, columns=['concept', 'name']) cont_cdf['concept_type'] = 'measure' cont_cdf.to_csv('../../ddf--concepts--continuous.csv', index=False) dis_cdf = pd.DataFrame([ ['name', 'Name', 'string'], ['short_name', 'Short Name', 'string'], ['medium_name', 'Medium Name', 'string'], ['long_name', 'Long Name', 'string'], ['location', 'Location', 'entity_domain'], ['sex', 'Sex', 'entity_domain'], ['age', 'Age', 'entity_domain'], ['cause', 'Cause', 'entity_domain'], ['rei', 'Risk/Etiology/Impairment', 'entity_domain'], ['label', 'Label', 'string'], ['year', 'Year', 'time'], ['type', 'Type', 'string'] ], columns=['concept', 'name', 'concept_type']) dis_cdf.sort_values(by='concept').to_csv('../../ddf--concepts--discrete.csv', index=False) print("Done.")
def test_categorical_empty(self): # GH 1705 def make_empty(): return pd.DataFrame({"A": pd.Categorical([np.nan, np.nan])}) def make_full(): return pd.DataFrame({"A": pd.Categorical(['a', 'a'])}) a = dd.from_delayed([dask.delayed(make_empty)(), dask.delayed(make_full)()]) # Used to raise an IndexError a.A.cat.categories
def to_dask(source): futures = [read_partition(source, i) for i in range(source.npartitions)] if source.container == 'ndarray': # array_parts = [da.from_delayed(f, shape=c.shape, dtype=c.dtype) for f, c in zip(futures, chunks)] # return da.concatenate(array_parts, axis=0) raise ValueError('FIXME: Support ndarray concatenation') elif source.container == 'dataframe': import dask.dataframe as dd return dd.from_delayed(futures) elif source.container == 'list': import dask.bag as db return db.from_delayed(futures) else: raise ValueError('Unknown container type: %s' % source.container)
def test_from_delayed_misordered_meta(): df = pd.DataFrame( columns=['(1)', '(2)', 'date', 'ent', 'val'], data=[range(i * 5, i * 5 + 5) for i in range(3)], index=range(3) ) # meta with different order for columns misordered_meta = pd.DataFrame( columns=['date', 'ent', 'val', '(1)', '(2)'], data=[range(5)] ) ddf = dd.from_delayed([delayed(lambda: df)()], meta=misordered_meta) with pytest.raises(ValueError) as info: #produces dataframe which does not match meta ddf.reset_index().compute() msg = "The columns in the computed data do not match the columns in the" \ " provided metadata" assert msg in str(info.value)
def read_json(url_path, orient='records', lines=None, storage_options=None, blocksize=None, sample=2**20, encoding='utf-8', errors='strict', **kwargs): """Create a dataframe from a set of JSON files This utilises ``pandas.read_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this is apropriate for line-delimited "JSON-lines" data, the kind of JSON output that is most common in big-data scenarios, and which can be chunked when reading (see ``read_json()``). All other options require blocksize=None, i.e., one partition per input file. Parameters ---------- url_path: str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation blocksize: None or int If None, files are not blocked, and you get one partition per input file. If int, which can only be used for line-delimited JSON files, each partition will be approximately this size in bytes, to the nearest newline character. sample: int Number of bytes to pre-load, to provide an empty dataframe structure to any blocks wihout data. Only relevant is using blocksize. encoding, errors: Text conversion, ``see bytes.decode()`` Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_json('myfile.1.json') # doctest: +SKIP Load multiple files >>> dd.read_json('myfile.*.json') # doctest: +SKIP >>> dd.read_json(['myfile.1.json', 'myfile.2.json']) # doctest: +SKIP Load large line-delimited JSON files using partitions of approx 256MB size >> dd.read_json('data/file*.csv', blocksize=2**28) """ import dask.dataframe as dd if lines is None: lines = orient == 'records' if orient != 'records' and lines: raise ValueError('Line-delimited JSON is only available with' 'orient="records".') if blocksize and (orient != 'records' or not lines): raise ValueError("JSON file chunking only allowed for JSON-lines" "input (orient='records', lines=True).") storage_options = storage_options or {} if blocksize: first, chunks = read_bytes(url_path, b'\n', blocksize=blocksize, sample=sample, **storage_options) chunks = list(dask.core.flatten(chunks)) first = read_json_chunk(first, encoding, errors, kwargs) parts = [dask.delayed(read_json_chunk)( chunk, encoding, errors, kwargs, meta=first[:0] ) for chunk in chunks] else: files = open_files(url_path, 'rt', encoding=encoding, errors=errors, **storage_options) parts = [dask.delayed(read_json_file)(f, orient, lines, kwargs) for f in files] return dd.from_delayed(parts)