def test_add(): df = pd.DataFrame(np.identity(12)) df2 = df.copy() df2.index += 1 sf1 = sp.SparseFrame(df) sf2 = sp.SparseFrame(df2) correct = sf1.add(sf2).todense() dsf = dsp.from_pandas(df, npartitions=4) dsf2 = dsp.from_pandas(df2, npartitions=4) res = dsf.add(dsf2).compute().todense() pdt.assert_frame_equal(res, correct)
def from_pandas(df, npartitions=None, chunksize=None, name=None): """ Parameters ---------- df : pandas.DataFrame or pandas.Series The DataFrame/Series with which to construct a Dask DataFrame/Series npartitions : int, optional The number of partitions of the index to create. Note that depending on the size and index of the dataframe, the output may have fewer partitions than requested. chunksize : int, optional The size of the partitions of the index. name: string, optional An optional keyname for the dataframe. Define when dataframe large. Defaults to hashing the input. Hashing takes a lot of time on large df. """ nrows = df.shape[0] if chunksize is None: chunksize = int(ceil(nrows / npartitions)) else: npartitions = int(ceil(nrows / chunksize)) if not df.index.is_monotonic_increasing: df = df.sort_index() divisions, locations = sorted_division_locations(df.index, chunksize=chunksize) name = name or 'from_pandas-{}'.format(tokenize(df, npartitions)) dsk = dict( ((name, i), sp.SparseFrame(df.iloc[start:stop])) for i, (start, stop) in enumerate(zip(locations[:-1], locations[1:]))) meta = _make_meta(df) return SparseFrame(dsk, name, meta, divisions)
def one_hot_encode(ddf, column, categories, index_col): """ Sparse one hot encoding of dask.DataFrame Convert a dask.DataFrame into a series of SparseFrames. By one hot encoding a single column Parameters ---------- ddf: dask.DataFrame e.g. the clickstream column: str column name to one hot encode in with SparseFrame categories: iterable possible category values index_col: str, iterable which columns to use as index Returns ------- sparse_one_hot: dask.Series """ idx_meta = ddf._meta.reset_index().set_index(index_col).index[:0] \ if index_col else ddf._meta.index meta = sp.SparseFrame(np.array([]), columns=categories, index=idx_meta) dsf = ddf.map_partitions(sparse_one_hot, column=column, categories=categories, index_col=index_col, meta=object) return SparseFrame(dsf.dask, dsf._name, meta, dsf.divisions)
def _make_meta(inp): if isinstance(inp, sp.SparseFrame): return inp.iloc[:0] else: meta = dd_make_meta(inp) if isinstance(meta, pd.core.generic.NDFrame): return sp.SparseFrame(meta) return meta
def test_read_npz(): sf = sp.SparseFrame(np.identity(100)) with tmpdir() as tmp: sf.iloc[:25].to_npz(os.path.join(tmp, '1')) sf.iloc[25:50].to_npz(os.path.join(tmp, '2')) sf.iloc[50:75].to_npz(os.path.join(tmp, '3')) sf.iloc[75:].to_npz(os.path.join(tmp, '4')) dsf = dsp.read_npz(os.path.join(tmp, '*.npz')) sf = dsf.compute() assert np.all(sf.data.toarray() == np.identity(100))
def read_npz(path, read_divisions=False, storage_options=None): """ Read SparseFrame from npz archives Parameters ---------- path: str path to load files from can contain '*' to reference multiple files read_divisions: bool if the files are sorted read the index for each file to obtain divions. If files are not sorted this will raise and error. Returns ------- dsf: dask.SparseFrame """ dsk = {} name = 'read_npz-{}'.format(tokenize(path)) loader = None divisions = None try: loader = _open_npz_archive( path.split('*')[0] + 'metadata.npz', storage_options) divisions = loader['divisions'] _paths = loader['partitions'] except FileNotFoundError: _paths = _sorted(list(glob(path))) finally: if loader: loader.close() archive = _open_npz_archive(_paths[0], storage_options) meta_idx, meta_cols = archive['frame_index'], archive['frame_columns'] meta = sp.SparseFrame(np.empty(shape=(0, len(meta_cols))), index=meta_idx[:0], columns=meta_cols) for i, p in enumerate(_paths): dsk[name, i] = (sp.SparseFrame.read_npz, p, storage_options) if divisions is None and read_divisions: level = 0 if isinstance(meta_idx, pd.MultiIndex) else None divisions = _npz_read_divisions(_paths, level=level) elif divisions is None: divisions = [None] * (len(_paths) + 1) return SparseFrame(dsk, name, meta, divisions=divisions)
def __init__(self, dsk, name, meta, divisions=None): if isinstance(meta, SparseFrame): # TODO: remove this case once we subclass from dask._Frame meta = meta._meta if not isinstance(meta, sp.SparseFrame): meta = sp.SparseFrame(meta) self.dask = dsk self._name = name self._meta = make_meta(meta) self.divisions = tuple(divisions) self.ndim = 2 self.loc = _LocIndexer(self)
def test_distributed_join_shortcut(how): left = pd.DataFrame(np.identity(10), index=np.arange(10), columns=list('ABCDEFGHIJ')) right = pd.DataFrame(np.identity(10), index=np.arange(5, 15), columns=list('KLMNOPQRST')) correct = left.join(right, how=how).fillna(0) d_left = dsp.from_pandas(left, npartitions=2) d_right = sp.SparseFrame(right) joined = d_left.join(d_right, how=how) res = joined.compute().todense() pdt.assert_frame_equal(correct, res)
def _construct_item_features(self, item_features, item_ids): """Create item features during predict.""" # align feature names if self.indicator_setting in ['both', 'items']: item_indicator = sp.SparseFrame(self._item_indicator, index=self.iid_map.index) item_indicator = item_indicator.reindex(item_ids).data else: item_indicator = None if self.item_feature_names is None: return item_indicator item_feat_csr = item_features\ .loc[:, self.item_feature_names]\ .reindex(item_ids, axis=0)\ .data if item_indicator is not None: item_feat_csr = sparse.hstack([item_feat_csr, item_indicator]) return item_feat_csr
def read_npz(path, sorted=False): """ Read SparseFrame from npz archives Parameters ---------- path: str path to load files from can contain '*' to reference multiple files sorted: bool if the files are sorted read the index for each file to obtain divions Returns ------- dsf: dask.SparseFrame """ dsk = {} name = 'read_npz-{}'.format(tokenize(path)) _paths = _sorted(list(glob(path))) archive = np.load(_paths[0]) meta_idx, meta_cols = archive['frame_index'], archive['frame_columns'] meta = sp.SparseFrame(np.empty(shape=(0, len(meta_cols))), index=meta_idx[:0], columns=meta_cols) for i, p in enumerate(_paths): dsk[name, i] = (sp.SparseFrame.read_npz, p) if sorted: level = 0 if isinstance(meta_idx, pd.MultiIndex) else None divisions = _npz_read_divisions(_paths, level=level) else: divisions = [None] * (len(_paths) + 1) return SparseFrame(dsk, name, meta, divisions=divisions)
def foo(sf, x, y): return sp.SparseFrame(sf.data * x * y, index=sf.index, columns=sf.columns)
def meta_nonempty_sparsity(x): idx = _nonempty_index(x.index) return sp.SparseFrame(sparse.csr_matrix((len(idx), len(x.columns))), index=idx, columns=x.columns)
def one_hot_encode(ddf, column=None, categories=None, index_col=None, order=None, prefixes=False, sep='_', ignore_cat_order_mismatch=False): """ Sparse one hot encoding of dask.DataFrame. Convert a dask.DataFrame into a series of SparseFrames by one-hot encoding specified columns. Parameters ---------- ddf: dask.DataFrame e.g. the clickstream categories: dict Maps ``column name`` to specification on how to treat this column. Specification can be: - iterable of possible category values; - ``None`` if this column is already of categorical dtype; - ``False`` if this column should not be one-hot-encoded - it will be included in the result untouched. This argument decides which column(s) will be processed by this function. See description of `order` and `ignore_cat_order_mismatch`. By default, try to ohe-hot-encode all categorical columns and include all the other columns untouched. index_col: str | iterable which columns to use as index order: iterable Specify order in which one-hot encoded columns should be aligned. Must have the same elements as keys of ``categories``. If `order = [col_name1, col_name2]` and `categories = {col_name1: ['A', 'B'], col_name2: ['C', 'D']}`, then the resulting SparseFrame will have columns `['A', 'B', 'C', 'D']`. If you don't specify order, then output columns' order depends on iteration over `categories` dictionary. You can pass `categories` as an OrderedDict instead of providing `order` explicitly. prefixes: bool If False, column names will be the same as categories, so that new columns will be named like: [cat11, cat12, cat21, cat22, ...]. If True, original column name followed by a separator will be added in front of each category name, so that new columns will be named like: [col1_cat11, col1_cat12, col2_cat21, col2_cat22, ...]. See ``sep`` argument. sep: str Separator used when ``prefixes`` is True. column: DEPRECATED Kept only for backward compatibility. ignore_cat_order_mismatch: bool If a column being one-hot encoded is of categorical dtype, it has its categories already predefined, so we don't need to explicitly pass them in `categories` argument (see this argument's description). However, if we pass them, they may be different than ones defined in column.cat.categories. In such a situation, a ValueError will be raised. However, if only orders of categories are different (but sets of elements are same), you may specify ignore_cat_order_mismatch=True to suppress this error. In such a situation, column's predefined categories will be used. Returns ------- sparse_one_hot: sparsity.dask.SparseFrame """ idx_meta = ddf._meta.reset_index().set_index(index_col).index[:0] \ if index_col else ddf._meta.index columns = sparse_one_hot(ddf._meta, column=column, categories=categories, index_col=index_col, order=order, prefixes=prefixes, sep=sep, ignore_cat_order_mismatch=ignore_cat_order_mismatch ).columns meta = sp.SparseFrame(np.empty(shape=(0, len(columns))), columns=columns, index=idx_meta) dsf = ddf.map_partitions(sparse_one_hot, column=column, categories=categories, index_col=index_col, order=order, prefixes=prefixes, sep=sep, ignore_cat_order_mismatch=ignore_cat_order_mismatch, meta=object) return SparseFrame(dsf.dask, dsf._name, meta, dsf.divisions)