def as_unknown(self): """Ensure the categories in this series are unknown""" if not self.known: return self._series out = self._series.copy() out._meta = clear_known_categories(out._meta) return out
def test_concat_categorical(known, cat_index, divisions): frames = [pd.DataFrame({'w': list('xxxxx'), 'x': np.arange(5), 'y': list('abcbc'), 'z': np.arange(5, dtype='f8')}), pd.DataFrame({'w': list('yyyyy'), 'x': np.arange(5, 10), 'y': list('abbba'), 'z': np.arange(5, 10, dtype='f8')}), pd.DataFrame({'w': list('zzzzz'), 'x': np.arange(10, 15), 'y': list('bcbcc'), 'z': np.arange(10, 15, dtype='f8')})] for df in frames: df.w = df.w.astype('category') df.y = df.y.astype('category') if cat_index: frames = [df.set_index(df.y) for df in frames] dframes = [dd.from_pandas(p, npartitions=2, sort=divisions) for p in frames] if not known: dframes[0]._meta = clear_known_categories(dframes[0]._meta, ['y'], index=True) def check_and_return(ddfs, dfs, join): sol = concat(dfs, join=join) res = dd.concat(ddfs, join=join, interleave_partitions=divisions) assert_eq(res, sol) if known: parts = compute_as_if_collection(dd.DataFrame, res.dask, res.__dask_keys__()) for p in [i.iloc[:0] for i in parts]: res._meta == p # will error if schemas don't align assert not cat_index or has_known_categories(res.index) == known return res for join in ['inner', 'outer']: # Frame res = check_and_return(dframes, frames, join) assert has_known_categories(res.w) assert has_known_categories(res.y) == known # Series res = check_and_return([i.y for i in dframes], [i.y for i in frames], join) assert has_known_categories(res) == known # Non-cat series with cat index if cat_index: res = check_and_return([i.x for i in dframes], [i.x for i in frames], join) # Partition missing columns res = check_and_return([dframes[0][['x', 'y']]] + dframes[1:], [frames[0][['x', 'y']]] + frames[1:], join) assert not hasattr(res, 'w') or has_known_categories(res.w) assert has_known_categories(res.y) == known
def test_categorize(): # rename y to y_ to avoid pandas future warning about ambiguous # levels meta = clear_known_categories(frames4[0]).rename(columns={'y': 'y_'}) ddf = dd.DataFrame({('unknown', i): df for (i, df) in enumerate(frames3)}, 'unknown', meta, [None] * 4).rename(columns={'y': 'y_'}) ddf = ddf.assign(w=ddf.w.cat.set_categories(['x', 'y', 'z'])) assert ddf.w.cat.known assert not ddf.y_.cat.known assert not ddf.index.cat.known df = ddf.compute() for index in [None, True, False]: known_index = index is not False # By default categorize object and unknown cat columns ddf2 = ddf.categorize(index=index) assert ddf2.y_.cat.known assert ddf2.v.cat.known assert ddf2.index.cat.known == known_index assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False) # Specifying split_every works ddf2 = ddf.categorize(index=index, split_every=2) assert ddf2.y_.cat.known assert ddf2.v.cat.known assert ddf2.index.cat.known == known_index assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False) # Specifying one column doesn't affect others ddf2 = ddf.categorize('v', index=index) assert not ddf2.y_.cat.known assert ddf2.v.cat.known assert ddf2.index.cat.known == known_index assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False) ddf2 = ddf.categorize('y_', index=index) assert ddf2.y_.cat.known assert ddf2.v.dtype == 'object' assert ddf2.index.cat.known == known_index assert_eq(ddf2, df) ddf_known_index = ddf.categorize(columns=[], index=True) assert ddf_known_index.index.cat.known assert_eq(ddf_known_index, df) # Specifying known categorical or no columns is a no-op: assert ddf.categorize(['w'], index=False) is ddf assert ddf.categorize([], index=False) is ddf assert ddf_known_index.categorize(['w']) is ddf_known_index assert ddf_known_index.categorize([]) is ddf_known_index # Bad split_every fails with pytest.raises(ValueError): ddf.categorize(split_every=1) with pytest.raises(ValueError): ddf.categorize(split_every='foo')
def test_categorize(): meta = clear_known_categories(frames4[0]) ddf = dd.DataFrame({('unknown', i): df for (i, df) in enumerate(frames3)}, 'unknown', meta, [None] * 4) ddf = ddf.assign(w=ddf.w.cat.set_categories(['x', 'y', 'z'])) assert ddf.w.cat.known assert not ddf.y.cat.known assert not ddf.index.cat.known df = ddf.compute() for index in [None, True, False]: known_index = index is not False # By default categorize object and unknown cat columns ddf2 = ddf.categorize(index=index) assert ddf2.y.cat.known assert ddf2.v.cat.known assert ddf2.index.cat.known == known_index assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False) # Specifying one column doesn't affect others ddf2 = ddf.categorize('v', index=index) assert not ddf2.y.cat.known assert ddf2.v.cat.known assert ddf2.index.cat.known == known_index assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False) ddf2 = ddf.categorize('y', index=index) assert ddf2.y.cat.known assert ddf2.v.dtype == 'object' assert ddf2.index.cat.known == known_index assert_eq(ddf2, df) ddf_known_index = ddf.categorize(columns=[], index=True) assert ddf_known_index.index.cat.known assert_eq(ddf_known_index, df) # Specifying known categorical or no columns is a no-op: assert ddf.categorize(['w'], index=False) is ddf assert ddf.categorize([], index=False) is ddf assert ddf_known_index.categorize(['w']) is ddf_known_index assert ddf_known_index.categorize([]) is ddf_known_index
def test_append_categorical(): frames = [pd.DataFrame({'x': np.arange(5, 10), 'y': list('abbba'), 'z': np.arange(5, 10, dtype='f8')}), pd.DataFrame({'x': np.arange(10, 15), 'y': list('bcbcc'), 'z': np.arange(10, 15, dtype='f8')})] frames2 = [] for df in frames: df.y = df.y.astype('category') df2 = df.copy() df2.y = df2.y.cat.set_categories(list('abc')) df.index = df.y frames2.append(df2.set_index(df2.y)) df1, df2 = frames2 for known in [True, False]: dframes = [dd.from_pandas(p, npartitions=2, sort=False) for p in frames] if not known: dframes[0]._meta = clear_known_categories(dframes[0]._meta, ['y'], index=True) ddf1, ddf2 = dframes res = ddf1.append(ddf2) assert_eq(res, df1.append(df2)) assert has_known_categories(res.index) == known assert has_known_categories(res.y) == known res = ddf1.y.append(ddf2.y) assert_eq(res, df1.y.append(df2.y)) assert has_known_categories(res.index) == known assert has_known_categories(res) == known res = ddf1.index.append(ddf2.index) assert_eq(res, df1.index.append(df2.index)) assert has_known_categories(res) == known
def text_blocks_to_pandas( reader, block_lists, header, head, kwargs, enforce=False, specified_dtypes=None, path=None, blocksize=None, urlpath=None, ): """Convert blocks of bytes to a dask.dataframe This accepts a list of lists of values of bytes where each list corresponds to one file, and the value of bytes concatenate to comprise the entire file, in order. Parameters ---------- reader : callable ``pd.read_csv`` or ``pd.read_table``. block_lists : list of lists of delayed values of bytes The lists of bytestrings where each list corresponds to one logical file header : bytestring The header, found at the front of the first file, to be prepended to all blocks head : pd.DataFrame An example Pandas DataFrame to be used for metadata. kwargs : dict Keyword arguments to pass down to ``reader`` path : tuple, optional A tuple containing column name for path and the path_converter if provided Returns ------- A dask.dataframe """ dtypes = head.dtypes.to_dict() # dtypes contains only instances of CategoricalDtype, which causes issues # in coerce_dtypes for non-uniform categories across partitions. # We will modify `dtype` (which is inferred) to # 1. contain instances of CategoricalDtypes for user-provided types # 2. contain 'category' for data inferred types categoricals = head.select_dtypes(include=["category"]).columns if isinstance(specified_dtypes, Mapping): known_categoricals = [ k for k in categoricals if isinstance(specified_dtypes.get(k), CategoricalDtype) and specified_dtypes.get(k).categories is not None ] unknown_categoricals = categoricals.difference(known_categoricals) else: unknown_categoricals = categoricals # Fixup the dtypes for k in unknown_categoricals: dtypes[k] = "category" columns = list(head.columns) blocks = tuple(flatten(block_lists)) # Create mask of first blocks from nested block_lists is_first = tuple(block_mask(block_lists)) is_last = tuple(block_mask_last(block_lists)) if path: colname, path_converter = path paths = [b[1].path for b in blocks] if path_converter: paths = [path_converter(p) for p in paths] head = head.assign( **{ colname: pd.Categorical.from_codes(np.zeros(len(head), dtype=int), set(paths)) }) path = (colname, paths) if len(unknown_categoricals): head = clear_known_categories(head, cols=unknown_categoricals) # Define parts parts = [] colname, paths = path or (None, None) for i in range(len(blocks)): parts.append( [blocks[i], paths[i] if paths else None, is_first[i], is_last[i]]) # Construct the output collection with from_map return from_map( CSVFunctionWrapper( columns, None, colname, head, header, reader, dtypes, enforce, kwargs, ), parts, meta=head, label="read-csv", token=tokenize(reader, urlpath, columns, enforce, head, blocksize), enforce_metadata=False, produces_tasks=True, )