def test_partitioning_index_categorical_on_values(): df = pd.DataFrame({"a": list(string.ascii_letters), "b": [1, 2, 3, 4] * 13}) df.a = df.a.astype("category") df2 = df.copy() df2.a = df2.a.cat.set_categories(list(reversed(df2.a.cat.categories))) res = partitioning_index(df.a, 5) res2 = partitioning_index(df2.a, 5) assert (res == res2).all() res = partitioning_index(df, 5) res2 = partitioning_index(df2, 5) assert (res == res2).all()
def test_partitioning_index_categorical_on_values(): df = pd.DataFrame({'a': list(string.ascii_letters), 'b': [1, 2, 3, 4] * 13}) df.a = df.a.astype('category') df2 = df.copy() df2.a = df2.a.cat.set_categories(list(reversed(df2.a.cat.categories))) res = partitioning_index(df.a, 5) res2 = partitioning_index(df2.a, 5) assert (res == res2).all() res = partitioning_index(df, 5) res2 = partitioning_index(df2, 5) assert (res == res2).all()
def test_partitioning_index(): res = partitioning_index(df2.i32, 3) exp = np.array([1, 2, 0] * 3) np.testing.assert_equal(res, exp) res = partitioning_index(df2[['i32']], 3) np.testing.assert_equal(res, exp) res = partitioning_index(df2[['cat', 'bool', 'f32']], 2) assert ((0 <= res) & (res < 2)).all() res = partitioning_index(df2.index, 4) exp = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0]) np.testing.assert_equal(res, exp)
def check_partitions(df, npartitions): """Check that all values in `df` hashes to the same""" hashes = partitioning_index(df, npartitions) if len(hashes) > 0: return len(hashes.unique()) == 1 else: return True
def partition_by_hash(df, columns, n_chunks, ignore_index=False): """ Splits dataframe into partitions The partitions is determined by the hash value of the rows in `columns`. Parameters ---------- df: DataFrame columns: label or list Column names on which to split the dataframe npartition: int Number of partitions ignore_index : bool, default False Set True to ignore the index of `df` Returns ------- out: Dict[int, DataFrame] A dictionary mapping integers in {0..npartition} to dataframes. """ if df is None: return [None] * n_chunks # Hashing `columns` in `df` and assing it to the "_partitions" column df["_partitions"] = partitioning_index(df[columns], n_chunks) # Split `df` based on the hash values in the "_partitions" column ret = shuffle_group(df, "_partitions", 0, n_chunks, n_chunks, ignore_index) # Let's remove the partition column and return the partitions del df["_partitions"] for df in ret.values(): del df["_partitions"] return ret
def test_partitioning_index(): res = partitioning_index(df2.i32, 3) assert ((res < 3) & (res >= 0)).all() assert len(np.unique(res)) > 1 assert (partitioning_index(df2.i32, 3) == partitioning_index(df2.i32, 3)).all() res = partitioning_index(df2[['i32']], 3) assert ((res < 3) & (res >= 0)).all() assert len(np.unique(res)) > 1 res = partitioning_index(df2[['cat', 'bool', 'f32']], 2) assert ((0 <= res) & (res < 2)).all() res = partitioning_index(df2.index, 4) assert ((res < 4) & (res >= 0)).all() assert len(np.unique(res)) > 1
def test_partitioning_index(): res = partitioning_index(df2.i32, 3) assert ((res < 3) & (res >= 0)).all() assert len(np.unique(res)) > 1 assert (partitioning_index(df2.i32, 3) == partitioning_index(df2.i32, 3)).all() res = partitioning_index(df2[["i32"]], 3) assert ((res < 3) & (res >= 0)).all() assert len(np.unique(res)) > 1 res = partitioning_index(df2[["cat", "bool", "f32"]], 2) assert ((0 <= res) & (res < 2)).all() res = partitioning_index(df2.index, 4) assert ((res < 4) & (res >= 0)).all() assert len(np.unique(res)) > 1