def test_pivot_table(aggfunc): df = pd.DataFrame({'A': np.random.choice(list('XYZ'), size=100), 'B': np.random.randn(100), 'C': pd.Categorical(np.random.choice(list('abc'), size=100))}) ddf = dd.from_pandas(df, 5) res = dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=aggfunc) exp = pd.pivot_table(df, index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) assert_eq(res, exp) # method res = ddf.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc) exp = df.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) assert_eq(res, exp)
def test_pivot_table(aggfunc): df = pd.DataFrame({'A': np.random.choice(list('XYZ'), size=100), 'B': np.random.randn(100), 'C': pd.Categorical(np.random.choice(list('abc'), size=100))}) ddf = dd.from_pandas(df, 5) res = dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=aggfunc) exp = pd.pivot_table(df, index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) if PANDAS_ge_0190: assert_eq(res, exp) else: # because of a pandas 0.18.x bug, categorical dtype is not preserved assert_eq(res, exp, check_names=False, check_column_type=False) # method res = ddf.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc) exp = df.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) if PANDAS_ge_0190: assert_eq(res, exp) else: # because of a pandas 0.18.x bug, categorical dtype is not preserved assert_eq(res, exp, check_names=False, check_column_type=False)
def dask_stuff_backup(): feature_dir = Path('./') df = dd.read_csv(os.path.join(feature_dir, '*-E*'), sep='\t', names=['variant', 'c1', 'c2', 'v1', 'v2', 'data'], include_path_column=True) df['path'] = df['path'].map(lambda x: Path(x).name) df = df.drop(['c1', 'c2', 'v1', 'v2'], axis=1) df = dd.pivot_table(df, index='variant', values='data', columns='path') df['chr-pos'] = df['variant'].str.partition(';')[0] # df['chr'] = df['chrpos'].str.partition('-')[0] # df['pos'] = df['chrpos'].str.partition('-')[2] # df[['chr', 'pos']] = df[['chr', 'pos']].astype(int) # df = df.drop(['chrpos'], axis=1) ref = pd.DataFrame({'chr': [1, 1, 1], 'pos': [94986,706645,723891]}) ref['chr-pos'] = ref['chr'].map(str) + '-' + ref['pos'].map(str) df_split = dd.merge(ref, df, on='chr-pos', how='left') features = ['DNase-E001','DNase-E120','H3K27ac-E100','DNase-E123'] # dd.pivot_table(df, index='variant', values='data', columns='path') for i, ref in enumerate(ref_dfs): ref = ref[['chr', 'pos']] df_split = dd.merge(ref, df, on=['chr', 'pos'], how='left') mat = split_to_mat(df_split, features) return
def test_pivot_table_dtype(): df = pd.DataFrame({ 'A': list('AABB'), 'B': pd.Categorical(list('ABAB')), 'C': [1, 2, 3, 4] }) ddf = dd.from_pandas(df, 2) res = dd.pivot_table(ddf, index='A', columns='B', values='C', aggfunc='count') exp_index = pd.CategoricalIndex(['A', 'B'], name='B') exp = pd.Series([np.float64] * 2, index=exp_index) tm.assert_series_equal(res.dtypes, exp) exp = pd.pivot_table(df, index='A', columns='B', values='C', aggfunc='count').astype(np.float64) if PANDAS_ge_0190: assert_eq(res, exp) else: # because of a pandas 0.18.x bug, categorical dtype is not preserved assert_eq(res, exp, check_names=False, check_column_type=False)
def test_pivot_table(values, aggfunc): df = pd.DataFrame( { "A": np.random.choice(list("XYZ"), size=100), "B": np.random.randn(100), "C": pd.Categorical(np.random.choice(list("abc"), size=100)), "D": np.random.randn(100), } ) ddf = dd.from_pandas(df, 5).repartition((0, 20, 40, 60, 80, 98, 99)) res = dd.pivot_table(ddf, index="A", columns="C", values=values, aggfunc=aggfunc) exp = pd.pivot_table(df, index="A", columns="C", values=values, aggfunc=aggfunc) if aggfunc == "count": # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) assert_eq(res, exp) # method res = ddf.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc) exp = df.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc) if aggfunc == "count": # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) assert_eq(res, exp)
def test_pivot_table_dtype(): df = pd.DataFrame({ "A": list("AABB"), "B": pd.Categorical(list("ABAB")), "C": [1, 2, 3, 4] }) ddf = dd.from_pandas(df, 2) res = dd.pivot_table(ddf, index="A", columns="B", values="C", aggfunc="count") exp_index = pd.CategoricalIndex(["A", "B"], name="B") exp = pd.Series([np.float64] * 2, index=exp_index) tm.assert_series_equal(res.dtypes, exp) exp = pd.pivot_table(df, index="A", columns="B", values="C", aggfunc="count").astype(np.float64) assert_eq(res, exp)
def test_pivot_table(aggfunc): df = pd.DataFrame({ 'A': np.random.choice(list('XYZ'), size=100), 'B': np.random.randn(100), 'C': pd.Categorical(np.random.choice(list('abc'), size=100)) }) ddf = dd.from_pandas(df, 5) res = dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=aggfunc) exp = pd.pivot_table(df, index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) if PANDAS_ge_0190: assert eq(res, exp) else: # because of a pandas 0.18.x bug, categorical dtype is not preserved assert eq(res, exp, check_names=False, check_column_type=False)
def test_pivot_table_dtype(): df = pd.DataFrame({ 'A': list('AABB'), 'B': pd.Categorical(list('ABAB')), 'C': [1, 2, 3, 4] }) ddf = dd.from_pandas(df, 2) res = dd.pivot_table(ddf, index='A', columns='B', values='C', aggfunc='count') exp_index = pd.CategoricalIndex(['A', 'B'], name='B') exp = pd.Series([np.float64] * 2, index=exp_index) tm.assert_series_equal(res.dtypes, exp) exp = pd.pivot_table(df, index='A', columns='B', values='C', aggfunc='count').astype(np.float64) assert_eq(res, exp)
def test_pivot_table(aggfunc): df = pd.DataFrame({ 'A': np.random.choice(list('XYZ'), size=100), 'B': np.random.randn(100), 'C': pd.Categorical(np.random.choice(list('abc'), size=100)) }) ddf = dd.from_pandas(df, 5) res = dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=aggfunc) exp = pd.pivot_table(df, index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) assert_eq(res, exp) # method res = ddf.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc) exp = df.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) assert_eq(res, exp)
def dask_features_to_csv(feature_dir=TMP_DIR): """ dask implementation can wait, there will be different implementation for neighbors""" def read_label_feature(fn): tmp = pd.read_csv(feature_dir / f'{fn}', sep='\t', names=['variant', 'c1', 'c2', 'v1', 'v2', f'{fn}']) tmp = tmp.drop(['c1', 'c2', 'v1', 'v2'], axis=1) tmp['variant'] = tmp['variant'].astype(str) tmp[f'{fn}'] = tmp[f'{fn}'].astype(np.float32) tmp.set_index('variant', drop=True, inplace=True) return tmp df = dd.read_csv(os.path.join(feature_dir, '*-E*'), sep='\t', names=['variant', 'c1', 'c2', 'v1', 'v2', 'data'], include_path_column=True) df['path'] = df['path'].map(lambda x: Path(x).name) df = df.drop(['c1', 'c2', 'v1', 'v2'], axis=1) dd.pivot_table(df, index='variant', values='data', columns='path')
def test_pivot_table_index_dtype(): df = pd.DataFrame( { "A": pd.date_range(start="2019-08-01", periods=3, freq="1D"), "B": pd.Categorical(list("abc")), "C": [1, 2, 3], } ) ddf = dd.from_pandas(df, 2) res = dd.pivot_table(ddf, index="A", columns="B", values="C", aggfunc="count") assert res.index.dtype == np.dtype("datetime64[ns]")
def test_pivot_table_dtype(): df = pd.DataFrame({'A': list('AABB'), 'B': pd.Categorical(list('ABAB')), 'C': [1, 2, 3, 4]}) ddf = dd.from_pandas(df, 2) res = dd.pivot_table(ddf, index='A', columns='B', values='C', aggfunc='count') exp_index = pd.CategoricalIndex(['A', 'B'], name='B') exp = pd.Series([np.float64] * 2, index=exp_index) tm.assert_series_equal(res.dtypes, exp) exp = pd.pivot_table(df, index='A', columns='B', values='C', aggfunc='count').astype(np.float64) assert_eq(res, exp)
def test_pivot_table_dtype(): df = pd.DataFrame({'A': list('AABB'), 'B': pd.Categorical(list('ABAB')), 'C': [1, 2, 3, 4]}) ddf = dd.from_pandas(df, 2) res = dd.pivot_table(ddf, index='A', columns='B', values='C', aggfunc='count') exp_index = pd.CategoricalIndex(['A', 'B'], name='B') exp = pd.Series([np.float64] * 2, index=exp_index) tm.assert_series_equal(res.dtypes, exp) exp = pd.pivot_table(df, index='A', columns='B', values='C', aggfunc='count').astype(np.float64) if PANDAS_ge_0190: assert_eq(res, exp) else: # because of a pandas 0.18.x bug, categorical dtype is not preserved assert_eq(res, exp, check_names=False, check_column_type=False)
def test_pivot_table_firstlast(values, aggfunc): df = pd.DataFrame( { "A": np.random.choice(list("XYZ"), size=100), "B": np.random.randn(100), "C": pd.Categorical(np.random.choice(list("abc"), size=100)), "D": np.random.choice(list("abc"), size=100), } ) ddf = dd.from_pandas(df, 5).repartition((0, 20, 40, 60, 80, 98, 99)) res = dd.pivot_table(ddf, index="A", columns="C", values=values, aggfunc=aggfunc) exp = pd.pivot_table(df, index="A", columns="C", values=values, aggfunc=aggfunc) assert_eq(exp, res) # method res = ddf.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc) exp = df.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc) assert_eq(exp, res)
def test_pivot_table_errors(): df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': pd.Categorical(np.random.choice(list('abc'), size=10))}) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index=['A'], columns='C', values='B') msg = "'columns' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns=['C'], values='B') msg = "'values' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values=['B']) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum']) with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx') df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': np.random.choice(list('abc'), size=10)}) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B')
def test_pivot_table_errors(): df = pd.DataFrame({ 'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': pd.Categorical(np.random.choice(list('abc'), size=10)) }) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index=['A'], columns='C', values='B') msg = "'columns' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns=['C'], values='B') msg = "'values' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values=['B']) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum']) with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx') df = pd.DataFrame({ 'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': np.random.choice(list('abc'), size=10) }) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B')
def test_pivot_table_errors(): df = pd.DataFrame( { "A": np.random.choice(list("abc"), size=10), "B": np.random.randn(10), "C": pd.Categorical(np.random.choice(list("abc"), size=10)), } ) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index=["A"], columns="C", values="B") assert msg in str(err.value) msg = "'columns' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns=["C"], values="B") assert msg in str(err.value) msg = "'values' must refer to an existing column or columns" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values=[["B"]]) assert msg in str(err.value) msg = "aggfunc must be either 'mean', 'sum', 'count', 'first', 'last'" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc=["sum"]) assert msg in str(err.value) with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc="xx") assert msg in str(err.value) # unknown categories ddf._meta = make_meta( {"A": object, "B": float, "C": "category"}, parent_meta=pd.DataFrame() ) msg = "'columns' must have known categories" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values=["B"]) assert msg in str(err.value) df = pd.DataFrame( { "A": np.random.choice(list("abc"), size=10), "B": np.random.randn(10), "C": np.random.choice(list("abc"), size=10), } ) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B") assert msg in str(err.value)
def test_pivot_table_errors(): df = pd.DataFrame({ 'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': pd.Categorical(np.random.choice(list('abc'), size=10)) }) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index=['A'], columns='C', values='B') assert msg in str(err.value) msg = "'columns' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns=['C'], values='B') assert msg in str(err.value) msg = "'values' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values=['B']) assert msg in str(err.value) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum']) assert msg in str(err.value) with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx') assert msg in str(err.value) # unknown categories ddf._meta = make_meta({'A': object, 'B': float, 'C': 'category'}) msg = "'columns' must have known categories" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values=['B']) assert msg in str(err.value) df = pd.DataFrame({ 'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': np.random.choice(list('abc'), size=10) }) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B') assert msg in str(err.value)
def test_pivot_table_errors(): df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': pd.Categorical(np.random.choice(list('abc'), size=10))}) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index=['A'], columns='C', values='B') assert msg in str(err.value) msg = "'columns' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns=['C'], values='B') assert msg in str(err.value) msg = "'values' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values=['B']) assert msg in str(err.value) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum']) assert msg in str(err.value) with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx') assert msg in str(err.value) # unknown categories ddf._meta = make_meta({'A': object, 'B': float, 'C': 'category'}) msg = "'columns' must have known categories" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values=['B']) assert msg in str(err.value) df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': np.random.choice(list('abc'), size=10)}) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B') assert msg in str(err.value)