예제 #1
0
def test_pivot_table(aggfunc):
    df = pd.DataFrame({'A': np.random.choice(list('XYZ'), size=100),
                       'B': np.random.randn(100),
                       'C': pd.Categorical(np.random.choice(list('abc'), size=100))})
    ddf = dd.from_pandas(df, 5)

    res = dd.pivot_table(ddf, index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    exp = pd.pivot_table(df, index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)

    assert_eq(res, exp)

    # method
    res = ddf.pivot_table(index='A', columns='C', values='B',
                          aggfunc=aggfunc)
    exp = df.pivot_table(index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)
    assert_eq(res, exp)
예제 #2
0
def test_pivot_table(aggfunc):
    df = pd.DataFrame({'A': np.random.choice(list('XYZ'), size=100),
                       'B': np.random.randn(100),
                       'C': pd.Categorical(np.random.choice(list('abc'), size=100))})
    ddf = dd.from_pandas(df, 5)

    res = dd.pivot_table(ddf, index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    exp = pd.pivot_table(df, index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)

    if PANDAS_ge_0190:
        assert_eq(res, exp)
    else:
        # because of a pandas 0.18.x bug, categorical dtype is not preserved
        assert_eq(res, exp, check_names=False, check_column_type=False)

    # method
    res = ddf.pivot_table(index='A', columns='C', values='B',
                          aggfunc=aggfunc)
    exp = df.pivot_table(index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)
    if PANDAS_ge_0190:
        assert_eq(res, exp)
    else:
        # because of a pandas 0.18.x bug, categorical dtype is not preserved
        assert_eq(res, exp, check_names=False, check_column_type=False)
예제 #3
0
def dask_stuff_backup():
    feature_dir = Path('./')
    df = dd.read_csv(os.path.join(feature_dir, '*-E*'), sep='\t',
                    names=['variant', 'c1', 'c2', 'v1', 'v2', 'data'],
                    include_path_column=True)
    df['path'] = df['path'].map(lambda x: Path(x).name)
    df = df.drop(['c1', 'c2', 'v1', 'v2'], axis=1)

    df = dd.pivot_table(df, index='variant', values='data', columns='path') 
    df['chr-pos'] = df['variant'].str.partition(';')[0]

    # df['chr'] = df['chrpos'].str.partition('-')[0]
    # df['pos'] = df['chrpos'].str.partition('-')[2]
    # df[['chr', 'pos']] = df[['chr', 'pos']].astype(int)
    # df = df.drop(['chrpos'], axis=1)

    ref = pd.DataFrame({'chr': [1, 1, 1], 'pos': [94986,706645,723891]})
    ref['chr-pos'] = ref['chr'].map(str) + '-' + ref['pos'].map(str)

    df_split = dd.merge(ref, df, on='chr-pos', how='left')

    features = ['DNase-E001','DNase-E120','H3K27ac-E100','DNase-E123']

        # dd.pivot_table(df, index='variant', values='data', columns='path') 

    for i, ref in enumerate(ref_dfs):
        ref = ref[['chr', 'pos']]
        df_split = dd.merge(ref, df, on=['chr', 'pos'], how='left')
        
        mat = split_to_mat(df_split, features)
    return
예제 #4
0
def test_pivot_table_dtype():

    df = pd.DataFrame({
        'A': list('AABB'),
        'B': pd.Categorical(list('ABAB')),
        'C': [1, 2, 3, 4]
    })
    ddf = dd.from_pandas(df, 2)
    res = dd.pivot_table(ddf,
                         index='A',
                         columns='B',
                         values='C',
                         aggfunc='count')

    exp_index = pd.CategoricalIndex(['A', 'B'], name='B')
    exp = pd.Series([np.float64] * 2, index=exp_index)
    tm.assert_series_equal(res.dtypes, exp)

    exp = pd.pivot_table(df,
                         index='A',
                         columns='B',
                         values='C',
                         aggfunc='count').astype(np.float64)

    if PANDAS_ge_0190:
        assert_eq(res, exp)
    else:
        # because of a pandas 0.18.x bug, categorical dtype is not preserved
        assert_eq(res, exp, check_names=False, check_column_type=False)
예제 #5
0
def test_pivot_table(values, aggfunc):
    df = pd.DataFrame(
        {
            "A": np.random.choice(list("XYZ"), size=100),
            "B": np.random.randn(100),
            "C": pd.Categorical(np.random.choice(list("abc"), size=100)),
            "D": np.random.randn(100),
        }
    )
    ddf = dd.from_pandas(df, 5).repartition((0, 20, 40, 60, 80, 98, 99))

    res = dd.pivot_table(ddf, index="A", columns="C", values=values, aggfunc=aggfunc)
    exp = pd.pivot_table(df, index="A", columns="C", values=values, aggfunc=aggfunc)
    if aggfunc == "count":
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)

    assert_eq(res, exp)

    # method
    res = ddf.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc)
    exp = df.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc)
    if aggfunc == "count":
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)
    assert_eq(res, exp)
예제 #6
0
def test_pivot_table_dtype():

    df = pd.DataFrame({
        "A": list("AABB"),
        "B": pd.Categorical(list("ABAB")),
        "C": [1, 2, 3, 4]
    })
    ddf = dd.from_pandas(df, 2)
    res = dd.pivot_table(ddf,
                         index="A",
                         columns="B",
                         values="C",
                         aggfunc="count")

    exp_index = pd.CategoricalIndex(["A", "B"], name="B")
    exp = pd.Series([np.float64] * 2, index=exp_index)
    tm.assert_series_equal(res.dtypes, exp)

    exp = pd.pivot_table(df,
                         index="A",
                         columns="B",
                         values="C",
                         aggfunc="count").astype(np.float64)

    assert_eq(res, exp)
예제 #7
0
def test_pivot_table(aggfunc):
    df = pd.DataFrame({
        'A':
        np.random.choice(list('XYZ'), size=100),
        'B':
        np.random.randn(100),
        'C':
        pd.Categorical(np.random.choice(list('abc'), size=100))
    })
    ddf = dd.from_pandas(df, 5)

    res = dd.pivot_table(ddf,
                         index='A',
                         columns='C',
                         values='B',
                         aggfunc=aggfunc)
    exp = pd.pivot_table(df,
                         index='A',
                         columns='C',
                         values='B',
                         aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)

    if PANDAS_ge_0190:
        assert eq(res, exp)
    else:
        # because of a pandas 0.18.x bug, categorical dtype is not preserved
        assert eq(res, exp, check_names=False, check_column_type=False)
def test_pivot_table_dtype():

    df = pd.DataFrame({
        'A': list('AABB'),
        'B': pd.Categorical(list('ABAB')),
        'C': [1, 2, 3, 4]
    })
    ddf = dd.from_pandas(df, 2)
    res = dd.pivot_table(ddf,
                         index='A',
                         columns='B',
                         values='C',
                         aggfunc='count')

    exp_index = pd.CategoricalIndex(['A', 'B'], name='B')
    exp = pd.Series([np.float64] * 2, index=exp_index)
    tm.assert_series_equal(res.dtypes, exp)

    exp = pd.pivot_table(df,
                         index='A',
                         columns='B',
                         values='C',
                         aggfunc='count').astype(np.float64)

    assert_eq(res, exp)
def test_pivot_table(aggfunc):
    df = pd.DataFrame({
        'A':
        np.random.choice(list('XYZ'), size=100),
        'B':
        np.random.randn(100),
        'C':
        pd.Categorical(np.random.choice(list('abc'), size=100))
    })
    ddf = dd.from_pandas(df, 5)

    res = dd.pivot_table(ddf,
                         index='A',
                         columns='C',
                         values='B',
                         aggfunc=aggfunc)
    exp = pd.pivot_table(df,
                         index='A',
                         columns='C',
                         values='B',
                         aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)

    assert_eq(res, exp)

    # method
    res = ddf.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc)
    exp = df.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)
    assert_eq(res, exp)
예제 #10
0
def dask_features_to_csv(feature_dir=TMP_DIR):
    """ dask implementation can wait, there will be different implementation
    for neighbors"""
    def read_label_feature(fn):
        tmp = pd.read_csv(feature_dir / f'{fn}',
                          sep='\t',
                          names=['variant', 'c1', 'c2', 'v1', 'v2', f'{fn}'])
        tmp = tmp.drop(['c1', 'c2', 'v1', 'v2'], axis=1)
        tmp['variant'] = tmp['variant'].astype(str)
        tmp[f'{fn}'] = tmp[f'{fn}'].astype(np.float32)
        tmp.set_index('variant', drop=True, inplace=True)
        return tmp

    df = dd.read_csv(os.path.join(feature_dir, '*-E*'),
                     sep='\t',
                     names=['variant', 'c1', 'c2', 'v1', 'v2', 'data'],
                     include_path_column=True)
    df['path'] = df['path'].map(lambda x: Path(x).name)
    df = df.drop(['c1', 'c2', 'v1', 'v2'], axis=1)
    dd.pivot_table(df, index='variant', values='data', columns='path')
예제 #11
0
def test_pivot_table_index_dtype():
    df = pd.DataFrame(
        {
            "A": pd.date_range(start="2019-08-01", periods=3, freq="1D"),
            "B": pd.Categorical(list("abc")),
            "C": [1, 2, 3],
        }
    )
    ddf = dd.from_pandas(df, 2)
    res = dd.pivot_table(ddf, index="A", columns="B", values="C", aggfunc="count")

    assert res.index.dtype == np.dtype("datetime64[ns]")
예제 #12
0
def test_pivot_table_dtype():

    df = pd.DataFrame({'A': list('AABB'),
                       'B': pd.Categorical(list('ABAB')),
                       'C': [1, 2, 3, 4]})
    ddf = dd.from_pandas(df, 2)
    res = dd.pivot_table(ddf, index='A', columns='B',
                         values='C', aggfunc='count')

    exp_index = pd.CategoricalIndex(['A', 'B'], name='B')
    exp = pd.Series([np.float64] * 2, index=exp_index)
    tm.assert_series_equal(res.dtypes, exp)

    exp = pd.pivot_table(df, index='A', columns='B',
                         values='C', aggfunc='count').astype(np.float64)

    assert_eq(res, exp)
예제 #13
0
def test_pivot_table_dtype():

    df = pd.DataFrame({'A': list('AABB'),
                       'B': pd.Categorical(list('ABAB')),
                       'C': [1, 2, 3, 4]})
    ddf = dd.from_pandas(df, 2)
    res = dd.pivot_table(ddf, index='A', columns='B',
                         values='C', aggfunc='count')

    exp_index = pd.CategoricalIndex(['A', 'B'], name='B')
    exp = pd.Series([np.float64] * 2, index=exp_index)
    tm.assert_series_equal(res.dtypes, exp)

    exp = pd.pivot_table(df, index='A', columns='B',
                         values='C', aggfunc='count').astype(np.float64)

    if PANDAS_ge_0190:
        assert_eq(res, exp)
    else:
        # because of a pandas 0.18.x bug, categorical dtype is not preserved
        assert_eq(res, exp, check_names=False, check_column_type=False)
예제 #14
0
def test_pivot_table_firstlast(values, aggfunc):

    df = pd.DataFrame(
        {
            "A": np.random.choice(list("XYZ"), size=100),
            "B": np.random.randn(100),
            "C": pd.Categorical(np.random.choice(list("abc"), size=100)),
            "D": np.random.choice(list("abc"), size=100),
        }
    )
    ddf = dd.from_pandas(df, 5).repartition((0, 20, 40, 60, 80, 98, 99))

    res = dd.pivot_table(ddf, index="A", columns="C", values=values, aggfunc=aggfunc)
    exp = pd.pivot_table(df, index="A", columns="C", values=values, aggfunc=aggfunc)

    assert_eq(exp, res)

    # method
    res = ddf.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc)
    exp = df.pivot_table(index="A", columns="C", values=values, aggfunc=aggfunc)

    assert_eq(exp, res)
예제 #15
0
def test_pivot_table_errors():
    df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10),
                       'B': np.random.randn(10),
                       'C': pd.Categorical(np.random.choice(list('abc'), size=10))})
    ddf = dd.from_pandas(df, 2)

    msg = "'index' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index=['A'], columns='C', values='B')
    msg = "'columns' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns=['C'], values='B')
    msg = "'values' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])

    msg = "aggfunc must be either 'mean', 'sum' or 'count'"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum'])

    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx')

    df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10),
                       'B': np.random.randn(10),
                       'C': np.random.choice(list('abc'), size=10)})
    ddf = dd.from_pandas(df, 2)
    msg = "'columns' must be category dtype"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values='B')
예제 #16
0
def test_pivot_table_errors():
    df = pd.DataFrame({
        'A':
        np.random.choice(list('abc'), size=10),
        'B':
        np.random.randn(10),
        'C':
        pd.Categorical(np.random.choice(list('abc'), size=10))
    })
    ddf = dd.from_pandas(df, 2)

    msg = "'index' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index=['A'], columns='C', values='B')
    msg = "'columns' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns=['C'], values='B')
    msg = "'values' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])

    msg = "aggfunc must be either 'mean', 'sum' or 'count'"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf,
                       index='A',
                       columns='C',
                       values='B',
                       aggfunc=['sum'])

    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx')

    df = pd.DataFrame({
        'A': np.random.choice(list('abc'), size=10),
        'B': np.random.randn(10),
        'C': np.random.choice(list('abc'), size=10)
    })
    ddf = dd.from_pandas(df, 2)
    msg = "'columns' must be category dtype"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values='B')
예제 #17
0
def test_pivot_table_errors():
    df = pd.DataFrame(
        {
            "A": np.random.choice(list("abc"), size=10),
            "B": np.random.randn(10),
            "C": pd.Categorical(np.random.choice(list("abc"), size=10)),
        }
    )
    ddf = dd.from_pandas(df, 2)

    msg = "'index' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index=["A"], columns="C", values="B")
    assert msg in str(err.value)
    msg = "'columns' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns=["C"], values="B")
    assert msg in str(err.value)
    msg = "'values' must refer to an existing column or columns"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values=[["B"]])
    assert msg in str(err.value)

    msg = "aggfunc must be either 'mean', 'sum', 'count', 'first', 'last'"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc=["sum"])
    assert msg in str(err.value)

    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc="xx")
    assert msg in str(err.value)

    # unknown categories
    ddf._meta = make_meta(
        {"A": object, "B": float, "C": "category"}, parent_meta=pd.DataFrame()
    )
    msg = "'columns' must have known categories"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values=["B"])
    assert msg in str(err.value)

    df = pd.DataFrame(
        {
            "A": np.random.choice(list("abc"), size=10),
            "B": np.random.randn(10),
            "C": np.random.choice(list("abc"), size=10),
        }
    )
    ddf = dd.from_pandas(df, 2)
    msg = "'columns' must be category dtype"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index="A", columns="C", values="B")
    assert msg in str(err.value)
def test_pivot_table_errors():
    df = pd.DataFrame({
        'A':
        np.random.choice(list('abc'), size=10),
        'B':
        np.random.randn(10),
        'C':
        pd.Categorical(np.random.choice(list('abc'), size=10))
    })
    ddf = dd.from_pandas(df, 2)

    msg = "'index' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index=['A'], columns='C', values='B')
    assert msg in str(err.value)
    msg = "'columns' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns=['C'], values='B')
    assert msg in str(err.value)
    msg = "'values' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])
    assert msg in str(err.value)

    msg = "aggfunc must be either 'mean', 'sum' or 'count'"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf,
                       index='A',
                       columns='C',
                       values='B',
                       aggfunc=['sum'])
    assert msg in str(err.value)

    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx')
    assert msg in str(err.value)

    # unknown categories
    ddf._meta = make_meta({'A': object, 'B': float, 'C': 'category'})
    msg = "'columns' must have known categories"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])
    assert msg in str(err.value)

    df = pd.DataFrame({
        'A': np.random.choice(list('abc'), size=10),
        'B': np.random.randn(10),
        'C': np.random.choice(list('abc'), size=10)
    })
    ddf = dd.from_pandas(df, 2)
    msg = "'columns' must be category dtype"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values='B')
    assert msg in str(err.value)
예제 #19
0
def test_pivot_table_errors():
    df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10),
                       'B': np.random.randn(10),
                       'C': pd.Categorical(np.random.choice(list('abc'), size=10))})
    ddf = dd.from_pandas(df, 2)

    msg = "'index' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index=['A'], columns='C', values='B')
    assert msg in str(err.value)
    msg = "'columns' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns=['C'], values='B')
    assert msg in str(err.value)
    msg = "'values' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])
    assert msg in str(err.value)

    msg = "aggfunc must be either 'mean', 'sum' or 'count'"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum'])
    assert msg in str(err.value)

    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx')
    assert msg in str(err.value)

    # unknown categories
    ddf._meta = make_meta({'A': object, 'B': float, 'C': 'category'})
    msg = "'columns' must have known categories"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])
    assert msg in str(err.value)

    df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10),
                       'B': np.random.randn(10),
                       'C': np.random.choice(list('abc'), size=10)})
    ddf = dd.from_pandas(df, 2)
    msg = "'columns' must be category dtype"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values='B')
    assert msg in str(err.value)