def test_data_as_first_argument(): def equals(df1, df2): return df1.equals(df2) df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2')) assert equals(create(df, 'x*2'), df >> create('x*2')) assert len(sample_n(df, 5)) == len(df >> sample_n(5)) assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3)) assert equals(select(df, 'x'), df >> select('x')) assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x')) assert equals(distinct(df), df >> distinct()) assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)')) assert equals(group_by(df, 'x'), df >> group_by('x')) assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup()) assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)')) assert equals(query(df, 'x % 2'), df >> query('x % 2')) assert equals(tally(df, 'x'), df >> tally('x')) def xsum(gdf): return [gdf['x'].sum()] assert equals(do(group_by(df, 'y'), xsum=xsum), df >> group_by('y') >> do(xsum=xsum)) assert len(head(df, 4) == 4) assert len(tail(df, 4) == 4)
def summarize_fd_by_subject(df): return ( df >> p.group_by("subject_id", "condition", "data_id", "headcase") >> p.summarize( fd_mean="mean(FramewiseDisplacement)", fd_median="median(FramewiseDisplacement)", fd_mean_filter="filter_mean(FramewiseDisplacement)", fd_median_filter="filter_median(FramewiseDisplacement)", perc_spikes="perc_high_motion(FramewiseDisplacement)", ) >> p.do( lambda df: df.melt( id_vars=["subject_id", "data_id", "condition", "headcase"], value_vars=[ "fd_mean", "fd_median", "fd_mean_filter", "fd_median_filter", "perc_spikes", ], var_name="measure", value_name="val", ) ) >> p.arrange("subject_id") >> p.call(".reset_index", drop=True) )
def test_do(): df = pd.DataFrame({ 'x': [1, 2, 2, 3], 'y': [2, 3, 4, 3], 'z': list('aabb'), 'w': pd.Categorical(list('aabb')), }) def least_squares(gdf): X = np.vstack([gdf.x, np.ones(len(gdf))]).T (m, c), _, _, _ = np.linalg.lstsq(X, gdf.y, None) return pd.DataFrame({'slope': [m], 'intercept': c}) def slope(x, y): return np.diff(y)[0] / np.diff(x)[0] def intercept(x, y): return y.values[0] - slope(x, y) * x.values[0] df1 = df >> group_by('z') >> do(least_squares) df2 = df >> group_by('z') >> do( slope=lambda gdf: slope(gdf.x, gdf.y), intercept=lambda gdf: intercept(gdf.x, gdf.y)) df3 = df >> group_by('w') >> do(least_squares) df4 = df >> group_by('w') >> do( slope=lambda gdf: slope(gdf.x, gdf.y), intercept=lambda gdf: intercept(gdf.x, gdf.y)) assert df1.plydata_groups == ['z'] assert df2.plydata_groups == ['z'] assert df1['z'].dtype == object assert df2['z'].dtype == object assert df3['w'].dtype == 'category' assert df4['w'].dtype == 'category' npt.assert_array_equal(df1['z'], df2['z']) npt.assert_array_almost_equal(df1['intercept'], df2['intercept']) npt.assert_array_almost_equal(df1['slope'], df2['slope']) # No groups (Test with pass-through functions) df1 = df >> do(lambda gdf: gdf) df2 = df >> do(x=lambda gdf: gdf.x, y=lambda gdf: gdf.y, z=lambda gdf: gdf.z, w=lambda gdf: gdf.w) cols = list('xyzw') assert all(df[cols] == df1[cols]) assert all(df[cols] == df2[cols]) # Reordered data so that the groups are not all # bunched together df = pd.DataFrame( { 'x': [2, 1, 2, 3], 'y': [4, 2, 3, 3], 'z': list('baab'), 'w': pd.Categorical(list('baab')), }, index=[3, 1, 0, 2] # good index ) dfi = pd.DataFrame( { 'x': [2, 1, 2, 3], 'y': [4, 2, 3, 3], 'z': list('baab'), 'w': pd.Categorical(list('baab')), }, index=[3, 1, 0, 0] # bad index ) # Reuse group dataframe def sum_x(gdf): gdf['sum_x'] = gdf['x'].sum() return gdf # When the group dataframe is reused and the # index is good (no duplicates) the rows # in the result should not be reordered res = df >> group_by('z') >> do(sum_x) assert df['x'].equals(res['x']) assert all(res['sum_x'] == [5, 3, 3, 5]) # Can use string evaluation res = df >> group_by('z') >> do(n='len(x)') assert all(res['z'] == ['b', 'a']) assert all(res['n'] == [2, 2]) # bad index is handled correctly res = dfi >> group_by('z') >> do(sum_x) assert dfi.index.equals(res.index) assert dfi['x'].equals(res['x']) assert all(res['sum_x'] == [5, 3, 3, 5]) # Branches with pytest.raises(ValueError): # args and kwargs df >> group_by('w') >> do( least_squares, slope=lambda gdf: slope(gdf.x, gdf.y), intercept=lambda gdf: intercept(gdf.x, gdf.y)) with pytest.raises(TypeError): df >> group_by('w') >> do('len(x)') # Potentially problematic index def non_range_index_func(gdf): return pd.Series([11, 12, 13], index=[21, 22, 23]) result = df >> do(r=non_range_index_func) assert all(result['r'] == [11, 12, 13])