def test_Q(): df = pd.DataFrame({'var.name': [1, 2, 3], 'class': [1, 2, 3]}) with pytest.raises(NameError): df >> define(y='var.name') with pytest.raises(NameError): df >> create(y='var.name') with pytest.raises(SyntaxError): df >> define(y='class+1') with pytest.raises(SyntaxError): df >> create(y='class+1') with pytest.raises(SyntaxError): df >> arrange('class+1') df >> define(y='Q("var.name")') df >> create(y='Q("var.name")') df >> define(y='Q("class")') df >> create(y='Q("class")') df >> define(y='class') df >> create(y='class') df >> arrange('class') df >> arrange('Q("class")+1')
def test_data_as_first_argument(): def equals(df1, df2): return df1.equals(df2) df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2')) assert equals(create(df, 'x*2'), df >> create('x*2')) assert len(sample_n(df, 5)) == len(df >> sample_n(5)) assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3)) assert equals(select(df, 'x'), df >> select('x')) assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x')) assert equals(distinct(df), df >> distinct()) assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)')) assert equals(group_by(df, 'x'), df >> group_by('x')) assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup()) assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)')) assert equals(query(df, 'x % 2'), df >> query('x % 2')) assert equals(tally(df, 'x'), df >> tally('x')) def xsum(gdf): return [gdf['x'].sum()] assert equals(do(group_by(df, 'y'), xsum=xsum), df >> group_by('y') >> do(xsum=xsum)) assert len(head(df, 4) == 4) assert len(tail(df, 4) == 4)
def test_arrange(): # Index 0, 1, 2, 3, 4, 5 df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0], 'y': [1, 2, 3, 4, 5, 6]}) I = pd.Index result = df >> arrange('x') assert result.index.equals(I([5, 0, 2, 3, 4, 1])) result = df >> arrange('x', '-y') assert result.index.equals(I([5, 0, 3, 2, 4, 1])) result = df >> arrange('np.sin(y)') assert result.index.equals(I([4, 3, 5, 2, 0, 1])) # Branches result = df >> arrange() assert result is df result = df >> arrange('x') >> arrange('y') # already sorted assert result.index.equals(df.index) # Bad index df_bad = df.copy() df_bad.index = [0, 1, 0, 1, 0, 1] result = df_bad >> arrange('x') assert result.index.equals(I([1, 0, 0, 1, 0, 1])) result = df_bad >> arrange('x', '-y') assert result.index.equals(I([1, 0, 1, 0, 0, 1]))
def summarize_fd_by_subject(df): return ( df >> p.group_by("subject_id", "condition", "data_id", "headcase") >> p.summarize( fd_mean="mean(FramewiseDisplacement)", fd_median="median(FramewiseDisplacement)", fd_mean_filter="filter_mean(FramewiseDisplacement)", fd_median_filter="filter_median(FramewiseDisplacement)", perc_spikes="perc_high_motion(FramewiseDisplacement)", ) >> p.do( lambda df: df.melt( id_vars=["subject_id", "data_id", "condition", "headcase"], value_vars=[ "fd_mean", "fd_median", "fd_mean_filter", "fd_median_filter", "perc_spikes", ], var_name="measure", value_name="val", ) ) >> p.arrange("subject_id") >> p.call(".reset_index", drop=True) )
def test_arrange(): # Index 0, 1, 2, 3, 4, 5 df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0], 'y': [1, 2, 3, 4, 5, 6]}) I = pd.Index result = df >> arrange('x') assert all(result.x == [0, 1, 2, 2, 4, 5]) assert all(result.y == [6, 1, 3, 4, 5, 2]) result = df >> arrange('x', '-y') assert all(result.x == [0, 1, 2, 2, 4, 5]) assert all(result.y == [6, 1, 4, 3, 5, 2]) result = df >> arrange('np.sin(y)') assert all(result.x == [4, 2, 0, 2, 1, 5]) assert all(result.y == [5, 4, 6, 3, 1, 2]) # Branches result = df >> arrange() assert result is df result = df >> arrange('x') >> arrange('y') # already sorted assert result.index.equals(df.index) # Do not reset index result = df >> arrange('x', reset_index=False) assert result.index.equals(I([5, 0, 2, 3, 4, 1])) # Bad index df_bad = df.copy() df_bad.index = [0, 1, 0, 1, 0, 1] result = df_bad >> arrange('x') assert all(result.x == [0, 1, 2, 2, 4, 5]) result = df_bad >> arrange('x', '-y') assert all(result.x == [0, 1, 2, 2, 4, 5]) assert all(result.y == [6, 1, 4, 3, 5, 2]) # A computation on a non-increasing index df2 = pd.DataFrame({ 'x': [0, 1, 2, 2, 4, 5], 'y': [6, 1, 3, 4, 5, 2] }, index=[5, 0, 2, 3, 4, 1]) result = df2 >> arrange('-y') assert all(result.y == [6, 5, 4, 3, 2, 1])
def summarize_mpars_by_subject(df): return ( df >> p.group_by("subject_id", "condition", "data_id", "headcase") >> p.summarize( x_mean="mean(x)", x_median="median(x)", x_std="std(x)", y_mean="mean(y)", y_median="median(y)", y_std="std(y)", z_mean="mean(z)", z_median="median(z)", z_std="std(z)", pitch_mean="mean(pitch)", pitch_median="median(pitch)", pitch_std="std(pitch)", roll_mean="mean(roll)", roll_median="median(roll)", roll_std="std(roll)", yaw_mean="mean(yaw)", yaw_median="median(yaw)", yaw_std="std(yaw)", ) >> p.call( ".melt", id_vars=["subject_id", "data_id", "condition", "headcase"], value_vars=[ "x_mean", "y_mean", "z_mean", "x_median", "y_median", "z_median", "x_std", "y_std", "z_std", "pitch_mean", "roll_mean", "yaw_mean", "pitch_median", "roll_median", "yaw_median", "pitch_std", "roll_std", "yaw_std", ], var_name="measure", value_name="val", ) >> p.arrange("subject_id") >> p.call(".reset_index", drop=True) )
def test_data_mutability(): # These tests affirm that we know the consequences of the verbs. # A test in the Mutable section should not fail without a change # in implementation. That change should be triggered when Pandas # implements a consistent copy-on-write policy. # # When a test in the mutable section fails, it is bad news. The # should be no memory usage gains by reusing the original data, # except for the case of `rename`. df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) # Default to not mutable df >> define(z='x**2') assert 'z' not in df df >> group_by(z='x**2') assert 'z' not in df arr = df >> pull('x') arr[0] = 99 assert df.loc[0, 'x'] != 99 df2 = df >> slice_rows(3) df2.loc[0, 'x'] = 999 assert df.loc[0, 'x'] != 999 set_option('modify_input_data', True) df2 = df.copy() df2 >> define(z='x**2') assert 'z' in df2 df2 = df.copy() df2 >> group_by(z='x**2') assert 'z' in df2 df2 = df.copy() arr = df2 >> pull('x') arr[0] = 99 assert df2.loc[0, 'x'] == 99 # Not mutable df2 = df.copy() df2 >> create(z='x**2') assert 'z' not in df2 df2 >> sample_n(3) >> define(z='x**2') assert 'z' not in df2 df2 >> sample_frac(.5) >> define(z='x**2') assert 'z' not in df2 df2 >> select('x') >> define(z='x**2') assert 'z' not in df2 df2 >> select('x', 'y') >> define(z='x**2') assert 'z' not in df2 # dataframe.rename has copy-on-write (if copy=False) that affects # only the new frame. This creates possibility for "action at a # distance" effects on the new frame when the original is modified result = df2 >> rename(x='z') df2['y'] = 3 result['x'] = 4 assert 'z' not in df2 assert df2.loc[0, 'y'] != 4 assert result.loc[0, 'x'] != 3 assert result is df2 df2 >> arrange('x') >> define(z='x**2') assert 'z' not in df2 df2 >> query('x%2') >> define(z='x**2') assert 'z' not in df2 df2 >> group_indices(z='x%2') assert 'z' not in df2 set_option('modify_input_data', False)
def test_arrange(self): result = self.df >> define(z='np.sin(x)') >> arrange('z') assert isinstance(result, GroupedDataFrame)