def test_data_as_first_argument(): def equals(df1, df2): return df1.equals(df2) df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2')) assert equals(create(df, 'x*2'), df >> create('x*2')) assert len(sample_n(df, 5)) == len(df >> sample_n(5)) assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3)) assert equals(select(df, 'x'), df >> select('x')) assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x')) assert equals(distinct(df), df >> distinct()) assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)')) assert equals(group_by(df, 'x'), df >> group_by('x')) assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup()) assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)')) assert equals(query(df, 'x % 2'), df >> query('x % 2')) assert equals(tally(df, 'x'), df >> tally('x')) def xsum(gdf): return [gdf['x'].sum()] assert equals(do(group_by(df, 'y'), xsum=xsum), df >> group_by('y') >> do(xsum=xsum)) assert len(head(df, 4) == 4) assert len(tail(df, 4) == 4)
def test_query(): df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) c = 3 # noqa: F841 result = df >> query('x % 2 == 0') assert all(result.loc[:, 'x'] == [0, 2, 4]) result = df >> query('x > @c') assert all(result.loc[:, 'x'] == [4, 5])
def test_query(): df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) c = 3 # noqa: F841 result = df >> query('x % 2 == 0') assert all(result.loc[:, 'x'] == [0, 2, 4]) result = df >> query('x > @c') assert all(result.loc[:, 'x'] == [4, 5]) result = df >> query('x % 2 == 0', reset_index=False) assert result.index.equals(pd.Index([0, 2, 4]))
def test_define(): x = np.array([1, 2, 3]) y = np.array([4, 5, 6]) df = pd.DataFrame({'x': x}) # No args df2 = df >> define() assert len(df2.columns) == 1 # All types of args df2 = df >> define(('x*2', 'x*2'), ('x*3', 'x*3'), x_sq='x**2', x_cumsum='np.cumsum(x)', y=y, w=9) assert len(df2.columns) == 7 assert all(df2['x*2'] == x * 2) assert all(df2['x*3'] == x * 3) assert all(df2['x_sq'] == x**2) assert all(df2['x_cumsum'] == np.cumsum(x)) assert all(df2['y'] == y) assert all(df2['w'] == 9) result = df >> define('x*4') assert len(result.columns) == 2 # Branches with pytest.raises(ValueError): df >> define(z=[1, 2, 3, 4]) # Works with group_by result = df >> group_by('x < 3') >> define(z='len(x)') assert all(result['z'] == [2, 2, 1]) # Potentially problematic index def non_range_index_func(s): return pd.Series([11, 12, 13], index=[21, 22, 23]) result = df >> define(z='non_range_index_func(x)') assert all(result['z'] == [11, 12, 13]) # Can create categorical column result = df >> define(xcat='pd.Categorical(x)') assert all(result['xcat'] == result['x']) assert pdtypes.is_categorical_dtype(result['xcat']) # Messing with indices result = (df >> query('x >= 2') >> group_by('x') >> define(y='x')) assert all(result['x'] == result['y']) # Do not modify group column with pytest.raises(ValueError): df >> group_by('x') >> define(x='2*x') # Series-like iterables # https://github.com/has2k1/plydata/issues/21 result = df >> define(y=pd.Series(y)) assert all(result['y'] == y)
def combine_sherlock_view_runs_and_replace(df): """ Given a summarized dataframe (i.e. 1 datapoint per subject per condition/motion-direction), average the summary params for runs view1 and view2 for sherlocks subs and reattach to the original frame such that it only contains 'view' and 'recall' conditions rather than 'view1', 'view2', and 'recall' conditions. This is because realignment is computed on a per run basis, of which sherlock subs have 2 'view' runs, but summary statistics (i.e. mean FD) are computed as: (mean of run1 + mean of run2 / 2) """ sherlock_combined = ( df >> p.query("data_id == 'sherlock' and condition != 'recall'") >> p.group_by("subject_id", "measure", "data_id", "headcase") >> p.summarize(val="mean(val)") >> p.call(".assign", condition="view") >> p.select("subject_id", "data_id", "condition", "headcase", "measure", "val") ) df_no_sherlock = df.query("condition == 'view' or condition == 'recall'") return pd.concat([df_no_sherlock, sherlock_combined], axis=0).reset_index(drop=True)
def test_data_mutability(): # These tests affirm that we know the consequences of the verbs. # A test in the Mutable section should not fail without a change # in implementation. That change should be triggered when Pandas # implements a consistent copy-on-write policy. # # When a test in the mutable section fails, it is bad news. The # should be no memory usage gains by reusing the original data, # except for the case of `rename`. df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) # Default to not mutable df >> define(z='x**2') assert 'z' not in df df >> group_by(z='x**2') assert 'z' not in df arr = df >> pull('x') arr[0] = 99 assert df.loc[0, 'x'] != 99 df2 = df >> slice_rows(3) df2.loc[0, 'x'] = 999 assert df.loc[0, 'x'] != 999 set_option('modify_input_data', True) df2 = df.copy() df2 >> define(z='x**2') assert 'z' in df2 df2 = df.copy() df2 >> group_by(z='x**2') assert 'z' in df2 df2 = df.copy() arr = df2 >> pull('x') arr[0] = 99 assert df2.loc[0, 'x'] == 99 # Not mutable df2 = df.copy() df2 >> create(z='x**2') assert 'z' not in df2 df2 >> sample_n(3) >> define(z='x**2') assert 'z' not in df2 df2 >> sample_frac(.5) >> define(z='x**2') assert 'z' not in df2 df2 >> select('x') >> define(z='x**2') assert 'z' not in df2 df2 >> select('x', 'y') >> define(z='x**2') assert 'z' not in df2 # dataframe.rename has copy-on-write (if copy=False) that affects # only the new frame. This creates possibility for "action at a # distance" effects on the new frame when the original is modified result = df2 >> rename(x='z') df2['y'] = 3 result['x'] = 4 assert 'z' not in df2 assert df2.loc[0, 'y'] != 4 assert result.loc[0, 'x'] != 3 assert result is df2 df2 >> arrange('x') >> define(z='x**2') assert 'z' not in df2 df2 >> query('x%2') >> define(z='x**2') assert 'z' not in df2 df2 >> group_indices(z='x%2') assert 'z' not in df2 set_option('modify_input_data', False)
def test_query(self): result = self.df >> query('x % 2 == 0') assert 'x' in result assert isinstance(result, GroupedDataFrame)