def test_data_as_first_argument(): def equals(df1, df2): return df1.equals(df2) df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2')) assert equals(create(df, 'x*2'), df >> create('x*2')) assert len(sample_n(df, 5)) == len(df >> sample_n(5)) assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3)) assert equals(select(df, 'x'), df >> select('x')) assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x')) assert equals(distinct(df), df >> distinct()) assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)')) assert equals(group_by(df, 'x'), df >> group_by('x')) assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup()) assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)')) assert equals(query(df, 'x % 2'), df >> query('x % 2')) assert equals(tally(df, 'x'), df >> tally('x')) def xsum(gdf): return [gdf['x'].sum()] assert equals(do(group_by(df, 'y'), xsum=xsum), df >> group_by('y') >> do(xsum=xsum)) assert len(head(df, 4) == 4) assert len(tail(df, 4) == 4)
def test_create(): x = np.array([1, 2, 3]) y = np.array([4, 5, 6]) df = pd.DataFrame({'x': x}) # No args result = df >> create() assert len(result.columns) == 0 # All types of args result = df >> create(('x*2', 'x*2'), ('x*3', 'x*3'), x_sq='x**2', x_cumsum='np.cumsum(x)', y=y, w=9) assert len(result.columns) == 6 assert all(result['x*2'] == x * 2) assert all(result['x*3'] == x * 3) assert all(result['x_sq'] == x**2) assert all(result['x_cumsum'] == np.cumsum(x)) assert all(result['y'] == y) assert all(result['w'] == 9) result = df >> create('x*4') assert len(result.columns) == 1 assert all(result['x*4'] == x * 4) # Branches with pytest.raises(ValueError): df >> create(z=[1, 2, 3, 4]) # Works with group_by result = df >> group_by('x < 3') >> create(z='len(x)') assert all(result['z'] == [2, 2, 1])
def test_Q(): df = pd.DataFrame({'var.name': [1, 2, 3], 'class': [1, 2, 3]}) with pytest.raises(NameError): df >> define(y='var.name') with pytest.raises(NameError): df >> create(y='var.name') with pytest.raises(SyntaxError): df >> define(y='class+1') with pytest.raises(SyntaxError): df >> create(y='class+1') with pytest.raises(SyntaxError): df >> arrange('class+1') df >> define(y='Q("var.name")') df >> create(y='Q("var.name")') df >> define(y='Q("class")') df >> create(y='Q("class")') df >> define(y='class') df >> create(y='class') df >> arrange('class') df >> arrange('Q("class")+1')
def test_data_mutability(): # These tests affirm that we know the consequences of the verbs. # A test in the Mutable section should not fail without a change # in implementation. That change should be triggered when Pandas # implements a consistent copy-on-write policy. # # When a test in the mutable section fails, it is bad news. The # should be no memory usage gains by reusing the original data, # except for the case of `rename`. df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) # Default to not mutable df >> define(z='x**2') assert 'z' not in df df >> group_by(z='x**2') assert 'z' not in df arr = df >> pull('x') arr[0] = 99 assert df.loc[0, 'x'] != 99 df2 = df >> slice_rows(3) df2.loc[0, 'x'] = 999 assert df.loc[0, 'x'] != 999 set_option('modify_input_data', True) df2 = df.copy() df2 >> define(z='x**2') assert 'z' in df2 df2 = df.copy() df2 >> group_by(z='x**2') assert 'z' in df2 df2 = df.copy() arr = df2 >> pull('x') arr[0] = 99 assert df2.loc[0, 'x'] == 99 # Not mutable df2 = df.copy() df2 >> create(z='x**2') assert 'z' not in df2 df2 >> sample_n(3) >> define(z='x**2') assert 'z' not in df2 df2 >> sample_frac(.5) >> define(z='x**2') assert 'z' not in df2 df2 >> select('x') >> define(z='x**2') assert 'z' not in df2 df2 >> select('x', 'y') >> define(z='x**2') assert 'z' not in df2 # dataframe.rename has copy-on-write (if copy=False) that affects # only the new frame. This creates possibility for "action at a # distance" effects on the new frame when the original is modified result = df2 >> rename(x='z') df2['y'] = 3 result['x'] = 4 assert 'z' not in df2 assert df2.loc[0, 'y'] != 4 assert result.loc[0, 'x'] != 3 assert result is df2 df2 >> arrange('x') >> define(z='x**2') assert 'z' not in df2 df2 >> query('x%2') >> define(z='x**2') assert 'z' not in df2 df2 >> group_indices(z='x%2') assert 'z' not in df2 set_option('modify_input_data', False)
def test_create(self): result = self.df.copy() >> create(z='2*x') assert 'x' in result assert 'z' in result assert isinstance(result, GroupedDataFrame)