def test_Q(): df = pd.DataFrame({'var.name': [1, 2, 3], 'class': [1, 2, 3]}) with pytest.raises(NameError): df >> define(y='var.name') with pytest.raises(NameError): df >> create(y='var.name') with pytest.raises(SyntaxError): df >> define(y='class+1') with pytest.raises(SyntaxError): df >> create(y='class+1') with pytest.raises(SyntaxError): df >> arrange('class+1') df >> define(y='Q("var.name")') df >> create(y='Q("var.name")') df >> define(y='Q("class")') df >> create(y='Q("class")') df >> define(y='class') df >> create(y='class') df >> arrange('class') df >> arrange('Q("class")+1')
def test_data_as_first_argument(): def equals(df1, df2): return df1.equals(df2) df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2')) assert equals(create(df, 'x*2'), df >> create('x*2')) assert len(sample_n(df, 5)) == len(df >> sample_n(5)) assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3)) assert equals(select(df, 'x'), df >> select('x')) assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x')) assert equals(distinct(df), df >> distinct()) assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)')) assert equals(group_by(df, 'x'), df >> group_by('x')) assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup()) assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)')) assert equals(query(df, 'x % 2'), df >> query('x % 2')) assert equals(tally(df, 'x'), df >> tally('x')) def xsum(gdf): return [gdf['x'].sum()] assert equals(do(group_by(df, 'y'), xsum=xsum), df >> group_by('y') >> do(xsum=xsum)) assert len(head(df, 4) == 4) assert len(tail(df, 4) == 4)
def test_options_context(): # Straight test set_option('modify_input_data', False) assert not get_option('modify_input_data') with options(modify_input_data=True): assert get_option('modify_input_data') assert not get_option('modify_input_data') # With some data df = pd.DataFrame({'x': [0, 1, 2, 3]}) df2 = df >> define(y='2*x') assert not df.equals(df2) with options(modify_input_data=True): df3 = df >> define(z='3*x') assert df.equals(df3) assert df is df3 df4 = df >> define(w='4*x') assert not df.equals(df4) # That the options context manager should not muffle # an exception. with pytest.raises(ValueError): with options(modify_input_data=True): raise ValueError() # The above exception should not leave a modified option assert not get_option('modify_input_data') with pytest.raises(ValueError): assert not get_option('time_travel')
def test_define(): x = np.array([1, 2, 3]) y = np.array([4, 5, 6]) df = pd.DataFrame({'x': x}) # No args df2 = df >> define() assert len(df2.columns) == 1 # All types of args df2 = df >> define(('x*2', 'x*2'), ('x*3', 'x*3'), x_sq='x**2', x_cumsum='np.cumsum(x)', y=y, w=9) assert len(df2.columns) == 7 assert all(df2['x*2'] == x * 2) assert all(df2['x*3'] == x * 3) assert all(df2['x_sq'] == x**2) assert all(df2['x_cumsum'] == np.cumsum(x)) assert all(df2['y'] == y) assert all(df2['w'] == 9) result = df >> define('x*4') assert len(result.columns) == 2 # Branches with pytest.raises(ValueError): df >> define(z=[1, 2, 3, 4]) # Works with group_by result = df >> group_by('x < 3') >> define(z='len(x)') assert all(result['z'] == [2, 2, 1]) # Potentially problematic index def non_range_index_func(s): return pd.Series([11, 12, 13], index=[21, 22, 23]) result = df >> define(z='non_range_index_func(x)') assert all(result['z'] == [11, 12, 13]) # Can create categorical column result = df >> define(xcat='pd.Categorical(x)') assert all(result['xcat'] == result['x']) assert pdtypes.is_categorical_dtype(result['xcat']) # Messing with indices result = (df >> query('x >= 2') >> group_by('x') >> define(y='x')) assert all(result['x'] == result['y']) # Do not modify group column with pytest.raises(ValueError): df >> group_by('x') >> define(x='2*x') # Series-like iterables # https://github.com/has2k1/plydata/issues/21 result = df >> define(y=pd.Series(y)) assert all(result['y'] == y)
def test_DataOperator(): s = {1, 2, 3} # unrecognized datastore data = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 3]}) with pytest.raises(TypeError): s >> define(z='x') # Currying result = define(z=[3, 2, 1])(data) assert 'x' in result assert 'y' in result assert 'z' in result
def test_distinct(): # Index 0, 1, 2, 3, 4, 5, 6 df = pd.DataFrame({'x': [1, 1, 2, 3, 4, 4, 5], 'y': [1, 2, 3, 4, 5, 5, 6]}) I = pd.Index # noqa: E741 result = df >> distinct() assert result.index.equals(I([0, 1, 2, 3, 4, 6])) result = df >> distinct(('x', 'y'), z='x+1') assert result.index.equals(I([0, 1, 2, 3, 4, 6])) result = df >> distinct('last') assert result.index.equals(I([0, 1, 2, 3, 5, 6])) result = df >> distinct(False) assert result.index.equals(I([0, 1, 2, 3, 6])) result = df >> distinct(['x']) assert result.index.equals(I([0, 2, 3, 4, 6])) result = df >> distinct(['x'], 'last') assert result.index.equals(I([1, 2, 3, 5, 6])) result = df >> distinct(z='x%2') assert result.index.equals(I([0, 2])) result1 = df >> define(z='x%2') >> distinct(['x', 'z']) result2 = df >> distinct(['x'], z='x%2') assert result1.equals(result2) with pytest.raises(Exception): df >> distinct(['x'], 'last', 'cause_exception')
def _apply_transforms(df, definitions): """ df (Pandas.DataFrame): Dataframe containing raw data queried from Census API definitions (List[Tuple[str, str]]): List of (name, definition) pairs. Column definitions should be strings containing valid Python expressions. Expressions can reference other columns in df by name. Example: [ ( "Column Name", "(B02001_001E - B02001_002E) / B02001_001E" ) ] This expression references columns containing data for census variables B02001_001E (population, all races) and B02001_002E (population, white). It calculates the proportion of a geography's population identifying as a race other than White. See https://plydata.readthedocs.io/en/latest/generated/plydata.one_table_verbs.define.html for more on how these expressions are evaluated returns (Pandas DataFrame): Dataframe containing transformed columns """ all_vars = df.columns.values df = plydata.define(df, *definitions).drop(all_vars, axis=1) return df
def create_readme_image(): kwargs = dict(width=6, height=4) df = pd.DataFrame({'x': np.linspace(0, 2 * np.pi, 500)}) p = (df >> define(y='np.sin(x)') >> define_where( 'y>=0', sign=('"positive"', '"negative"')) >> (ggplot(aes('x', 'y')) + geom_line(aes(color='sign'), size=1.5))) p.save('readme-image.png', **kwargs)
def test_define(): x = np.array([1, 2, 3]) y = np.array([4, 5, 6]) d = custom_dict({'x': x}) # No args d >> define() assert len(d) == 1 # All types of args result = d >> define(('x*2', 'x*2'), ('x*3', 'x*3'), x_sq='x**2', x_cumsum='np.cumsum(x)', y=y) assert len(result) == 6 assert all(result['x*2'] == x * 2) assert all(result['x*3'] == x * 3) assert all(result['x_sq'] == x**2) assert all(result['x_cumsum'] == np.cumsum(x)) assert all(result['y'] == y)
def test_call(): def remove_column_a(df): _df = df.copy() del _df['a'] return _df df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, np.nan]}) # External function result = df >> call(remove_column_a) assert 'a' not in result assert 'b' in result # dataframe method result = df >> call('.dropna') assert len(result) == 2 # dataframe method with arguments result = df >> define(c='a*2') >> call('.dropna', axis=1) assert 'a' in result assert 'b' not in result assert 'c' in result
def test_data_mutability(): # These tests affirm that we know the consequences of the verbs. # A test in the Mutable section should not fail without a change # in implementation. That change should be triggered when Pandas # implements a consistent copy-on-write policy. # # When a test in the mutable section fails, it is bad news. The # should be no memory usage gains by reusing the original data, # except for the case of `rename`. df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]}) # Default to not mutable df >> define(z='x**2') assert 'z' not in df df >> group_by(z='x**2') assert 'z' not in df arr = df >> pull('x') arr[0] = 99 assert df.loc[0, 'x'] != 99 df2 = df >> slice_rows(3) df2.loc[0, 'x'] = 999 assert df.loc[0, 'x'] != 999 set_option('modify_input_data', True) df2 = df.copy() df2 >> define(z='x**2') assert 'z' in df2 df2 = df.copy() df2 >> group_by(z='x**2') assert 'z' in df2 df2 = df.copy() arr = df2 >> pull('x') arr[0] = 99 assert df2.loc[0, 'x'] == 99 # Not mutable df2 = df.copy() df2 >> create(z='x**2') assert 'z' not in df2 df2 >> sample_n(3) >> define(z='x**2') assert 'z' not in df2 df2 >> sample_frac(.5) >> define(z='x**2') assert 'z' not in df2 df2 >> select('x') >> define(z='x**2') assert 'z' not in df2 df2 >> select('x', 'y') >> define(z='x**2') assert 'z' not in df2 # dataframe.rename has copy-on-write (if copy=False) that affects # only the new frame. This creates possibility for "action at a # distance" effects on the new frame when the original is modified result = df2 >> rename(x='z') df2['y'] = 3 result['x'] = 4 assert 'z' not in df2 assert df2.loc[0, 'y'] != 4 assert result.loc[0, 'x'] != 3 assert result is df2 df2 >> arrange('x') >> define(z='x**2') assert 'z' not in df2 df2 >> query('x%2') >> define(z='x**2') assert 'z' not in df2 df2 >> group_indices(z='x%2') assert 'z' not in df2 set_option('modify_input_data', False)
def test_arrange(self): result = self.df >> define(z='np.sin(x)') >> arrange('z') assert isinstance(result, GroupedDataFrame)
def test_define(self): result = self.df.copy() >> define(z='2*x') assert isinstance(result, GroupedDataFrame)
def test_define(self): v = define(y='x*2') self._test(v)