예제 #1
0
def test_data_as_first_argument():
    def equals(df1, df2):
        return df1.equals(df2)

    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2'))
    assert equals(create(df, 'x*2'), df >> create('x*2'))
    assert len(sample_n(df, 5)) == len(df >> sample_n(5))
    assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3))
    assert equals(select(df, 'x'), df >> select('x'))
    assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x'))
    assert equals(distinct(df), df >> distinct())
    assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)'))
    assert equals(group_by(df, 'x'), df >> group_by('x'))
    assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup())
    assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)'))
    assert equals(query(df, 'x % 2'), df >> query('x % 2'))
    assert equals(tally(df, 'x'), df >> tally('x'))

    def xsum(gdf):
        return [gdf['x'].sum()]

    assert equals(do(group_by(df, 'y'), xsum=xsum),
                  df >> group_by('y') >> do(xsum=xsum))

    assert len(head(df, 4) == 4)
    assert len(tail(df, 4) == 4)
예제 #2
0
def test_query():
    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})
    c = 3  # noqa: F841

    result = df >> query('x % 2 == 0')
    assert all(result.loc[:, 'x'] == [0, 2, 4])

    result = df >> query('x > @c')
    assert all(result.loc[:, 'x'] == [4, 5])
예제 #3
0
def test_query():
    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})
    c = 3  # noqa: F841

    result = df >> query('x % 2 == 0')
    assert all(result.loc[:, 'x'] == [0, 2, 4])

    result = df >> query('x > @c')
    assert all(result.loc[:, 'x'] == [4, 5])

    result = df >> query('x % 2 == 0', reset_index=False)
    assert result.index.equals(pd.Index([0, 2, 4]))
예제 #4
0
def test_define():
    x = np.array([1, 2, 3])
    y = np.array([4, 5, 6])
    df = pd.DataFrame({'x': x})

    # No args
    df2 = df >> define()
    assert len(df2.columns) == 1

    # All types of args
    df2 = df >> define(('x*2', 'x*2'), ('x*3', 'x*3'),
                       x_sq='x**2',
                       x_cumsum='np.cumsum(x)',
                       y=y,
                       w=9)

    assert len(df2.columns) == 7
    assert all(df2['x*2'] == x * 2)
    assert all(df2['x*3'] == x * 3)
    assert all(df2['x_sq'] == x**2)
    assert all(df2['x_cumsum'] == np.cumsum(x))
    assert all(df2['y'] == y)
    assert all(df2['w'] == 9)

    result = df >> define('x*4')
    assert len(result.columns) == 2

    # Branches
    with pytest.raises(ValueError):
        df >> define(z=[1, 2, 3, 4])

    # Works with group_by
    result = df >> group_by('x < 3') >> define(z='len(x)')
    assert all(result['z'] == [2, 2, 1])

    # Potentially problematic index
    def non_range_index_func(s):
        return pd.Series([11, 12, 13], index=[21, 22, 23])

    result = df >> define(z='non_range_index_func(x)')
    assert all(result['z'] == [11, 12, 13])

    # Can create categorical column
    result = df >> define(xcat='pd.Categorical(x)')
    assert all(result['xcat'] == result['x'])
    assert pdtypes.is_categorical_dtype(result['xcat'])

    # Messing with indices
    result = (df >> query('x >= 2') >> group_by('x') >> define(y='x'))
    assert all(result['x'] == result['y'])

    # Do not modify group column
    with pytest.raises(ValueError):
        df >> group_by('x') >> define(x='2*x')

    # Series-like iterables
    # https://github.com/has2k1/plydata/issues/21
    result = df >> define(y=pd.Series(y))
    assert all(result['y'] == y)
예제 #5
0
def combine_sherlock_view_runs_and_replace(df):
    """
    Given a summarized dataframe (i.e. 1 datapoint per subject per condition/motion-direction),
    average the summary params for runs view1 and view2 for sherlocks subs and reattach to 
    the original frame such that it only contains 'view' and 'recall' conditions rather than
    'view1', 'view2', and 'recall' conditions. This is because realignment is computed on a per
    run basis, of which sherlock subs have 2 'view' runs, but summary statistics (i.e. mean FD)
    are computed as: (mean of run1 + mean of run2 / 2)
    """

    sherlock_combined = (
        df
        >> p.query("data_id == 'sherlock' and condition != 'recall'")
        >> p.group_by("subject_id", "measure", "data_id", "headcase")
        >> p.summarize(val="mean(val)")
        >> p.call(".assign", condition="view")
        >> p.select("subject_id", "data_id", "condition", "headcase", "measure", "val")
    )
    df_no_sherlock = df.query("condition == 'view' or condition == 'recall'")
    return pd.concat([df_no_sherlock, sherlock_combined], axis=0).reset_index(drop=True)
예제 #6
0
def test_data_mutability():
    # These tests affirm that we know the consequences of the verbs.
    # A test in the Mutable section should not fail without a change
    # in implementation. That change should be triggered when Pandas
    # implements a consistent copy-on-write policy.
    #
    # When a test in the mutable section fails, it is bad news. The
    # should be no memory usage gains by reusing the original data,
    # except for the case of `rename`.
    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    # Default to not mutable
    df >> define(z='x**2')
    assert 'z' not in df

    df >> group_by(z='x**2')
    assert 'z' not in df

    arr = df >> pull('x')
    arr[0] = 99
    assert df.loc[0, 'x'] != 99

    df2 = df >> slice_rows(3)
    df2.loc[0, 'x'] = 999
    assert df.loc[0, 'x'] != 999

    set_option('modify_input_data', True)

    df2 = df.copy()
    df2 >> define(z='x**2')
    assert 'z' in df2

    df2 = df.copy()
    df2 >> group_by(z='x**2')
    assert 'z' in df2

    df2 = df.copy()
    arr = df2 >> pull('x')
    arr[0] = 99
    assert df2.loc[0, 'x'] == 99

    # Not mutable
    df2 = df.copy()
    df2 >> create(z='x**2')
    assert 'z' not in df2

    df2 >> sample_n(3) >> define(z='x**2')
    assert 'z' not in df2

    df2 >> sample_frac(.5) >> define(z='x**2')
    assert 'z' not in df2

    df2 >> select('x') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> select('x', 'y') >> define(z='x**2')
    assert 'z' not in df2

    # dataframe.rename has copy-on-write (if copy=False) that affects
    # only the new frame. This creates possibility for "action at a
    # distance" effects on the new frame when the original is modified
    result = df2 >> rename(x='z')
    df2['y'] = 3
    result['x'] = 4
    assert 'z' not in df2
    assert df2.loc[0, 'y'] != 4
    assert result.loc[0, 'x'] != 3
    assert result is df2

    df2 >> arrange('x') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> query('x%2') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> group_indices(z='x%2')
    assert 'z' not in df2

    set_option('modify_input_data', False)
예제 #7
0
 def test_query(self):
     result = self.df >> query('x % 2 == 0')
     assert 'x' in result
     assert isinstance(result, GroupedDataFrame)