Exemplo n.º 1
0
def test_data_as_first_argument():
    def equals(df1, df2):
        return df1.equals(df2)

    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2'))
    assert equals(create(df, 'x*2'), df >> create('x*2'))
    assert len(sample_n(df, 5)) == len(df >> sample_n(5))
    assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3))
    assert equals(select(df, 'x'), df >> select('x'))
    assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x'))
    assert equals(distinct(df), df >> distinct())
    assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)'))
    assert equals(group_by(df, 'x'), df >> group_by('x'))
    assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup())
    assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)'))
    assert equals(query(df, 'x % 2'), df >> query('x % 2'))
    assert equals(tally(df, 'x'), df >> tally('x'))

    def xsum(gdf):
        return [gdf['x'].sum()]

    assert equals(do(group_by(df, 'y'), xsum=xsum),
                  df >> group_by('y') >> do(xsum=xsum))

    assert len(head(df, 4) == 4)
    assert len(tail(df, 4) == 4)
Exemplo n.º 2
0
def test_summarize_all():
    df = pd.DataFrame({
        'alpha': list('aaabbb'),
        'beta': list('babruq'),
        'theta': list('cdecde'),
        'x': [1, 2, 3, 4, 5, 6],
        'y': [6, 5, 4, 3, 2, 1],
        'z': [7, 9, 11, 8, 10, 12]
    })

    result = df >> select('x', 'z') >> summarize_all(('mean', np.std))
    expected_cols = ['x_mean', 'z_mean', 'x_std', 'z_std']
    assert len(result.columns.intersection(expected_cols)) == 4
    result.loc[0, 'x_mean'] = 3.5
    result.loc[0, 'z_mean'] = 9.5
    result.loc[0, 'x_std'] = result.loc[0, 'z_std']

    # Group column is not summarized
    result = (df >> select('x', 'y', 'z') >> group_by('x') >> summarize_all(
        ('mean')))
    assert result['x'].equals(df['x'])
Exemplo n.º 3
0
def test_mutate_all():
    df = pd.DataFrame({
        'alpha': list('aaabbb'),
        'beta': list('babruq'),
        'theta': list('cdecde'),
        'x': [1, 2, 3, 4, 5, 6],
        'y': [6, 5, 4, 3, 2, 1],
        'z': [7, 9, 11, 8, 10, 12]
    })

    result = (df >> group_by('alpha') >> select('x', 'y', 'z') >> mutate_all(
        (np.add, np.subtract), 10))
    assert 'alpha' in result
Exemplo n.º 4
0
def test_arrange_all():
    df = pd.DataFrame({
        'alpha': list('aaabbb'),
        'beta': list('babruq'),
        'theta': list('cdecde'),
        'x': [1, 2, 3, 4, 5, 6],
        'y': [6, 5, 4, 3, 2, 1],
        'z': [7, 9, 11, 8, 10, 12]
    })

    result = df >> select('x', 'y', 'z') >> arrange_all(np.negative)
    assert all(result['x'] == [6, 5, 4, 3, 2, 1])
    assert all(result['y'] == [1, 2, 3, 4, 5, 6])
    assert all(result['z'] == [12, 10, 8, 11, 9, 7])
Exemplo n.º 5
0
def test_gather():
    df = pd.DataFrame({
        'name': ['mary', 'oscar', 'martha', 'john'],
        'math': [92, 83, 85, 90],
        'art': [75, 95, 80, 72],
        'pe': [85, 75, 82, 84]
    })
    result = df >> gather('subject', 'grade')
    assert all(result.columns == ['subject', 'grade'])
    assert len(result) == 16

    result1 = df >> gather('subject', 'grade', select('-name'))
    result2 = df >> gather('subject', 'grade', slice('math', 'pe'))
    result3 = df >> gather('subject', 'grade', ['math', 'art', 'pe'])
    result4 = df >> gather('subject', 'grade', '-name')
    assert result2.equals(result1)
    assert result3.equals(result1)
    assert result4.equals(result1)
Exemplo n.º 6
0
def combine_sherlock_view_runs_and_replace(df):
    """
    Given a summarized dataframe (i.e. 1 datapoint per subject per condition/motion-direction),
    average the summary params for runs view1 and view2 for sherlocks subs and reattach to 
    the original frame such that it only contains 'view' and 'recall' conditions rather than
    'view1', 'view2', and 'recall' conditions. This is because realignment is computed on a per
    run basis, of which sherlock subs have 2 'view' runs, but summary statistics (i.e. mean FD)
    are computed as: (mean of run1 + mean of run2 / 2)
    """

    sherlock_combined = (
        df
        >> p.query("data_id == 'sherlock' and condition != 'recall'")
        >> p.group_by("subject_id", "measure", "data_id", "headcase")
        >> p.summarize(val="mean(val)")
        >> p.call(".assign", condition="view")
        >> p.select("subject_id", "data_id", "condition", "headcase", "measure", "val")
    )
    df_no_sherlock = df.query("condition == 'view' or condition == 'recall'")
    return pd.concat([df_no_sherlock, sherlock_combined], axis=0).reset_index(drop=True)
Exemplo n.º 7
0
def test_data_mutability():
    # These tests affirm that we know the consequences of the verbs.
    # A test in the Mutable section should not fail without a change
    # in implementation. That change should be triggered when Pandas
    # implements a consistent copy-on-write policy.
    #
    # When a test in the mutable section fails, it is bad news. The
    # should be no memory usage gains by reusing the original data,
    # except for the case of `rename`.
    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    # Default to not mutable
    df >> define(z='x**2')
    assert 'z' not in df

    df >> group_by(z='x**2')
    assert 'z' not in df

    arr = df >> pull('x')
    arr[0] = 99
    assert df.loc[0, 'x'] != 99

    df2 = df >> slice_rows(3)
    df2.loc[0, 'x'] = 999
    assert df.loc[0, 'x'] != 999

    set_option('modify_input_data', True)

    df2 = df.copy()
    df2 >> define(z='x**2')
    assert 'z' in df2

    df2 = df.copy()
    df2 >> group_by(z='x**2')
    assert 'z' in df2

    df2 = df.copy()
    arr = df2 >> pull('x')
    arr[0] = 99
    assert df2.loc[0, 'x'] == 99

    # Not mutable
    df2 = df.copy()
    df2 >> create(z='x**2')
    assert 'z' not in df2

    df2 >> sample_n(3) >> define(z='x**2')
    assert 'z' not in df2

    df2 >> sample_frac(.5) >> define(z='x**2')
    assert 'z' not in df2

    df2 >> select('x') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> select('x', 'y') >> define(z='x**2')
    assert 'z' not in df2

    # dataframe.rename has copy-on-write (if copy=False) that affects
    # only the new frame. This creates possibility for "action at a
    # distance" effects on the new frame when the original is modified
    result = df2 >> rename(x='z')
    df2['y'] = 3
    result['x'] = 4
    assert 'z' not in df2
    assert df2.loc[0, 'y'] != 4
    assert result.loc[0, 'x'] != 3
    assert result is df2

    df2 >> arrange('x') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> query('x%2') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> group_indices(z='x%2')
    assert 'z' not in df2

    set_option('modify_input_data', False)
Exemplo n.º 8
0
 def test_select(self):
     result = self.df >> select('y')
     assert 'x' in result
     assert isinstance(result, GroupedDataFrame)
Exemplo n.º 9
0
def test_select():
    x = list(range(20))
    df = pd.DataFrame({
        'lion': x,
        'tiger': x,
        'cheetah': x,
        'leopard': x,
        'jaguar': x,
        'cougar': x,
        'caracal': x
    })

    result = df >> select('lion', 'caracal')
    assert len(result.columns) == 2
    assert all(c in result.columns for c in ('lion', 'caracal'))

    result = df >> select(startswith='c')
    assert len(result.columns) == 3

    result = df >> select(
        'caracal', endswith='ar', contains='ee', matches=r'\w+opa')
    assert len(result.columns) == 5

    result = df >> select(contains=['ee', 'ion', '23'])
    assert len(result.columns) == 2

    result = df >> select(matches=(r'\w+opa', r'\w+r$'))
    assert len(result.columns) == 4

    # grouped on columns are never dropped
    result = df >> group_by('cougar') >> select(startswith='c', drop=True)
    assert len(result.columns) == 5
    assert 'cougar' in result

    # order depends on selection, and grouped columns are prepend
    # if missing from selection
    result1 = df >> select('jaguar', 'lion', 'caracal')
    result2 = df >> select('caracal', 'jaguar', 'lion')
    result3 = df >> group_by('tiger') >> select('caracal', 'jaguar', 'lion')
    assert list(result1.columns) == ['jaguar', 'lion', 'caracal']
    assert list(result2.columns) == ['caracal', 'jaguar', 'lion']
    assert list(result3.columns) == ['tiger', 'caracal', 'jaguar', 'lion']

    # Numerical column names, and regex object
    df[123] = 1
    df[456] = 2
    df[789] = 3
    pattern = re.compile(r'\w+opa')
    result = df >> select(startswith='t', matches=pattern)
    assert len(result.columns) == 2

    result = df >> select(123, startswith='t', matches=pattern)
    assert len(result.columns) == 3

    result = df >> select(456, 789, drop=True)
    assert len(result.columns) == len(df.columns) - 2

    result = df >> select(contains=['ee', 'ion'])
    assert len(result.columns) == 2

    # No selection, should still have an index
    result = df >> select()
    assert len(result.columns) == 0
    assert len(result.index) == len(df.index)
    df = pd.DataFrame({
        'lion': x,
        'tiger': x,
        'cheetah': x,
        'leopard': x,
        'jaguar': x,
        'cougar': x,
        'caracal': x
    })

    # Exclude with minus
    result = df >> select('-jaguar', '-lion')
    assert 'jaguar' not in result
    assert 'lion' not in result

    result = df >> select('-jaguar', '-lion', 'jaguar')
    assert result.columns[-1] == 'jaguar'

    # Wrong way to exclude
    with pytest.raises(KeyError):
        df >> select('jaguar', '-lion')

    with pytest.raises(TypeError):
        select.from_columns({})
Exemplo n.º 10
0
def test_pivot_longer():
    df = pd.DataFrame({
        'name': ['mary', 'mary', 'john', 'john'],
        'city': ['dakar', 'dakar', 'lome', 'lome'],
        'year': [1990, 1992, 1996, 1998],
        '1_sunny': [8, 6, 4, 7],
        '2_rainy': [9, 7, 7, 6]
    })

    result = df >> pivot_longer(
        cols=select(matches=r'^\d'),
        names_to=['two_vars'],
        values_to='score',
    )

    assert result['two_vars'].iloc[0] == '1_sunny'
    assert result['two_vars'].iloc[4] == '2_rainy'

    result = df >> pivot_longer(
        cols=select(matches=r'^\d'),
        names_to=['take', 'season'],
        values_to='score',
        names_sep='_',
        convert=True
    )
    assert result['take'].dtype == int
    assert len(result.columns) == 6

    # select all
    result = df[['1_sunny', '2_rainy']] >> pivot_longer(
        cols=select_all(),
        names_to=['col'],
        values_to='score',
    )

    with pytest.warns(UserWarning) as records:
        result = df >> pivot_longer(
            select(startswith='x'),  # No selected columns
            names_to=['take', 'season'],
            values_to='score',
            names_sep='_',
        )

    assert len(result) == 0
    assert any('No columns' in str(r.message) for r in records)

    with pytest.raises(ValueError):
        df >> pivot_longer(
            cols=select(matches=r'^\d'),
            names_to=['take', 'season'],
            values_to='score',
            # missing names_sep or names_pattern
        )

    with pytest.raises(TypeError):
        df >> pivot_longer(
            cols=select(matches=r'^\d'),
            names_to=['take', 'season'],
            values_to='score',
            names_sep='_',
            names_prefix=4  # bad value
        )