Exemplo n.º 1
0
def test_data_as_first_argument():
    def equals(df1, df2):
        return df1.equals(df2)

    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2'))
    assert equals(create(df, 'x*2'), df >> create('x*2'))
    assert len(sample_n(df, 5)) == len(df >> sample_n(5))
    assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3))
    assert equals(select(df, 'x'), df >> select('x'))
    assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x'))
    assert equals(distinct(df), df >> distinct())
    assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)'))
    assert equals(group_by(df, 'x'), df >> group_by('x'))
    assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup())
    assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)'))
    assert equals(query(df, 'x % 2'), df >> query('x % 2'))
    assert equals(tally(df, 'x'), df >> tally('x'))

    def xsum(gdf):
        return [gdf['x'].sum()]

    assert equals(do(group_by(df, 'y'), xsum=xsum),
                  df >> group_by('y') >> do(xsum=xsum))

    assert len(head(df, 4) == 4)
    assert len(tail(df, 4) == 4)
Exemplo n.º 2
0
def test_distinct():
    # Index                  0, 1, 2, 3, 4, 5, 6
    df = pd.DataFrame({'x': [1, 1, 2, 3, 4, 4, 5], 'y': [1, 2, 3, 4, 5, 5, 6]})
    I = pd.Index  # noqa: E741

    result = df >> distinct()
    assert result.index.equals(I([0, 1, 2, 3, 4, 6]))

    result = df >> distinct(('x', 'y'), z='x+1')
    assert result.index.equals(I([0, 1, 2, 3, 4, 6]))

    result = df >> distinct('last')
    assert result.index.equals(I([0, 1, 2, 3, 5, 6]))

    result = df >> distinct(False)
    assert result.index.equals(I([0, 1, 2, 3, 6]))

    result = df >> distinct(['x'])
    assert result.index.equals(I([0, 2, 3, 4, 6]))

    result = df >> distinct(['x'], 'last')
    assert result.index.equals(I([1, 2, 3, 5, 6]))

    result = df >> distinct(z='x%2')
    assert result.index.equals(I([0, 2]))

    result1 = df >> define(z='x%2') >> distinct(['x', 'z'])
    result2 = df >> distinct(['x'], z='x%2')
    assert result1.equals(result2)

    with pytest.raises(Exception):
        df >> distinct(['x'], 'last', 'cause_exception')
Exemplo n.º 3
0
 def test_distinct(self):
     result = self.df >> distinct()
     assert isinstance(result, GroupedDataFrame)