Пример #1
0
def test_comp(setup):
    df1 = DataFrame(pd.DataFrame(np.random.rand(4, 3)))
    df2 = DataFrame(pd.DataFrame(np.random.rand(4, 3)))

    with enter_mode(build=True):
        assert not df1.data == df2.data
        assert df1.data == df1.data

    for op in [operator.eq, operator.ne, operator.lt, operator.gt,
               operator.le, operator.ge]:
        eq_df = op(df1, df2)
        pd.testing.assert_index_equal(eq_df.index_value.to_pandas(),
                                      df1.index_value.to_pandas())

        # index not identical
        df3 = DataFrame(pd.DataFrame(np.random.rand(4, 3),
                                     index=[1, 2, 3, 4]))
        with pytest.raises(ValueError):
            op(df1, df3)

        # columns not identical
        df4 = DataFrame(pd.DataFrame(np.random.rand(4, 3),
                                     columns=['a', 'b', 'c']))
        with pytest.raises(ValueError):
            op(df1, df4)

    # test datetime
    df = DataFrame(pd.DataFrame(pd.date_range('20130101', periods=6)))
    for op in [operator.eq, operator.ne, operator.lt, operator.gt,
               operator.le, operator.ge]:
        r_df = op(df, datetime(2013, 1, 2))
        pd.testing.assert_index_equal(r_df.index_value.to_pandas(),
                                      df.index_value.to_pandas())
Пример #2
0
    def testComp(self):
        df1 = DataFrame(pd.DataFrame(np.random.rand(4, 3)))
        df2 = DataFrame(pd.DataFrame(np.random.rand(4, 3)))

        with build_mode():
            self.assertFalse(df1.data == df2.data)
            self.assertTrue(df1.data == df1.data)

        for op in [
                operator.eq, operator.ne, operator.lt, operator.gt,
                operator.le, operator.ge
        ]:
            eq_df = op(df1, df2)
            pd.testing.assert_index_equal(eq_df.index_value.to_pandas(),
                                          df1.index_value.to_pandas())

            # index not identical
            df3 = DataFrame(
                pd.DataFrame(np.random.rand(4, 3), index=[1, 2, 3, 4]))
            with self.assertRaises(ValueError):
                op(df1, df3)

            # columns not identical
            df4 = DataFrame(
                pd.DataFrame(np.random.rand(4, 3), columns=['a', 'b', 'c']))
            with self.assertRaises(ValueError):
                op(df1, df4)
Пример #3
0
    def testSortValues(self):
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                'f': [pd.Timedelta(f'{i} days') for i in range(10)]
            }, )
        df = DataFrame(raw)
        sorted_df = dataframe_sort_values(df, by='c')

        self.assertEqual(sorted_df.shape, raw.shape)
        self.assertIsInstance(sorted_df.op, DataFrameSortValues)

        tiled = sorted_df.tiles()

        self.assertEqual(len(tiled.chunks), 1)
        self.assertIsInstance(tiled.chunks[0].op, DataFrameSortValues)

        df = DataFrame(raw, chunk_size=6)
        sorted_df = dataframe_sort_values(df, by='c')

        self.assertEqual(sorted_df.shape, raw.shape)
        self.assertIsInstance(sorted_df.op, DataFrameSortValues)

        tiled = sorted_df.tiles()

        self.assertEqual(len(tiled.chunks), 2)
        self.assertEqual(tiled.chunks[0].op.stage, OperandStage.reduce)

        df = DataFrame(raw, chunk_size=3)
        sorted_df = dataframe_sort_values(df, by=['a', 'c'])

        self.assertEqual(sorted_df.shape, raw.shape)
        self.assertIsInstance(sorted_df.op, DataFrameSortValues)

        tiled = sorted_df.tiles()

        self.assertEqual(len(tiled.chunks), 3)
        self.assertEqual(tiled.chunks[0].op.stage, OperandStage.reduce)
        pd.testing.assert_index_equal(tiled.chunks[0].index_value.to_pandas(),
                                      pd.RangeIndex(3))
        self.assertEqual(tiled.chunks[1].op.stage, OperandStage.reduce)
        pd.testing.assert_index_equal(tiled.chunks[1].index_value.to_pandas(),
                                      pd.RangeIndex(3, 6))
        self.assertEqual(tiled.chunks[2].op.stage, OperandStage.reduce)
        pd.testing.assert_index_equal(tiled.chunks[2].index_value.to_pandas(),
                                      pd.RangeIndex(6, 10))
Пример #4
0
    def testSortIndex(self):
        raw = pd.DataFrame(np.random.rand(10, 10),
                           columns=np.random.rand(10),
                           index=np.random.rand(10))
        df = DataFrame(raw)
        sorted_df = sort_index(df)

        self.assertEqual(sorted_df.shape, raw.shape)
        self.assertIsInstance(sorted_df.op, DataFrameSortIndex)

        tiled = sorted_df.tiles()

        self.assertEqual(len(tiled.chunks), 1)
        self.assertIsInstance(tiled.chunks[0].op, DataFrameSortIndex)

        df = DataFrame(raw, chunk_size=6)
        sorted_df = sort_index(df)

        self.assertEqual(sorted_df.shape, raw.shape)
        self.assertIsInstance(sorted_df.op, DataFrameSortIndex)

        tiled = sorted_df.tiles()

        self.assertEqual(len(tiled.chunks), 2)
        self.assertEqual(tiled.chunks[0].op.stage, OperandStage.reduce)

        df = DataFrame(raw, chunk_size=3)
        sorted_df = sort_index(df)

        self.assertEqual(sorted_df.shape, raw.shape)
        self.assertIsInstance(sorted_df.op, DataFrameSortIndex)

        tiled = sorted_df.tiles()

        self.assertEqual(len(tiled.chunks), 3)
        self.assertEqual(tiled.chunks[0].op.stage, OperandStage.reduce)
        self.assertEqual(tiled.chunks[1].op.stage, OperandStage.reduce)
        self.assertEqual(tiled.chunks[2].op.stage, OperandStage.reduce)

        # support on axis 1
        df = DataFrame(raw, chunk_size=4)
        sorted_df = sort_index(df, axis=1)

        self.assertEqual(sorted_df.shape, raw.shape)
        self.assertIsInstance(sorted_df.op, DataFrameSortIndex)

        tiled = sorted_df.tiles()

        self.assertTrue(
            all(isinstance(c.op, DataFrameIndex) for c in tiled.chunks))
Пример #5
0
def test_sort_index():
    raw = pd.DataFrame(np.random.rand(10, 10),
                       columns=np.random.rand(10),
                       index=np.random.rand(10))
    df = DataFrame(raw)
    sorted_df = sort_index(df)

    assert sorted_df.shape == raw.shape
    assert isinstance(sorted_df.op, DataFrameSortIndex)

    tiled = tile(sorted_df)

    assert len(tiled.chunks) == 1
    assert isinstance(tiled.chunks[0].op, DataFrameSortIndex)

    df = DataFrame(raw, chunk_size=6)
    sorted_df = sort_index(df)

    assert sorted_df.shape == raw.shape
    assert isinstance(sorted_df.op, DataFrameSortIndex)

    tiled = tile(sorted_df)

    assert len(tiled.chunks) == 2
    assert tiled.chunks[0].op.stage == OperandStage.reduce

    df = DataFrame(raw, chunk_size=3)
    sorted_df = sort_index(df)

    assert sorted_df.shape == raw.shape
    assert isinstance(sorted_df.op, DataFrameSortIndex)

    tiled = tile(sorted_df)

    assert len(tiled.chunks) == 3
    assert tiled.chunks[0].op.stage == OperandStage.reduce
    assert tiled.chunks[1].op.stage == OperandStage.reduce
    assert tiled.chunks[2].op.stage == OperandStage.reduce

    # support on axis 1
    df = DataFrame(raw, chunk_size=4)
    sorted_df = sort_index(df, axis=1)

    assert sorted_df.shape == raw.shape
    assert isinstance(sorted_df.op, DataFrameSortIndex)

    tiled = tile(sorted_df)

    assert all(isinstance(c.op, DataFrameIndex) for c in tiled.chunks) is True
Пример #6
0
def test_assign(setup):
    rs = np.random.RandomState(0)
    raw = pd.DataFrame({"A": rs.rand(10), "B": rs.rand(10)})

    df = DataFrame(raw, chunk_size=5)
    result = df.assign(C=df.B / df.A).execute().fetch()
    expected = raw.assign(C=raw.B / raw.A)
    pd.testing.assert_frame_equal(result, expected)

    # lambda syntax
    result = df.assign(C=lambda x: x.B / x.A).execute().fetch()
    expected = raw.assign(C=lambda x: x.B / x.A)
    pd.testing.assert_frame_equal(result, expected)

    # Non-Series array-like
    row_list = rs.rand(10).tolist()
    result = df.assign(C=row_list).execute().fetch()
    expected = raw.assign(C=row_list)
    pd.testing.assert_frame_equal(result, expected)

    # multiple
    row_list = rs.rand(10).tolist()
    result = df.assign(C=row_list, D=df.A, E=lambda x: x.B)
    result['C'] = result['C'].astype('int64')
    expected = raw.assign(C=row_list, D=raw.A, E=lambda x: x.B)
    expected['C'] = expected['C'].astype('int64')
    pd.testing.assert_frame_equal(result.execute().fetch(), expected)
Пример #7
0
    def testFetchDataFrameCornerData(self):
        max_rows = pd.get_option('display.max_rows')
        try:
            min_rows = pd.get_option('display.min_rows')
        except KeyError:  # pragma: no cover
            min_rows = max_rows
        sess = new_session()

        for row in (5,
                    max_rows - 2,
                    max_rows - 1,
                    max_rows,
                    max_rows + 1,
                    max_rows + 2,
                    max_rows + 3):
            pdf = pd.DataFrame(np.random.rand(row, 5))
            df = DataFrame(pdf, chunk_size=max_rows // 2)
            sess.run(df, fetch=False)

            corner = fetch_corner_data(df, session=sess)
            self.assertLessEqual(corner.shape[0], max_rows + 2)
            corner_max_rows = max_rows if row <= max_rows else corner.shape[0] - 1
            self.assertEqual(corner.to_string(max_rows=corner_max_rows, min_rows=min_rows),
                             pdf.to_string(max_rows=max_rows, min_rows=min_rows),
                             'failed when row == {}'.format(row))
Пример #8
0
def test_sort_values():
    raw = pd.DataFrame(
        {
            'a': np.random.rand(10),
            'b': np.random.randint(1000, size=10),
            'c': np.random.rand(10),
            'd': [np.random.bytes(10) for _ in range(10)],
            'e': [pd.Timestamp(f'201{i}') for i in range(10)],
            'f': [pd.Timedelta(f'{i} days') for i in range(10)]
        }, )
    df = DataFrame(raw)
    sorted_df = dataframe_sort_values(df, by='c')

    assert sorted_df.shape == raw.shape
    assert isinstance(sorted_df.op, DataFrameSortValues)

    tiled = tile(sorted_df)

    assert len(tiled.chunks) == 1
    assert isinstance(tiled.chunks[0].op, DataFrameSortValues)

    df = DataFrame(raw, chunk_size=6)
    sorted_df = dataframe_sort_values(df, by='c')

    assert sorted_df.shape == raw.shape
    assert isinstance(sorted_df.op, DataFrameSortValues)

    tiled = tile(sorted_df)

    assert len(tiled.chunks) == 2
    assert tiled.chunks[0].op.stage == OperandStage.reduce

    df = DataFrame(raw, chunk_size=3)
    sorted_df = dataframe_sort_values(df, by=['a', 'c'])

    assert sorted_df.shape == raw.shape
    assert isinstance(sorted_df.op, DataFrameSortValues)

    tiled = tile(sorted_df)

    assert len(tiled.chunks) == 3
    assert tiled.chunks[0].op.stage == OperandStage.reduce
    pd.testing.assert_series_equal(tiled.chunks[0].dtypes, raw.dtypes)
    assert tiled.chunks[1].op.stage == OperandStage.reduce
    pd.testing.assert_series_equal(tiled.chunks[1].dtypes, raw.dtypes)
    assert tiled.chunks[2].op.stage == OperandStage.reduce
    pd.testing.assert_series_equal(tiled.chunks[2].dtypes, raw.dtypes)
Пример #9
0
def test_build_concatenated_rows_frame(setup, columns):
    df = pd.DataFrame(np.random.rand(16, 8), columns=columns)

    # single chunk
    mdf = tile(DataFrame(df, chunk_size=8))
    concatenated = build_concatenated_rows_frame(mdf)
    assert len(concatenated.chunks) == 2
    pd.testing.assert_frame_equal(concatenated.execute().fetch(), df)

    # multiple chunks
    mdf = tile(DataFrame(df, chunk_size=5))
    concatenated = build_concatenated_rows_frame(mdf)
    assert len(concatenated.chunks) == 4
    for i in range(4):
        pd.testing.assert_index_equal(
            concatenated.chunks[i].columns_value.to_pandas(), df.columns)
    pd.testing.assert_frame_equal(concatenated.execute().fetch(), df)
Пример #10
0
def test_comp(setup):
    df1 = DataFrame(pd.DataFrame(np.random.rand(4, 3)))
    df2 = DataFrame(pd.DataFrame(np.random.rand(4, 3)))

    with enter_mode(build=True):
        assert not df1.data == df2.data
        assert df1.data == df1.data

    for op in [
            operator.eq, operator.ne, operator.lt, operator.gt, operator.le,
            operator.ge
    ]:
        eq_df = op(df1, df2)
        pd.testing.assert_index_equal(eq_df.index_value.to_pandas(),
                                      df1.index_value.to_pandas())

        # index not identical
        df3 = DataFrame(pd.DataFrame(np.random.rand(4, 3), index=[1, 2, 3, 4]))
        with pytest.raises(ValueError):
            op(df1, df3)

        # columns not identical
        df4 = DataFrame(
            pd.DataFrame(np.random.rand(4, 3), columns=['a', 'b', 'c']))
        with pytest.raises(ValueError):
            op(df1, df4)

    # test datetime
    df = DataFrame(pd.DataFrame(pd.date_range('20130101', periods=6)))
    for op in [
            operator.eq, operator.ne, operator.lt, operator.gt, operator.le,
            operator.ge
    ]:
        r_df = op(df, datetime(2013, 1, 2))
        pd.testing.assert_index_equal(r_df.index_value.to_pandas(),
                                      df.index_value.to_pandas())

    # test period type
    raw = pd.period_range("2000-01-01", periods=10, freq="D")
    raw_series = pd.Series(raw)
    series = Series(raw, chunk_size=5)
    r = series >= series[1]
    pd.testing.assert_series_equal(r.to_pandas(), raw_series >= raw_series[1])
Пример #11
0
    def testValidateAxis(self):
        df = DataFrame(pd.DataFrame(np.random.rand(4, 3)))

        self.assertEqual(validate_axis(0, df), 0)
        self.assertEqual(validate_axis('index', df), 0)
        self.assertEqual(validate_axis(1, df), 1)
        self.assertEqual(validate_axis('columns', df), 1)

        with self.assertRaises(ValueError):
            validate_axis('unknown index', df)

        with self.assertRaises(ValueError):
            validate_axis(object(), df)

        with self.assertRaises(ValueError):
            validate_axis(-1, df)

        with self.assertRaises(ValueError):
            validate_axis(2, df)
Пример #12
0
def test_dataframe_params():
    raw = pd.DataFrame({'a': [1, 2, 3]})
    df = DataFrame(raw)
    df = df[df['a'] < 2]
    df = tile(df)
    c = df.chunks[0]

    assert any(np.isnan(s) for s in c.params['shape'])
    assert np.isnan(c.params['index_value'].min_val)
    c.params = c.get_params_from_data(raw[raw['a'] < 2])
    # shape and index_value updated
    assert not any(np.isnan(s) for s in c.params['shape'])
    assert not np.isnan(c.params['index_value'].min_val)

    params = c.params.copy()
    params.pop('index', None)
    df.params = params
    assert np.prod(df.shape) > 0
    df.refresh_params()
Пример #13
0
def test_fetch_dataframe_corner_data(setup):
    max_rows = pd.get_option('display.max_rows')
    try:
        min_rows = pd.get_option('display.min_rows')
    except KeyError:  # pragma: no cover
        min_rows = max_rows

    for row in (5, max_rows - 2, max_rows - 1, max_rows, max_rows + 1,
                max_rows + 2, max_rows + 3):
        pdf = pd.DataFrame(np.random.rand(row, 5))
        df = DataFrame(pdf, chunk_size=max_rows // 2)
        df.execute()

        corner = fetch_corner_data(df)
        assert corner.shape[0] <= max_rows + 2
        corner_max_rows = max_rows if row <= max_rows else corner.shape[0] - 1
        assert corner.to_string(max_rows=corner_max_rows,
                                min_rows=min_rows) == pdf.to_string(
                                    max_rows=max_rows, min_rows=min_rows)
Пример #14
0
def test_validate_axis():
    df = DataFrame(pd.DataFrame(np.random.rand(4, 3)))

    assert validate_axis(0, df) == 0
    assert validate_axis('index', df) == 0
    assert validate_axis(1, df) == 1
    assert validate_axis('columns', df) == 1

    with pytest.raises(ValueError):
        validate_axis('unknown index', df)

    with pytest.raises(ValueError):
        validate_axis(object(), df)

    with pytest.raises(ValueError):
        validate_axis(-1, df)

    with pytest.raises(ValueError):
        validate_axis(2, df)

    df2 = df[df[0] < 0.5]  # create unknown shape
    assert validate_axis(0, df2) == 0
Пример #15
0
def test_groupby_params():
    raw = pd.DataFrame({'a': [1, 2, 3]})
    df = DataFrame(raw)
    grouped = df.groupby('a')
    grouped = tile(grouped)
    c = grouped.chunks[0]

    c.params = c.get_params_from_data(wrapped_groupby(raw, by='a'))
    params = c.params.copy()
    params.pop('index', None)
    grouped.params = params

    raw = pd.Series([1, 2, 3], name='a')
    series = Series(raw)
    grouped = series.groupby(level=0)
    grouped = tile(grouped)
    c = grouped.chunks[0]

    c.params = c.get_params_from_data(wrapped_groupby(raw, level=0))
    params = c.params.copy()
    params.pop('index', None)
    grouped.params = params
    grouped.refresh_params()
Пример #16
0
def test_key_value(setup):
    raw = pd.DataFrame(np.random.rand(4, 3), columns=list('ABC'))
    df = DataFrame(raw)

    result = df.values.execute().fetch()
    np.testing.assert_array_equal(result, raw.values)

    result = df.keys().execute().fetch()
    pd.testing.assert_index_equal(result, raw.keys())

    raw = pd.Series(np.random.rand(10))
    s = Series(raw)

    result = s.values.execute().fetch()
    np.testing.assert_array_equal(result, raw.values)

    result = s.keys().execute().fetch()
    pd.testing.assert_index_equal(result, raw.keys())

    raw = pd.Index(np.random.rand(10))
    idx = Index(raw)

    result = idx.values.execute().fetch()
    np.testing.assert_array_equal(result, raw.values)
Пример #17
0
    def testKeyValue(self):
        raw = pd.DataFrame(np.random.rand(4, 3), columns=list('ABC'))
        df = DataFrame(raw)

        result = self.executor.execute_dataframe(df.values, concat=True)[0]
        np.testing.assert_array_equal(result, raw.values)

        result = self.executor.execute_dataframe(df.keys(), concat=True)[0]
        pd.testing.assert_index_equal(result, raw.keys())

        raw = pd.Series(np.random.rand(10))
        s = Series(raw)

        result = self.executor.execute_dataframe(s.values, concat=True)[0]
        np.testing.assert_array_equal(result, raw.values)

        result = self.executor.execute_dataframe(s.keys(), concat=True)[0]
        pd.testing.assert_index_equal(result, raw.keys())

        raw = pd.Index(np.random.rand(10))
        idx = Index(raw)

        result = self.executor.execute_dataframe(idx.values, concat=True)[0]
        np.testing.assert_array_equal(result, raw.values)
Пример #18
0
def test_dataframe_dir():
    df = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=list('ABC')))
    dir_result = set(dir(df))
    for c in df.dtypes.index:
        assert c in dir_result
Пример #19
0
 def testDataFrameDir(self):
     df = DataFrame(pd.DataFrame(np.random.rand(4, 3), columns=list('ABC')))
     dir_result = set(dir(df))
     for c in df.dtypes.index:
         self.assertIn(c, dir_result)