Пример #1
0
    def testToCSVExecution(self):
        index = pd.RangeIndex(100, 0, -1, name='index')
        raw = pd.DataFrame({
            'col1': np.random.rand(100),
            'col2': np.random.choice(['a', 'b', 'c'], (100,)),
            'col3': np.arange(100)
        }, index=index)
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # test one file
            path = os.path.join(base_path, 'out.csv')

            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)

            # test multi files
            path = os.path.join(base_path, 'out-*.csv')
            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            dfs = [pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)),
                               dtype=raw.dtypes.to_dict())
                   for i in range(4)]
            result = pd.concat(dfs, axis=0)
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)
            pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.iloc[33: 66])
Пример #2
0
def test_to_csv():
    raw = pd.DataFrame(np.random.rand(10, 5))
    df = DataFrame(raw, chunk_size=4)

    r = df.to_csv('*.csv')
    r = tile(r)

    assert r.chunk_shape[1] == 1
    for i, c in enumerate(r.chunks):
        assert type(c.op).__name__ == 'DataFrameToCSV'
        assert c.inputs[0] is r.inputs[0].chunks[i].data

    # test one file
    r = df.to_csv('out.csv')
    r = tile(r)

    assert r.chunk_shape[1] == 1
    for i, c in enumerate(r.chunks):
        assert len(c.inputs) == 2
        assert c.inputs[0].inputs[0] is r.inputs[0].chunks[i].data
        assert type(c.inputs[1].op).__name__ == 'DataFrameToCSVStat'
Пример #3
0
def test_to_csv_execution(setup):
    index = pd.RangeIndex(100, 0, -1, name='index')
    raw = pd.DataFrame(
        {
            'col1': np.random.rand(100),
            'col2': np.random.choice(['a', 'b', 'c'], (100, )),
            'col3': np.arange(100)
        },
        index=index)
    df = DataFrame(raw, chunk_size=33)

    with tempfile.TemporaryDirectory() as base_path:
        # DATAFRAME TESTS
        # test one file with dataframe
        path = os.path.join(base_path, 'out.csv')

        df.to_csv(path).execute()

        result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw)

        # test multi files with dataframe
        path = os.path.join(base_path, 'out-*.csv')
        df.to_csv(path).execute()

        dfs = [
            pd.read_csv(os.path.join(base_path, f'out-{i}.csv'),
                        dtype=raw.dtypes.to_dict()) for i in range(4)
        ]
        result = pd.concat(dfs, axis=0)
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw)
        pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                      raw.iloc[33:66])

        # test df with unknown shape
        df2 = DataFrame(raw, chunk_size=(50, 2))
        df2 = df2[df2['col1'] < 1]
        path2 = os.path.join(base_path, 'out2.csv')
        df2.to_csv(path2).execute()

        result = pd.read_csv(path2, dtype=raw.dtypes.to_dict())
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw)

        # SERIES TESTS
        series = md.Series(raw.col1, chunk_size=33)

        # test one file with series
        path = os.path.join(base_path, 'out.csv')
        series.to_csv(path).execute()

        result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw.col1.to_frame())

        # test multi files with series
        path = os.path.join(base_path, 'out-*.csv')
        series.to_csv(path).execute()

        dfs = [
            pd.read_csv(os.path.join(base_path, f'out-{i}.csv'),
                        dtype=raw.dtypes.to_dict()) for i in range(4)
        ]
        result = pd.concat(dfs, axis=0)
        result.set_index('index', inplace=True)
        pd.testing.assert_frame_equal(result, raw.col1.to_frame())
        pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                      raw.col1.to_frame().iloc[33:66])