def test_read_csv_head(prepare_data, setup): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf.head(5) pd.testing.assert_frame_equal(result, expected) # test multiple head df3 = df1.head(10) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None assert opt_df1.op.nrows == 10 opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert graph.predecessors(opt_df2)[0] is opt_df1 assert opt_df2.inputs[0] is opt_df1 opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert graph.predecessors(opt_df3)[0] is opt_df1 assert opt_df3.inputs[0] is opt_df1 # test head with successor df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 2
def test_read_csv_head(prepare_data): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results # test multiple head df3 = df1.head(10) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None assert opt_df1.op.nrows == 10 opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert graph.predecessors(opt_df2)[0] is opt_df1 assert opt_df2.inputs[0] is opt_df1 opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert graph.predecessors(opt_df3)[0] is opt_df1 assert opt_df3.inputs[0] is opt_df1 # test head with successor df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 2
def test_groupby_and_getitem(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) df3 = df1[['b', 'a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1 in graph.predecessors(opt_df2) opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1.op.usecols == ['a', 'b', 'c'] # original tileable should not be modified assert df2.inputs[0] is df1.data assert df3.inputs[0] is df1.data
def test_groupby_read_csv(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None assert opt_df1 in graph.predecessors(opt_df2) assert opt_df1 in opt_df2.inputs assert opt_df1.op.usecols == ['a', 'c'] assert opt_df2 in graph.predecessors(df3.data) assert opt_df2 in df3.inputs df4 = md.read_csv(file_path, usecols=['a', 'b', 'c']) df5 = df4.groupby('c').agg({'b': 'sum'}) graph = TileableGraph([df5.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df4 = records.get_optimization_result(df4.data) assert opt_df4 is not None opt_df5 = records.get_optimization_result(df5.data) assert opt_df5 is not None assert opt_df4.op.usecols == ['b', 'c'] df6 = md.read_csv(file_path) df7 = df6.groupby('c').agg({'b': 'sum'}) df8 = df6.groupby('b').agg({'a': 'sum'}) graph = TileableGraph([df7.data, df8.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df6 = records.get_optimization_result(df6.data) assert opt_df6 is not None opt_df7 = records.get_optimization_result(df7.data) assert opt_df7 is not None opt_df8 = records.get_optimization_result(df8.data) assert opt_df8 is not None assert opt_df6.op.usecols == ['a', 'b', 'c'] # original tileable should not be modified assert df7.inputs[0] is df6.data assert df8.inputs[0] is df6.data # test data source in result tileables graph = TileableGraph([df6.data, df7.data, df8.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df6 = records.get_optimization_result(df6.data) assert opt_df6 is None opt_df7 = records.get_optimization_result(df7.data) assert opt_df7 is None opt_df8 = records.get_optimization_result(df8.data) assert opt_df8 is None