def test_batch_read_dataframes_from_json_of_orients(df, orient): test_datas = [df.to_json(orient=orient).encode()] * 3 test_types = ['application/json'] * 3 df_merged, slices = read_dataframes_from_json_n_csv(test_datas, test_types, orient) df_merged, slices = read_dataframes_from_json_n_csv(test_datas, test_types, orient) for s in slices: assert_df_equal(df_merged[s], df)
def test_benchmark_load_dataframes(): ''' read_dataframes_from_json_n_csv should be 30x faster than pd.read_json + pd.concat ''' test_count = 50 dfs = [pd.DataFrame(np.random.rand(10, 100)) for _ in range(test_count)] inputs = [df.to_json().encode() for df in dfs] time_st = time.time() dfs = [pd.read_json(i) for i in inputs] result1 = pd.concat(dfs) time1 = time.time() - time_st time_st = time.time() result2, _ = read_dataframes_from_json_n_csv( inputs, itertools.repeat('json'), 'columns' ) time2 = time.time() - time_st assert_df_equal(result1, result2) # 5 is just an estimate on the smaller end, which should be true for most # development machines and Github actions CI environment, the actual ratio depends # on the hardware and available computing resource assert time1 / time2 > 5
def test_batch_read_dataframes_from_json_n_csv(): for df in ( pd.DataFrame(np.random.rand(2, 3)), pd.DataFrame(["str1", "str2", "str3"]), # single dim sting array pd.DataFrame([np.nan]), # special values pd.DataFrame([math.nan]), # special values pd.DataFrame([" "]), # special values # pd.DataFrame([""]), # TODO: -> NaN ): csv_str = df.to_json() list_str = json.dumps(df.to_numpy().tolist()) test_datas = ([csv_str.encode()] * 20 + [list_str.encode()] * 20 + [df.to_csv().encode()] * 20 + [df.to_csv(index=False).encode()] * 20) test_types = (['application/json'] * 20 + ['application/json'] * 20 + ['text/csv'] * 20 + ['text/csv'] * 20) df_merged, slices = read_dataframes_from_json_n_csv( test_datas, test_types) for s in slices: left = df_merged[s].values right = df.values if right.dtype == np.float: np.testing.assert_array_almost_equal(left, right) else: np.testing.assert_array_equal(left, right)
def test_batch_read_dataframes_from_json_with_wrong_orients(df, orient): test_datas = [df.to_json(orient='table').encode()] * 3 test_types = ['json'] * 3 df_merged, counts = read_dataframes_from_json_n_csv(test_datas, test_types, orient) assert not df_merged for count in counts: assert not count
def test_batch_read_dataframes_from_json_of_orients(df, orient): test_datas = [df.to_json(orient=orient).encode()] * 3 test_types = ['json'] * 3 df_merged, counts = read_dataframes_from_json_n_csv(test_datas, test_types, orient) i = 0 for count in counts: assert_df_equal(df_merged[i : i + count], df) i += count
def test_batch_read_dataframes_from_json_in_mixed_order(): # different column order when orient=records df_json = b'[{"A": 1, "B": 2, "C": 3}, {"C": 6, "A": 2, "B": 4}]' df_merged, slices = read_dataframes_from_json_n_csv([df_json], ['application/json']) for s in slices: assert_df_equal(df_merged[s], pd.read_json(df_json)) # different row/column order when orient=columns df_json1 = b'{"A": {"1": 1, "2": 2}, "B": {"1": 2, "2": 4}, "C": {"1": 3, "2": 6}}' df_json2 = b'{"B": {"1": 2, "2": 4}, "A": {"1": 1, "2": 2}, "C": {"1": 3, "2": 6}}' df_json3 = b'{"A": {"1": 1, "2": 2}, "B": {"2": 4, "1": 2}, "C": {"1": 3, "2": 6}}' df_merged, slices = read_dataframes_from_json_n_csv( [df_json1, df_json2, df_json3], ['application/json'] * 3) for s in slices: assert_df_equal(df_merged[s][["A", "B", "C"]], pd.read_json(df_json1)[["A", "B", "C"]])
def test_batch_read_dataframes_from_csv_other_CRLF(df): csv_str = df.to_csv(index=False) if '\r\n' in csv_str: csv_str = '\n'.join(_csv_split(csv_str, '\r\n')).encode() else: csv_str = '\r\n'.join(_csv_split(csv_str, '\n')).encode() df_merged, _ = read_dataframes_from_json_n_csv([csv_str], ['text/csv']) assert_df_equal(df_merged, df)
def test_batch_read_dataframes_from_csv_other_CRLF(df): csv_str = df.to_csv(index=False) if '\r\n' in csv_str: csv_str = '\n'.join(csv_splitlines(csv_str)) else: csv_str = '\r\n'.join(csv_splitlines(csv_str)) df_merged, _ = read_dataframes_from_json_n_csv([csv_str], ['csv']) assert_df_equal(df_merged, df)
def test_batch_read_dataframes_from_json_in_mixed_order(): # different column order when orient=records df_json = b'[{"A": 1, "B": 2, "C": 3}, {"C": 6, "A": 2, "B": 4}]' df_merged, counts = read_dataframes_from_json_n_csv([df_json], ['json']) i = 0 for count in counts: assert_df_equal(df_merged[i : i + count], pd.read_json(df_json)) i += count # different row/column order when orient=columns df_json1 = b'{"A": {"1": 1, "2": 2}, "B": {"1": 2, "2": 4}, "C": {"1": 3, "2": 6}}' df_json2 = b'{"B": {"1": 2, "2": 4}, "A": {"1": 1, "2": 2}, "C": {"1": 3, "2": 6}}' df_json3 = b'{"A": {"1": 1, "2": 2}, "B": {"2": 4, "1": 2}, "C": {"1": 3, "2": 6}}' df_merged, counts = read_dataframes_from_json_n_csv( [df_json1, df_json2, df_json3], ['json'] * 3 ) i = 0 for count in counts: assert_df_equal( df_merged[i : i + count][["A", "B", "C"]], pd.read_json(df_json1)[["A", "B", "C"]], ) i += count
def test_batch_read_dataframes_from_mixed_json_n_csv(df): test_datas = [] test_types = [] # test content_type=application/json with various orients for orient in pytest.DF_ORIENTS: try: assert_df_equal(df, pd.read_json(df.to_json(orient=orient))) except (AssertionError, ValueError): # skip cases not supported by official pandas continue test_datas.extend([df.to_json(orient=orient).encode()] * 3) test_types.extend(['application/json'] * 3) df_merged, slices = read_dataframes_from_json_n_csv( test_datas, test_types, orient=None) # auto detect orient test_datas.extend([df.to_csv(index=False).encode()] * 3) test_types.extend(['text/csv'] * 3) df_merged, slices = read_dataframes_from_json_n_csv(test_datas, test_types) for s in slices: assert_df_equal(df_merged[s], df)
def test_benchmark_load_dataframes(): ''' read_dataframes_from_json_n_csv should be 30x faster than pd.read_json + pd.concat ''' test_count = 50 dfs = [pd.DataFrame(np.random.rand(10, 100)) for _ in range(test_count)] inputs = [df.to_json().encode() for df in dfs] time_st = time.time() dfs = [pd.read_json(i) for i in inputs] result1 = pd.concat(dfs) time1 = time.time() - time_st time_st = time.time() result2, _ = read_dataframes_from_json_n_csv( inputs, itertools.repeat('application/json'), 'columns') time2 = time.time() - time_st assert_df_equal(result1, result2) assert time1 / time2 > 20
def test_batch_read_dataframes_from_mixed_json_n_csv(df): test_datas = [] test_types = [] # test content_type=application/json with various orients for orient in pytest.DF_ORIENTS: try: assert_df_equal(df, pd.read_json(df.to_json(orient=orient))) except (AssertionError, ValueError): # skip cases not supported by official pandas continue test_datas.extend([df.to_json(orient=orient)] * 3) test_types.extend(['json'] * 3) test_datas.extend([df.to_csv(index=False)] * 3) test_types.extend(['csv'] * 3) df_merged, counts = read_dataframes_from_json_n_csv(test_datas, test_types) i = 0 for count in counts: assert_df_equal(df_merged[i : i + count], df) i += count
def test_batch_read_dataframes_from_json_with_wrong_orients(df, orient): test_datas = [df.to_json(orient='table').encode()] * 3 test_types = ['application/json'] * 3 with pytest.raises(BadInput): read_dataframes_from_json_n_csv(test_datas, test_types, orient)