def test_read_custom_json_text(): filename = get_unique_filename(extension="json") def _generate_json(file_name, nrows, ncols): data = np.random.rand(nrows, ncols) df = pandas.DataFrame(data, columns=[f"col{x}" for x in range(ncols)]) df.to_json(file_name, lines=True, orient="records") _generate_json(filename, 64, 8) # Custom parser allows us to add some specifics to reading files, # which is not available through the ready-made API. # For example, the parser allows us to reduce the amount of RAM # required for reading by selecting a subset of columns. def _custom_parser(io_input, **kwargs): result = {"col0": [], "col1": [], "col3": []} for line in io_input: # for example, simjson can be used here obj = json.loads(line) for key in result: result[key].append(obj[key]) return pandas.DataFrame(result).rename(columns={"col0": "testID"}) df1 = pd.read_custom_text( filename, columns=["testID", "col1", "col3"], custom_parser=_custom_parser, is_quoting=False, ) df2 = pd.read_json(filename, lines=True)[["col0", "col1", "col3"]].rename(columns={"col0": "testID"}) df_equals(df1, df2)
def TestReadGlobCSVFixture(): filenames = [] base_name = get_unique_filename(extension="") pytest.glob_path = "{}_*.csv".format(base_name) pytest.files = ["{}_{}.csv".format(base_name, i) for i in range(11)] for fname in pytest.files: # Glob does not guarantee ordering so we have to remove the randomness in the generated csvs. _make_csv_file(filenames)(fname, row_size=11, remove_randomness=True) yield teardown_test_files(filenames)
def test_read_evaluated_dict(): filename = get_unique_filename(extension="json") def _generate_evaluated_dict(file_name, nrows, ncols): result = {} keys = [f"col{x}" for x in range(ncols)] with open(file_name, mode="w") as _file: for i in range(nrows): data = np.random.rand(ncols) for idx, key in enumerate(keys): result[key] = data[idx] _file.write(str(result)) _file.write("\n") _generate_evaluated_dict(filename, 64, 8) # This parser allows us to read a format not supported by other reading functions def _custom_parser(io_input, **kwargs): cat_list = [] asin_list = [] for line in io_input: obj = eval(line) cat_list.append(obj["col1"]) asin_list.append(obj["col2"]) return pandas.DataFrame({"col1": asin_list, "col2": cat_list}) df1 = pd.read_custom_text( filename, columns=["col1", "col2"], custom_parser=_custom_parser, ) assert df1.shape == (64, 2) def columns_callback(io_input, **kwargs): columns = None for line in io_input: columns = list(eval(line).keys())[1:3] break return columns df2 = pd.read_custom_text(filename, columns=columns_callback, custom_parser=_custom_parser) df_equals(df1, df2)
def TestReadCSVFixture(): filenames = [] files_ids = [ "test_read_csv_regular", "test_read_csv_blank_lines", "test_read_csv_yes_no", "test_read_csv_nans", "test_read_csv_bad_lines", ] # each xdist worker spawned in separate process with separate namespace and dataset pytest.csvs_names = { file_id: get_unique_filename() for file_id in files_ids } # test_read_csv_col_handling, test_read_csv_parsing _make_csv_file(filenames)( filename=pytest.csvs_names["test_read_csv_regular"], ) # test_read_csv_parsing _make_csv_file(filenames)( filename=pytest.csvs_names["test_read_csv_yes_no"], additional_col_values=["Yes", "true", "No", "false"], ) # test_read_csv_col_handling _make_csv_file(filenames)( filename=pytest.csvs_names["test_read_csv_blank_lines"], add_blank_lines=True, ) # test_read_csv_nans_handling _make_csv_file(filenames)( filename=pytest.csvs_names["test_read_csv_nans"], add_blank_lines=True, additional_col_values=[ "<NA>", "N/A", "NA", "NULL", "custom_nan", "73" ], ) # test_read_csv_error_handling _make_csv_file(filenames)( filename=pytest.csvs_names["test_read_csv_bad_lines"], add_bad_lines=True, ) yield # Delete csv files that were created teardown_test_files(filenames)