示例#1
0
def test_date(in_type, pd_date_type, out_type):

    test_data_path = "tests/data/date_type.csv"
    test_str_dates = pd.read_csv(test_data_path, dtype=str)["my_date"]
    test_str_dates = [None if pd.isna(s) else s for s in test_str_dates]

    schema = pa.schema([("my_date", getattr(pa, in_type)())])

    # datetime_object
    if in_type == "date32" and pd_date_type == "pd_period":
        with pytest.warns(UserWarning):
            df = pa_read_csv_to_pandas(
                test_data_path,
                schema,
                expect_full_schema=False,
                pd_date_type=pd_date_type,
            )
    else:
        df = pa_read_csv_to_pandas(test_data_path,
                                   schema,
                                   expect_full_schema=False,
                                   pd_date_type=pd_date_type)

    test_str_dates = pd.read_csv(test_data_path, dtype=str)["my_date"]
    test_str_dates = [None if pd.isna(s) else s for s in test_str_dates]

    assert str(df.my_date.dtype) == out_type
    if out_type == "object":
        assert isinstance(df.my_date[0], datetime.date)

    actual_str_dates = pd_datetime_series_to_list(df.my_date,
                                                  out_type.split("[")[0],
                                                  date=True)
    assert test_str_dates == actual_str_dates
def test_string(in_type, pd_old_type, pd_new_type):

    schema = pa.schema([("string_col", pa.string())])
    df_old = pa_read_csv_to_pandas("tests/data/string_type.csv",
                                   schema,
                                   False,
                                   pd_string=False)
    assert str(df_old.my_string.dtype) == pd_old_type

    df_new = pa_read_csv_to_pandas("tests/data/string_type.csv",
                                   schema,
                                   False,
                                   pd_string=True)
    assert str(df_new.my_string.dtype) == pd_new_type
示例#3
0
def test_to_parquet(schema, boolean_args, date_args):
    original = pa_read_csv_to_pandas(
        "tests/data/all_types.csv",
        schema,
        pd_boolean=boolean_args,
        pd_integer=boolean_args,
        pd_string=boolean_args,
        pd_date_type=date_args,
        pd_timestamp_type=date_args,
    )

    # output as parquet
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        tmp_out_file = f.name
    pd_to_parquet(original, tmp_out_file)

    # read in as parquet
    reloaded = pa_read_parquet_to_pandas(
        tmp_out_file,
        schema,
        pd_boolean=boolean_args,
        pd_integer=boolean_args,
        pd_string=boolean_args,
        pd_date_type=date_args,
        pd_timestamp_type=date_args,
    )

    assert_frame_equal(original, reloaded)
示例#4
0
def test_datetime(in_type, pd_timestamp_type, out_type):

    test_data_path = "tests/data/datetime_type.csv"
    test_str_dates = pd.read_csv(test_data_path, dtype=str)["my_datetime"]
    test_str_dates = [None if pd.isna(s) else s for s in test_str_dates]

    type_dict = {
        "timestamp[s]": pa.timestamp("s"),
        "timestamp[ms]": pa.timestamp("ms"),
        "timestamp[us]": pa.timestamp("us"),
        "timestamp[ns]": pa.timestamp("ns"),
    }

    schema = pa.schema([("my_datetime", type_dict[in_type])])

    # datetime_object
    df = pa_read_csv_to_pandas(
        test_data_path,
        schema=schema,
        expect_full_schema=False,
        pd_timestamp_type=pd_timestamp_type,
    )

    test_str_dates = pd.read_csv(test_data_path, dtype=str)["my_datetime"]
    test_str_dates = [None if pd.isna(s) else s for s in test_str_dates]

    assert str(df.my_datetime.dtype) == out_type
    if out_type == "object":
        assert isinstance(df.my_datetime[0], datetime.datetime)

    actual_str_dates = pd_datetime_series_to_list(df.my_datetime,
                                                  out_type.split("[")[0],
                                                  date=False)
    assert test_str_dates == actual_str_dates
示例#5
0
def test_int_csv(in_type, pd_old_type, pd_new_type):
    """Testing csv mapping from pyarrow to Pandas data types.

    Args:
        in_type (str): pyarrow data type read in from the csv.
        pd_old_type (str): old pandas data type mapping.
        pd_new_type (str): new pandas data type mapping.
    """

    schema = pa.schema([("int_col", getattr(pa, in_type)())])
    test_file = "tests/data/int_type.csv"

    df_old = pa_read_csv_to_pandas(test_file, schema, False, pd_integer=False)
    assert str(df_old.my_int.dtype) == pd_old_type

    df_new = pa_read_csv_to_pandas(test_file, schema, False, pd_integer=True)
    assert str(df_new.my_int.dtype) == pd_new_type
示例#6
0
def test_bool_csv_and_json():
    schema = pa.schema(
        [("i", pa.int8()), ("my_bool", pa.bool_()), ("my_nullable_bool", pa.bool_())]
    )
    df_csv = pa_read_csv_to_pandas("tests/data/bool_type.csv", schema, pd_boolean=True)
    df_jsonl = pa_read_json_to_pandas(
        "tests/data/bool_type.jsonl", schema, pd_boolean=True
    )
    assert df_csv.equals(df_jsonl)
def test_file_reader_works_with_schema():
    csv_schema = pa.schema([("test", pa.string()), ("a_column", pa.string())])
    df_csv = pa_read_csv_to_pandas("tests/data/example_data.csv")
    df_csv_schema = pa_read_csv_to_pandas("tests/data/example_data.csv",
                                          csv_schema)
    assert_frame_equal(df_csv, df_csv_schema)

    json_schema = pa.schema([("a", pa.int64()), ("b", pa.float64()),
                             ("c", pa.string()), ("d", pa.bool_())])
    df_json = pa_read_json_to_pandas("tests/data/example_data.jsonl")
    df_json_schema = pa_read_json_to_pandas("tests/data/example_data.jsonl",
                                            json_schema)
    assert_frame_equal(df_json, df_json_schema)

    # Check raises error on both readers
    missing_schema = pa.schema([("b", pa.float64()), ("c", pa.string()),
                                ("d", pa.bool_())])
    with pytest.raises(ValueError):
        pa_read_json_to_pandas("tests/data/example_data.jsonl", missing_schema)
    with pytest.raises(ValueError):
        pa_read_csv_to_pandas("tests/data/example_data.csv", missing_schema)
示例#8
0
def test_timestamps_as_strs():
    test_data_path = "tests/data/datetime_type.csv"
    test_str_dates = pd.read_csv(test_data_path, dtype="string")["my_datetime"]

    schema = pa.schema([("my_datetime", pa.string())])
    df = pa_read_csv_to_pandas(test_data_path,
                               schema,
                               expect_full_schema=False)
    assert df["my_datetime"].to_list() == test_str_dates.to_list()

    df = pa_read_json_to_pandas(test_data_path.replace(".csv", ".jsonl"),
                                schema,
                                expect_full_schema=False)
    assert df["my_datetime"].to_list() == test_str_dates.to_list()
示例#9
0
def test_pd_to_csv(boolean_args, date_args, schema):
    original = pa_read_csv_to_pandas(
        "tests/data/all_types.csv",
        schema,
        pd_boolean=boolean_args,
        pd_integer=boolean_args,
        pd_string=boolean_args,
        pd_date_type=date_args,
        pd_timestamp_type=date_args,
    )
    # Write to StringIO then convert to BytesIO so Arrow can read it
    output = io.StringIO()
    pd_to_csv(original, output)
    as_bytes = io.BytesIO(bytearray(output.getvalue(), "utf-8"))
    reloaded = pa_read_csv_to_pandas(
        as_bytes,
        schema,
        pd_boolean=boolean_args,
        pd_integer=boolean_args,
        pd_string=boolean_args,
        pd_date_type=date_args,
        pd_timestamp_type=date_args,
    )
    assert_frame_equal(original, reloaded)
def test_decimal_float(arrow_type, pd_type):

    type_lu = {
        "float32": pa.float32(),
        "float64": pa.float64(),
        "decimal": pa.decimal128(5, 3),
    }

    schema = pa.schema([("i", pa.int8()), ("my_decimal", type_lu[arrow_type])])

    df_csv = pa_read_csv_to_pandas("tests/data/decimal_type.csv", schema)
    df_json = pa_read_json_to_pandas("tests/data/decimal_type.jsonl", schema)

    assert str(df_csv.my_decimal.dtype) == pd_type
    assert str(df_json.my_decimal.dtype) == pd_type

    assert_frame_equal(df_csv, df_json)
def test_csv_options(in_type, pd_old_type, pd_new_type):
    schema = pa.schema([("string_col", pa.string())])

    read_options = csv.ReadOptions(skip_rows=1)

    parse_options = csv.ParseOptions(quote_char="'",
                                     escape_char="\\",
                                     delimiter=";",
                                     newlines_in_values=True)

    convert_options = csv.ConvertOptions(
        include_columns=["i", "my_string", "nonexistent_column"],
        include_missing_columns=True,
        null_values=["NULL_STRING"],
        strings_can_be_null=True,
    )

    df = pa_read_csv_to_pandas(
        "tests/data/csv_options_test.csv",
        schema,
        False,
        pd_string=False,
        parse_options=parse_options,
        convert_options=convert_options,
        read_options=read_options,
    )

    expected = [
        "dsfasd;dsffadsf",
        "dsfasd;dsffadsf",
        None,
        "this text\nhas a line break",
        "this text, like so, has commas",
    ]
    assert df.columns.tolist() == ["i", "my_string", "nonexistent_column"]
    assert df["nonexistent_column"].isnull().all()
    assert_series_equal(df["my_string"], Series(expected, name="my_string"))
def test_file_reader_returns_df():
    df = pa_read_csv_to_pandas("tests/data/example_data.csv")
    assert isinstance(df, pd.DataFrame)

    df = pa_read_json_to_pandas("tests/data/example_data.jsonl")
    assert isinstance(df, pd.DataFrame)
示例#13
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df