def cast_dataframe_for_parquet( dataframe: pandas.DataFrame, schema: Optional[Dict[str, Any]], ) -> pandas.DataFrame: """Cast columns to needed dtype when writing parquet files. See: https://github.com/googleapis/python-bigquery-pandas/issues/421 """ columns = schema.get("fields", []) # Protect against an explicit None in the dictionary. columns = columns if columns is not None else [] for column in columns: # Schema can be a superset of the columns in the dataframe, so ignore # columns that aren't present. column_name = column.get("name") if column_name not in dataframe.columns: continue # Skip array columns for now. Potentially casting the elements of the # array would be possible, but not worth the effort until there is # demand for it. if column.get("mode", "NULLABLE").upper() == "REPEATED": continue column_type = column.get("type", "").upper() if (column_type == "DATE" # Use extension dtype first so that it uses the correct equality operator. and db_dtypes.DateDtype() != dataframe[column_name].dtype): cast_column = dataframe[column_name].astype( dtype=db_dtypes.DateDtype(), # Return the original column if there was an error converting # to the dtype, such as is there is a date outside the # supported range. # https://github.com/googleapis/python-bigquery-pandas/issues/441 errors="ignore", ) elif column_type in {"NUMERIC", "DECIMAL", "BIGNUMERIC", "BIGDECIMAL"}: cast_column = dataframe[column_name].map(decimal.Decimal) else: cast_column = None if cast_column is not None: dataframe = dataframe.assign(**{column_name: cast_column}) return dataframe
def test_cast_dataframe_for_parquet_w_string_date(): dataframe = pandas.DataFrame( { "row_num": [0, 1, 2], "date_col": pandas.Series( ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="object", ), "row_num_2": [0, 1, 2], }, # Use multiple columns to ensure column order is maintained. columns=["row_num", "date_col", "row_num_2"], ) schema = { "fields": [ { "name": "date_col", "type": "DATE" }, { "name": "not_in_df", "type": "IGNORED" }, ] } result = load.cast_dataframe_for_parquet(dataframe, schema) expected = pandas.DataFrame( { "row_num": [0, 1, 2], "date_col": pandas.Series( ["2021-04-17", "1999-12-31", "2038-01-19"], dtype=db_dtypes.DateDtype(), ), "row_num_2": [0, 1, 2], }, columns=["row_num", "date_col", "row_num_2"], ) pandas.testing.assert_frame_equal(result, expected)
def _finalize_dtypes( df: "pandas.DataFrame", schema_fields: Sequence[Dict[str, Any]]) -> "pandas.DataFrame": """ Attempt to change the dtypes of those columns that don't map exactly. For example db_dtypes.DateDtype() and datetime64[ns] cannot represent 0001-01-01, but they can represent dates within a couple hundred years of 1970. See: https://github.com/googleapis/python-bigquery-pandas/issues/365 """ import db_dtypes import pandas.api.types # If you update this mapping, also update the table at # `docs/reading.rst`. dtype_map = { "DATE": db_dtypes.DateDtype(), "DATETIME": "datetime64[ns]", "TIMESTAMP": "datetime64[ns]", } for field in schema_fields: # This method doesn't modify ARRAY/REPEATED columns. if field["mode"].upper() == "REPEATED": continue name = str(field["name"]) dtype = dtype_map.get(field["type"].upper()) # Avoid deprecated conversion to timezone-naive dtype by only casting # object dtypes. if dtype and pandas.api.types.is_object_dtype(df[name]): df[name] = df[name].astype(dtype, errors="ignore") # Ensure any TIMESTAMP columns are tz-aware. df = pandas_gbq.timestamp.localize_df(df, schema_fields) return df
}), table_schema=[{ "name": "date_col", "type": "DATE" }], # Skip CSV because the pandas CSV writer includes time when writing # datetime64 values. api_methods={"load_parquet"}, ), DataFrameRoundTripTestCase( input_df=pandas.DataFrame({ "row_num": [0, 1, 2], "date_col": pandas.Series( ["2021-04-17", "1999-12-31", "2038-01-19"], dtype=db_dtypes.DateDtype(), ), }), table_schema=[{ "name": "date_col", "type": "DATE" }], ), # Loading a DATE column should work for string objects. See: # https://github.com/googleapis/python-bigquery-pandas/issues/421 DataFrameRoundTripTestCase( input_df=pandas.DataFrame( { "row_num": [123], "date_col": ["2021-12-12"] },
def test_empty_dataframe(read_gbq, use_bqstorage_api): # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 and # https://github.com/googleapis/python-bigquery-pandas/issues/299 query = """ SELECT bools.row_num AS row_num, bool_col, bytes_col, date_col, datetime_col, float_col, int64_col, numeric_col, string_col, time_col, timestamp_col FROM UNNEST([ STRUCT(1 AS row_num, TRUE AS bool_col) ]) AS `bools` INNER JOIN UNNEST([ STRUCT(1 AS row_num, CAST('F1AC' AS BYTES FORMAT 'HEX') AS bytes_col) ]) AS `bytes` INNER JOIN UNNEST([ STRUCT(1 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates` INNER JOIN UNNEST([ STRUCT(1 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col) ]) AS `datetimes` INNER JOIN UNNEST([ STRUCT(1 AS row_num, -2.375 AS float_col) ]) AS `floats` INNER JOIN UNNEST([ STRUCT(1 AS row_num, 1234 AS int64_col) ]) AS `int64s` INNER JOIN UNNEST([ STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col) ]) AS `numerics` INNER JOIN UNNEST([ STRUCT(1 AS row_num, 'abcdefghijklmnopqrstuvwxyz' AS string_col) ]) AS `strings` INNER JOIN UNNEST([ STRUCT(1 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col) ]) AS `times` INNER JOIN UNNEST([ STRUCT(1 AS row_num, TIMESTAMP('1998-09-04 12:34:56.789101') AS timestamp_col) ]) AS `timestamps` WHERE `bools`.row_num = `dates`.row_num AND `bools`.row_num = `bytes`.row_num AND `bools`.row_num = `datetimes`.row_num AND `bools`.row_num = `floats`.row_num AND `bools`.row_num = `int64s`.row_num AND `bools`.row_num = `numerics`.row_num AND `bools`.row_num = `strings`.row_num AND `bools`.row_num = `times`.row_num AND `bools`.row_num = `timestamps`.row_num AND `bools`.row_num = -1 ORDER BY row_num ASC """ expected = pandas.DataFrame({ "row_num": pandas.Series([], dtype="Int64"), "bool_col": pandas.Series( [], dtype="boolean" if FEATURES.pandas_has_boolean_dtype else "bool", ), "bytes_col": pandas.Series([], dtype="object"), "date_col": pandas.Series( [], dtype=db_dtypes.DateDtype(), ), "datetime_col": pandas.Series( [], dtype="datetime64[ns]", ), "float_col": pandas.Series([], dtype="float64"), "int64_col": pandas.Series([], dtype="Int64"), "numeric_col": pandas.Series([], dtype="object"), "string_col": pandas.Series([], dtype="object"), "time_col": pandas.Series( [], dtype=db_dtypes.TimeDtype(), ), "timestamp_col": pandas.Series( [], dtype="datetime64[ns]", ).dt.tz_localize(datetime.timezone.utc), }) result = read_gbq(query, use_bqstorage_api=use_bqstorage_api) pandas.testing.assert_frame_equal(result, expected, check_index_type=False)