Пример #1
0
def cast_dataframe_for_parquet(
    dataframe: pandas.DataFrame,
    schema: Optional[Dict[str, Any]],
) -> pandas.DataFrame:
    """Cast columns to needed dtype when writing parquet files.

    See: https://github.com/googleapis/python-bigquery-pandas/issues/421
    """

    columns = schema.get("fields", [])

    # Protect against an explicit None in the dictionary.
    columns = columns if columns is not None else []

    for column in columns:
        # Schema can be a superset of the columns in the dataframe, so ignore
        # columns that aren't present.
        column_name = column.get("name")
        if column_name not in dataframe.columns:
            continue

        # Skip array columns for now. Potentially casting the elements of the
        # array would be possible, but not worth the effort until there is
        # demand for it.
        if column.get("mode", "NULLABLE").upper() == "REPEATED":
            continue

        column_type = column.get("type", "").upper()
        if (column_type == "DATE"
                # Use extension dtype first so that it uses the correct equality operator.
                and db_dtypes.DateDtype() != dataframe[column_name].dtype):
            cast_column = dataframe[column_name].astype(
                dtype=db_dtypes.DateDtype(),
                # Return the original column if there was an error converting
                # to the dtype, such as is there is a date outside the
                # supported range.
                # https://github.com/googleapis/python-bigquery-pandas/issues/441
                errors="ignore",
            )
        elif column_type in {"NUMERIC", "DECIMAL", "BIGNUMERIC", "BIGDECIMAL"}:
            cast_column = dataframe[column_name].map(decimal.Decimal)
        else:
            cast_column = None

        if cast_column is not None:
            dataframe = dataframe.assign(**{column_name: cast_column})
    return dataframe
Пример #2
0
def test_cast_dataframe_for_parquet_w_string_date():
    dataframe = pandas.DataFrame(
        {
            "row_num": [0, 1, 2],
            "date_col":
            pandas.Series(
                ["2021-04-17", "1999-12-31", "2038-01-19"],
                dtype="object",
            ),
            "row_num_2": [0, 1, 2],
        },
        # Use multiple columns to ensure column order is maintained.
        columns=["row_num", "date_col", "row_num_2"],
    )
    schema = {
        "fields": [
            {
                "name": "date_col",
                "type": "DATE"
            },
            {
                "name": "not_in_df",
                "type": "IGNORED"
            },
        ]
    }
    result = load.cast_dataframe_for_parquet(dataframe, schema)
    expected = pandas.DataFrame(
        {
            "row_num": [0, 1, 2],
            "date_col":
            pandas.Series(
                ["2021-04-17", "1999-12-31", "2038-01-19"],
                dtype=db_dtypes.DateDtype(),
            ),
            "row_num_2": [0, 1, 2],
        },
        columns=["row_num", "date_col", "row_num_2"],
    )
    pandas.testing.assert_frame_equal(result, expected)
Пример #3
0
def _finalize_dtypes(
        df: "pandas.DataFrame",
        schema_fields: Sequence[Dict[str, Any]]) -> "pandas.DataFrame":
    """
    Attempt to change the dtypes of those columns that don't map exactly.

    For example db_dtypes.DateDtype() and datetime64[ns] cannot represent
    0001-01-01, but they can represent dates within a couple hundred years of
    1970. See:
    https://github.com/googleapis/python-bigquery-pandas/issues/365
    """
    import db_dtypes
    import pandas.api.types

    # If you update this mapping, also update the table at
    # `docs/reading.rst`.
    dtype_map = {
        "DATE": db_dtypes.DateDtype(),
        "DATETIME": "datetime64[ns]",
        "TIMESTAMP": "datetime64[ns]",
    }

    for field in schema_fields:
        # This method doesn't modify ARRAY/REPEATED columns.
        if field["mode"].upper() == "REPEATED":
            continue

        name = str(field["name"])
        dtype = dtype_map.get(field["type"].upper())

        # Avoid deprecated conversion to timezone-naive dtype by only casting
        # object dtypes.
        if dtype and pandas.api.types.is_object_dtype(df[name]):
            df[name] = df[name].astype(dtype, errors="ignore")

    # Ensure any TIMESTAMP columns are tz-aware.
    df = pandas_gbq.timestamp.localize_df(df, schema_fields)

    return df
Пример #4
0
     }),
     table_schema=[{
         "name": "date_col",
         "type": "DATE"
     }],
     # Skip CSV because the pandas CSV writer includes time when writing
     # datetime64 values.
     api_methods={"load_parquet"},
 ),
 DataFrameRoundTripTestCase(
     input_df=pandas.DataFrame({
         "row_num": [0, 1, 2],
         "date_col":
         pandas.Series(
             ["2021-04-17", "1999-12-31", "2038-01-19"],
             dtype=db_dtypes.DateDtype(),
         ),
     }),
     table_schema=[{
         "name": "date_col",
         "type": "DATE"
     }],
 ),
 # Loading a DATE column should work for string objects. See:
 # https://github.com/googleapis/python-bigquery-pandas/issues/421
 DataFrameRoundTripTestCase(
     input_df=pandas.DataFrame(
         {
             "row_num": [123],
             "date_col": ["2021-12-12"]
         },
Пример #5
0
def test_empty_dataframe(read_gbq, use_bqstorage_api):
    # Bug fix for https://github.com/pandas-dev/pandas/issues/10273 and
    # https://github.com/googleapis/python-bigquery-pandas/issues/299
    query = """
SELECT
  bools.row_num AS row_num,
  bool_col,
  bytes_col,
  date_col,
  datetime_col,
  float_col,
  int64_col,
  numeric_col,
  string_col,
  time_col,
  timestamp_col
FROM
  UNNEST([
      STRUCT(1 AS row_num, TRUE AS bool_col) ]) AS `bools`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, CAST('F1AC' AS BYTES FORMAT 'HEX') AS bytes_col) ]) AS `bytes`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, DATE(2018, 4, 11) AS date_col) ]) AS `dates`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, DATETIME('2011-10-01 00:01:02.345678') AS datetime_col) ]) AS `datetimes`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, -2.375 AS float_col) ]) AS `floats`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, 1234 AS int64_col) ]) AS `int64s`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, CAST('123.456789' AS NUMERIC) AS numeric_col) ]) AS `numerics`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, 'abcdefghijklmnopqrstuvwxyz' AS string_col) ]) AS `strings`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, CAST('09:08:07.654321' AS TIME) AS time_col) ]) AS `times`
INNER JOIN
  UNNEST([
      STRUCT(1 AS row_num, TIMESTAMP('1998-09-04 12:34:56.789101') AS timestamp_col) ]) AS `timestamps`
WHERE
  `bools`.row_num = `dates`.row_num
  AND `bools`.row_num = `bytes`.row_num
  AND `bools`.row_num = `datetimes`.row_num
  AND `bools`.row_num = `floats`.row_num
  AND `bools`.row_num = `int64s`.row_num
  AND `bools`.row_num = `numerics`.row_num
  AND `bools`.row_num = `strings`.row_num
  AND `bools`.row_num = `times`.row_num
  AND `bools`.row_num = `timestamps`.row_num
  AND `bools`.row_num = -1
ORDER BY row_num ASC
    """
    expected = pandas.DataFrame({
        "row_num":
        pandas.Series([], dtype="Int64"),
        "bool_col":
        pandas.Series(
            [],
            dtype="boolean" if FEATURES.pandas_has_boolean_dtype else "bool",
        ),
        "bytes_col":
        pandas.Series([], dtype="object"),
        "date_col":
        pandas.Series(
            [],
            dtype=db_dtypes.DateDtype(),
        ),
        "datetime_col":
        pandas.Series(
            [],
            dtype="datetime64[ns]",
        ),
        "float_col":
        pandas.Series([], dtype="float64"),
        "int64_col":
        pandas.Series([], dtype="Int64"),
        "numeric_col":
        pandas.Series([], dtype="object"),
        "string_col":
        pandas.Series([], dtype="object"),
        "time_col":
        pandas.Series(
            [],
            dtype=db_dtypes.TimeDtype(),
        ),
        "timestamp_col":
        pandas.Series(
            [],
            dtype="datetime64[ns]",
        ).dt.tz_localize(datetime.timezone.utc),
    })
    result = read_gbq(query, use_bqstorage_api=use_bqstorage_api)
    pandas.testing.assert_frame_equal(result, expected, check_index_type=False)