Exemplo n.º 1
0
def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test):
    dataframe = pandas.DataFrame(
        data=[
            {"id": 10, "status": u"FOO", "created_at": datetime.date(2019, 5, 10)},
            {"id": 20, "status": u"BAR", "created_at": datetime.date(2018, 9, 12)},
        ]
    )

    with warnings.catch_warnings(record=True) as warned:
        detected_schema = module_under_test.dataframe_to_bq_schema(
            dataframe, bq_schema=[]
        )

    expected_schema = (
        schema.SchemaField("id", "INTEGER", mode="NULLABLE"),
        schema.SchemaField("status", "STRING", mode="NULLABLE"),
        schema.SchemaField("created_at", "DATE", mode="NULLABLE"),
    )
    by_name = operator.attrgetter("name")
    assert sorted(detected_schema, key=by_name) == sorted(expected_schema, key=by_name)

    # there should be no relevant warnings
    unwanted_warnings = [
        warning for warning in warned if "could not determine" in str(warning).lower()
    ]
    assert not unwanted_warnings
Exemplo n.º 2
0
def test_dataframe_to_parquet_dict_sequence_schema(module_under_test):
    dict_schema = [
        {"name": "field01", "type": "STRING", "mode": "REQUIRED"},
        {"name": "field02", "type": "BOOL", "mode": "NULLABLE"},
    ]

    dataframe = pandas.DataFrame(
        {"field01": [u"hello", u"world"], "field02": [True, False]}
    )

    write_table_patch = mock.patch.object(
        module_under_test.pyarrow.parquet, "write_table", autospec=True
    )
    to_arrow_patch = mock.patch.object(
        module_under_test, "dataframe_to_arrow", autospec=True
    )

    with write_table_patch, to_arrow_patch as fake_to_arrow:
        module_under_test.dataframe_to_parquet(dataframe, dict_schema, None)

    expected_schema_arg = [
        schema.SchemaField("field01", "STRING", mode="REQUIRED"),
        schema.SchemaField("field02", "BOOL", mode="NULLABLE"),
    ]
    schema_arg = fake_to_arrow.call_args.args[1]
    assert schema_arg == expected_schema_arg
Exemplo n.º 3
0
def test_dataframe_to_arrow_w_unknown_type(module_under_test):
    bq_schema = (
        schema.SchemaField("field00", "UNKNOWN_TYPE"),
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
    )
    dataframe = pandas.DataFrame({
        "field00": ["whoami", "whatami"],
        "field01": ["hello", "world"],
        "field02": [b"abd", b"efg"],
        "field03": [1, 2],
    })

    with warnings.catch_warnings(record=True) as warned:
        arrow_table = module_under_test.dataframe_to_arrow(
            dataframe, bq_schema)
    arrow_schema = arrow_table.schema

    assert len(warned) == 1
    warning = warned[0]
    assert "field00" in str(warning)

    assert len(arrow_schema) == len(bq_schema)
    assert arrow_schema[0].name == "field00"
    assert arrow_schema[1].name == "field01"
    assert arrow_schema[2].name == "field02"
    assert arrow_schema[3].name == "field03"
def test_dataframe_to_bq_schema_dict_sequence(module_under_test):
    df_data = collections.OrderedDict([
        ("str_column", [u"hello", u"world"]),
        ("int_column", [42, 8]),
        ("bool_column", [True, False]),
    ])
    dataframe = pandas.DataFrame(df_data)

    dict_schema = [
        {
            "name": "str_column",
            "type": "STRING",
            "mode": "NULLABLE"
        },
        {
            "name": "bool_column",
            "type": "BOOL",
            "mode": "REQUIRED"
        },
    ]

    returned_schema = module_under_test.dataframe_to_bq_schema(
        dataframe, dict_schema)

    expected_schema = (
        schema.SchemaField("str_column", "STRING", "NULLABLE"),
        schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
        schema.SchemaField("bool_column", "BOOL", "REQUIRED"),
    )
    assert returned_schema == expected_schema
Exemplo n.º 5
0
def test_augment_schema_type_detection_fails(module_under_test):
    dataframe = pandas.DataFrame(
        data=[
            {
                "status": u"FOO",
                "struct_field": {"one": 1},
                "struct_field_2": {"foo": u"123"},
            },
            {
                "status": u"BAR",
                "struct_field": {"two": u"111"},
                "struct_field_2": {"bar": 27},
            },
        ]
    )
    current_schema = [
        schema.SchemaField("status", field_type="STRING", mode="NULLABLE"),
        schema.SchemaField("struct_field", field_type=None, mode="NULLABLE"),
        schema.SchemaField("struct_field_2", field_type=None, mode="NULLABLE"),
    ]

    with warnings.catch_warnings(record=True) as warned:
        augmented_schema = module_under_test.augment_schema(dataframe, current_schema)

    assert augmented_schema is None

    expected_warnings = [
        warning for warning in warned if "could not determine" in str(warning)
    ]
    assert len(expected_warnings) == 1
    warning_msg = str(expected_warnings[0])
    assert "pyarrow" in warning_msg.lower()
    assert "struct_field" in warning_msg and "struct_field_2" in warning_msg
Exemplo n.º 6
0
def test_download_arrow_tabledata_list_known_field_type(module_under_test):
    fake_page = api_core.page_iterator.Page(
        parent=mock.Mock(),
        items=[{"page_data": "foo"}],
        item_to_value=api_core.page_iterator._item_to_value_identity,
    )
    fake_page._columns = [[1, 10, 100], ["2.2", "22.22", "222.222"]]
    pages = [fake_page]

    bq_schema = [
        schema.SchemaField("population_size", "INTEGER"),
        schema.SchemaField("non_alien_field", "STRING"),
    ]

    results_gen = module_under_test.download_arrow_tabledata_list(pages, bq_schema)
    with warnings.catch_warnings(record=True) as warned:
        result = next(results_gen)

    unwanted_warnings = [
        warning
        for warning in warned
        if "please pass schema= explicitly" in str(warning).lower()
    ]
    assert not unwanted_warnings

    assert len(result.columns) == 2
    col = result.columns[0]
    assert type(col) is pyarrow.lib.Int64Array
    assert list(col) == [1, 10, 100]
    col = result.columns[1]
    assert type(col) is pyarrow.lib.StringArray
    assert list(col) == ["2.2", "22.22", "222.222"]
Exemplo n.º 7
0
def test_bq_to_arrow_schema_w_unknown_type(module_under_test):
    fields = (
        schema.SchemaField("field1", "STRING"),
        schema.SchemaField("field2", "INTEGER"),
        # Don't know what to convert UNKNOWN_TYPE to, let type inference work,
        # instead.
        schema.SchemaField("field3", "UNKNOWN_TYPE"),
    )
    actual = module_under_test.bq_to_arrow_schema(fields)
    assert actual is None
def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
    fields = (
        schema.SchemaField("field1", "STRING"),
        schema.SchemaField("field2", "INTEGER"),
        # Don't know what to convert UNKNOWN_TYPE to, let type inference work,
        # instead.
        schema.SchemaField("field3", "UNKNOWN_TYPE"),
    )
    field = schema.SchemaField("ignored_name",
                               "RECORD",
                               mode="NULLABLE",
                               fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    assert actual is None
def test_bq_to_arrow_schema_w_unknown_type(module_under_test):
    fields = (
        schema.SchemaField("field1", "STRING"),
        schema.SchemaField("field2", "INTEGER"),
        # Don't know what to convert UNKNOWN_TYPE to, let type inference work,
        # instead.
        schema.SchemaField("field3", "UNKNOWN_TYPE"),
    )
    with warnings.catch_warnings(record=True) as warned:
        actual = module_under_test.bq_to_arrow_schema(fields)
    assert actual is None

    assert len(warned) == 1
    warning = warned[0]
    assert "field3" in str(warning)
Exemplo n.º 10
0
    def test_to_api_repr_base(self):
        ec = external_config.ExternalConfig("")
        ec.source_uris = self.SOURCE_URIS
        ec.max_bad_records = 17
        ec.autodetect = True
        ec.ignore_unknown_values = False
        ec.compression = "compression"
        ec.schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")]

        exp_schema = {
            "fields": [
                {
                    "name": "full_name",
                    "type": "STRING",
                    "mode": "REQUIRED",
                    "description": None,
                }
            ]
        }
        got_resource = ec.to_api_repr()
        exp_resource = {
            "sourceFormat": "",
            "sourceUris": self.SOURCE_URIS,
            "maxBadRecords": 17,
            "autodetect": True,
            "ignoreUnknownValues": False,
            "compression": "compression",
            "schema": exp_schema,
        }
        self.assertEqual(got_resource, exp_resource)
Exemplo n.º 11
0
    def test_from_api_repr_base(self):
        resource = copy.deepcopy(self.BASE_RESOURCE)
        ec = external_config.ExternalConfig.from_api_repr(resource)
        self._verify_base(ec)
        self.assertEqual(ec.schema, [])
        self.assertIsNone(ec.options)

        got_resource = ec.to_api_repr()
        self.assertEqual(got_resource, self.BASE_RESOURCE)

        resource = _copy_and_update(
            self.BASE_RESOURCE,
            {
                "schema": {
                    "fields": [
                        {
                            "name": "full_name",
                            "type": "STRING",
                            "mode": "REQUIRED",
                            "description": None,
                        }
                    ]
                }
            },
        )
        ec = external_config.ExternalConfig.from_api_repr(resource)
        self._verify_base(ec)
        exp_schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")]
        self.assertEqual(ec.schema, exp_schema)
        self.assertIsNone(ec.options)

        got_resource = ec.to_api_repr()
        self.assertEqual(got_resource, resource)
def test_bq_to_arrow_array_w_pandas_timestamp(module_under_test, bq_type, rows):
    rows = [pandas.Timestamp(row) for row in rows]
    series = pandas.Series(rows)
    bq_field = schema.SchemaField("field_name", bq_type)
    arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
    roundtrip = arrow_array.to_pandas()
    assert series.equals(roundtrip)
Exemplo n.º 13
0
    def test_from_api_repr_base(self):
        resource = copy.deepcopy(self.BASE_RESOURCE)
        ec = external_config.ExternalConfig.from_api_repr(resource)
        self._verify_base(ec)
        self.assertEqual(ec.schema, [])
        self.assertIsNone(ec.options)

        got_resource = ec.to_api_repr()
        self.assertEqual(got_resource, self.BASE_RESOURCE)

        resource = _copy_and_update(self.BASE_RESOURCE, {
            'schema': {
                'fields': [
                    {
                        'name': 'full_name',
                        'type': 'STRING',
                        'mode': 'REQUIRED',
                        'description': None,
                    },
                ],
            },
        })
        ec = external_config.ExternalConfig.from_api_repr(resource)
        self._verify_base(ec)
        exp_schema = [
            schema.SchemaField('full_name', 'STRING', mode='REQUIRED')
        ]
        self.assertEqual(ec.schema, exp_schema)
        self.assertIsNone(ec.options)

        got_resource = ec.to_api_repr()
        self.assertEqual(got_resource, resource)
Exemplo n.º 14
0
def test_bq_to_arrow_array_w_nullable_scalars(module_under_test, bq_type,
                                              rows):
    series = pandas.Series(rows, dtype="object")
    bq_field = schema.SchemaField("field_name", bq_type)
    arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
    roundtrip = arrow_array.to_pylist()
    assert rows == roundtrip
Exemplo n.º 15
0
def test_bq_to_arrow_array_w_arrays(module_under_test):
    rows = [[1, 2, 3], [], [4, 5, 6]]
    series = pandas.Series(rows, dtype="object")
    bq_field = schema.SchemaField("field_name", "INTEGER", mode="REPEATED")
    arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
    roundtrip = arrow_array.to_pylist()
    assert rows == roundtrip
Exemplo n.º 16
0
    def test_to_api_repr_base(self):
        ec = external_config.ExternalConfig('')
        ec.source_uris = self.SOURCE_URIS
        ec.max_bad_records = 17
        ec.autodetect = True
        ec.ignore_unknown_values = False
        ec.compression = 'compression'
        ec.schema = [
            schema.SchemaField('full_name', 'STRING', mode='REQUIRED')
        ]

        exp_schema = {
            'fields': [
                {
                    'name': 'full_name',
                    'type': 'STRING',
                    'mode': 'REQUIRED',
                    'description': None,
                },
            ]
        }
        got_resource = ec.to_api_repr()
        exp_resource = {
            'sourceFormat': '',
            'sourceUris': self.SOURCE_URIS,
            'maxBadRecords': 17,
            'autodetect': True,
            'ignoreUnknownValues': False,
            'compression': 'compression',
            'schema': exp_schema
        }
        self.assertEqual(got_resource, exp_resource)
Exemplo n.º 17
0
def test_dataframe_to_parquet_w_missing_columns(module_under_test,
                                                monkeypatch):
    with pytest.raises(ValueError) as exc_context:
        module_under_test.dataframe_to_parquet(
            pandas.DataFrame(), (schema.SchemaField("not_found", "STRING"), ),
            None)
    assert "columns in schema must match" in str(exc_context.value)
Exemplo n.º 18
0
def test_dataframe_to_parquet_w_extra_fields(module_under_test, monkeypatch):
    with pytest.raises(ValueError) as exc_context:
        module_under_test.dataframe_to_parquet(
            pandas.DataFrame(), (schema.SchemaField("not_in_df", "STRING"),), None
        )
    message = str(exc_context.value)
    assert "bq_schema contains fields not present in dataframe" in message
    assert "not_in_df" in message
Exemplo n.º 19
0
def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
    fields = (
        schema.SchemaField("field1", "STRING"),
        schema.SchemaField("field2", "INTEGER"),
        # Don't know what to convert UNKNOWN_TYPE to, let type inference work,
        # instead.
        schema.SchemaField("field3", "UNKNOWN_TYPE"),
    )
    field = schema.SchemaField("ignored_name", "RECORD", mode="NULLABLE", fields=fields)

    with warnings.catch_warnings(record=True) as warned:
        actual = module_under_test.bq_to_arrow_data_type(field)

    assert actual is None
    assert len(warned) == 1
    warning = warned[0]
    assert "field3" in str(warning)
Exemplo n.º 20
0
def test_bq_to_arrow_array_w_structs(module_under_test, bq_type):
    rows = [
        {"int_col": 123, "string_col": "abc"},
        None,
        {"int_col": 456, "string_col": "def"},
    ]
    series = pandas.Series(rows, dtype="object")
    bq_field = schema.SchemaField(
        "field_name",
        bq_type,
        fields=(
            schema.SchemaField("int_col", "INTEGER"),
            schema.SchemaField("string_col", "STRING"),
        ),
    )
    arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
    roundtrip = arrow_array.to_pylist()
    assert rows == roundtrip
Exemplo n.º 21
0
def augment_schema(dataframe, current_bq_schema):
    """Try to deduce the unknown field types and return an improved schema.

    This function requires ``pyarrow`` to run. If all the missing types still
    cannot be detected, ``None`` is returned. If all types are already known,
    a shallow copy of the given schema is returned.

    Args:
        dataframe (pandas.DataFrame):
            DataFrame for which some of the field types are still unknown.
        current_bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
            A BigQuery schema for ``dataframe``. The types of some or all of
            the fields may be ``None``.
    Returns:
        Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]
    """
    # pytype: disable=attribute-error
    augmented_schema = []
    unknown_type_fields = []

    for field in current_bq_schema:
        if field.field_type is not None:
            augmented_schema.append(field)
            continue

        arrow_table = pyarrow.array(dataframe[field.name])

        if pyarrow.types.is_list(arrow_table.type):
            # `pyarrow.ListType`
            detected_mode = "REPEATED"
            detected_type = ARROW_SCALAR_IDS_TO_BQ.get(
                arrow_table.values.type.id)
        else:
            detected_mode = field.mode
            detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.type.id)

        if detected_type is None:
            unknown_type_fields.append(field)
            continue

        new_field = schema.SchemaField(
            name=field.name,
            field_type=detected_type,
            mode=detected_mode,
            description=field.description,
            fields=field.fields,
        )
        augmented_schema.append(new_field)

    if unknown_type_fields:
        warnings.warn(
            "Pyarrow could not determine the type of columns: {}.".format(
                ", ".join(field.name for field in unknown_type_fields)))
        return None

    return augmented_schema
Exemplo n.º 22
0
def dataframe_to_bq_schema(dataframe, bq_schema):
    """Convert a pandas DataFrame schema to a BigQuery schema.

    Args:
        dataframe (pandas.DataFrame):
            DataFrame for which the client determines the BigQuery schema.
        bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
            A BigQuery schema. Use this argument to override the autodetected
            type for some or all of the DataFrame columns.

    Returns:
        Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]:
            The automatically determined schema. Returns None if the type of
            any column cannot be determined.
    """
    if bq_schema:
        for field in bq_schema:
            if field.field_type in schema._STRUCT_TYPES:
                raise ValueError(
                    "Uploading dataframes with struct (record) column types "
                    "is not supported. See: "
                    "https://github.com/googleapis/google-cloud-python/issues/8191"
                )
        bq_schema_index = {field.name: field for field in bq_schema}
        bq_schema_unused = set(bq_schema_index.keys())
    else:
        bq_schema_index = {}
        bq_schema_unused = set()

    bq_schema_out = []
    for column, dtype in list_columns_and_indexes(dataframe):
        # Use provided type from schema, if present.
        bq_field = bq_schema_index.get(column)
        if bq_field:
            bq_schema_out.append(bq_field)
            bq_schema_unused.discard(bq_field.name)
            continue

        # Otherwise, try to automatically determine the type based on the
        # pandas dtype.
        bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
        if not bq_type:
            warnings.warn(u"Unable to determine type of column '{}'.".format(column))
            return None
        bq_field = schema.SchemaField(column, bq_type)
        bq_schema_out.append(bq_field)

    # Catch any schema mismatch. The developer explicitly asked to serialize a
    # column, but it was not found.
    if bq_schema_unused:
        raise ValueError(
            u"bq_schema contains fields not present in dataframe: {}".format(
                bq_schema_unused
            )
        )
    return tuple(bq_schema_out)
Exemplo n.º 23
0
def test_upload_time_and_datetime_56(bigquery_client, dataset_id):
    df = pandas.DataFrame(
        dict(
            dt=[
                datetime.datetime(2020, 1, 8, 8, 0, 0),
                datetime.datetime(
                    2020,
                    1,
                    8,
                    8,
                    0,
                    0,
                    tzinfo=datetime.timezone(datetime.timedelta(hours=-7)),
                ),
            ],
            t=[datetime.time(0, 0, 10, 100001), None],
        )
    )
    table = f"{dataset_id}.test_upload_time_and_datetime"
    bigquery_client.load_table_from_dataframe(df, table).result()
    data = list(map(list, bigquery_client.list_rows(table)))
    assert data == [
        [
            datetime.datetime(2020, 1, 8, 8, 0, tzinfo=datetime.timezone.utc),
            datetime.time(0, 0, 10, 100001),
        ],
        [datetime.datetime(2020, 1, 8, 15, 0, tzinfo=datetime.timezone.utc), None],
    ]

    from google.cloud.bigquery import job, schema

    table = f"{dataset_id}.test_upload_time_and_datetime_dt"
    config = job.LoadJobConfig(
        schema=[schema.SchemaField("dt", "DATETIME"), schema.SchemaField("t", "TIME")]
    )

    bigquery_client.load_table_from_dataframe(df, table, job_config=config).result()
    data = list(map(list, bigquery_client.list_rows(table)))
    assert data == [
        [datetime.datetime(2020, 1, 8, 8, 0), datetime.time(0, 0, 10, 100001)],
        [datetime.datetime(2020, 1, 8, 15, 0), None],
    ]
def test_bq_to_arrow_array_w_special_floats(module_under_test):
    bq_field = schema.SchemaField("field_name", "FLOAT64")
    rows = [float("-inf"), float("nan"), float("inf"), None]
    series = pandas.Series(rows, dtype="object")
    arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
    roundtrip = arrow_array.to_pylist()
    assert len(rows) == len(roundtrip)
    assert roundtrip[0] == float("-inf")
    assert roundtrip[1] != roundtrip[1]  # NaN doesn't equal itself.
    assert roundtrip[2] == float("inf")
    assert roundtrip[3] is None
Exemplo n.º 25
0
def test_bq_to_arrow_array_w_special_floats(module_under_test):
    bq_field = schema.SchemaField("field_name", "FLOAT64")
    rows = [float("-inf"), float("nan"), float("inf"), None]
    series = pandas.Series(rows, dtype="object")
    arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
    roundtrip = arrow_array.to_pylist()
    assert len(rows) == len(roundtrip)
    assert roundtrip[0] == float("-inf")
    # Since we are converting from pandas, NaN is treated as NULL in pyarrow
    # due to pandas conventions.
    # https://arrow.apache.org/docs/python/data.html#none-values-and-nan-handling
    assert roundtrip[1] is None
    assert roundtrip[2] == float("inf")
    assert roundtrip[3] is None
Exemplo n.º 26
0
def test_dataframe_to_parquet_compression_method(module_under_test):
    bq_schema = (schema.SchemaField("field00", "STRING"),)
    dataframe = pandas.DataFrame({"field00": ["foo", "bar"]})

    write_table_patch = mock.patch.object(
        module_under_test.pyarrow.parquet, "write_table", autospec=True
    )

    with write_table_patch as fake_write_table:
        module_under_test.dataframe_to_parquet(
            dataframe, bq_schema, None, parquet_compression="ZSTD"
        )

    call_args = fake_write_table.call_args
    assert call_args is not None
    assert call_args.kwargs.get("compression") == "ZSTD"
Exemplo n.º 27
0
def bq_to_arrow_data_type(field):
    """Return the Arrow data type, corresponding to a given BigQuery column.

    Returns None if default Arrow type inspection should be used.
    """
    if field.mode is not None and field.mode.upper() == "REPEATED":
        inner_type = bq_to_arrow_data_type(
            schema.SchemaField(field.name, field.field_type))
        if inner_type:
            return pyarrow.list_(inner_type)
        return None

    if field.field_type.upper() in STRUCT_TYPES:
        return bq_to_arrow_struct_data_type(field)

    data_type_constructor = BQ_TO_ARROW_SCALARS.get(field.field_type.upper())
    if data_type_constructor is None:
        return None
    return data_type_constructor()
Exemplo n.º 28
0
def test_dataframe_to_arrow_w_required_fields(module_under_test):
    bq_schema = (
        schema.SchemaField("field01", "STRING", mode="REQUIRED"),
        schema.SchemaField("field02", "BYTES", mode="REQUIRED"),
        schema.SchemaField("field03", "INTEGER", mode="REQUIRED"),
        schema.SchemaField("field04", "INT64", mode="REQUIRED"),
        schema.SchemaField("field05", "FLOAT", mode="REQUIRED"),
        schema.SchemaField("field06", "FLOAT64", mode="REQUIRED"),
        schema.SchemaField("field07", "NUMERIC", mode="REQUIRED"),
        schema.SchemaField("field08", "BOOLEAN", mode="REQUIRED"),
        schema.SchemaField("field09", "BOOL", mode="REQUIRED"),
        schema.SchemaField("field10", "TIMESTAMP", mode="REQUIRED"),
        schema.SchemaField("field11", "DATE", mode="REQUIRED"),
        schema.SchemaField("field12", "TIME", mode="REQUIRED"),
        schema.SchemaField("field13", "DATETIME", mode="REQUIRED"),
        schema.SchemaField("field14", "GEOGRAPHY", mode="REQUIRED"),
    )
    dataframe = pandas.DataFrame({
        "field01": ["hello", "world"],
        "field02": [b"abd", b"efg"],
        "field03": [1, 2],
        "field04": [3, 4],
        "field05": [1.25, 9.75],
        "field06": [-1.75, -3.5],
        "field07": [decimal.Decimal("1.2345"),
                    decimal.Decimal("6.7891")],
        "field08": [True, False],
        "field09": [False, True],
        "field10": [
            datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
            datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=pytz.utc),
        ],
        "field11": [datetime.date(9999, 12, 31),
                    datetime.date(1970, 1, 1)],
        "field12":
        [datetime.time(23, 59, 59, 999999),
         datetime.time(12, 0, 0)],
        "field13": [
            datetime.datetime(1970, 1, 1, 0, 0, 0),
            datetime.datetime(2012, 12, 21, 9, 7, 42),
        ],
        "field14": [
            "POINT(30 10)",
            "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
        ],
    })

    arrow_table = module_under_test.dataframe_to_arrow(dataframe, bq_schema)
    arrow_schema = arrow_table.schema

    assert len(arrow_schema) == len(bq_schema)
    for arrow_field in arrow_schema:
        assert not arrow_field.nullable
Exemplo n.º 29
0
def dataframe_to_bq_schema(dataframe, bq_schema):
    """Convert a pandas DataFrame schema to a BigQuery schema.

    Args:
        dataframe (pandas.DataFrame):
            DataFrame for which the client determines the BigQuery schema.
        bq_schema (Sequence[Union[ \
            :class:`~google.cloud.bigquery.schema.SchemaField`, \
            Mapping[str, Any] \
        ]]):
            A BigQuery schema. Use this argument to override the autodetected
            type for some or all of the DataFrame columns.

    Returns:
        Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]:
            The automatically determined schema. Returns None if the type of
            any column cannot be determined.
    """
    if bq_schema:
        bq_schema = schema._to_schema_fields(bq_schema)
        bq_schema_index = {field.name: field for field in bq_schema}
        bq_schema_unused = set(bq_schema_index.keys())
    else:
        bq_schema_index = {}
        bq_schema_unused = set()

    bq_schema_out = []
    unknown_type_fields = []

    for column, dtype in list_columns_and_indexes(dataframe):
        # Use provided type from schema, if present.
        bq_field = bq_schema_index.get(column)
        if bq_field:
            bq_schema_out.append(bq_field)
            bq_schema_unused.discard(bq_field.name)
            continue

        # Otherwise, try to automatically determine the type based on the
        # pandas dtype.
        bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
        bq_field = schema.SchemaField(column, bq_type)
        bq_schema_out.append(bq_field)

        if bq_field.field_type is None:
            unknown_type_fields.append(bq_field)

    # Catch any schema mismatch. The developer explicitly asked to serialize a
    # column, but it was not found.
    if bq_schema_unused:
        raise ValueError(
            u"bq_schema contains fields not present in dataframe: {}".format(
                bq_schema_unused))

    # If schema detection was not successful for all columns, also try with
    # pyarrow, if available.
    if unknown_type_fields:
        if not pyarrow:
            msg = u"Could not determine the type of columns: {}".format(
                ", ".join(field.name for field in unknown_type_fields))
            warnings.warn(msg)
            return None  # We cannot detect the schema in full.

        # The augment_schema() helper itself will also issue unknown type
        # warnings if detection still fails for any of the fields.
        bq_schema_out = augment_schema(dataframe, bq_schema_out)

    return tuple(bq_schema_out) if bq_schema_out else None
Exemplo n.º 30
0
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name",
                               bq_type,
                               mode="REPEATED",
                               fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected_value_type = pyarrow.struct((
        pyarrow.field("field01", pyarrow.string()),
        pyarrow.field("field02", pyarrow.binary()),
        pyarrow.field("field03", pyarrow.int64()),
        pyarrow.field("field04", pyarrow.int64()),
        pyarrow.field("field05", pyarrow.float64()),
        pyarrow.field("field06", pyarrow.float64()),
        pyarrow.field("field07", module_under_test.pyarrow_numeric()),
        pyarrow.field("field08", pyarrow.bool_()),
        pyarrow.field("field09", pyarrow.bool_()),
        pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
        pyarrow.field("field11", pyarrow.date32()),
        pyarrow.field("field12", module_under_test.pyarrow_time()),
        pyarrow.field("field13", module_under_test.pyarrow_datetime()),
        pyarrow.field("field14", pyarrow.string()),
    ))
    assert pyarrow.types.is_list(actual)
    assert pyarrow.types.is_struct(actual.value_type)
    assert actual.value_type.num_children == len(fields)
    assert actual.value_type.equals(expected_value_type)