def test_read_rows_to_dataframe_with_wide_table(client, project_id): # Use a wide table to boost the chance of getting a large message size. # https://github.com/googleapis/python-bigquery-storage/issues/78 read_session = types.ReadSession() read_session.table = "projects/{}/datasets/{}/tables/{}".format( "bigquery-public-data", "geo_census_tracts", "us_census_tracts_national" ) read_session.data_format = types.DataFormat.ARROW session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) stream = session.streams[0].name read_rows_stream = client.read_rows(stream) # fetch the first two batches of rows pages_iter = iter(read_rows_stream.rows(session).pages) some_rows = next(pages_iter) assert all(len(row["tract_geom"].as_py()) > 0 for row in some_rows)
def test_rows_no_schema_set_raises_type_error(mut, class_under_test, mock_gapic_client, monkeypatch): reader = class_under_test([], mock_gapic_client, "", 0, {}) read_session = types.ReadSession() with pytest.raises(TypeError): reader.rows(read_session)
def test_ingestion_time_partitioned_table( client, project_id, ingest_partition_table_ref, bq_client, data_format ): data = [{"shape": "cigar", "altitude": 1200}, {"shape": "disc", "altitude": 750}] destination = _to_bq_table_ref( ingest_partition_table_ref, partition_suffix="$20190809" ) bq_client.load_table_from_json(data, destination).result() data = [ {"shape": "sphere", "altitude": 3500}, {"shape": "doughnut", "altitude": 100}, ] destination = _to_bq_table_ref( ingest_partition_table_ref, partition_suffix="$20190810" ) bq_client.load_table_from_json(data, destination).result() data = [ {"shape": "elephant", "altitude": 1}, {"shape": "rocket", "altitude": 12700}, ] destination = _to_bq_table_ref( ingest_partition_table_ref, partition_suffix="$20190811" ) bq_client.load_table_from_json(data, destination).result() read_session = types.ReadSession() read_session.table = ingest_partition_table_ref read_session.data_format = data_format read_session.read_options.row_restriction = "DATE(_PARTITIONTIME) = '2019-08-10'" session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) assert session.streams # there should be some data to fetch stream = session.streams[0].name rows = list(client.read_rows(stream).rows(session)) assert len(rows) == 2 if data_format == types.DataFormat.AVRO: actual_items = {(row["shape"], row["altitude"]) for row in rows} else: assert data_format == types.DataFormat.ARROW actual_items = {(row["shape"].as_py(), row["altitude"].as_py()) for row in rows} expected_items = {("sphere", 3500), ("doughnut", 100)} assert actual_items == expected_items
def test_session_to_dataframe(capsys, clients): from google.cloud.bigquery_storage import types bqclient, bqstorageclient = clients your_project_id = bqclient.project # [START bigquerystorage_pandas_tutorial_all] # [START bigquerystorage_pandas_tutorial_read_session] project_id = "bigquery-public-data" dataset_id = "new_york_trees" table_id = "tree_species" table = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}" # Select columns to read with read options. If no read options are # specified, the whole table is read. read_options = types.ReadSession.TableReadOptions( selected_fields=["species_common_name", "fall_color"]) parent = "projects/{}".format(your_project_id) requested_session = types.ReadSession( table=table, # This API can also deliver data serialized in Apache Avro format. # This example leverages Apache Arrow. data_format=types.DataFormat.ARROW, read_options=read_options, ) read_session = bqstorageclient.create_read_session( parent=parent, read_session=requested_session, max_stream_count=1, ) # This example reads from only a single stream. Read from multiple streams # to fetch data faster. Note that the session may not contain any streams # if there are no rows to read. stream = read_session.streams[0] reader = bqstorageclient.read_rows(stream.name) # Parse all Arrow blocks and create a dataframe. This call requires a # session, because the session contains the schema for the row blocks. dataframe = reader.to_dataframe(read_session) print(dataframe.head()) # [END bigquerystorage_pandas_tutorial_read_session] # [END bigquerystorage_pandas_tutorial_all] out, _ = capsys.readouterr() assert "species_common_name" in out
def test_basic_nonfiltered_read(client, project_id, table_with_data_ref, data_format): read_session = types.ReadSession() read_session.table = table_with_data_ref read_session.data_format = data_format session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) stream = session.streams[0].name rows = list(client.read_rows(stream).rows(session)) assert len(rows) == 5 # all table rows
def test_filtered_rows_read(client, project_id, table_with_data_ref): read_session = types.ReadSession() read_session.table = table_with_data_ref read_session.data_format = types.DataFormat.AVRO read_session.read_options.row_restriction = "age >= 50" session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) stream = session.streams[0].name rows = list(client.read_rows(stream).rows(session)) assert len(rows) == 2
def test_read_rows_as_rows_full_table( client, project_id, small_table_reference, data_format, expected_schema_type ): read_session = types.ReadSession() read_session.table = small_table_reference read_session.data_format = data_format session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) stream = session.streams[0].name rows = list(client.read_rows(stream).rows(session)) assert len(rows) > 0
def test_column_partitioned_table( client, project_id, col_partition_table_ref, bq_client ): data = [ {"description": "Tracking established.", "occurred": "2017-02-15"}, {"description": "Look, a solar eclipse!", "occurred": "2018-02-15"}, {"description": "Fake solar eclipse reported.", "occurred": "2018-02-15"}, {"description": "1 day after false eclipse report.", "occurred": "2018-02-16"}, {"description": "1 year after false eclipse report.", "occurred": "2019-02-15"}, ] destination = _to_bq_table_ref(col_partition_table_ref) bq_client.load_table_from_json(data, destination).result() # Read from the table with a partition filter specified, and verify that # only the expected data is returned. read_session = types.ReadSession() read_session.table = col_partition_table_ref read_session.data_format = types.DataFormat.AVRO read_session.read_options.row_restriction = "occurred = '2018-02-15'" session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) assert session.streams # there should be some data to fetch stream = session.streams[0].name rows = list(client.read_rows(stream).rows(session)) assert len(rows) == 2 expected_descriptions = ("Look, a solar eclipse!", "Fake solar eclipse reported.") for row in rows: assert row["occurred"] == dt.date(2018, 2, 15) assert row["description"] in expected_descriptions
def test_create_read_session(mock_transport, client_under_test): assert client_under_test._transport is mock_transport # sanity check table = "projects/{}/datasets/{}/tables/{}".format("data-project-id", "dataset_id", "table_id") read_session = types.ReadSession() read_session.table = table client_under_test.create_read_session(parent="projects/other-project", read_session=read_session) expected_session_arg = types.CreateReadSessionRequest( parent="projects/other-project", read_session=read_session) rpc_callable = mock_transport._wrapped_methods[ mock_transport.create_read_session] rpc_callable.assert_called_once_with(expected_session_arg, metadata=mock.ANY, retry=mock.ANY, timeout=mock.ANY)
def test_column_selection_read(client, project_id, table_with_data_ref, data_format): read_session = types.ReadSession() read_session.table = table_with_data_ref read_session.data_format = data_format read_session.read_options.selected_fields.append("first_name") read_session.read_options.selected_fields.append("age") session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) stream = session.streams[0].name rows = list(client.read_rows(stream).rows(session)) for row in rows: assert sorted(row.keys()) == ["age", "first_name"]
def test_resuming_read_from_offset( client, project_id, data_format, local_shakespeare_table_reference ): read_session = types.ReadSession() read_session.table = local_shakespeare_table_reference read_session.data_format = data_format session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) assert session.streams # there should be data available stream = session.streams[0].name read_rows_stream = client.read_rows(stream) # fetch the first two batches of rows rows_iter = iter(read_rows_stream) some_rows = next(rows_iter) more_rows = next(rows_iter) # fetch the rest of the rows using the stream offset offset = some_rows.row_count + more_rows.row_count remaining_rows_count = sum( 1 for _ in client.read_rows(stream, offset=offset).rows(session) ) # verify that the counts match expected_len = 164656 # total rows in shakespeare table actual_len = remaining_rows_count + some_rows.row_count + more_rows.row_count assert actual_len == expected_len
def test_snapshot(client, project_id, table_with_data_ref, bq_client): before_new_data = types.Timestamp() before_new_data.GetCurrentTime() # load additional data into the table new_data = [ {u"first_name": u"NewGuyFoo", u"last_name": u"Smith", u"age": 46}, {u"first_name": u"NewGuyBar", u"last_name": u"Jones", u"age": 30}, ] destination = _to_bq_table_ref(table_with_data_ref) bq_client.load_table_from_json(new_data, destination).result() # read data using the timestamp before the additional data load read_session = types.ReadSession() read_session.table = table_with_data_ref read_session.table_modifiers.snapshot_time = before_new_data read_session.data_format = types.DataFormat.AVRO session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) stream = session.streams[0].name rows = list(client.read_rows(stream).rows(session)) # verify that only the data before the timestamp was returned assert len(rows) == 5 # all initial records for row in rows: assert "NewGuy" not in row["first_name"] # no new records
def _generate_arrow_read_session(arrow_schema): return types.ReadSession( arrow_schema={ "serialized_schema": arrow_schema.serialize().to_pybytes() })
def _generate_avro_read_session(avro_schema_json): schema = json.dumps(avro_schema_json) return types.ReadSession(avro_schema={"schema": schema})
from google.cloud.bigquery_storage import types import os os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = 'D:\medium\example-apis\key\key_bqsa.json' project_id = 'medium-sandbox' #Setting the client client = BigQueryReadClient() # Selecting the table table = "projects/{}/datasets/{}/tables/{}".format( "bigquery-public-data", "world_bank_global_population", "population_by_country") requested_session = types.ReadSession() requested_session.table = table # This API deliver data serialized in Apache Arrow and AVRO format. requested_session.data_format = types.DataFormat.ARROW # Selecting the columns and appying a filter requested_session.read_options.selected_fields = [ "country", "year_1960", "year_1970", "year_1980", "year_1990", "year_2000", "year_2010", "year_2018" ] requested_session.read_options.row_restriction = 'country_code = "PER"' parent = "projects/{}".format(project_id) session = client.create_read_session( parent=parent,
def main(project_id="your-project-id", snapshot_millis=0): # [START bigquerystorage_quickstart] from google.cloud.bigquery_storage import BigQueryReadClient from google.cloud.bigquery_storage import types # TODO(developer): Set the project_id variable. # project_id = 'your-project-id' # # The read session is created in this project. This project can be # different from that which contains the table. client = BigQueryReadClient() # This example reads baby name data from the public datasets. table = "projects/{}/datasets/{}/tables/{}".format("bigquery-public-data", "usa_names", "usa_1910_current") requested_session = types.ReadSession() requested_session.table = table # This API can also deliver data serialized in Apache Arrow format. # This example leverages Apache Avro. requested_session.data_format = types.DataFormat.AVRO # We limit the output columns to a subset of those allowed in the table, # and set a simple filter to only report names from the state of # Washington (WA). requested_session.read_options.selected_fields = [ "name", "number", "state" ] requested_session.read_options.row_restriction = 'state = "WA"' # Set a snapshot time if it's been specified. if snapshot_millis > 0: snapshot_time = types.Timestamp() snapshot_time.FromMilliseconds(snapshot_millis) requested_session.table_modifiers.snapshot_time = snapshot_time parent = "projects/{}".format(project_id) session = client.create_read_session( parent=parent, read_session=requested_session, # We'll use only a single stream for reading data from the table. However, # if you wanted to fan out multiple readers you could do so by having a # reader process each individual stream. max_stream_count=1, ) reader = client.read_rows(session.streams[0].name) # The read stream contains blocks of Avro-encoded bytes. The rows() method # uses the fastavro library to parse these blocks as an iterable of Python # dictionaries. Install fastavro with the following command: # # pip install google-cloud-bigquery-storage[fastavro] rows = reader.rows(session) # Do any local processing by iterating over the rows. The # google-cloud-bigquery-storage client reconnects to the API after any # transient network errors or timeouts. names = set() states = set() for row in rows: names.add(row["name"]) states.add(row["state"]) print("Got {} unique names in states: {}".format(len(names), ", ".join(states)))
def test_decoding_data_types( client, project_id, all_types_table_ref, bq_client, data_format ): data = [ { u"string_field": u"Price: € 9.95.", u"bytes_field": bigquery._helpers._bytes_to_json(b"byteees"), u"int64_field": -1085, u"float64_field": -42.195, u"numeric_field": "1.4142", u"bool_field": True, u"geography_field": '{"type": "Point", "coordinates": [-49.3028, 69.0622]}', u"person_struct_field": {u"name": u"John", u"age": 42}, u"timestamp_field": 1565357902.017896, # 2019-08-09T13:38:22.017896 u"date_field": u"1995-03-17", u"time_field": u"16:24:51", u"datetime_field": u"2005-10-26T19:49:41", u"string_array_field": [u"foo", u"bar", u"baz"], } ] # Explicit schema is needed to recognize bytes_field as BYTES, and not STRING. # Since partial schemas are not supported in load_table_from_json(), a full # schema needs to be specified. schema = [ bigquery.SchemaField("string_field", "STRING"), bigquery.SchemaField("bytes_field", "BYTES"), bigquery.SchemaField("int64_field", "INT64"), bigquery.SchemaField("float64_field", "FLOAT64"), bigquery.SchemaField("numeric_field", "NUMERIC"), bigquery.SchemaField("bool_field", "BOOL"), bigquery.SchemaField("geography_field", "GEOGRAPHY"), bigquery.SchemaField( "person_struct_field", "STRUCT", fields=( bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("age", "INT64"), ), ), bigquery.SchemaField("timestamp_field", "TIMESTAMP"), bigquery.SchemaField("date_field", "DATE"), bigquery.SchemaField("time_field", "TIME"), bigquery.SchemaField("datetime_field", "DATETIME"), bigquery.SchemaField("string_array_field", "STRING", mode="REPEATED"), ] job_config = bigquery.LoadJobConfig(schema=schema) destination = _to_bq_table_ref(all_types_table_ref) bq_client.load_table_from_json(data, destination, job_config=job_config).result() read_session = types.ReadSession() read_session.table = all_types_table_ref read_session.data_format = data_format session = client.create_read_session( request={ "parent": "projects/{}".format(project_id), "read_session": read_session, "max_stream_count": 1, } ) assert session.streams # there should be data available stream = session.streams[0].name if data_format == types.DataFormat.AVRO: rows = list(client.read_rows(stream).rows(session)) else: assert data_format == types.DataFormat.ARROW rows = list( dict((key, value.as_py()) for key, value in row_dict.items()) for row_dict in client.read_rows(stream).rows(session) ) expected_result = { u"string_field": u"Price: € 9.95.", u"bytes_field": b"byteees", u"int64_field": -1085, u"float64_field": -42.195, u"numeric_field": decimal.Decimal("1.4142"), u"bool_field": True, u"geography_field": "POINT(-49.3028 69.0622)", u"person_struct_field": {u"name": u"John", u"age": 42}, u"timestamp_field": dt.datetime(2019, 8, 9, 13, 38, 22, 17896, tzinfo=pytz.UTC), u"date_field": dt.date(1995, 3, 17), u"time_field": dt.time(16, 24, 51), u"string_array_field": [u"foo", u"bar", u"baz"], } result_copy = copy.copy(rows[0]) del result_copy["datetime_field"] assert result_copy == expected_result # Compare datetime separately, AVRO and PYARROW return different object types, # although they should both represent the same value. # TODO: when fixed, change assertion to assert a datetime instance! expected_pattern = re.compile(r"2005-10-26( |T)19:49:41") assert expected_pattern.match(str(rows[0]["datetime_field"]))