def test_rows_no_schema_set_raises_type_error(mut, class_under_test, mock_client, monkeypatch): reader = class_under_test([], mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {}) read_session = bigquery_storage_v1beta1.types.ReadSession() with pytest.raises(TypeError): reader.rows(read_session)
def test_rows_raises_import_error(mut, class_under_test, mock_client, monkeypatch): monkeypatch.setattr(mut, "fastavro", None) reader = class_under_test( [], mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {} ) read_session = bigquery_storage_v1beta1.types.ReadSession() with pytest.raises(ImportError): reader.rows(read_session)
def test_avro_rows_raises_import_error(mut, class_under_test, mock_client, monkeypatch): monkeypatch.setattr(mut, "fastavro", None) reader = class_under_test( [], mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {} ) bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_avro_read_session(avro_schema) with pytest.raises(ImportError): reader.rows(read_session)
def test_rows_w_timeout(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_avro_read_session(avro_schema) bq_blocks_1 = [ [{"int_col": 123}, {"int_col": 234}], [{"int_col": 345}, {"int_col": 456}], ] avro_blocks_1 = _avro_blocks_w_deadline( _bq_to_avro_blocks(bq_blocks_1, avro_schema) ) bq_blocks_2 = [[{"int_col": 567}, {"int_col": 789}], [{"int_col": 890}]] avro_blocks_2 = _bq_to_avro_blocks(bq_blocks_2, avro_schema) mock_client.read_rows.return_value = avro_blocks_2 stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"} ) reader = class_under_test( avro_blocks_1, mock_client, stream_position, {"metadata": {"test-key": "test-value"}}, ) with pytest.raises(google.api_core.exceptions.DeadlineExceeded): list(reader.rows(read_session)) # Don't reconnect on DeadlineException. This allows user-specified timeouts # to be respected. mock_client.read_rows.assert_not_called()
def test_rows_w_timeout(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_read_session(avro_schema) bq_blocks_1 = [ [{"int_col": 123}, {"int_col": 234}], [{"int_col": 345}, {"int_col": 456}], ] avro_blocks_1 = _avro_blocks_w_deadline( _bq_to_avro_blocks(bq_blocks_1, avro_schema) ) bq_blocks_2 = [[{"int_col": 567}, {"int_col": 789}], [{"int_col": 890}]] avro_blocks_2 = _bq_to_avro_blocks(bq_blocks_2, avro_schema) mock_client.read_rows.return_value = avro_blocks_2 stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"} ) reader = class_under_test( avro_blocks_1, mock_client, stream_position, {"metadata": {"test-key": "test-value"}}, ) with pytest.raises(google.api_core.exceptions.DeadlineExceeded): list(reader.rows(read_session)) # Don't reconnect on DeadlineException. This allows user-specified timeouts # to be respected. mock_client.read_rows.assert_not_called()
def test_rows_w_reconnect_by_page(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_read_session(avro_schema) bq_blocks_1 = [ [{"int_col": 123}, {"int_col": 234}], [{"int_col": 345}, {"int_col": 456}], ] avro_blocks_1 = _bq_to_avro_blocks(bq_blocks_1, avro_schema) bq_blocks_2 = [[{"int_col": 567}, {"int_col": 789}], [{"int_col": 890}]] avro_blocks_2 = _bq_to_avro_blocks(bq_blocks_2, avro_schema) avro_blocks_1[0].status.estimated_row_count = 8 avro_blocks_1[1].status.estimated_row_count = 6 avro_blocks_2[0].status.estimated_row_count = 9 avro_blocks_2[1].status.estimated_row_count = 7 mock_client.read_rows.return_value = avro_blocks_2 stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"} ) reader = class_under_test( _avro_blocks_w_unavailable(avro_blocks_1), mock_client, stream_position, {"metadata": {"test-key": "test-value"}}, ) got = reader.rows(read_session) pages = iter(got.pages) assert got.total_rows is None page_1 = next(pages) assert got.total_rows == 8 assert page_1.num_items == 2 assert page_1.remaining == 2 assert tuple(page_1) == tuple(bq_blocks_1[0]) assert page_1.num_items == 2 assert page_1.remaining == 0 page_2 = next(pages) assert got.total_rows == 6 assert next(page_2) == bq_blocks_1[1][0] assert page_2.num_items == 2 assert page_2.remaining == 1 assert next(page_2) == bq_blocks_1[1][1] page_3 = next(pages) assert tuple(page_3) == tuple(bq_blocks_2[0]) assert page_3.num_items == 2 assert page_3.remaining == 0 assert got.total_rows == 9 page_4 = next(pages) assert got.total_rows == 7 assert tuple(page_4) == tuple(bq_blocks_2[1]) assert page_4.num_items == 1 assert page_4.remaining == 0
def test_rows_w_reconnect_by_page(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_avro_read_session(avro_schema) bq_blocks_1 = [ [{"int_col": 123}, {"int_col": 234}], [{"int_col": 345}, {"int_col": 456}], ] avro_blocks_1 = _bq_to_avro_blocks(bq_blocks_1, avro_schema) bq_blocks_2 = [[{"int_col": 567}, {"int_col": 789}], [{"int_col": 890}]] avro_blocks_2 = _bq_to_avro_blocks(bq_blocks_2, avro_schema) avro_blocks_1[0].status.estimated_row_count = 8 avro_blocks_1[1].status.estimated_row_count = 6 avro_blocks_2[0].status.estimated_row_count = 9 avro_blocks_2[1].status.estimated_row_count = 7 mock_client.read_rows.return_value = avro_blocks_2 stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"} ) reader = class_under_test( _pages_w_unavailable(avro_blocks_1), mock_client, stream_position, {"metadata": {"test-key": "test-value"}}, ) got = reader.rows(read_session) pages = iter(got.pages) assert got.total_rows is None page_1 = next(pages) assert got.total_rows == 8 assert page_1.num_items == 2 assert page_1.remaining == 2 assert tuple(page_1) == tuple(bq_blocks_1[0]) assert page_1.num_items == 2 assert page_1.remaining == 0 page_2 = next(pages) assert got.total_rows == 6 assert next(page_2) == bq_blocks_1[1][0] assert page_2.num_items == 2 assert page_2.remaining == 1 assert next(page_2) == bq_blocks_1[1][1] page_3 = next(pages) assert tuple(page_3) == tuple(bq_blocks_2[0]) assert page_3.num_items == 2 assert page_3.remaining == 0 assert got.total_rows == 9 page_4 = next(pages) assert got.total_rows == 7 assert tuple(page_4) == tuple(bq_blocks_2[1]) assert page_4.num_items == 1 assert page_4.remaining == 0
def test_to_arrow_no_pyarrow_raises_import_error(mut, class_under_test, mock_client, monkeypatch): monkeypatch.setattr(mut, "pyarrow", None) arrow_schema = _bq_to_arrow_schema(SCALAR_COLUMNS) read_session = _generate_arrow_read_session(arrow_schema) arrow_batches = _bq_to_arrow_batches(SCALAR_BLOCKS, arrow_schema) reader = class_under_test(arrow_batches, mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {}) with pytest.raises(ImportError): reader.to_arrow(read_session) with pytest.raises(ImportError): reader.rows(read_session).to_arrow() with pytest.raises(ImportError): next(reader.rows(read_session).pages).to_arrow()
def test_to_dataframe_no_pandas_raises_import_error(mut, class_under_test, mock_client, monkeypatch): monkeypatch.setattr(mut, "pandas", None) avro_schema = _bq_to_avro_schema(SCALAR_COLUMNS) read_session = _generate_read_session(avro_schema) avro_blocks = _bq_to_avro_blocks(SCALAR_BLOCKS, avro_schema) reader = class_under_test(avro_blocks, mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {}) with pytest.raises(ImportError): reader.to_dataframe(read_session) with pytest.raises(ImportError): reader.rows(read_session).to_dataframe() with pytest.raises(ImportError): next(reader.rows(read_session).pages).to_dataframe()
def test_rows_w_empty_stream(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_read_session(avro_schema) reader = class_under_test( [], mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {} ) got = tuple(reader.rows(read_session)) assert got == ()
def test_rows_w_empty_stream_arrow(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] arrow_schema = _bq_to_arrow_schema(bq_columns) read_session = _generate_arrow_read_session(arrow_schema) reader = class_under_test([], mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {}) got = reader.rows(read_session) assert got.total_rows is None assert tuple(got) == ()
def test_to_dataframe_no_pandas_raises_import_error( mut, class_under_test, mock_client, monkeypatch ): monkeypatch.setattr(mut, "pandas", None) avro_schema = _bq_to_avro_schema(SCALAR_COLUMNS) read_session = _generate_read_session(avro_schema) avro_blocks = _bq_to_avro_blocks(SCALAR_BLOCKS, avro_schema) reader = class_under_test( avro_blocks, mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {} ) with pytest.raises(ImportError): reader.to_dataframe(read_session) with pytest.raises(ImportError): reader.rows(read_session).to_dataframe() with pytest.raises(ImportError): next(reader.rows(read_session).pages).to_dataframe()
def test_rows_w_empty_stream(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_read_session(avro_schema) reader = class_under_test( [], mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {} ) got = reader.rows(read_session) assert got.total_rows is None assert tuple(got) == ()
def test_rows_w_scalars_arrow(class_under_test, mock_client): arrow_schema = _bq_to_arrow_schema(SCALAR_COLUMNS) read_session = _generate_arrow_read_session(arrow_schema) arrow_batches = _bq_to_arrow_batches(SCALAR_BLOCKS, arrow_schema) reader = class_under_test(arrow_batches, mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {}) got = tuple(reader.rows(read_session)) expected = tuple(itertools.chain.from_iterable(SCALAR_BLOCKS)) assert got == expected
def test_rows_w_scalars(class_under_test, mock_client): avro_schema = _bq_to_avro_schema(SCALAR_COLUMNS) read_session = _generate_read_session(avro_schema) avro_blocks = _bq_to_avro_blocks(SCALAR_BLOCKS, avro_schema) reader = class_under_test( avro_blocks, mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {} ) got = tuple(reader.rows(read_session)) expected = tuple(itertools.chain.from_iterable(SCALAR_BLOCKS)) assert got == expected
def test_rows_w_reconnect(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_read_session(avro_schema) bq_blocks_1 = [ [{ "int_col": 123 }, { "int_col": 234 }], [{ "int_col": 345 }, { "int_col": 456 }], ] avro_blocks_1 = _avro_blocks_w_unavailable( _bq_to_avro_blocks(bq_blocks_1, avro_schema)) bq_blocks_2 = [[{"int_col": 567}, {"int_col": 789}], [{"int_col": 890}]] avro_blocks_2 = _bq_to_avro_blocks(bq_blocks_2, avro_schema) for block in avro_blocks_2: block.status.estimated_row_count = 7 mock_client.read_rows.return_value = avro_blocks_2 stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"}) reader = class_under_test( avro_blocks_1, mock_client, stream_position, {"metadata": { "test-key": "test-value" }}, ) got = reader.rows(read_session) expected = tuple( itertools.chain( itertools.chain.from_iterable(bq_blocks_1), itertools.chain.from_iterable(bq_blocks_2), )) assert tuple(got) == expected assert got.total_rows == 7 mock_client.read_rows.assert_called_once_with( bigquery_storage_v1beta1.types.StreamPosition(stream={"name": "test"}, offset=4), metadata={"test-key": "test-value"}, )
def test_rows_w_reconnect(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_read_session(avro_schema) bq_blocks_1 = [ [{"int_col": 123}, {"int_col": 234}], [{"int_col": 345}, {"int_col": 456}], ] avro_blocks_1 = _avro_blocks_w_unavailable( _bq_to_avro_blocks(bq_blocks_1, avro_schema) ) bq_blocks_2 = [[{"int_col": 567}, {"int_col": 789}], [{"int_col": 890}]] avro_blocks_2 = _bq_to_avro_blocks(bq_blocks_2, avro_schema) for block in avro_blocks_2: block.status.estimated_row_count = 7 mock_client.read_rows.return_value = avro_blocks_2 stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"} ) reader = class_under_test( avro_blocks_1, mock_client, stream_position, {"metadata": {"test-key": "test-value"}}, ) got = reader.rows(read_session) expected = tuple( itertools.chain( itertools.chain.from_iterable(bq_blocks_1), itertools.chain.from_iterable(bq_blocks_2), ) ) assert tuple(got) == expected assert got.total_rows == 7 mock_client.read_rows.assert_called_once_with( bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"}, offset=4 ), metadata={"test-key": "test-value"}, )
def test_rows_w_nonresumable_internal_error(class_under_test, mock_client): bq_columns = [{"name": "int_col", "type": "int64"}] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_avro_read_session(avro_schema) bq_blocks = [[{"int_col": 1024}, {"int_col": 512}], [{"int_col": 256}]] avro_blocks = _pages_w_nonresumable_internal_error( _bq_to_avro_blocks(bq_blocks, avro_schema)) stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"}) reader = class_under_test(avro_blocks, mock_client, stream_position, {}) with pytest.raises(google.api_core.exceptions.InternalServerError, match="nonresumable error"): list(reader.rows(read_session)) mock_client.read_rows.assert_not_called()
def test_to_dataframe_by_page_arrow(class_under_test, mock_client): bq_columns = [ { "name": "int_col", "type": "int64" }, { "name": "bool_col", "type": "bool" }, ] arrow_schema = _bq_to_arrow_schema(bq_columns) read_session = _generate_arrow_read_session(arrow_schema) bq_block_1 = [ { "int_col": 123, "bool_col": True }, { "int_col": 234, "bool_col": False }, ] bq_block_2 = [ { "int_col": 345, "bool_col": True }, { "int_col": 456, "bool_col": False }, ] bq_block_3 = [ { "int_col": 567, "bool_col": True }, { "int_col": 789, "bool_col": False }, ] bq_block_4 = [{"int_col": 890, "bool_col": True}] # Break blocks into two groups to test that iteration continues across # reconnection. bq_blocks_1 = [bq_block_1, bq_block_2] bq_blocks_2 = [bq_block_3, bq_block_4] batch_1 = _bq_to_arrow_batches(bq_blocks_1, arrow_schema) batch_2 = _bq_to_arrow_batches(bq_blocks_2, arrow_schema) mock_client.read_rows.return_value = batch_2 reader = class_under_test( _pages_w_unavailable(batch_1), mock_client, bigquery_storage_v1beta1.types.StreamPosition(), {}, ) got = reader.rows(read_session) pages = iter(got.pages) page_1 = next(pages) pandas.testing.assert_frame_equal( page_1.to_dataframe(dtypes={ "int_col": "int64", "bool_col": "bool" }).reset_index(drop=True), pandas.DataFrame(bq_block_1, columns=["int_col", "bool_col"]).reset_index(drop=True), ) page_2 = next(pages) pandas.testing.assert_frame_equal( page_2.to_dataframe().reset_index(drop=True), pandas.DataFrame(bq_block_2, columns=["int_col", "bool_col"]).reset_index(drop=True), ) page_3 = next(pages) pandas.testing.assert_frame_equal( page_3.to_dataframe().reset_index(drop=True), pandas.DataFrame(bq_block_3, columns=["int_col", "bool_col"]).reset_index(drop=True), ) page_4 = next(pages) pandas.testing.assert_frame_equal( page_4.to_dataframe().reset_index(drop=True), pandas.DataFrame(bq_block_4, columns=["int_col", "bool_col"]).reset_index(drop=True), )
def test_to_dataframe_by_page(class_under_test, mock_client): bq_columns = [ {"name": "int_col", "type": "int64"}, {"name": "bool_col", "type": "bool"}, ] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_read_session(avro_schema) block_1 = [{"int_col": 123, "bool_col": True}, {"int_col": 234, "bool_col": False}] block_2 = [{"int_col": 345, "bool_col": True}, {"int_col": 456, "bool_col": False}] block_3 = [{"int_col": 567, "bool_col": True}, {"int_col": 789, "bool_col": False}] block_4 = [{"int_col": 890, "bool_col": True}] # Break blocks into two groups to test that iteration continues across # reconnection. bq_blocks_1 = [block_1, block_2] bq_blocks_2 = [block_3, block_4] avro_blocks_1 = _bq_to_avro_blocks(bq_blocks_1, avro_schema) avro_blocks_2 = _bq_to_avro_blocks(bq_blocks_2, avro_schema) mock_client.read_rows.return_value = avro_blocks_2 stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"} ) reader = class_under_test( _avro_blocks_w_deadline(avro_blocks_1), mock_client, stream_position, {"metadata": {"test-key": "test-value"}}, ) got = reader.rows(read_session) pages = iter(got.pages) page_1 = next(pages) pandas.testing.assert_frame_equal( page_1.to_dataframe().reset_index(drop=True), pandas.DataFrame(block_1, columns=["int_col", "bool_col"]).reset_index( drop=True ), ) page_2 = next(pages) pandas.testing.assert_frame_equal( page_2.to_dataframe().reset_index(drop=True), pandas.DataFrame(block_2, columns=["int_col", "bool_col"]).reset_index( drop=True ), ) page_3 = next(pages) pandas.testing.assert_frame_equal( page_3.to_dataframe().reset_index(drop=True), pandas.DataFrame(block_3, columns=["int_col", "bool_col"]).reset_index( drop=True ), ) page_4 = next(pages) pandas.testing.assert_frame_equal( page_4.to_dataframe().reset_index(drop=True), pandas.DataFrame(block_4, columns=["int_col", "bool_col"]).reset_index( drop=True ), )
def test_to_dataframe_by_page(class_under_test, mock_client): bq_columns = [ {"name": "int_col", "type": "int64"}, {"name": "bool_col", "type": "bool"}, ] avro_schema = _bq_to_avro_schema(bq_columns) read_session = _generate_read_session(avro_schema) block_1 = [{"int_col": 123, "bool_col": True}, {"int_col": 234, "bool_col": False}] block_2 = [{"int_col": 345, "bool_col": True}, {"int_col": 456, "bool_col": False}] block_3 = [{"int_col": 567, "bool_col": True}, {"int_col": 789, "bool_col": False}] block_4 = [{"int_col": 890, "bool_col": True}] # Break blocks into two groups to test that iteration continues across # reconnection. bq_blocks_1 = [block_1, block_2] bq_blocks_2 = [block_3, block_4] avro_blocks_1 = _bq_to_avro_blocks(bq_blocks_1, avro_schema) avro_blocks_2 = _bq_to_avro_blocks(bq_blocks_2, avro_schema) mock_client.read_rows.return_value = avro_blocks_2 stream_position = bigquery_storage_v1beta1.types.StreamPosition( stream={"name": "test"} ) reader = class_under_test( _avro_blocks_w_unavailable(avro_blocks_1), mock_client, stream_position, {"metadata": {"test-key": "test-value"}}, ) got = reader.rows(read_session) pages = iter(got.pages) page_1 = next(pages) pandas.testing.assert_frame_equal( page_1.to_dataframe().reset_index(drop=True), pandas.DataFrame(block_1, columns=["int_col", "bool_col"]).reset_index( drop=True ), ) page_2 = next(pages) pandas.testing.assert_frame_equal( page_2.to_dataframe().reset_index(drop=True), pandas.DataFrame(block_2, columns=["int_col", "bool_col"]).reset_index( drop=True ), ) page_3 = next(pages) pandas.testing.assert_frame_equal( page_3.to_dataframe().reset_index(drop=True), pandas.DataFrame(block_3, columns=["int_col", "bool_col"]).reset_index( drop=True ), ) page_4 = next(pages) pandas.testing.assert_frame_equal( page_4.to_dataframe().reset_index(drop=True), pandas.DataFrame(block_4, columns=["int_col", "bool_col"]).reset_index( drop=True ), )