def test_create_tables_from_dict(self): # type: () -> None self.client.create_tables_from_dict({ 'empty_1': [ SchemaField('col1', 'INTEGER'), SchemaField('col2', 'STRING'), ], 'empty_2': [ SchemaField('col1', 'FLOAT'), SchemaField('col2', 'INTEGER'), ] }) self.assertEqual([('col1', 'INTEGER', 'NULLABLE'), ('col2', 'STRING', 'NULLABLE')], [(x.name, x.field_type, x.mode) for x in self.client.get_schema( self.default_test_dataset_id, 'empty_1')]) self.assertEqual([('col1', 'FLOAT', 'NULLABLE'), ('col2', 'INTEGER', 'NULLABLE')], [(x.name, x.field_type, x.mode) for x in self.client.get_schema( self.default_test_dataset_id, 'empty_2')])
def test_simple_query(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: schema = [SchemaField("f1", field_type="STRING")] with ds.table("table_bar", schema=schema).isolate() as t: result = bqtk.query_template( from_=f"select count(*) as nb from `{t.fqdn()}`").run() assert len(result.schema) == 1 assert result.schema[0].name == "nb" assert str.upper( result.schema[0].field_type) in ["INTEGER", "INT64"] assert len(result.rows) == 1 assert result.total_rows == 1 assert result.rows[0]["nb"] == 0
def test_execute_w_query(self): from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery import dbapi connection = dbapi.connect(self._mock_client( rows=[('hello', 'world', 1), ('howdy', 'y\'all', 2)], schema=[ SchemaField('a', 'STRING', mode='NULLABLE'), SchemaField('b', 'STRING', mode='REQUIRED'), SchemaField('c', 'INTEGER', mode='NULLABLE')])) cursor = connection.cursor() cursor.execute('SELECT a, b, c FROM hello_world WHERE d > 3;') # Verify the description. self.assertEqual(len(cursor.description), 3) a_name, a_type, _, _, _, _, a_null_ok = cursor.description[0] self.assertEqual(a_name, 'a') self.assertEqual(a_type, 'STRING') self.assertEqual(a_type, dbapi.STRING) self.assertTrue(a_null_ok) b_name, b_type, _, _, _, _, b_null_ok = cursor.description[1] self.assertEqual(b_name, 'b') self.assertEqual(b_type, 'STRING') self.assertEqual(b_type, dbapi.STRING) self.assertFalse(b_null_ok) c_name, c_type, _, _, _, _, c_null_ok = cursor.description[2] self.assertEqual(c_name, 'c') self.assertEqual(c_type, 'INTEGER') self.assertEqual(c_type, dbapi.NUMBER) self.assertTrue(c_null_ok) # Verify the results. self.assertEqual(cursor.rowcount, 2) row = cursor.fetchone() self.assertEqual(row, ('hello', 'world', 1)) row = cursor.fetchone() self.assertEqual(row, ('howdy', 'y\'all', 2)) row = cursor.fetchone() self.assertIsNone(row)
def test_to_arrow_w_tqdm_wo_query_plan(): from google.cloud.bigquery import table from google.cloud.bigquery.job import QueryJob as target_class from google.cloud.bigquery.schema import SchemaField begun_resource = _make_job_resource(job_type="query") rows = [ {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, ] schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] connection = _make_connection({}) client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = table.RowIterator(client, api_request, path, schema) reload_patch = mock.patch( "google.cloud.bigquery.job._AsyncJob.reload", autospec=True ) result_patch = mock.patch( "google.cloud.bigquery.job.QueryJob.result", side_effect=[concurrent.futures.TimeoutError, row_iterator], ) with result_patch as result_patch_tqdm, reload_patch: tbl = job.to_arrow(progress_bar_type="tqdm", create_bqstorage_client=False) assert result_patch_tqdm.call_count == 2 assert isinstance(tbl, pyarrow.Table) assert tbl.num_rows == 2 result_patch_tqdm.assert_called()
def _get_schema_for_field(self, column: str) -> SchemaField: field_type = 'STRING' field_mode = 'REQUIRED' concrete_field_type = self._get_type_for_field(column) if date == concrete_field_type: field_type = 'DATE' elif int == concrete_field_type: field_type = 'INT64' elif float == concrete_field_type: field_type = 'FLOAT64' return SchemaField(column, field_type, field_mode)
def test_w_description(self): from google.cloud.bigquery.schema import SchemaField DESCRIPTION = 'DESCRIPTION' full_name = SchemaField('full_name', 'STRING', mode='REQUIRED', description=DESCRIPTION) age = SchemaField('age', 'INTEGER', mode='REQUIRED') resource = self._call_fut([full_name, age]) self.assertEqual(len(resource), 2) self.assertEqual( resource[0], { 'name': 'full_name', 'type': 'STRING', 'mode': 'REQUIRED', 'description': DESCRIPTION }) self.assertEqual(resource[1], { 'name': 'age', 'type': 'INTEGER', 'mode': 'REQUIRED' })
def test_w_description(self): from google.cloud.bigquery.schema import SchemaField DESCRIPTION = "DESCRIPTION" full_name = SchemaField( "full_name", "STRING", mode="REQUIRED", description=DESCRIPTION ) age = SchemaField("age", "INTEGER", mode="REQUIRED") resource = self._call_fut([full_name, age]) self.assertEqual(len(resource), 2) self.assertEqual( resource[0], { "name": "full_name", "type": "STRING", "mode": "REQUIRED", "description": DESCRIPTION, }, ) self.assertEqual( resource[1], {"name": "age", "type": "INTEGER", "mode": "REQUIRED", "description": None}, )
def test_query_write_mode(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: source_schema = [SchemaField("f1", field_type="STRING")] target_schema = [ SchemaField("f1", field_type="STRING"), SchemaField("f2", field_type="INT64") ] with ds.table("table_bar", schema=source_schema).isolate() as t_source: with ds.table("table_foobar", schema=target_schema).isolate() as t_target: bqtk.query_template(from_=f"select count(*) as f2, 'test' as f1 from `{t_source.fqdn()}`") \ .with_destination(t_target).run() result = bqtk.query_template( from_=f"select * from `{t_target.fqdn()}` order by f2" ).run() assert result.rows == [{"f1": "test", "f2": 0}] bqtk.query_template(from_="select 'test2' as f1, 2 as f2") \ .with_destination(t_target).append().run() result = bqtk.query_template( from_=f"select * from `{t_target.fqdn()}` order by f2" ).run() assert result.rows == [{ "f1": "test", "f2": 0 }, { "f1": "test2", "f2": 2 }] bqtk.query_template(from_="select 'test3' as f1, 3 as f2") \ .with_destination(t_target).overwrite().run() result = bqtk.query_template( from_=f"select * from `{t_target.fqdn()}` order by f2" ).run() assert result.rows == [{"f1": "test3", "f2": 3}] with pytest.raises(Exception): bqtk.query_template(from_="select 'test4' as f1, 4 as f2") \ .with_destination(t_target).error_if_exists().run()
def test_insert_rows(self): # type: () -> None dataset_ref = DatasetReference('my_project', 'my_dataset') dataset = Dataset(dataset_ref) table1_ref = TableReference(dataset_ref, 'table1') schema = [SchemaField(name="a", field_type='INT64'), SchemaField(name="b", field_type='FLOAT64'), ] table = Table(table1_ref, schema) self.bq_client.create_dataset(dataset) self.bq_client.create_table(table) # Insert two rows, check that they landed self.assertFalse(self.bq_client.insert_rows(table, [{'a': 1, 'b': 2.5}, # Intentionally omit 'b' here. {'a': 3}])) self.assertRowsExpected( self.bq_client.query('SELECT * FROM `my_project.my_dataset.table1`', QueryJobConfig()), [[1, 2.5], [3, None]]) self.assertRowsExpected( self.bq_client.query('SELECT a FROM `my_project.my_dataset.table1` WHERE b is NULL', QueryJobConfig()), [[3]]) # Insert two more rows, check that all four rows are now present. self.assertFalse(self.bq_client.insert_rows(table, [{'a': 5, 'b': 6.5}, {'a': 7, 'b': 8.25}])) self.assertRowsExpected( self.bq_client.query('SELECT * FROM `my_project.my_dataset.table1`', QueryJobConfig()), [[1, 2.5], [3, None], [5, 6.5], [7, 8.25]])
def create_mock_tables(cls): # type: () -> None """Create mock tables""" super(MockBQTest, cls).create_mock_tables() cls.dates_table_name = cls.client.path('dates', delimiter=BQ_PATH_DELIMITER) cls.client.populate_table( cls.dates_table_name, [ SchemaField('foo', 'DATETIME'), SchemaField('bar', 'INTEGER'), SchemaField('baz', 'INTEGER') ], [['1987-05-13 00:00:00', 2, 3], ['1950-01-01 00:00:00', 5, 6]], ) cls.str_table_name = cls.client.path('strings', delimiter=BQ_PATH_DELIMITER) cls.client.populate_table( cls.str_table_name, [SchemaField('char1', 'STRING')], [['123'], ['456']], ) cls.str_with_single_quotes_table_name = cls.client.path( 'strings2', delimiter=BQ_PATH_DELIMITER) cls.client.populate_table( cls.str_with_single_quotes_table_name, [ SchemaField('description', 'STRING'), SchemaField('is_good', 'BOOLEAN') ], [['Description of something with \'single quotes\'', True]], ) cls.bool_table_name = cls.client.path('booleans', delimiter=BQ_PATH_DELIMITER) cls.client.populate_table( cls.bool_table_name, [ SchemaField('str_col', 'STRING'), SchemaField('bool_col', 'BOOLEAN') ], [['yes', True], ['no', False], ['yes2', True]], )
def to_schema_field(self, name): # type: (str) -> SchemaField """Converts this type to a BigQuery SchemaField. Args: name: The name of the column. This class represents a type; SchemaField represents a column, so it includes the type and also the name of the column. Returns: A SchemaField object corresponding to a column containing this class' type. """ if isinstance(self.type_, BQScalarType): return SchemaField(name=name, field_type=self.type_.value, mode='REPEATED') raise NotImplementedError("SchemaField for ARRAY of {} not implemented" .format(self.type_))
def test_change_partition_by_range(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: table_bar_schema = [SchemaField("my_int", "int64")] with ds.table("table_bar"). \ with_schema(from_=table_bar_schema). \ partition_by(Range(on_field="my_int", start=0, end=10000, interval=10)). \ isolate() as t: show_res = t.show() assert show_res is not None assert show_res.time_partitioning is None expected_range = RangePartitioning(field='my_int', range_=PartitionRange(end=10000, interval=10, start=0)) assert show_res.range_partitioning == expected_range
def setUp(self): self.bq_client = Client('my_project') dataset_ref = DatasetReference('my_project', 'my_dataset') schema = [ SchemaField(name="a", field_type='INT64'), SchemaField(name="b", field_type='FLOAT64'), ] self.source_table = Table(TableReference(dataset_ref, 'source_table'), schema) self.destination_table = Table( TableReference(dataset_ref, 'destination_table'), schema) self.bq_client.create_dataset(Dataset(dataset_ref)) self.bq_client.create_table(self.source_table) # We don't create the destination table here; some tests do not want it created. # Stick two rows into source_table self.assertFalse( self.bq_client.insert_rows(self.source_table, [{ 'a': 1, 'b': 2.5 }, { 'a': 3, 'b': 4.25 }]))
def create_mock_tables(cls): # type: () -> None """Create mock tables""" cls.src_table_name = cls.client.path('tmp', delimiter=BQ_PATH_DELIMITER) cls.client.populate_table( cls.src_table_name, FOO_BAR_BAZ_INTEGERS_SCHEMA, [[1, 2, 3], [4, 5, 6]], ) cls.long_table_name = cls.client.path('long_table', delimiter=BQ_PATH_DELIMITER) cls.client.populate_table(cls.long_table_name, [ SchemaField('foo', 'INTEGER'), ], [[1]] * LONG_TABLE_LENGTH)
def test_convert_between_schema_field_and_bq_type(self, bq_type, schema_field): # type: (BQScalarType, SchemaField) -> None # Test scalar self.assertEqual(BQType.from_schema_field(schema_field), bq_type) self.assertEqual(bq_type.to_schema_field('foo'), schema_field) # Test array schema_array_field = SchemaField(name=schema_field.name, field_type=schema_field.field_type, mode='REPEATED') bq_array_type = BQArray(bq_type) self.assertEqual(BQType.from_schema_field(schema_array_field), bq_array_type) self.assertEqual(bq_array_type.to_schema_field('foo'), schema_array_field)
def _get_columns_helper(self, columns, cur_columns): """ Recurse into record type and return all the nested field names. As contributed by @sumedhsakdeo on issue #17 """ results = [] for col in columns: results += [SchemaField(name='.'.join(col.name for col in cur_columns + [col]), field_type=col.field_type, mode=col.mode, description=col.description, fields=col.fields)] if col.field_type == 'RECORD': cur_columns.append(col) results += self._get_columns_helper(col.fields, cur_columns) cur_columns.pop() return results
def __to_bq_schema(schema_df: pd.DataFrame): type_mapping = { 'i': 'INTEGER', 'b': 'BOOLEAN', 'f': 'FLOAT', 'O': 'STRING', 'S': 'STRING', 'U': 'STRING', 'M': 'TIMESTAMP' } fields = [] for column_name, dtype in schema_df.dtypes.iteritems(): fields.append( SchemaField(column_name, type_mapping.get(dtype.kind, 'STRING'))) return fields
def create_test_table(cls, table_name, schema_file, data_file=None, table_postfix=''): # type: (str, str, Optional[str], str) -> () """ This method creates a table to be used in testing Args: table_name: A string with the name of the table schema_file: A string with the name of the schema file from which to build the test table data_file: A string describing the file with the data to use in testing. If None, create an empty table. table_postfix: A string to be added to the end of the table_name. """ data = cls._load_csv_with_schema(schema_file, data_file) if data_file else [] table_path = cls.client.path(table_name + table_postfix) with open(schema_file) as f: schema_json = json.load(f) schema_list = [SchemaField(row['name'], row['type']) for row in schema_json] cls.client.populate_table(table_path, schema_list, data)
def get_schema(self, dataset_id, table_name, project_id=None): # type: (str, str, Optional[str]) -> List[SchemaField] """Returns the schema of a table. Note that due to the imperfect mapping of SQLiteTypes to BQ types, these schemas won't be perfect. Anything relying heavily on correct schemas should use the real BigQuery client. Args: dataset_id: The dataset to query. table_name: The name of the table. project_id: The project ID of the table. Returns: A list of SchemaFields representing the schema. """ # schema rows are in the format (order, name, type, ...) standardized_path = self.path(table_name, dataset_id, project_id, delimiter=MOCK_DELIMITER) # 'pragma' is SQLite's equivalent to DESCRIBE TABLE pragma_query = 'pragma table_info(\'' + standardized_path + '\')' single_row_query = 'SELECT * FROM ' + standardized_path + ' LIMIT 1' single_row = self.conn.execute(single_row_query).fetchall() schema = self.conn.execute(pragma_query).fetchall() returned_schema = [] for i in range(len(schema)): row_name = schema[i][1] if len(single_row) > 0: row_type = self._db_type_to_bq_type(schema[i][2], sample=single_row[0][i]) else: row_type = self._db_type_to_bq_type(schema[i][2]) # Repeated fields are not supported in mock BigQuery so we always set the mode # to nullable. returned_schema.append( SchemaField(row_name, row_type, mode='NULLABLE')) return returned_schema
def _parse_schema_resource(info): """Parse a resource fragment into a schema field. :type info: mapping :param info: should contain a "fields" key to be parsed :rtype: list of :class:`SchemaField`, or ``NoneType`` :returns: a list of parsed fields, or ``None`` if no "fields" key is present in ``info``. """ if 'fields' not in info: return () schema = [] for r_field in info['fields']: name = r_field['name'] field_type = r_field['type'] mode = r_field.get('mode', 'NULLABLE') description = r_field.get('description') sub_fields = _parse_schema_resource(r_field) schema.append( SchemaField(name, field_type, mode, description, sub_fields)) return schema
def _get_schema_for_field(column: str, methods: Sequence[dict]): field_type = 'STRING' field_mode = 'REQUIRED' method = next((method['method'] for method in methods if method['fieldName'] == column), None) if SistrixApiClient.ENDPOINT_DOMAIN_VISIBILITYINDEX == method: field_type = SqlTypeNames.FLOAT field_mode = 'NULLABLE' if SistrixApiClient.ENDPOINT_DOMAIN_PAGES == method or \ SistrixApiClient.ENDPOINT_DOMAIN_KEYWORDCOUNT_SEO == method or \ SistrixApiClient.ENDPOINT_DOMAIN_KEYWORDCOUNT_SEO_TOP10 == method: field_type = SqlTypeNames.INTEGER field_mode = 'NULLABLE' if 'date' == column: field_type = SqlTypeNames.DATE if 'daily' == column or 'mobile' == column: field_type = SqlTypeNames.BOOLEAN return SchemaField(column, field_type, field_mode)
def test_table_exists_with_name(self): dataset_that_exists = 'dataset_{}'.format( self.make_n_digit_random_number(6)) table_that_exists = 'table_{}'.format( self.make_n_digit_random_number(6)) table_path_that_exists = self.client.path(table_that_exists, dataset_that_exists) self.client.create_dataset_by_name(dataset_that_exists) self.addCleanup(lambda: self.client.delete_dataset_by_name( dataset_that_exists, delete_all_tables=True)) self.client.create_tables_from_dict( {table_that_exists: [SchemaField('col', 'INT64')]}, dataset_that_exists) self.assertTrue( self.client.table_exists_with_name(table_path_that_exists)) self.assertFalse( self.client.table_exists_with_name( self.client.path(table_that_exists, 'dataset_that_does_not_exist'))) self.assertFalse( self.client.table_exists_with_name( self.client.path('table_that_does_not_exist', dataset_that_exists)))
def _prepare_schema(self): return [SchemaField(**row) for row in SCHEMA]
def get_schema_field(item_definition: dict) -> SchemaField: schema_field = SchemaField(item_definition['dbName'], item_definition['type']) return schema_field
def test_create_tables_from_dict_overwrite(self): # type: () -> None # Create the dataset once. self.client.create_tables_from_dict( { 'empty_1': [ SchemaField('col1', 'INTEGER'), SchemaField('col2', 'STRING') ], 'empty_2': [SchemaField('col1', 'FLOAT'), SchemaField('col2', 'INTEGER')] }, replace_existing_tables=True) # Create it again with a different schema. Make sure the changes take since it should have # recreated the dataset. self.client.create_tables_from_dict( { 'empty_1': [ SchemaField('col1_test1', 'INTEGER'), SchemaField('col2_test2', 'STRING') ], 'empty_2': [ SchemaField('col1_test1', 'FLOAT'), SchemaField('col2_test2', 'INTEGER') ] }, replace_existing_tables=True) self.assertEqual([('col1_test1', 'INTEGER', 'NULLABLE'), ('col2_test2', 'STRING', 'NULLABLE')], [(x.name, x.field_type, x.mode) for x in self.client.get_schema( self.default_test_dataset_id, 'empty_1')]) self.assertEqual([('col1_test1', 'FLOAT', 'NULLABLE'), ('col2_test2', 'INTEGER', 'NULLABLE')], [(x.name, x.field_type, x.mode) for x in self.client.get_schema( self.default_test_dataset_id, 'empty_2')]) # Try to create one of the tables again; it should raise a RuntimeError. with self.assertRaises(RuntimeError): self.client.create_tables_from_dict( { 'empty_1': [ SchemaField('col1', 'INTEGER'), SchemaField('col2', 'STRING') ], }, replace_existing_tables=False) # Try to create a table not in the dataset. It should work fine. self.client.create_tables_from_dict( { 'empty_3': [ SchemaField('col1', 'INTEGER'), SchemaField('col2', 'STRING') ], }, replace_existing_tables=False) self.assertEqual([('col1', 'INTEGER', 'NULLABLE'), ('col2', 'STRING', 'NULLABLE')], [(x.name, x.field_type, x.mode) for x in self.client.get_schema( self.default_test_dataset_id, 'empty_3')])
class BqTypesTest(unittest.TestCase): @data( (BQScalarType.BOOLEAN, SchemaField(name='foo', field_type='BOOLEAN')), (BQScalarType.DATE, SchemaField(name='foo', field_type='DATE')), (BQScalarType.DATETIME, SchemaField(name='foo', field_type='DATETIME')), (BQScalarType.INTEGER, SchemaField(name='foo', field_type='INTEGER')), (BQScalarType.FLOAT, SchemaField(name='foo', field_type='FLOAT')), (BQScalarType.STRING, SchemaField(name='foo', field_type='STRING')), (BQScalarType.TIMESTAMP, SchemaField(name='foo', field_type='TIMESTAMP')), ) @unpack def test_convert_between_schema_field_and_bq_type(self, bq_type, schema_field): # type: (BQScalarType, SchemaField) -> None # Test scalar self.assertEqual(BQType.from_schema_field(schema_field), bq_type) self.assertEqual(bq_type.to_schema_field('foo'), schema_field) # Test array schema_array_field = SchemaField(name=schema_field.name, field_type=schema_field.field_type, mode='REPEATED') bq_array_type = BQArray(bq_type) self.assertEqual(BQType.from_schema_field(schema_array_field), bq_array_type) self.assertEqual(bq_array_type.to_schema_field('foo'), schema_array_field) @data( (BQScalarType.BOOLEAN, SchemaField(name='foo', field_type='BOOL')), (BQScalarType.INTEGER, SchemaField(name='foo', field_type='INT64')), (BQScalarType.FLOAT, SchemaField(name='foo', field_type='FLOAT64')), ) @unpack def test_convert_from_standard_schema_field_to_bq_type( self, bq_type, schema_field): # type: (BQScalarType, SchemaField) -> None self.assertEqual(BQType.from_schema_field(schema_field), bq_type) @data( (BQScalarType.BOOLEAN, 'BOOL'), (BQScalarType.INTEGER, 'int64'), (BQScalarType.FLOAT, 'FLOAT64'), (BQScalarType.BOOLEAN, 'boolean'), (BQScalarType.DATE, 'DATE'), (BQScalarType.DATETIME, 'datetime'), (BQScalarType.INTEGER, 'INTEGER'), (BQScalarType.FLOAT, 'FlOaT'), (BQScalarType.STRING, 'STRING'), (BQScalarType.TIMESTAMP, 'timeSTAMP'), ) @unpack def test_convert_from_string_to_bq_scalar_type(self, bq_type, string): # type: (BQScalarType, str) -> None self.assertEqual(BQScalarType.from_string(string), bq_type) self.assertEqual(BQScalarType.from_string(string.lower()), bq_type) @data( (BQScalarType.BOOLEAN, ), (BQScalarType.DATE, ), (BQScalarType.DATETIME, ), (BQScalarType.INTEGER, ), (BQScalarType.FLOAT, ), (BQScalarType.STRING, ), (BQScalarType.TIMESTAMP, ), ) @unpack def test_two_arrays_of_same_type_are_same_object(self, bq_type): # type: (BQScalarType) -> None # Type objects are immutable, and we need to be able to compare them # (an array of ints is an array of ints, but it's not a string or an array of floats). # A way to achieve this is to ensure that all types, including arrays, are singletons. # So we test that for each scalar type, creating two arrays of it yields the same object. a1 = BQArray(bq_type) a2 = BQArray(bq_type) self.assertIs(a1, a2) @data((BQScalarType.BOOLEAN, np.bool_(True), True), (BQScalarType.DATE, np.datetime64('2019-01-07'), datetime.date(2019, 1, 7)), (BQScalarType.DATETIME, np.datetime64('2019-01-07T10:32:05.123456'), datetime.datetime(2019, 1, 7, 10, 32, 5, 123456)), (BQScalarType.INTEGER, np.float64(35.0), 35), (BQScalarType.FLOAT, np.float64(12.34), 12.34), (BQScalarType.STRING, np.string_('hello'), 'hello'), (BQScalarType.TIMESTAMP, np.datetime64('2019-01-07T10:32:05.123456'), datetime.datetime(2019, 1, 7, 10, 32, 5, 123456))) @unpack def test_convert(self, bq_type, np_object, py_object): # type: (BQScalarType, NumPyType, PythonType) -> None # First, convert from a NumPy-typed object to a Pandas-typed object. # Types are mostly the same except for np.datetime64 becomes pd.Timestamp # We do this by creating a Pandas Series containing the single object, and then # converting it to a sequence and extracting its single element. pd_object, = pd.Series(np_object) self.assertEqual(bq_type.convert(pd_object), py_object) # Test that for any type, a NaN converts to None self.assertIsNone(bq_type.convert(np.nan)) # Now test the same conversion for a list (array) of objects. # Construct a Series containing a single row which is a list of three objects. pd_array_object, = pd.Series([(pd_object, ) * 3]) self.assertEqual( BQArray(bq_type).convert(pd_array_object), (py_object, ) * 3) # Test that for any Array type, a NaN converts to None self.assertIsNone(BQArray(bq_type).convert(np.nan)) @data( (BQScalarType.BOOLEAN, np.bool_), (BQScalarType.DATE, 'datetime64[ns]'), (BQScalarType.DATETIME, 'datetime64[ns]'), (BQScalarType.INTEGER, np.float64), (BQScalarType.FLOAT, np.float64), (BQScalarType.STRING, np.string_), (BQScalarType.TIMESTAMP, 'datetime64[ns]'), ) @unpack def test_to_dtype(self, bq_type, np_type): # type: (BQScalarType, NumPyType) -> None self.assertEqual(bq_type.to_dtype(), np.dtype(np_type)) # NumPy doesn't know from cell elements that are lists, so it just leaves it as an # uninterpreted Python object. self.assertEqual(BQArray(bq_type).to_dtype(), np.dtype('object')) def test_get_typed_series_as_list(self): typed_series = TypedSeries( pd.Series([(np.float64(1.5), np.float64(2.5), np.float64(3.0)), (np.float64(2.5), np.float64(3.5), np.float64(4.0))]), BQArray(BQScalarType.FLOAT)) self.assertEqual(typed_series.to_list(), [(1.5, 2.5, 3.0), (2.5, 3.5, 4.0)]) def test_get_typed_dataframe_schema(self): typed_dataframe = TypedDataFrame( pd.DataFrame(columns=['a', 'b']), [BQScalarType.BOOLEAN, BQArray(BQScalarType.FLOAT)]) self.assertEqual(typed_dataframe.to_bq_schema(), [ SchemaField(name='a', field_type='BOOLEAN'), SchemaField(name='b', field_type='FLOAT', mode='REPEATED') ]) def test_get_typed_dataframe_as_list_of_lists(self): typed_dataframe = TypedDataFrame( pd.DataFrame( [[ np.bool_(True), (np.float64(1.5), np.float64(2.5), np.float64(3.0)) ], [ np.bool_(False), (np.float64(2.5), np.float64(3.5), np.float64(4.0)) ]], columns=['a', 'b']), [BQScalarType.BOOLEAN, BQArray(BQScalarType.FLOAT)]) self.assertEqual(typed_dataframe.to_list_of_lists(), [[True, (1.5, 2.5, 3.0)], [False, (2.5, 3.5, 4.0)]]) @data( dict(names=[], expected_name=None), dict(names=['foo'], expected_name='foo'), dict(names=['foo', None, 'foo'], expected_name='foo'), ) @unpack def test_coerce_names(self, names, expected_name): # type: (Sequence[str], Optional[str]) -> None self.assertEqual(expected_name, _coerce_names(names)) @data( dict(names=['foo', 'bar']), ) @unpack def test_coerce_names_error(self, names): # type: (Sequence[str]) -> None with self.assertRaisesRegexp(ValueError, 'field names .* do not match'): _coerce_names(names) @data( ([BQScalarType.INTEGER], BQScalarType.INTEGER), ([None, BQScalarType.INTEGER], BQScalarType.INTEGER), ([BQScalarType.FLOAT], BQScalarType.FLOAT), ([BQScalarType.FLOAT, None], BQScalarType.FLOAT), ([BQScalarType.FLOAT, BQScalarType.FLOAT], BQScalarType.FLOAT), ([BQScalarType.STRING, BQScalarType.STRING], BQScalarType.STRING), ([BQScalarType.STRING, None, BQScalarType.STRING ], BQScalarType.STRING), ([BQScalarType.INTEGER, BQScalarType.FLOAT], BQScalarType.FLOAT), ([BQScalarType.STRING, BQScalarType.DATE], BQScalarType.DATE), ([BQScalarType.STRING, BQScalarType.TIMESTAMP ], BQScalarType.TIMESTAMP), ) @unpack def test_implicitly_coerce(self, input_types, expected_supertype): # type: (List[BQScalarType], BQScalarType) -> None supertype = implicitly_coerce(*input_types) self.assertEqual(supertype, expected_supertype) @data( ([], "No types provided to merge"), ([BQScalarType.STRING, BQScalarType.INTEGER ], "Cannot implicitly coerce the given types:"), ([BQScalarType.STRING, BQScalarType.DATE, BQScalarType.TIMESTAMP ], "Cannot implicitly coerce the given types:"), ) @unpack def test_implicitly_coerce_error(self, input_types, error): # type: (List[BQScalarType], str) -> None with self.assertRaisesRegexp(ValueError, error): implicitly_coerce(*input_types)
def test_begin_w_alternate_client(self): from google.cloud.bigquery.job import CreateDisposition from google.cloud.bigquery.job import LoadJobConfig from google.cloud.bigquery.job import SchemaUpdateOption from google.cloud.bigquery.job import WriteDisposition from google.cloud.bigquery.schema import SchemaField PATH = "/projects/%s/jobs" % (self.PROJECT, ) RESOURCE = self._make_resource(ended=True) LOAD_CONFIGURATION = { "sourceUris": [self.SOURCE1], "destinationTable": { "projectId": self.PROJECT, "datasetId": self.DS_ID, "tableId": self.TABLE_ID, }, "allowJaggedRows": True, "allowQuotedNewlines": True, "createDisposition": CreateDisposition.CREATE_NEVER, "encoding": "ISO-8559-1", "fieldDelimiter": "|", "ignoreUnknownValues": True, "maxBadRecords": 100, "nullMarker": r"\N", "quote": "'", "skipLeadingRows": "1", "sourceFormat": "CSV", "useAvroLogicalTypes": True, "writeDisposition": WriteDisposition.WRITE_TRUNCATE, "schema": { "fields": [ { "name": "full_name", "type": "STRING", "mode": "REQUIRED", "description": None, }, { "name": "age", "type": "INTEGER", "mode": "REQUIRED", "description": None, }, ] }, "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION], } RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION conn1 = _make_connection() client1 = _make_client(project=self.PROJECT, connection=conn1) conn2 = _make_connection(RESOURCE) client2 = _make_client(project=self.PROJECT, connection=conn2) full_name = SchemaField("full_name", "STRING", mode="REQUIRED") age = SchemaField("age", "INTEGER", mode="REQUIRED") config = LoadJobConfig() config.schema = [full_name, age] job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client1, config) config.allow_jagged_rows = True config.allow_quoted_newlines = True config.create_disposition = CreateDisposition.CREATE_NEVER config.encoding = "ISO-8559-1" config.field_delimiter = "|" config.ignore_unknown_values = True config.max_bad_records = 100 config.null_marker = r"\N" config.quote_character = "'" config.skip_leading_rows = 1 config.source_format = "CSV" config.use_avro_logical_types = True config.write_disposition = WriteDisposition.WRITE_TRUNCATE config.schema_update_options = [ SchemaUpdateOption.ALLOW_FIELD_ADDITION ] with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: job._begin(client=client2) final_attributes.assert_called_with({"path": PATH}, client2, job) conn1.api_request.assert_not_called() self.assertEqual(len(conn2.api_request.call_args_list), 1) req = conn2.api_request.call_args_list[0] self.assertEqual(req[1]["method"], "POST") self.assertEqual(req[1]["path"], PATH) SENT = { "jobReference": { "projectId": self.PROJECT, "jobId": self.JOB_ID }, "configuration": { "load": LOAD_CONFIGURATION }, } self.maxDiff = None self.assertEqual(req[1]["data"], SENT) self._verifyResourceProperties(job, RESOURCE)
def _make_field(field_type, mode="NULLABLE", name="testing", fields=()): from google.cloud.bigquery.schema import SchemaField return SchemaField(name=name, field_type=field_type, mode=mode, fields=fields)
def get_rank_table_schema(src_schema): dst_schema_field = SchemaField('rank', 'INTEGER', mode='REQUIRED') dst_schema = [dst_schema_field] + src_schema return dst_schema
from typing import Any, List, Tuple # noqa: F401 import six from ddt import data, ddt, unpack from google.api_core.exceptions import BadRequest, NotFound from google.cloud.bigquery import Dataset, DatasetReference, Table, TableReference from google.cloud.bigquery.job import QueryJobConfig from google.cloud.bigquery.schema import SchemaField from purplequery.bq_types import PythonType # noqa: F401 from purplequery.client import _FakeJob # noqa: F401 from purplequery.client import Client from six.moves import cStringIO _TEST_SCHEMA = [ SchemaField(name="num", field_type='INTEGER'), SchemaField(name="ID", field_type='STRING'), SchemaField(name="height", field_type='FLOAT'), SchemaField(name="likes_chocolate", field_type='BOOLEAN'), SchemaField(name="start_date", field_type='DATE'), SchemaField(name="mid_date", field_type='DATETIME'), SchemaField(name="end_time", field_type='TIMESTAMP'), SchemaField(name="xs", field_type='INTEGER', mode='REPEATED'), ] class ClientTestBase(unittest.TestCase): def assertRowsExpected(self, query_job, expected_rows): # type: (_FakeJob, List[List[PythonType]]) -> None """Assert that query_job has finished and it contains the expected rows.