def test_bigquery_write(mocker, uploader): bq_client = mocker.MagicMock() mocker.patch.object(uploader, "_get_bq_client") uploader._get_bq_client.return_value = bq_client table = mocker.MagicMock() bq_client.get_table.return_value = table now = datetime.datetime.now().timestamp() account_config = AccountConfig("account_id", False, "ga_account_id", "", "") destination = Destination( "dest1", DestinationType.GA_MEASUREMENT_PROTOCOL, ["web_property", "view", "c", "list", "d", "buyers_custom_dim"]) source = Source("orig1", SourceType.BIG_QUERY, ["dt1", "buyers"]) execution = Execution(account_config, source, destination) uploader._do_process(Batch(execution, [{"uuid": "uuid-1"}, {"uuid": "uuid-2"}]), now) bq_client.insert_rows.assert_called_once_with( table, [{"uuid": "uuid-1", "timestamp": now}, {"uuid": "uuid-2", "timestamp": now}], (SchemaField("uuid", "string"), SchemaField("timestamp", "timestamp")))
def test_check_field_type_fail(): local_schema = [SchemaField("a", "STRING", "REQUIRED")] remote_schema = [SchemaField("a", "INTEGER", "REQUIRED")] assert list(check_schemas(local_schema, remote_schema)) == [ "There is difference between SchemaField('a', 'STRING', 'REQUIRED', None, (), None) " "and SchemaField('a', 'INTEGER', 'REQUIRED', None, (), None)", ]
def column_type(name, schema_property): safe_name = safe_column_name(name, quotes=False) property_type = schema_property['type'] property_format = schema_property.get('format', None) if 'array' in property_type: try: items_schema = schema_property['items'] items_type = bigquery_type(items_schema['type'], items_schema.get('format', None)) except KeyError: return SchemaField(safe_name, 'string', 'NULLABLE') else: result_type = items_type return SchemaField(safe_name, items_type, 'REPEATED') elif 'object' in property_type: fields = [ column_type(col, t) for col, t in schema_property.get('properties', {}).items() ] if fields: return SchemaField(safe_name, 'RECORD', 'NULLABLE', fields=fields) else: return SchemaField(safe_name, 'string', 'NULLABLE') else: result_type = bigquery_type(property_type, property_format) return SchemaField(safe_name, result_type, 'NULLABLE')
def main(gcp_credentials_filepath: str): gcp_credentials, project_id = load_gcp_credentials_and_project_from_file( gcp_credentials_filepath) bq_client = create_bq_client(gcp_credentials, project_id) dataset = get_bq_dataset(bq_client, 'dataset_x_1') # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types table_schema = [ SchemaField('field_INT64', 'INT64', 'NULLABLE'), SchemaField('field_NUMERIC', 'NUMERIC', 'NULLABLE'), SchemaField('field_FLOAT64', 'FLOAT64', 'NULLABLE'), SchemaField('field_BOOL', 'BOOL', 'NULLABLE'), SchemaField('field_STRING', 'STRING', 'NULLABLE'), SchemaField('field_BYTES', 'BYTES', 'NULLABLE'), SchemaField('field_DATE', 'DATE', 'NULLABLE'), SchemaField('field_DATETIME', 'DATETIME', 'NULLABLE'), SchemaField('field_TIME', 'TIME', 'NULLABLE'), SchemaField('field_TIMESTAMP', 'TIMESTAMP', 'NULLABLE'), ] table = create_bq_table( bq_client, dataset, table_id='table_2', table_schema=table_schema, table_description='some table description') print("table fully-qualified ID:", table.full_table_id) print("table GRN:", table.path) print("table description:", table.description) print("table type:", table.table_type) print("table size [MiB]:", table.num_bytes / 1014) print("table size [rows]:", table.num_rows) print("table schema fields:") for _field in table.schema: print("\t", _field) print("table API representation:", table.to_api_repr())
def test_to_schema_field(self): expected_schema_field1 = SchemaField(name='test', field_type='STRING', mode='NULLABLE', description='test_description') schema_field1_1 = BigQuerySchemaField('test', 'STRING', 'NULLABLE', 'test_description') actual_schema_field1_1 = BigQuerySchemaField.to_schema_field( schema_field1_1) self.assertEqual(expected_schema_field1, actual_schema_field1_1) schema_field1_2 = BigQuerySchemaField('test', 'INTEGER', 'REQUIRED', 'foo_bar') actual_schema_field1_2 = BigQuerySchemaField.to_schema_field( schema_field1_2) self.assertNotEqual(expected_schema_field1, actual_schema_field1_2) expected_schema_field2 = SchemaField(name='test', field_type='RECORD', mode='NULLABLE', description='test_description', fields=(expected_schema_field1, )) schema_field2_1 = BigQuerySchemaField('test', 'RECORD', 'NULLABLE', 'test_description', (schema_field1_1, )) actual_schema_field2_1 = BigQuerySchemaField.to_schema_field( schema_field2_1) self.assertEqual(expected_schema_field2, actual_schema_field2_1) schema_field2_2 = BigQuerySchemaField('test', 'RECORD', 'NULLABLE', 'test_description', (schema_field1_2, )) actual_schema_field2_2 = BigQuerySchemaField.to_schema_field( schema_field2_2) self.assertNotEqual(expected_schema_field2, actual_schema_field2_2)
def build_schema(schema, key_properties=None, add_metadata=True, force_fields={}): SCHEMA = [] required_fields = set(key_properties) if key_properties else set() if "required" in schema: required_fields.update(schema["required"]) for key, props in schema["properties"].items(): if key in force_fields: SCHEMA.append( SchemaField(key, force_fields[key]["type"], force_fields[key].get("mode", "nullable"), force_fields[key].get("description", None), ())) elif not props: # if we end up with an empty record. continue else: SCHEMA.append( define_schema(props, bigquery_transformed_key(key), required_fields=required_fields)) if add_metadata: for field in METADATA_FIELDS: SCHEMA.append( SchemaField(field, METADATA_FIELDS[field]["bq_type"], "nullable", None, ())) return SCHEMA
def build_schema(self) -> Union[List[SchemaField], None]: """ Attempts to build the schema that will be used for table creation Iterates through the dtypes items and generate the appropriate SchemaFields Returns: The schema generated if successful otherwise None """ schema = None if self.dtypes is not None: schema = [] dtypes = self.dtypes.apply(lambda x: x.name).to_dict() for key, value in dtypes.items(): if value == "string": schema.append(SchemaField(key, "STRING")) elif value == "bool": schema.append(SchemaField(key, "BOOL")) elif value == "Int64" or value == "int64": schema.append(SchemaField(key, "INTEGER")) elif value == "float64": schema.append(SchemaField(key, "FLOAT")) elif value == "datetime64[ns]": schema.append(SchemaField(key, "DATETIME")) return schema
def test_check_field_type_unspecified(): local_schema = [SchemaField("a", "RANDOM_FIELD_TYPE", "REQUIRED")] remote_schema = [SchemaField("a", "INTEGER", "REQUIRED")] assert list(check_schemas(local_schema, remote_schema)) == [ "Unspecified field type in SchemaField('a', 'RANDOM_FIELD_TYPE', 'REQUIRED', None, ()) " "or SchemaField('a', 'INTEGER', 'REQUIRED', None, ())" ]
def _field_to_schema(field: Field) -> SchemaField: field_type = _BASIC_TYPES_TO_NAME.get(field.type) if field_type: return SchemaField( name=field.name, field_type=field_type, description=_parse_field_description(field), mode=BigQueryFieldModes.REQUIRED, ) if is_dataclass(field.type): return SchemaField( name=field.name, field_type=BigQueryTypes.STRUCT, mode=BigQueryFieldModes.REQUIRED, description=_parse_field_description(field), fields=_parse_fields(field.type), ) # typing.Optional is the same as typing.Union[SomeType, NoneType] if field.type.__origin__ is Union: return _parse_optional(field) if field.type.__origin__ is list: return _parse_list(field) raise TypeError(f"Unsupported type: {field.type}.")
def main(): # Preamble - gets some creds and a valid Google BigQuery Client object. gbq_creds_path = Path("~/.gbq-key.json").expanduser() client = Client.from_service_account_json(gbq_creds_path) # Example of df_to_table(). schema = [ SchemaField('field1', 'float64', 'REQUIRED'), SchemaField('field2', 'float64', 'REQUIRED'), SchemaField('field3', 'float64', 'REQUIRED'), SchemaField('field4', 'float64', 'REQUIRED') ] table = client.dataset('my_dataset').table('example_table_1', schema) df1 = pd.DataFrame(np.random.rand(10, 4)) gbq.df_to_table(df1, table) # Example table_to_df(). dataset = client.dataset('my_dataset') table = dataset.table('example_table_1') df2 = gbq.table_to_df(table) df2.info() # Example of query_to_df(). query = 'select field1, field2 from my_dataset.example_table_1;' df3 = gbq.query_to_df(query, client) df3.info()
def create_table(self, dataset: str, table_id: str) -> bigquery.Table: dataset_ref = self.bq_client.dataset(dataset) table_ref = dataset_ref.table(table_id) schema = [ SchemaField("image_id", SqlTypeNames.INTEGER, mode="REQUIRED"), SchemaField("filename", SqlTypeNames.STRING, mode="REQUIRED"), SchemaField("category_id", SqlTypeNames.INTEGER, mode="REQUIRED"), SchemaField("score", SqlTypeNames.FLOAT, mode="REQUIRED"), SchemaField( "segmentation", SqlTypeNames.RECORD, mode="REQUIRED", fields=[ SchemaField("size", SqlTypeNames.INTEGER, mode="REPEATED"), SchemaField("counts", SqlTypeNames.STRING, mode="REQUIRED"), ], ), SchemaField("bbox", SqlTypeNames.FLOAT, mode="REPEATED"), SchemaField("mask_area_fraction", SqlTypeNames.FLOAT, mode="REQUIRED"), SchemaField("mask_mean_score", SqlTypeNames.FLOAT, mode="REQUIRED"), ] table = bigquery.Table(table_ref, schema=schema) table = self.bq_client.create_table(table, exists_ok=True) return table
def test_check_schemas(): local_schema = [ SchemaField("a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER")]) ] remote_schema = [ SchemaField("a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER")]) ] assert list(check_schemas(local_schema, remote_schema)) == []
def test_check_removed_local_nested_columns(): local_schema = [SchemaField("a", "RECORD", "REQUIRED", fields=[])] remote_schema = [ SchemaField("a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER")]) ] assert list(check_schemas(local_schema, remote_schema)) == [ "Nested: Removed column SchemaField('b', 'INTEGER', 'NULLABLE', None, (), None)" ]
def test_find_new_columns_missing_column_in_record(): local_schema = [ SchemaField("a", "RECORD", fields=[SchemaField("b", "INTEGER")]) ] remote_schema = [SchemaField("a", "RECORD", fields=[])] assert list(find_new_columns(local_schema, remote_schema)) == [ SchemaField("b", "INTEGER", "NULLABLE", None, ()) ]
def test_find_new_columns_missing_column_in_record(): local_schema = [ SchemaField("a", "RECORD", fields=[SchemaField("b", "INTEGER")]) ] remote_schema = [SchemaField("a", "RECORD", fields=[])] assert list(check_schemas(local_schema, remote_schema)) == [ "Nested: New column SchemaField('b', 'INTEGER', 'NULLABLE', None, ())" ]
def test_data_type(sdc_builder, sdc_executor, gcp, data_type, data, expected_data): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using wiretap with one data format each. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name}' google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField('data', data_type, mode='NULLABLE') ] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, [(None, data)]) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len( wiretap.output_records ) == 1, f'Expected 1 record, received {len(wiretap.output_records)}' received_data = wiretap.output_records[0].field['data'] assert received_data == expected_data finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def handle_record_type(safe_name, schema_property, mode="NULLABLE"): fields = [ column_schema(col, t) for col, t in schema_property.get('properties', {}).items() ] if fields: return SchemaField(safe_name, 'RECORD', mode, fields=fields) else: return SchemaField(safe_name, 'string', mode)
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp): """Verify proper batch handling by the BigQuery origin. In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of size 3 and that 8 total records are captured. The small numbers are used because of the limitations described in SDC-8765. """ MAX_BATCH_SIZE = 3 TOTAL_RECORDS = 8 dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number' pipeline_builder = sdc_builder.get_pipeline_builder() google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('number', 'STRING', mode='required'), SchemaField('value', 'STRING', mode='required')] data = [dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS)] dataset = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table. logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset) table = bigquery_client.create_table(Table(dataset.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, data) assert not errors # Ceiling division is needed to capture all the complete batches along with the last partial one. snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batches=math.ceil(TOTAL_RECORDS / MAX_BATCH_SIZE)).snapshot # Assert that the batch size is being respected in each batch (including the last). In this case, # we'd expect batch sizes of 3, 3, and 2. for i, batch in enumerate(snapshot.snapshot_batches, start=1): # for 8 records, we'd expect batch sizes of 3, 3, and 2. assert (len(batch.stage_outputs[google_bigquery.instance_name].output) == MAX_BATCH_SIZE if i * MAX_BATCH_SIZE <= TOTAL_RECORDS else TOTAL_RECORDS % MAX_BATCH_SIZE) all_records = [record.field for batch in snapshot.snapshot_batches for record in batch.stage_outputs[google_bigquery.instance_name].output] assert all_records == data finally: bigquery_client.delete_dataset(dataset, delete_contents=True)
def create_bigquery_dataset(ds_name, table_name, data): """ Creates a dataset for the twits and fills it up with twit data, if it does not exist yet :param ds_name: dataset name in BigQuery :param table_name: talbe name for twit data within the dataset :param data: data to be uploaded :return: """ # Define BigQuery schema schema = [ SchemaField('Sentiment', 'INT64', mode='required'), SchemaField('SentimentText', 'STRING', mode='required'), SchemaField('language', 'STRING', mode='required'), SchemaField('twit_length', 'INT64', mode='required'), SchemaField('Training', 'BOOLEAN', mode='required') ] client = bigquery.Client() # Get or create the dataset dataset_ref = client.dataset(ds_name) dataset = bigquery.Dataset(dataset_ref) try: dataset = client.get_dataset(dataset) print('Dataset present') except: print('Dataset not found, creating...') dataset.location = 'EU' dataset = client.create_dataset(dataset) # Create table if it does not exist tables = list(client.list_tables(dataset_ref)) table_ref = dataset_ref.table(table_name) table = bigquery.Table(table_ref, schema=schema) if table_name in [t.table_id for t in tables]: print('Table of twits already created') table = client.get_table(table_ref) else: print('Table twits not found, creating...') table = client.create_table(table) # Fill in table data print('Filling table twits...') rows = [tuple(r[1:]) for r in data.values] def _batch(iterable, n=1): l = len(iterable) for ndx in range(0, l, n): yield ndx, iterable[ndx:min(ndx + n, l)] # Iterate in batches to not exceed payload size limit for batch_n, batch in _batch(rows, n=1000): print('Uploading batch {}'.format(str(batch_n))) errors = client.insert_rows(table, batch) if errors: print('There were errors copying batch {}: {}'.format( str(batch_n, str(errors))))
def test_google_bigquery_destination(sdc_builder, sdc_executor, gcp): """ Send data to Google BigQuery from Dev Raw Data Source and confirm that Google BigQuery destination successfully recieves them using Google BigQuery client. This is achieved by using a deduplicator which assures that there is only one ingest to Google BigQuery. The pipeline looks like: dev_raw_data_source >> record_deduplicator >> google_bigquery record_deduplicator >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', raw_data='\n'.join(CSV_DATA_TO_INSERT)) dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) record_deduplicator = pipeline_builder.add_stage('Record Deduplicator') trash = pipeline_builder.add_stage('Trash') dev_raw_data_source >> record_deduplicator >> google_bigquery record_deduplicator >> trash pipeline = pipeline_builder.build(title='Google BigQuery Destination').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('full_name', 'STRING', mode='required'), SchemaField('age', 'INTEGER', mode='required')] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) logger.info('Starting BigQuery Destination pipeline and waiting for it to produce records ...') sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1) logger.info('Stopping BigQuery Destination pipeline and getting the count of records produced in total ...') sdc_executor.stop_pipeline(pipeline) # Verify by reading records using Google BigQuery client data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() logger.debug('read_data = {}'.format(data_from_bigquery)) assert ROWS_EXPECTED == data_from_bigquery finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_object_names_columns(sdc_builder, sdc_executor, gcp, column_name): """ Create data using Google BigQuery destination with different column names and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ data_to_insert = [(1, 'data')] rows_to_insert = [f'id,{column_name}'] + [','.join(str(element) for element in row) for row in data_to_insert] data_to_expect = [(1, 'data')] pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', stop_after_first_batch=True, raw_data='\n'.join(rows_to_insert)) dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_raw_data_source >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField(column_name, 'STRING', mode='NULLABLE')] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert data_to_expect == data_from_bigquery finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def __create_test_table(self, table_name, dataset_id): table_schema = [ SchemaField("int_data", "INT64"), SchemaField("str_data", "STRING") ] table_reference = TableReference(dataset_id, table_name) test_table = Table(table_reference, table_schema) test_table.time_partitioning = TimePartitioning("DAY") self.__delete_if_exists(test_table) self.GCP_BIGQUERY_CLIENT.create_table(test_table) return test_table
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using snapshot. The pipeline looks like: google_bigquery >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY full_name' google_bigquery.set_attributes(query=query_str) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build( title='Google BigQuery').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('full_name', 'STRING', mode='required'), SchemaField('age', 'INTEGER', mode='required') ] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert errors == [] # Start pipeline and verify correct rows are received using snaphot. logger.info('Starting pipeline and snapshot') snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING': sdc_executor.stop_pipeline(pipeline) rows_from_snapshot = [(record.value['value'][0]['value'], int(record.value['value'][1]['value'])) for record in snapshot[google_bigquery].output] assert rows_from_snapshot == ROWS_TO_INSERT finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_check_mode_fail(): local_schema = [ SchemaField("a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER")]) ] remote_schema = [ SchemaField("a", "RECORD", "NULLABLE", fields=[SchemaField("b", "INTEGER")]) ] assert list(check_schemas(local_schema, remote_schema)) == [ "There is difference between SchemaField('a', 'RECORD', 'REQUIRED', None, " "(SchemaField('b', 'INTEGER', 'NULLABLE', None, (), None),), None) and " "SchemaField('a', 'RECORD', 'NULLABLE', None, (SchemaField('b', 'INTEGER', " "'NULLABLE', None, (), None),), None)", ]
def test_sdk_representation_conversion_record() -> None: schema_out = [ SchemaField('att_01', 'INT64', 'REQUIRED', 'Att 1', ()), SchemaField('att_02', 'RECORD', 'NULLABLE', 'Att 2', (SchemaField( 'att_11', 'FLOAT64', 'REQUIRED', None, ()), SchemaField('att_12', 'STRING', 'NULLABLE', None, ()))) ] schema_convert = module.sdk_representation(schema_in_record) assert schema_convert == schema_out,\ "Convertion doesn't work" return
def test_check_field_type_fail_nested(): local_schema = [ SchemaField( "a", "RECORD", "REQUIRED", fields=[SchemaField("b", "STRING", "REQUIRED")] ) ] remote_schema = [ SchemaField( "a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER", "REQUIRED")] ) ] assert list(check_schemas(local_schema, remote_schema)) == [ "Nested: There is difference between SchemaField('b', 'STRING', 'REQUIRED', " "None, (), None) and SchemaField('b', 'INTEGER', 'REQUIRED', None, (), None)" ]
def upload_to_bq(table_name, dataframe): table_id = GCLOUD_PROJECT_ID + '.' + DATASET_BIGQUERY + '.' + table_name table_ref = dataset_ref.table(table_name) schem = [] data_df = dataframe.copy() data_df['events_loaded'] = False for col in users_columns: if col == 'ae_total_app_session_length' or col == 'ae_total_app_sessions' or col == 'events_loaded': pass else: data_df[col] = data_df[col].astype(str) data_df.columns = data_df.columns.str.replace('$', '') data_df.columns = data_df.columns.str.replace('properties.', '') if if_tbl_exists(bigquery_client, table_ref): table = bigquery_client.get_table(table_ref) # API Request schem = table.schema else: for column in data_df: data_df[column] = data_df[column].apply( lambda x: 'DICT' if isinstance(x, dict) else x) data_df[column] = data_df[column].apply( lambda x: 'LIST' if isinstance(x, list) else x) # If it's a nested field we drop it if not data_df.loc[data_df[column] == 'DICT'].empty: data_df = data_df.drop([column], axis=1) elif not data_df.loc[data_df[column] == 'LIST'].empty: data_df = data_df.drop([column], axis=1) elif data_df[column].dtype == 'int64': schem.append(SchemaField(column, 'INTEGER')) elif data_df[column].dtype == 'float64': schem.append(SchemaField(column, 'FLOAT')) elif data_df[column].dtype == 'bool': schem.append(SchemaField(column, 'BOOLEAN')) elif data_df[column].dtype == 'datetime64[ns]': schem.append(SchemaField(column, 'INT64')) else: schem.append(SchemaField(column, 'STRING')) job_config = bigquery.LoadJobConfig(schema=schem) job = bigquery_client.load_table_from_dataframe(data_df, table_id, job_config=job_config)
def upload_tweets(): big_query_client = bigquery.Client.from_service_account_json('my-beam-project-b2834963a4ae.json') dataset_ref = big_query_client.dataset('Tweets') dataset = Dataset(dataset_ref) dataset.description = 'This represents tweets of trending topics' dataset = big_query_client.create_dataset(dataset) SCHEMA = [ SchemaField('Tweets', 'STRING', mode='Nullable'), ] table_ref = big_query_client.dataset('Tweets').table('tabletweet') load_config = LoadJobConfig() load_config.skip_leading_rows = 0 load_config.schema = SCHEMA load_config.allow_quoted_newlines = True load_config.ignore_unknown_values = False load_config.max_bad_records = 200 with open('tweets.csv', 'rb') as readable: big_query_client.load_table_from_file( readable, table_ref, job_config=load_config) print('tweets file uploaded to big query')
def table_update(client, to_delete): """Update a table's metadata.""" DATASET_NAME = 'table_update_dataset_%d' % (_millis(), ) TABLE_NAME = 'table_update_table_%d' % (_millis(), ) dataset = client.dataset(DATASET_NAME) dataset.description = ORIGINAL_DESCRIPTION dataset.create() to_delete.append(dataset) table = dataset.table(TABLE_NAME, SCHEMA) table.friendly_name = ORIGINAL_FRIENDLY_NAME table.description = ORIGINAL_DESCRIPTION table.create() to_delete.insert(0, table) # [START table_update] assert table.friendly_name == ORIGINAL_FRIENDLY_NAME assert table.description == ORIGINAL_DESCRIPTION NEW_SCHEMA = table.schema[:] NEW_SCHEMA.append(SchemaField('phone', 'string')) table.friendly_name = UPDATED_FRIENDLY_NAME table.description = UPDATED_DESCRIPTION table.schema = NEW_SCHEMA table.update() # API request assert table.friendly_name == UPDATED_FRIENDLY_NAME assert table.description == UPDATED_DESCRIPTION assert table.schema == NEW_SCHEMA
def setUp(self) -> None: self.mock_project_id = "fake-recidiviz-project" self.mock_dataset_id = "fake-dataset" self.mock_table_id = "test_table" self.mock_table_schema = [ SchemaField("my_column", "STRING", "NULLABLE", None, ()) ] self.mock_export_uri = "gs://fake-export-uri" self.mock_dataset = bigquery.dataset.DatasetReference( self.mock_project_id, self.mock_dataset_id) self.mock_table = self.mock_dataset.table(self.mock_table_id) self.project_id_patcher = mock.patch( "recidiviz.utils.metadata.project_id") self.project_id_patcher.start().return_value = self.mock_project_id self.mock_load_job_patcher = mock.patch( "google.cloud.bigquery.job.LoadJob") self.mock_load_job = self.mock_load_job_patcher.start() self.mock_load_job.destination.return_value = self.mock_table Table = collections.namedtuple("Table", ["name"]) self.tables_to_export = [Table("first_table"), Table("second_table")] self.mock_bq_refresh_config = mock.Mock() self.mock_query_builder = mock.Mock() self.mock_bq_refresh_config.get_tables_to_export.return_value = ( self.tables_to_export) self.mock_bq_refresh_config.get_stale_bq_rows_for_excluded_regions_query_builder.return_value = ( self.mock_query_builder) self.mock_bq_client = create_autospec(BigQueryClient)