def process(self, element, job_name_prefix=None): destination = element[0] job_reference = element[1] copy_to_reference = bigquery_tools.parse_table_reference(destination) if copy_to_reference.projectId is None: copy_to_reference.projectId = vp.RuntimeValueProvider.get_value('project', str, '') copy_from_reference = bigquery_tools.parse_table_reference(destination) copy_from_reference.tableId = job_reference.jobId if copy_from_reference.projectId is None: copy_from_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_job_name = '%s_copy_%s_to_%s' % ( job_name_prefix, _bq_uuid('%s:%s.%s' % (copy_from_reference.projectId, copy_from_reference.datasetId, copy_from_reference.tableId)), _bq_uuid('%s:%s.%s' % (copy_to_reference.projectId, copy_to_reference.datasetId, copy_to_reference.tableId))) _LOGGER.info("Triggering copy job from %s to %s", copy_from_reference, copy_to_reference) job_reference = self.bq_wrapper._insert_copy_job( copy_to_reference.projectId, copy_job_name, copy_from_reference, copy_to_reference, create_disposition=self.create_disposition, write_disposition=self.write_disposition) yield (destination, job_reference)
def test_value_provider_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) schema = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]} additional_bq_parameters = { 'timePartitioning': {'type': 'DAY'}, 'clustering': {'fields': ['language']}} table_ref = bigquery_tools.parse_table_reference(output_table_1) table_ref2 = bigquery_tools.parse_table_reference(output_table_2) pipeline_verifiers = [ BigQueryTableMatcher( project=self.project, dataset=table_ref.datasetId, table=table_ref.tableId, expected_properties=additional_bq_parameters), BigQueryTableMatcher( project=self.project, dataset=table_ref2.datasetId, table=table_ref2.tableId, expected_properties=additional_bq_parameters), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_2, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create([row for row in _ELEMENTS if 'language' in row]) _ = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_1)), schema=value_provider.StaticValueProvider(dict, schema), additional_bq_parameters=additional_bq_parameters, method='STREAMING_INSERTS')) _ = (input | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_2)), schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, additional_bq_parameters=lambda _: additional_bq_parameters, method='FILE_LOADS'))
def process(self, element, job_name_prefix=None, unused_schema_mod_jobs=None): destination = element[0] job_reference = element[1] copy_to_reference = bigquery_tools.parse_table_reference(destination) if copy_to_reference.projectId is None: copy_to_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_from_reference = bigquery_tools.parse_table_reference(destination) copy_from_reference.tableId = job_reference.jobId if copy_from_reference.projectId is None: copy_from_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_job_name = '%s_%s' % (job_name_prefix, _bq_uuid('%s:%s.%s' % (copy_from_reference.projectId, copy_from_reference.datasetId, copy_from_reference.tableId))) _LOGGER.info("Triggering copy job from %s to %s", copy_from_reference, copy_to_reference) if copy_to_reference.tableId not in self._observed_tables: # When the write_disposition for a job is WRITE_TRUNCATE, # multiple copy jobs to the same destination can stump on # each other, truncate data, and write to the BQ table over and # over. # Thus, the first copy job runs with the user's write_disposition, # but afterwards, all jobs must always WRITE_APPEND to the table. # If they do not, subsequent copy jobs will clear out data appended # by previous jobs. write_disposition = self.write_disposition wait_for_job = True self._observed_tables.add(copy_to_reference.tableId) else: wait_for_job = False write_disposition = 'WRITE_APPEND' if not self.bq_io_metadata: self.bq_io_metadata = create_bigquery_io_metadata(self._step_name) job_reference = self.bq_wrapper._insert_copy_job( copy_to_reference.projectId, copy_job_name, copy_from_reference, copy_to_reference, create_disposition=self.create_disposition, write_disposition=write_disposition, job_labels=self.bq_io_metadata.add_additional_bq_job_labels()) if wait_for_job: self.bq_wrapper.wait_for_bq_job(job_reference, sleep_duration_sec=10) yield (destination, job_reference)
def process(self, table_reference): _LOGGER.info("Deleting table %s", table_reference) table_reference = bigquery_tools.parse_table_reference(table_reference) self.bq_wrapper._delete_table( table_reference.projectId, table_reference.datasetId, table_reference.tableId)
def process(self, element: 'ReadFromBigQueryRequest') -> Iterable[BoundedSource]: bq = bigquery_tools.BigQueryWrapper( temp_dataset_id=self._get_temp_dataset().datasetId) # TODO(BEAM-11359): Clean up temp dataset at pipeline completion. if element.query is not None: self._setup_temporary_dataset(bq, element) table_reference = self._execute_query(bq, element) else: assert element.table table_reference = bigquery_tools.parse_table_reference( element.table, project=self._get_project()) if not table_reference.projectId: table_reference.projectId = self._get_project() schema, metadata_list = self._export_files(bq, element, table_reference) for metadata in metadata_list: yield self._create_source(metadata.path, schema) if element.query is not None: bq._delete_table(table_reference.projectId, table_reference.datasetId, table_reference.tableId)
def process(self, table_reference): logging.info("Deleting table %s", table_reference) table_reference = bigquery_tools.parse_table_reference(table_reference) self.bq_wrapper._delete_table( table_reference.projectId, table_reference.datasetId, table_reference.tableId)
def _write_files_with_auto_sharding(self, destination_data_kv_pc, file_prefix_pcv): clock = self.test_client.test_clock if self.test_client else time.time # Auto-sharding is achieved via GroupIntoBatches.WithShardedKey # transform which shards, groups and at the same time batches the table rows # to be inserted to BigQuery. # Firstly, the keys of tagged_data (table references) are converted to a # hashable format. This is needed to work with the keyed states used by. # GroupIntoBatches. After grouping and batching is done, table references # are restored. destination_files_kv_pc = ( destination_data_kv_pc | 'ToHashableTableRef' >> beam.Map( bigquery_tools.to_hashable_table_ref) | 'WithAutoSharding' >> GroupIntoBatches.WithShardedKey( batch_size=_FILE_TRIGGERING_RECORD_COUNT, max_buffering_duration_secs= _FILE_TRIGGERING_BATCHING_DURATION_SECS, clock=clock) | 'FromHashableTableRefAndDropShard' >> beam.Map(lambda kvs: ( bigquery_tools.parse_table_reference(kvs[0].key), kvs[1])) | beam.ParDo( WriteGroupedRecordsToFile(schema=self.schema, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs)) return self._maybe_apply_user_trigger(destination_files_kv_pc)
def __init__(self, table, dataset=None, project=None, schema=None, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND, batch_size=None, test_client=None): """Initialize a WriteToBigQuery transform. Args: table (str): The ID of the table. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If dataset argument is :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument. schema (str): The schema to be used if the BigQuery table to write has to be created. This can be either specified as a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object or a single string of the form ``'field1:type1,field2:type2,field3:type3'`` that defines a comma separated list of fields. Here ``'type'`` should specify the BigQuery type of the field. Single string based schemas do not support nested fields, repeated fields, or specifying a BigQuery mode for fields (mode will always be set to ``'NULLABLE'``). create_disposition (BigQueryDisposition): A string describing what happens if the table does not exist. Possible values are: * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not exist. * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not exist. write_disposition (BigQueryDisposition): A string describing what happens if the table has already some data. Possible values are: * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows. * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows. * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not empty. For streaming pipelines WriteTruncate can not be used. batch_size (int): Number of rows to be written to BQ per streaming API insert. test_client: Override the default bigquery client used for testing. """ self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) self.create_disposition = BigQueryDisposition.validate_create( create_disposition) self.write_disposition = BigQueryDisposition.validate_write( write_disposition) self.schema = schema self.batch_size = batch_size self.test_client = test_client
def test_calling_with_partially_qualified_table_ref(self): datasetId = 'test_dataset' tableId = 'test_table' partially_qualified_table = '{}.{}'.format(datasetId, tableId) parsed_ref = parse_table_reference(partially_qualified_table) self.assertIsInstance(parsed_ref, bigquery.TableReference) self.assertEqual(parsed_ref.datasetId, datasetId) self.assertEqual(parsed_ref.tableId, tableId)
def test_calling_with_table_reference(self): table_ref = bigquery.TableReference() table_ref.projectId = 'test_project' table_ref.datasetId = 'test_dataset' table_ref.tableId = 'test_table' parsed_ref = parse_table_reference(table_ref) self.assertEqual(table_ref, parsed_ref) self.assertIsNot(table_ref, parsed_ref)
def process(self, element, load_job_name_prefix, *schema_side_inputs): # Each load job is assumed to have files respecting these constraints: # 1. Total size of all files < 15 TB (Max size for load jobs) # 2. Total no. of files in a single load job < 10,000 # This assumption means that there will always be a single load job # triggered for each partition of files. destination = element[0] files = element[1] if callable(self.schema): schema = self.schema(destination, *schema_side_inputs) elif isinstance(self.schema, vp.ValueProvider): schema = self.schema.get() else: schema = self.schema if callable(self.additional_bq_parameters): additional_parameters = self.additional_bq_parameters(destination) elif isinstance(self.additional_bq_parameters, vp.ValueProvider): additional_parameters = self.additional_bq_parameters.get() else: additional_parameters = self.additional_bq_parameters table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') # Load jobs for a single destination are always triggered from the same # worker. This means that we can generate a deterministic numbered job id, # and not need to worry. destination_hash = _bq_uuid( '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)) uid = _bq_uuid() job_name = '%s_%s_%s' % (load_job_name_prefix, destination_hash, uid) logging.debug('Load job has %s files. Job name is %s.', len(files), job_name) if self.temporary_tables: # For temporary tables, we create a new table with the name with JobId. table_reference.tableId = job_name yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference) logging.info( 'Triggering job %s to load data to BigQuery table %s.' 'Schema: %s. Additional parameters: %s', job_name, table_reference, schema, additional_parameters) job_reference = self.bq_wrapper.perform_load_job( table_reference, files, job_name, schema=schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, additional_load_parameters=additional_parameters) yield (destination, job_reference)
def process(self, element, load_job_name_prefix, *schema_side_inputs): destination = element[0] files = iter(element[1]) if callable(self.schema): schema = self.schema(destination, *schema_side_inputs) elif isinstance(self.schema, vp.ValueProvider): schema = self.schema.get() else: schema = self.schema if callable(self.additional_bq_parameters): additional_parameters = self.additional_bq_parameters(destination) elif isinstance(self.additional_bq_parameters, vp.ValueProvider): additional_parameters = self.additional_bq_parameters.get() else: additional_parameters = self.additional_bq_parameters batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS)) while batch_of_files: table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') # Load jobs for a single destination are always triggered from the same # worker. This means that we can generate a deterministic numbered job id, # and not need to worry. destination_hash = _bq_uuid('%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)) timestamp = int(time.time()) job_name = '%s_%s_%s' % ( load_job_name_prefix, destination_hash, timestamp) logging.debug('Batch of files has %s files. Job name is %s.', len(batch_of_files), job_name) if self.temporary_tables: # For temporary tables, we create a new table with the name with JobId. table_reference.tableId = job_name yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference) logging.info('Triggering job %s to load data to BigQuery table %s.' 'Schema: %s. Additional parameters: %s', job_name, table_reference, schema, additional_parameters) job_reference = self.bq_wrapper.perform_load_job( table_reference, batch_of_files, job_name, schema=schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, additional_load_parameters=additional_parameters) yield (destination, job_reference) # Prepare to trigger the next job batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
def test_calling_with_all_arguments(self): projectId = 'test_project' datasetId = 'test_dataset' tableId = 'test_table' parsed_ref = parse_table_reference( tableId, dataset=datasetId, project=projectId) self.assertIsInstance(parsed_ref, bigquery.TableReference) self.assertEqual(parsed_ref.projectId, projectId) self.assertEqual(parsed_ref.datasetId, datasetId) self.assertEqual(parsed_ref.tableId, tableId)
def test_calling_with_hyphened_table_ref(self): projectId = 'test_project' datasetId = 'test_dataset' tableId = 'test-table' fully_qualified_table = '{}:{}.{}'.format(projectId, datasetId, tableId) parsed_ref = parse_table_reference(fully_qualified_table) self.assertIsInstance(parsed_ref, bigquery.TableReference) self.assertEqual(parsed_ref.projectId, projectId) self.assertEqual(parsed_ref.datasetId, datasetId) self.assertEqual(parsed_ref.tableId, tableId)
def process(self, element, job_name_prefix=None): destination = element[0] job_reference = element[1] if not self.temporary_tables: # If we did not use temporary tables, then we do not need to trigger any # copy jobs. return copy_to_reference = bigquery_tools.parse_table_reference(destination) if copy_to_reference.projectId is None: copy_to_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_from_reference = bigquery_tools.parse_table_reference(destination) copy_from_reference.tableId = job_reference.jobId if copy_from_reference.projectId is None: copy_from_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_job_name = '%s_copy_%s_to_%s' % ( job_name_prefix, _bq_uuid( '%s:%s.%s' % (copy_from_reference.projectId, copy_from_reference.datasetId, copy_from_reference.tableId)), _bq_uuid('%s:%s.%s' % (copy_to_reference.projectId, copy_to_reference.datasetId, copy_to_reference.tableId))) logging.info("Triggering copy job from %s to %s", copy_from_reference, copy_to_reference) job_reference = self.bq_wrapper._insert_copy_job( copy_to_reference.projectId, copy_job_name, copy_from_reference, copy_to_reference, create_disposition=self.create_disposition, write_disposition=self.write_disposition) yield (destination, job_reference)
def test_perform_load_job_with_source_stream(self): client = mock.Mock() wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) wrapper.perform_load_job( destination=parse_table_reference('project:dataset.table'), job_id='job_id', source_stream=io.BytesIO(b'some,data')) client.jobs.Insert.assert_called_once() upload = client.jobs.Insert.call_args[1]["upload"] self.assertEqual(b'some,data', upload.stream.read())
def test_records_traverse_transform_with_mocks(self): destination = 'project1:dataset1.table1' job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' result_job = bigquery_api.Job() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job transform = bigquery.WriteToBigQuery( destination, gs_location=self._new_tempdir(), test_client=bq_client) # Need to test this with the DirectRunner to avoid serializing mocks with TestPipeline('DirectRunner') as p: outputs = p | beam.Create(_ELEMENTS) | transform dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS] dest_job = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1]) files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1]) destinations = (dest_files | "GetUniques" >> beam.combiners.Count.PerKey() | "GetDests" >> beam.Map(lambda x: x[0])) # All files exist _ = (files | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))) # One file per destination assert_that(files | beam.combiners.Count.Globally(), equal_to([1]), label='CountFiles') assert_that(destinations, equal_to([bigquery_tools.parse_table_reference(destination)]), label='CheckDestinations') assert_that(jobs, equal_to([job_reference]), label='CheckJobs')
def process(self, element, job_name_prefix=None): destination = element[0] job_reference = element[1] if not self.temporary_tables: # If we did not use temporary tables, then we do not need to trigger any # copy jobs. return copy_to_reference = bigquery_tools.parse_table_reference(destination) if copy_to_reference.projectId is None: copy_to_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_from_reference = bigquery_tools.parse_table_reference(destination) copy_from_reference.tableId = job_reference.jobId if copy_from_reference.projectId is None: copy_from_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_job_name = '%s_copy_%s_to_%s' % ( job_name_prefix, _bq_uuid('%s:%s.%s' % (copy_from_reference.projectId, copy_from_reference.datasetId, copy_from_reference.tableId)), _bq_uuid('%s:%s.%s' % (copy_to_reference.projectId, copy_to_reference.datasetId, copy_to_reference.tableId))) logging.info("Triggering copy job from %s to %s", copy_from_reference, copy_to_reference) job_reference = self.bq_wrapper._insert_copy_job( copy_to_reference.projectId, copy_job_name, copy_from_reference, copy_to_reference, create_disposition=self.create_disposition, write_disposition=self.write_disposition) yield (destination, job_reference)
def _export_files( self, bq: bigquery_tools.BigQueryWrapper, element: 'ReadFromBigQueryRequest', table_reference: TableReference): """Runs a BigQuery export job. Returns: bigquery.TableSchema instance, a list of FileMetadata instances """ job_labels = self._get_bq_metadata().add_additional_bq_job_labels( self.bigquery_job_labels) export_job_name = bigquery_tools.generate_bq_job_name( self._job_name, self._source_uuid, bigquery_tools.BigQueryJobTypes.EXPORT, element.obj_id) temp_location = self.options.view_as(GoogleCloudOptions).temp_location gcs_location = bigquery_export_destination_uri( self.gcs_location, temp_location, '%s%s' % (self._source_uuid, element.obj_id)) if self.use_json_exports: job_ref = bq.perform_extract_job([gcs_location], export_job_name, table_reference, bigquery_tools.FileFormat.JSON, project=self._get_project(), job_labels=job_labels, include_header=False) else: job_ref = bq.perform_extract_job([gcs_location], export_job_name, table_reference, bigquery_tools.FileFormat.AVRO, project=self._get_project(), include_header=False, job_labels=job_labels, use_avro_logical_types=True) bq.wait_for_bq_job(job_ref) metadata_list = FileSystems.match([gcs_location])[0].metadata_list if isinstance(table_reference, ValueProvider): table_ref = bigquery_tools.parse_table_reference( element.table, project=self._get_project()) else: table_ref = table_reference table = bq.get_table( table_ref.projectId, table_ref.datasetId, table_ref.tableId) return table.schema, metadata_list
def process(self, element, load_job_name_prefix): destination = element[0] files = iter(element[1]) if callable(self.schema): schema = self.schema(destination) elif isinstance(self.schema, vp.ValueProvider): schema = self.schema.get() else: schema = self.schema job_count = 0 batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS)) while batch_of_files: table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') # Load jobs for a single destination are always triggered from the same # worker. This means that we can generate a deterministic numbered job id, # and not need to worry. job_name = '%s_%s_%s' % ( load_job_name_prefix, _bq_uuid('%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)), job_count) logging.debug("Batch of files has %s files. Job name is %s", len(batch_of_files), job_name) if self.temporary_tables: # For temporary tables, we create a new table with the name with JobId. table_reference.tableId = job_name yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference) logging.info("Triggering job %s to load data to BigQuery table %s.", job_name, table_reference) job_reference = self.bq_wrapper.perform_load_job( table_reference, batch_of_files, job_name, schema=schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition) yield (destination, job_reference) # Prepare to trigger the next job job_count += 1 batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
def test_perform_load_job_source_mutual_exclusivity(self): client = mock.Mock() wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) # Both source_uri and source_stream specified. with self.assertRaises(ValueError): wrapper.perform_load_job( destination=parse_table_reference('project:dataset.table'), job_id='job_id', source_uris=['gs://example.com/*'], source_stream=io.BytesIO()) # Neither source_uri nor source_stream specified. with self.assertRaises(ValueError): wrapper.perform_load_job(destination='P:D.T', job_id='J')
def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.dataset_id = '%s%s%s' % ( self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client = bigquery_tools.BigQueryWrapper() self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = '%s.output_table' % (self.dataset_id) self.table_ref = bigquery_tools.parse_table_reference(self.output_table) _LOGGER.info( 'Created dataset %s in project %s', self.dataset_id, self.project)
def process(self, element, unused_create_fn_output=None): destination = element[0] if isinstance(destination, tuple): schema = destination[1] destination = destination[0] self._create_table_if_needed( schema, bigquery_tools.parse_table_reference(destination)) row = element[1] self._rows_buffer[destination].append(row) self._total_buffered_rows += 1 if len(self._rows_buffer[destination]) >= self._max_batch_size: return self._flush_batch(destination) elif self._total_buffered_rows >= self._max_buffered_rows: return self._flush_all_batches()
def _flush_batch(self, destination): # Flush the current batch of rows to BigQuery. rows = self._rows_buffer[destination] table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') logging.debug('Flushing data to %s. Total %s rows.', destination, len(rows)) while True: # TODO: Figure out an insertId to make calls idempotent. passed, errors = self.bigquery_wrapper.insert_rows( project_id=table_reference.projectId, dataset_id=table_reference.datasetId, table_id=table_reference.tableId, rows=rows, skip_invalid_rows=True) logging.debug("Passed: %s. Errors are %s", passed, errors) failed_rows = [rows[entry.index] for entry in errors] should_retry = any( bigquery_tools.RetryStrategy.should_retry( self._retry_strategy, entry.errors[0].reason) for entry in errors) rows = failed_rows if not should_retry: break else: retry_backoff = next(self._backoff_calculator) logging.info('Sleeping %s seconds before retrying insertion.', retry_backoff) time.sleep(retry_backoff) self._total_buffered_rows -= len(self._rows_buffer[destination]) del self._rows_buffer[destination] return [ pvalue.TaggedOutput( BigQueryWriteFn.FAILED_ROWS, GlobalWindows.windowed_value((destination, row))) for row in failed_rows ]
def process(self, element, load_job_name_prefix): destination = element[0] files = iter(element[1]) job_count = 0 batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS)) while batch_of_files: table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') # Load jobs for a single des5tination are always triggered from the same # worker. This means that we can generate a deterministic numbered job id, # and not need to worry. job_name = '%s_%s_%s' % ( load_job_name_prefix, _bq_uuid('%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)), job_count) logging.debug("Batch of files has %s files. Job name is %s", len(batch_of_files), job_name) if self.temporary_tables: # For temporary tables, we create a new table with the name with JobId. table_reference.tableId = job_name yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference) logging.info( "Triggering job %s to load data to BigQuery table %s.", job_name, table_reference) job_reference = self.bq_wrapper.perform_load_job( table_reference, batch_of_files, job_name, schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition) yield (destination, job_reference) # Prepare to trigger the next job job_count += 1 batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
def _flush_batch(self, destination): # Flush the current batch of rows to BigQuery. rows = self._rows_buffer[destination] table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') logging.debug('Flushing data to %s. Total %s rows.', destination, len(rows)) while True: # TODO: Figure out an insertId to make calls idempotent. passed, errors = self.bigquery_wrapper.insert_rows( project_id=table_reference.projectId, dataset_id=table_reference.datasetId, table_id=table_reference.tableId, rows=rows, skip_invalid_rows=True) logging.debug("Passed: %s. Errors are %s", passed, errors) failed_rows = [rows[entry.index] for entry in errors] should_retry = any( bigquery_tools.RetryStrategy.should_retry( self._retry_strategy, entry.errors[0].reason) for entry in errors) rows = failed_rows if not should_retry: break else: retry_backoff = next(self._backoff_calculator) logging.info('Sleeping %s seconds before retrying insertion.', retry_backoff) time.sleep(retry_backoff) self._total_buffered_rows -= len(self._rows_buffer[destination]) del self._rows_buffer[destination] return [pvalue.TaggedOutput(BigQueryWriteFn.FAILED_ROWS, GlobalWindows.windowed_value( (destination, row))) for row in failed_rows]
def __init__( self, # gcs_location=None, get_destination_uri=None, table=None, dataset=None, project=None, query=None, validate=False, coder=None, use_standard_sql=False, flatten_results=True, kms_key=None): if table is not None and query is not None: raise ValueError( 'Both a BigQuery table and a query were specified.' ' Please specify only one of these.') elif table is None and query is None: raise ValueError('A BigQuery table or a query must be specified') elif table is not None: self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) self.query = None self.use_legacy_sql = True else: if isinstance(query, (str, unicode)): query = StaticValueProvider(str, query) self.query = query # TODO(BEAM-1082): Change the internal flag to be standard_sql self.use_legacy_sql = not use_standard_sql self.table_reference = None self.get_destination_uri = get_destination_uri # self.gcs_location = gcs_location if isinstance(project, (str, unicode)): project = StaticValueProvider(str, query) self.project = project self.validate = validate self.flatten_results = flatten_results self.coder = coder or _JsonToDictCoder self.kms_key = kms_key self.split_result = None
def process(self, element, unused_create_fn_output=None): destination = element[0] if callable(self.schema): schema = self.schema(destination) elif isinstance(self.schema, vp.ValueProvider): schema = self.schema.get() else: schema = self.schema self._create_table_if_needed( bigquery_tools.parse_table_reference(destination), schema) row = element[1] self._rows_buffer[destination].append(row) self._total_buffered_rows += 1 if len(self._rows_buffer[destination]) >= self._max_batch_size: return self._flush_batch(destination) elif self._total_buffered_rows >= self._max_buffered_rows: return self._flush_all_batches()
def process(self, element, unused_create_fn_output=None): destination = element[0] if callable(self.schema): schema = self.schema(destination) elif isinstance(self.schema, vp.ValueProvider): schema = self.schema.get() else: schema = self.schema self._create_table_if_needed( bigquery_tools.parse_table_reference(destination), schema) destination = bigquery_tools.get_hashable_destination(destination) row = element[1] self._rows_buffer[destination].append(row) self._total_buffered_rows += 1 if len(self._rows_buffer[destination]) >= self._max_batch_size: return self._flush_batch(destination) elif self._total_buffered_rows >= self._max_buffered_rows: return self._flush_all_batches()
def process(self, element, schema_mod_job_name_prefix): destination = element[0] temp_table_load_job_reference = element[1] if callable(self._additional_bq_parameters): additional_parameters = self._additional_bq_parameters(destination) elif isinstance(self._additional_bq_parameters, vp.ValueProvider): additional_parameters = self._additional_bq_parameters.get() else: additional_parameters = self._additional_bq_parameters # When writing to normal tables WRITE_TRUNCATE will overwrite the schema but # when writing to a partition, care needs to be taken to update the schema # even on WRITE_TRUNCATE. if (self._write_disposition not in ('WRITE_TRUNCATE', 'WRITE_APPEND') or not additional_parameters or not additional_parameters.get("schemaUpdateOptions")): # No need to modify schema of destination table return table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') try: # Check if destination table exists destination_table = self._bq_wrapper.get_table( project_id=table_reference.projectId, dataset_id=table_reference.datasetId, table_id=table_reference.tableId) except HttpError as exn: if exn.status_code == 404: # Destination table does not exist, so no need to modify its schema # ahead of the copy jobs. return else: raise temp_table_load_job = self._bq_wrapper.get_job( project=temp_table_load_job_reference.projectId, job_id=temp_table_load_job_reference.jobId, location=temp_table_load_job_reference.location) temp_table_schema = temp_table_load_job.configuration.load.schema if bigquery_tools.check_schema_equal(temp_table_schema, destination_table.schema, ignore_descriptions=True, ignore_field_order=True): # Destination table schema is already the same as the temp table schema, # so no need to run a job to update the destination table schema. return destination_hash = _bq_uuid( '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)) uid = _bq_uuid() job_name = '%s_%s_%s' % (schema_mod_job_name_prefix, destination_hash, uid) _LOGGER.debug('Triggering schema modification job %s on %s', job_name, table_reference) # Trigger potential schema modification by loading zero rows into the # destination table with the temporary table schema. schema_update_job_reference = self._bq_wrapper.perform_load_job( destination=table_reference, source_stream=io.BytesIO(), # file with zero rows job_id=job_name, schema=temp_table_schema, write_disposition='WRITE_APPEND', create_disposition='CREATE_NEVER', additional_load_parameters=additional_parameters, job_labels=self._bq_io_metadata.add_additional_bq_job_labels()) yield (destination, schema_update_job_reference)
def __init__(self, table, dataset=None, project=None, schema=None, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND, kms_key=None, batch_size=None, max_file_size=None, max_files_per_bundle=None, test_client=None, gs_location=None, method=None, insert_retry_strategy=None): """Initialize a WriteToBigQuery transform. Args: table (str, callable): The ID of the table, or a callable that returns it. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If dataset argument is :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. If it's a callable, it must receive one argument representing an element to be written to BigQuery, and return a TableReference, or a string table name as specified above. Multiple destinations are only supported on Batch pipelines at the moment. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument. schema (str): The schema to be used if the BigQuery table to write has to be created. This can be either specified as a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object or a single string of the form ``'field1:type1,field2:type2,field3:type3'`` that defines a comma separated list of fields. Here ``'type'`` should specify the BigQuery type of the field. Single string based schemas do not support nested fields, repeated fields, or specifying a BigQuery mode for fields (mode will always be set to ``'NULLABLE'``). create_disposition (BigQueryDisposition): A string describing what happens if the table does not exist. Possible values are: * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not exist. * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not exist. write_disposition (BigQueryDisposition): A string describing what happens if the table has already some data. Possible values are: * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows. * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows. * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not empty. For streaming pipelines WriteTruncate can not be used. kms_key (str): Experimental. Optional Cloud KMS key name for use when creating new tables. batch_size (int): Number of rows to be written to BQ per streaming API insert. The default is 500. insert. test_client: Override the default bigquery client used for testing. max_file_size (int): The maximum size for a file to be written and then loaded into BigQuery. The default value is 4TB, which is 80% of the limit of 5TB for BigQuery to load any file. max_files_per_bundle(int): The maximum number of files to be concurrently written by a worker. The default here is 20. Larger values will allow writing to multiple destinations without having to reshard - but they increase the memory burden on the workers. gs_location (str): A GCS location to store files to be used for file loads into BigQuery. By default, this will use the pipeline's temp_location, but for pipelines whose temp_location is not appropriate for BQ File Loads, users should pass a specific one. method: The method to use to write to BigQuery. It may be STREAMING_INSERTS, FILE_LOADS, or DEFAULT. An introduction on loading data to BigQuery: https://cloud.google.com/bigquery/docs/loading-data. DEFAULT will use STREAMING_INSERTS on Streaming pipelines and FILE_LOADS on Batch pipelines. insert_retry_strategy: The strategy to use when retrying streaming inserts into BigQuery. Options are shown in bigquery_tools.RetryStrategy attrs. """ self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) self.create_disposition = BigQueryDisposition.validate_create( create_disposition) self.write_disposition = BigQueryDisposition.validate_write( write_disposition) self.schema = WriteToBigQuery.get_dict_table_schema(schema) self.batch_size = batch_size self.kms_key = kms_key self.test_client = test_client self.gs_location = gs_location self.max_file_size = max_file_size self.max_files_per_bundle = max_files_per_bundle self.method = method or WriteToBigQuery.Method.DEFAULT self.insert_retry_strategy = insert_retry_strategy
def __init__(self, table, dataset=None, project=None, schema=None, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY, validate=False, coder=None, kms_key=None): """Initialize a BigQuerySink. Args: table (str): The ID of the table. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If **dataset** argument is :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument. schema (str): The schema to be used if the BigQuery table to write has to be created. This can be either specified as a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object or a single string of the form ``'field1:type1,field2:type2,field3:type3'`` that defines a comma separated list of fields. Here ``'type'`` should specify the BigQuery type of the field. Single string based schemas do not support nested fields, repeated fields, or specifying a BigQuery mode for fields (mode will always be set to ``'NULLABLE'``). create_disposition (BigQueryDisposition): A string describing what happens if the table does not exist. Possible values are: * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not exist. * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not exist. write_disposition (BigQueryDisposition): A string describing what happens if the table has already some data. Possible values are: * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows. * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows. * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not empty. validate (bool): If :data:`True`, various checks will be done when sink gets initialized (e.g., is table present given the disposition arguments?). This should be :data:`True` for most scenarios in order to catch errors as early as possible (pipeline construction instead of pipeline execution). It should be :data:`False` if the table is created during pipeline execution by a previous step. coder (~apache_beam.coders.coders.Coder): The coder for the table rows if serialized to disk. If :data:`None`, then the default coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`, which will interpret every element written to the sink as a dictionary that will be JSON serialized as a line in a file. This argument needs a value only in special cases when writing table rows as dictionaries is not desirable. kms_key (str): Experimental. Optional Cloud KMS key name for use when creating new tables. Raises: ~exceptions.TypeError: if the schema argument is not a :class:`str` or a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object. ~exceptions.ValueError: if the table reference as a string does not match the expected format. """ # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apitools.base import py # pylint: disable=unused-variable except ImportError: raise ImportError('Google Cloud IO not available, ' 'please install apache_beam[gcp]') self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) # Transform the table schema into a bigquery.TableSchema instance. if isinstance(schema, (str, unicode)): # TODO(silviuc): Should add a regex-based validation of the format. table_schema = bigquery.TableSchema() schema_list = [s.strip(' ') for s in schema.split(',')] for field_and_type in schema_list: field_name, field_type = field_and_type.split(':') field_schema = bigquery.TableFieldSchema() field_schema.name = field_name field_schema.type = field_type field_schema.mode = 'NULLABLE' table_schema.fields.append(field_schema) self.table_schema = table_schema elif schema is None: # TODO(silviuc): Should check that table exists if no schema specified. self.table_schema = schema elif isinstance(schema, bigquery.TableSchema): self.table_schema = schema else: raise TypeError('Unexpected schema argument: %s.' % schema) self.create_disposition = BigQueryDisposition.validate_create( create_disposition) self.write_disposition = BigQueryDisposition.validate_write( write_disposition) self.validate = validate self.coder = coder or bigquery_tools.RowAsDictJsonCoder() self.kms_key = kms_key
def _parse_table_reference(table, dataset=None, project=None): return bigquery_tools.parse_table_reference(table, dataset, project)
def test_calling_with_callable(self): callable_ref = lambda: 'foo' parsed_ref = parse_table_reference(callable_ref) self.assertIs(callable_ref, parsed_ref)
def __init__(self, table, dataset=None, project=None, schema=None, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND, kms_key=None, batch_size=None, max_file_size=None, max_files_per_bundle=None, test_client=None, gs_location=None, method=None): """Initialize a WriteToBigQuery transform. Args: table (str, callable): The ID of the table, or a callable that returns it. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If dataset argument is :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. If it's a callable, it must receive one argument representing an element to be written to BigQuery, and return a TableReference, or a string table name as specified above. Multiple destinations are only supported on Batch pipelines at the moment. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument. schema (str): The schema to be used if the BigQuery table to write has to be created. This can be either specified as a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object or a single string of the form ``'field1:type1,field2:type2,field3:type3'`` that defines a comma separated list of fields. Here ``'type'`` should specify the BigQuery type of the field. Single string based schemas do not support nested fields, repeated fields, or specifying a BigQuery mode for fields (mode will always be set to ``'NULLABLE'``). create_disposition (BigQueryDisposition): A string describing what happens if the table does not exist. Possible values are: * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not exist. * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not exist. write_disposition (BigQueryDisposition): A string describing what happens if the table has already some data. Possible values are: * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows. * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows. * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not empty. For streaming pipelines WriteTruncate can not be used. kms_key (str): Experimental. Optional Cloud KMS key name for use when creating new tables. batch_size (int): Number of rows to be written to BQ per streaming API insert. The default is 500. insert. test_client: Override the default bigquery client used for testing. max_file_size (int): The maximum size for a file to be written and then loaded into BigQuery. The default value is 4TB, which is 80% of the limit of 5TB for BigQuery to load any file. max_files_per_bundle(int): The maximum number of files to be concurrently written by a worker. The default here is 20. Larger values will allow writing to multiple destinations without having to reshard - but they increase the memory burden on the workers. gs_location (str): A GCS location to store files to be used for file loads into BigQuery. By default, this will use the pipeline's temp_location, but for pipelines whose temp_location is not appropriate for BQ File Loads, users should pass a specific one. method: The method to use to write to BigQuery. It may be STREAMING_INSERTS, FILE_LOADS, or DEFAULT. An introduction on loading data to BigQuery: https://cloud.google.com/bigquery/docs/loading-data. DEFAULT will use STREAMING_INSERTS on Streaming pipelines and FILE_LOADS on Batch pipelines. """ self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) self.create_disposition = BigQueryDisposition.validate_create( create_disposition) self.write_disposition = BigQueryDisposition.validate_write( write_disposition) self.schema = schema self.batch_size = batch_size self.kms_key = kms_key self.test_client = test_client self.gs_location = gs_location self.max_file_size = max_file_size self.max_files_per_bundle = max_files_per_bundle self.method = method or WriteToBigQuery.Method.DEFAULT
def __init__(self, table, dataset=None, project=None, schema=None, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY, validate=False, coder=None, kms_key=None): """Initialize a BigQuerySink. Args: table (str): The ID of the table. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If **dataset** argument is :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument. schema (str): The schema to be used if the BigQuery table to write has to be created. This can be either specified as a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object or a single string of the form ``'field1:type1,field2:type2,field3:type3'`` that defines a comma separated list of fields. Here ``'type'`` should specify the BigQuery type of the field. Single string based schemas do not support nested fields, repeated fields, or specifying a BigQuery mode for fields (mode will always be set to ``'NULLABLE'``). create_disposition (BigQueryDisposition): A string describing what happens if the table does not exist. Possible values are: * :attr:`BigQueryDisposition.CREATE_IF_NEEDED`: create if does not exist. * :attr:`BigQueryDisposition.CREATE_NEVER`: fail the write if does not exist. write_disposition (BigQueryDisposition): A string describing what happens if the table has already some data. Possible values are: * :attr:`BigQueryDisposition.WRITE_TRUNCATE`: delete existing rows. * :attr:`BigQueryDisposition.WRITE_APPEND`: add to existing rows. * :attr:`BigQueryDisposition.WRITE_EMPTY`: fail the write if table not empty. validate (bool): If :data:`True`, various checks will be done when sink gets initialized (e.g., is table present given the disposition arguments?). This should be :data:`True` for most scenarios in order to catch errors as early as possible (pipeline construction instead of pipeline execution). It should be :data:`False` if the table is created during pipeline execution by a previous step. coder (~apache_beam.coders.coders.Coder): The coder for the table rows if serialized to disk. If :data:`None`, then the default coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`, which will interpret every element written to the sink as a dictionary that will be JSON serialized as a line in a file. This argument needs a value only in special cases when writing table rows as dictionaries is not desirable. kms_key (str): Experimental. Optional Cloud KMS key name for use when creating new tables. Raises: ~exceptions.TypeError: if the schema argument is not a :class:`str` or a :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ bigquery_v2_messages.TableSchema` object. ~exceptions.ValueError: if the table reference as a string does not match the expected format. """ # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apitools.base import py # pylint: disable=unused-variable except ImportError: raise ImportError( 'Google Cloud IO not available, ' 'please install apache_beam[gcp]') self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) # Transform the table schema into a bigquery.TableSchema instance. if isinstance(schema, (str, unicode)): # TODO(silviuc): Should add a regex-based validation of the format. table_schema = bigquery.TableSchema() schema_list = [s.strip(' ') for s in schema.split(',')] for field_and_type in schema_list: field_name, field_type = field_and_type.split(':') field_schema = bigquery.TableFieldSchema() field_schema.name = field_name field_schema.type = field_type field_schema.mode = 'NULLABLE' table_schema.fields.append(field_schema) self.table_schema = table_schema elif schema is None: # TODO(silviuc): Should check that table exists if no schema specified. self.table_schema = schema elif isinstance(schema, bigquery.TableSchema): self.table_schema = schema else: raise TypeError('Unexpected schema argument: %s.' % schema) self.create_disposition = BigQueryDisposition.validate_create( create_disposition) self.write_disposition = BigQueryDisposition.validate_write( write_disposition) self.validate = validate self.coder = coder or bigquery_tools.RowAsDictJsonCoder() self.kms_key = kms_key
def __init__(self, table=None, dataset=None, project=None, query=None, validate=False, coder=None, use_standard_sql=False, flatten_results=True, kms_key=None): """Initialize a :class:`BigQuerySource`. Args: table (str): The ID of a BigQuery table. If specified all data of the table will be used as input of the current source. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If dataset and query arguments are :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument or a query is specified. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument or a query is specified. query (str): A query to be used instead of arguments table, dataset, and project. validate (bool): If :data:`True`, various checks will be done when source gets initialized (e.g., is table present?). This should be :data:`True` for most scenarios in order to catch errors as early as possible (pipeline construction instead of pipeline execution). It should be :data:`False` if the table is created during pipeline execution by a previous step. coder (~apache_beam.coders.coders.Coder): The coder for the table rows if serialized to disk. If :data:`None`, then the default coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`, which will interpret every line in a file as a JSON serialized dictionary. This argument needs a value only in special cases when returning table rows as dictionaries is not desirable. use_standard_sql (bool): Specifies whether to use BigQuery's standard SQL dialect for this query. The default value is :data:`False`. If set to :data:`True`, the query will use BigQuery's updated SQL dialect with improved standards compliance. This parameter is ignored for table inputs. flatten_results (bool): Flattens all nested and repeated fields in the query results. The default value is :data:`True`. kms_key (str): Experimental. Optional Cloud KMS key name for use when creating new tables. Raises: ~exceptions.ValueError: if any of the following is true: 1) the table reference as a string does not match the expected format 2) neither a table nor a query is specified 3) both a table and a query is specified. """ # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apitools.base import py # pylint: disable=unused-variable except ImportError: raise ImportError( 'Google Cloud IO not available, ' 'please install apache_beam[gcp]') if table is not None and query is not None: raise ValueError('Both a BigQuery table and a query were specified.' ' Please specify only one of these.') elif table is None and query is None: raise ValueError('A BigQuery table or a query must be specified') elif table is not None: self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) self.query = None self.use_legacy_sql = True else: self.query = query # TODO(BEAM-1082): Change the internal flag to be standard_sql self.use_legacy_sql = not use_standard_sql self.table_reference = None self.validate = validate self.flatten_results = flatten_results self.coder = coder or bigquery_tools.RowAsDictJsonCoder() self.kms_key = kms_key
def test_calling_with_value_provider(self): value_provider_ref = StaticValueProvider(str, 'test_dataset.test_table') parsed_ref = parse_table_reference(value_provider_ref) self.assertIs(value_provider_ref, parsed_ref)
def __init__(self, table=None, dataset=None, project=None, query=None, validate=False, coder=None, use_standard_sql=False, flatten_results=True, kms_key=None): """Initialize a :class:`BigQuerySource`. Args: table (str): The ID of a BigQuery table. If specified all data of the table will be used as input of the current source. The ID must contain only letters ``a-z``, ``A-Z``, numbers ``0-9``, or underscores ``_``. If dataset and query arguments are :data:`None` then the table argument must contain the entire table reference specified as: ``'DATASET.TABLE'`` or ``'PROJECT:DATASET.TABLE'``. dataset (str): The ID of the dataset containing this table or :data:`None` if the table reference is specified entirely by the table argument or a query is specified. project (str): The ID of the project containing this table or :data:`None` if the table reference is specified entirely by the table argument or a query is specified. query (str): A query to be used instead of arguments table, dataset, and project. validate (bool): If :data:`True`, various checks will be done when source gets initialized (e.g., is table present?). This should be :data:`True` for most scenarios in order to catch errors as early as possible (pipeline construction instead of pipeline execution). It should be :data:`False` if the table is created during pipeline execution by a previous step. coder (~apache_beam.coders.coders.Coder): The coder for the table rows if serialized to disk. If :data:`None`, then the default coder is :class:`~apache_beam.io.gcp.bigquery_tools.RowAsDictJsonCoder`, which will interpret every line in a file as a JSON serialized dictionary. This argument needs a value only in special cases when returning table rows as dictionaries is not desirable. use_standard_sql (bool): Specifies whether to use BigQuery's standard SQL dialect for this query. The default value is :data:`False`. If set to :data:`True`, the query will use BigQuery's updated SQL dialect with improved standards compliance. This parameter is ignored for table inputs. flatten_results (bool): Flattens all nested and repeated fields in the query results. The default value is :data:`True`. kms_key (str): Experimental. Optional Cloud KMS key name for use when creating new tables. Raises: ~exceptions.ValueError: if any of the following is true: 1) the table reference as a string does not match the expected format 2) neither a table nor a query is specified 3) both a table and a query is specified. """ # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apitools.base import py # pylint: disable=unused-variable except ImportError: raise ImportError('Google Cloud IO not available, ' 'please install apache_beam[gcp]') if table is not None and query is not None: raise ValueError( 'Both a BigQuery table and a query were specified.' ' Please specify only one of these.') elif table is None and query is None: raise ValueError('A BigQuery table or a query must be specified') elif table is not None: self.table_reference = bigquery_tools.parse_table_reference( table, dataset, project) self.query = None self.use_legacy_sql = True else: self.query = query # TODO(BEAM-1082): Change the internal flag to be standard_sql self.use_legacy_sql = not use_standard_sql self.table_reference = None self.validate = validate self.flatten_results = flatten_results self.coder = coder or bigquery_tools.RowAsDictJsonCoder() self.kms_key = kms_key