Exemplo n.º 1
0
 def _insert_load_job(self,
                      project_id,
                      job_id,
                      table_reference,
                      source_uris,
                      schema=None,
                      write_disposition=None,
                      create_disposition=None):
   reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
   request = bigquery.BigqueryJobsInsertRequest(
       projectId=project_id,
       job=bigquery.Job(
           configuration=bigquery.JobConfiguration(
               load=bigquery.JobConfigurationLoad(
                   sourceUris=source_uris,
                   destinationTable=table_reference,
                   schema=schema,
                   writeDisposition=write_disposition,
                   createDisposition=create_disposition,
                   sourceFormat='NEWLINE_DELIMITED_JSON',
                   autodetect=schema is None,
               )
           ),
           jobReference=reference,
       )
   )
   response = self.client.jobs.Insert(request)
   return response.jobReference
Exemplo n.º 2
0
  def perform_extract_job(self, destination, job_id, table_reference,
                          destination_format, include_header=True,
                          compression=ExportCompression.NONE):
    """Starts a job to export data from BigQuery.

    Returns:
      bigquery.JobReference with the information about the job that was started.
    """
    job_reference = bigquery.JobReference(jobId=job_id,
                                          projectId=table_reference.projectId)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=table_reference.projectId,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                extract=bigquery.JobConfigurationExtract(
                    destinationUris=destination,
                    sourceTable=table_reference,
                    printHeader=include_header,
                    destinationFormat=destination_format,
                    compression=compression,
                )
            ),
            jobReference=job_reference,
        )
    )
    response = self.client.jobs.Insert(request)
    return response.jobReference
Exemplo n.º 3
0
    def test_records_traverse_transform_with_mocks(self):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job

        bq_client.jobs.Insert.return_value = result_job

        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            coder=CustomRowCoder())

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline('DirectRunner') as p:
            outputs = p | beam.Create(_ELEMENTS) | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> beam.combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())

            # All files exist
            _ = (files | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # One file per destination
            assert_that(files | beam.combiners.Count.Globally(),
                        equal_to([1]),
                        label='CountFiles')

            assert_that(destinations,
                        equal_to([destination]),
                        label='CheckDestinations')

            assert_that(jobs, equal_to([job_reference]), label='CheckJobs')
Exemplo n.º 4
0
    def _insert_copy_job(self,
                         project_id,
                         job_id,
                         from_table_reference,
                         to_table_reference,
                         create_disposition=None,
                         write_disposition=None):
        reference = bigquery.JobReference()
        reference.jobId = job_id
        reference.projectId = project_id
        request = bigquery.BigqueryJobsInsertRequest(
            projectId=project_id,
            job=bigquery.Job(
                configuration=bigquery.JobConfiguration(
                    copy=bigquery.JobConfigurationTableCopy(
                        destinationTable=to_table_reference,
                        sourceTable=from_table_reference,
                        createDisposition=create_disposition,
                        writeDisposition=write_disposition,
                    )),
                jobReference=reference,
            ))

        logging.info("Inserting job request: %s", request)
        response = self.client.jobs.Insert(request)
        logging.info("Response was %s", response)
        return response.jobReference
Exemplo n.º 5
0
 def test_read_from_table_and_multiple_pages(self):
     client = mock.Mock()
     client.jobs.Insert.return_value = bigquery.Job(
         jobReference=bigquery.JobReference(jobId='somejob'))
     table_rows, schema, expected_rows = self.get_test_rows()
     # Return a pageToken on first call to trigger the code path where
     # query needs to handle multiple pages of results.
     client.jobs.GetQueryResults.side_effect = [
         bigquery.GetQueryResultsResponse(jobComplete=True,
                                          rows=table_rows,
                                          schema=schema,
                                          pageToken='token'),
         bigquery.GetQueryResultsResponse(jobComplete=True,
                                          rows=table_rows,
                                          schema=schema)
     ]
     actual_rows = []
     with beam.io.BigQuerySource(
             'dataset.table',
             use_dataflow_native_source=True).reader(client) as reader:
         for row in reader:
             actual_rows.append(row)
     # We return expected rows for each of the two pages of results so we
     # adjust our expectation below accordingly.
     self.assertEqual(actual_rows, expected_rows * 2)
Exemplo n.º 6
0
    def test_load_job_id_used(self):
        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'loadJobProject'
        job_reference.jobId = 'job_name1'

        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job

        bq_client.jobs.Insert.return_value = result_job

        transform = bqfl.BigQueryBatchFileLoads(
            'project1:dataset1.table1',
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            load_job_project_id='loadJobProject')

        with TestPipeline('DirectRunner') as p:
            outputs = p | beam.Create(_ELEMENTS) | transform
            jobs = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] \
                   | "GetJobs" >> beam.Map(lambda x: x[1])

            assert_that(jobs,
                        equal_to([job_reference]),
                        label='CheckJobProjectIds')
Exemplo n.º 7
0
 def _insert_load_job(self,
                      project_id,
                      job_id,
                      table_reference,
                      source_uris,
                      schema=None,
                      write_disposition=None,
                      create_disposition=None,
                      additional_load_parameters=None):
     additional_load_parameters = additional_load_parameters or {}
     job_schema = None if schema == 'SCHEMA_AUTODETECT' else schema
     reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
     request = bigquery.BigqueryJobsInsertRequest(
         projectId=project_id,
         job=bigquery.Job(
             configuration=bigquery.JobConfiguration(
                 load=bigquery.JobConfigurationLoad(
                     sourceUris=source_uris,
                     destinationTable=table_reference,
                     schema=job_schema,
                     writeDisposition=write_disposition,
                     createDisposition=create_disposition,
                     sourceFormat='NEWLINE_DELIMITED_JSON',
                     autodetect=schema == 'SCHEMA_AUTODETECT',
                     **additional_load_parameters)),
             jobReference=reference,
         ))
     response = self.client.jobs.Insert(request)
     return response.jobReference
Exemplo n.º 8
0
 def test_read_from_table(self):
     client = mock.Mock()
     client.jobs.Insert.return_value = bigquery.Job(
         jobReference=bigquery.JobReference(jobId='somejob'))
     table_rows, schema, expected_rows = self.get_test_rows()
     client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse(
         jobComplete=True, rows=table_rows, schema=schema)
     actual_rows = []
     with beam.io.BigQuerySource('dataset.table').reader(client) as reader:
         for row in reader:
             actual_rows.append(row)
     self.assertEqual(actual_rows, expected_rows)
     self.assertEqual(schema, reader.schema)
Exemplo n.º 9
0
 def test_read_from_table_as_tablerows(self):
   client = mock.Mock()
   client.jobs.Insert.return_value = bigquery.Job(
       jobReference=bigquery.JobReference(jobId='somejob'))
   table_rows, schema, _ = self.get_test_rows()
   client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse(
       jobComplete=True, rows=table_rows, schema=schema)
   actual_rows = []
   # We set the coder to TableRowJsonCoder, which is a signal that
   # the caller wants to see the rows as TableRows.
   with beam.io.BigQuerySource(
       'dataset.table', coder=TableRowJsonCoder).reader(client) as reader:
     for row in reader:
       actual_rows.append(row)
   self.assertEqual(actual_rows, table_rows)
   self.assertEqual(schema, reader.schema)
Exemplo n.º 10
0
 def test_read_from_query_unflatten_records(self):
     client = mock.Mock()
     client.jobs.Insert.return_value = bigquery.Job(
         jobReference=bigquery.JobReference(jobId='somejob'))
     table_rows, schema, expected_rows = self.get_test_rows()
     client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse(
         jobComplete=True, rows=table_rows, schema=schema)
     actual_rows = []
     with beam.io.BigQuerySource(
             query='query', flatten_results=False).reader(client) as reader:
         for row in reader:
             actual_rows.append(row)
     self.assertEqual(actual_rows, expected_rows)
     self.assertEqual(schema, reader.schema)
     self.assertTrue(reader.use_legacy_sql)
     self.assertFalse(reader.flatten_results)
Exemplo n.º 11
0
 def test_read_from_table_and_job_complete_retry(self, patched_time_sleep):
   client = mock.Mock()
   client.jobs.Insert.return_value = bigquery.Job(
       jobReference=bigquery.JobReference(jobId='somejob'))
   table_rows, schema, expected_rows = self.get_test_rows()
   # Return jobComplete=False on first call to trigger the code path where
   # query needs to handle waiting a bit.
   client.jobs.GetQueryResults.side_effect = [
       bigquery.GetQueryResultsResponse(jobComplete=False),
       bigquery.GetQueryResultsResponse(
           jobComplete=True, rows=table_rows, schema=schema)
   ]
   actual_rows = []
   with beam.io.BigQuerySource('dataset.table').reader(client) as reader:
     for row in reader:
       actual_rows.append(row)
   self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 12
0
    def load_table(self, job_id, project_id, table_ref, table_schema, gcs_urls,
                   create_disposition, write_disposition):

        job_ref = bq.JobReference(jobId=job_id, projectId=project_id)
        request = bq.BigqueryJobsInsertRequest(
            projectId=project_id,
            job=bq.Job(
                configuration=bq.JobConfiguration(load=bq.JobConfigurationLoad(
                    createDisposition=create_disposition,
                    destinationTable=table_ref,
                    schema=table_schema,
                    sourceFormat="NEWLINE_DELIMITED_JSON",
                    sourceUris=gcs_urls,
                    writeDisposition=write_disposition)),
                jobReference=job_ref))

        response = self.client.jobs.Insert(request)
        return response.jobReference.jobId
Exemplo n.º 13
0
  def _start_query_job(self, project_id, query, use_legacy_sql, flatten_results,
                       job_id, dry_run=False):
    reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=project_id,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                dryRun=dry_run,
                query=bigquery.JobConfigurationQuery(
                    query=query,
                    useLegacySql=use_legacy_sql,
                    allowLargeResults=True,
                    destinationTable=self._get_temp_table(project_id),
                    flattenResults=flatten_results)),
            jobReference=reference))

    response = self.client.jobs.Insert(request)
    return response.jobReference.jobId
Exemplo n.º 14
0
  def _insert_load_job(self, project_id, job_id, table_reference, source_uris,
                       schema=None):
    reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=table_reference.project_id,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                load=bigquery.JobConfigurationLoad(
                    source_uris=source_uris,
                    destination_table=table_reference,
                )
            ),
            jobReference=reference,
        )
    )

    response = self.client.jobs.Insert(request)
    return response.jobReference.jobId
Exemplo n.º 15
0
  def get_query_location(self, project_id, query, use_legacy_sql):
    """
    Get the location of tables referenced in a query.

    This method returns the location of the first referenced table in the query
    and depends on the BigQuery service to provide error handling for
    queries that reference tables in multiple locations.
    """
    reference = bigquery.JobReference(jobId=uuid.uuid4().hex,
                                      projectId=project_id)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=project_id,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                dryRun=True,
                query=bigquery.JobConfigurationQuery(
                    query=query,
                    useLegacySql=use_legacy_sql,
                )),
            jobReference=reference))

    response = self.client.jobs.Insert(request)

    if response.statistics is None:
      # This behavior is only expected in tests
      logging.warning(
          "Unable to get location, missing response.statistics. Query: %s",
          query)
      return None

    referenced_tables = response.statistics.query.referencedTables
    if referenced_tables:  # Guards against both non-empty and non-None
      table = referenced_tables[0]
      location = self.get_table_location(
          table.projectId,
          table.datasetId,
          table.tableId)
      logging.info("Using location %r from table %r referenced by query %s",
                   location, table, query)
      return location

    logging.debug("Query %s does not reference any tables.", query)
    return None
Exemplo n.º 16
0
  def _start_query_job(self, project_id, query, use_legacy_sql, flatten_results,
                       job_id, dry_run=False, kms_key=None):
    reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=project_id,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                dryRun=dry_run,
                query=bigquery.JobConfigurationQuery(
                    query=query,
                    useLegacySql=use_legacy_sql,
                    allowLargeResults=not dry_run,
                    destinationTable=self._get_temp_table(project_id) if not
                    dry_run else None,
                    flattenResults=flatten_results,
                    destinationEncryptionConfiguration=bigquery
                    .EncryptionConfiguration(kmsKeyName=kms_key))),
            jobReference=reference))

    response = self.client.jobs.Insert(request)
    return response
Exemplo n.º 17
0
    def test_triggering_frequency(self, is_streaming, with_auto_sharding):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job
        bq_client.jobs.Insert.return_value = result_job

        # Insert a fake clock to work with auto-sharding which needs a processing
        # time timer.
        class _FakeClock(object):
            def __init__(self, now=time.time()):
                self._now = now

            def __call__(self):
                return self._now

        start_time = timestamp.Timestamp(0)
        bq_client.test_clock = _FakeClock(now=start_time)

        triggering_frequency = 20 if is_streaming else None
        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            temp_file_format=bigquery_tools.FileFormat.JSON,
            is_streaming_pipeline=is_streaming,
            triggering_frequency=triggering_frequency,
            with_auto_sharding=with_auto_sharding)

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline(
                runner='BundleBasedDirectRunner',
                options=StandardOptions(streaming=is_streaming)) as p:
            if is_streaming:
                _SIZE = len(_ELEMENTS)
                fisrt_batch = [
                    TimestampedValue(value, start_time + i + 1)
                    for i, value in enumerate(_ELEMENTS[:_SIZE // 2])
                ]
                second_batch = [
                    TimestampedValue(value, start_time + _SIZE // 2 + i + 1)
                    for i, value in enumerate(_ELEMENTS[_SIZE // 2:])
                ]
                # Advance processing time between batches of input elements to fire the
                # user triggers. Intentionally advance the processing time twice for the
                # auto-sharding case since we need to first fire the timer and then
                # fire the trigger.
                test_stream = (
                    TestStream().advance_watermark_to(start_time).add_elements(
                        fisrt_batch).advance_processing_time(30).
                    advance_processing_time(30).add_elements(second_batch).
                    advance_processing_time(30).advance_processing_time(
                        30).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)
            outputs = input | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())
            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            # Check that all files exist.
            _ = (files
                 | beam.Map(
                     lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # Expect two load jobs are generated in the streaming case due to the
            # triggering frequency. Grouping is per trigger so we expect two entries
            # in the output as opposed to one.
            file_count = files | combiners.Count.Globally().without_defaults()
            expected_file_count = [1, 1] if is_streaming else [1]
            expected_destinations = [destination, destination
                                     ] if is_streaming else [destination]
            expected_jobs = [job_reference, job_reference
                             ] if is_streaming else [job_reference]
            assert_that(file_count,
                        equal_to(expected_file_count),
                        label='CountFiles')
            assert_that(destinations,
                        equal_to(expected_destinations),
                        label='CheckDestinations')
            assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')