Exemplo n.º 1
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        if (isinstance(self.table_reference, bigquery.TableReference)
                and self.table_reference.projectId is None):
            self.table_reference.projectId = pcoll.pipeline.options.view_as(
                GoogleCloudOptions).project

        method_to_use = self._compute_method(p, p.options)

        if method_to_use == WriteToBigQuery.Method.STREAMING_INSERTS:
            # TODO: Support load jobs for streaming pipelines.
            bigquery_write_fn = BigQueryWriteFn(
                schema=self.schema,
                batch_size=self.batch_size,
                create_disposition=self.create_disposition,
                write_disposition=self.write_disposition,
                kms_key=self.kms_key,
                retry_strategy=self.insert_retry_strategy,
                test_client=self.test_client)

            outputs = (
                pcoll
                | 'AppendDestination' >> beam.ParDo(
                    bigquery_tools.AppendDestinationsFn(self.table_reference))
                | 'StreamInsertRows' >> ParDo(bigquery_write_fn).with_outputs(
                    BigQueryWriteFn.FAILED_ROWS, main='main'))

            return {
                BigQueryWriteFn.FAILED_ROWS:
                outputs[BigQueryWriteFn.FAILED_ROWS]
            }
        else:
            if p.options.view_as(StandardOptions).streaming:
                raise NotImplementedError(
                    'File Loads to BigQuery are only supported on Batch pipelines.'
                )

            from apache_beam.io.gcp import bigquery_file_loads
            return (pcoll
                    | bigquery_file_loads.BigQueryBatchFileLoads(
                        destination=self.table_reference,
                        schema=self.schema,
                        create_disposition=self.create_disposition,
                        write_disposition=self.write_disposition,
                        max_file_size=self.max_file_size,
                        max_files_per_bundle=self.max_files_per_bundle,
                        custom_gcs_temp_location=self.custom_gcs_temp_location,
                        test_client=self.test_client,
                        validate=self._validate))
Exemplo n.º 2
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location

        empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
        singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

        load_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            singleton_pc
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        all_destination_file_pairs_pc = self._write_files(
            destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        partitions = (
            grouped_files_pc
            | beam.ParDo(
                PartitionFiles(self.max_partition_size,
                               self.max_files_per_partition)).with_outputs(
                                   PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                   PartitionFiles.SINGLE_PARTITION_TAG))

        multiple_partitions_per_destination_pc = partitions[
            PartitionFiles.MULTIPLE_PARTITIONS_TAG]
        single_partition_per_destination_pc = partitions[
            PartitionFiles.SINGLE_PARTITION_TAG]

        # When using dynamic destinations, elements with both single as well as
        # multiple partitions are loaded into BigQuery using temporary tables to
        # ensure atomicity.
        if self.dynamic_destinations:
            all_partitions = ((multiple_partitions_per_destination_pc,
                               single_partition_per_destination_pc)
                              | "FlattenPartitions" >> beam.Flatten())
            destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
              _load_data(all_partitions, empty_pc, load_job_name_pcv,
                         singleton_pc)
        else:
            destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
              _load_data(multiple_partitions_per_destination_pc,
                         single_partition_per_destination_pc,
                         load_job_name_pcv, singleton_pc)

        return {
            self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
    def expand(self, pcoll):
        p = pcoll.pipeline
        try:
            step_name = self.label
        except AttributeError:
            step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT
            BigQueryBatchFileLoads.COUNT += 1

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location
        job_name = (p.options.view_as(GoogleCloudOptions).job_name
                    or 'AUTOMATIC_JOB_NAME')

        empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
        singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

        load_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | "LoadJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP')))

        schema_mod_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            |
            "SchemaModJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD,
                'SCHEMA_MOD_STEP')))

        copy_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | "CopyJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP')))

        file_prefix_pcv = pvalue.AsSingleton(
            singleton_pc
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        if not self.with_auto_sharding:
            all_destination_file_pairs_pc = self._write_files(
                destination_data_kv_pc, file_prefix_pcv)
        else:
            all_destination_file_pairs_pc = self._write_files_with_auto_sharding(
                destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        partitions = (
            grouped_files_pc
            | beam.ParDo(
                PartitionFiles(self.max_partition_size,
                               self.max_files_per_partition)).with_outputs(
                                   PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                   PartitionFiles.SINGLE_PARTITION_TAG))

        multiple_partitions_per_destination_pc = partitions[
            PartitionFiles.MULTIPLE_PARTITIONS_TAG]
        single_partition_per_destination_pc = partitions[
            PartitionFiles.SINGLE_PARTITION_TAG]

        # When using dynamic destinations, elements with both single as well as
        # multiple partitions are loaded into BigQuery using temporary tables to
        # ensure atomicity.
        if self.dynamic_destinations:
            all_partitions = ((multiple_partitions_per_destination_pc,
                               single_partition_per_destination_pc)
                              | "FlattenPartitions" >> beam.Flatten())
            destination_load_job_ids_pc, destination_copy_job_ids_pc = (
                self._load_data(all_partitions, empty_pc, load_job_name_pcv,
                                schema_mod_job_name_pcv, copy_job_name_pcv, p,
                                step_name))
        else:
            destination_load_job_ids_pc, destination_copy_job_ids_pc = (
                self._load_data(multiple_partitions_per_destination_pc,
                                single_partition_per_destination_pc,
                                load_job_name_pcv, schema_mod_job_name_pcv,
                                copy_job_name_pcv, p, step_name))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
Exemplo n.º 4
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        self._custom_gcs_temp_location = (
            self._custom_gcs_temp_location
            or p.options.view_as(GoogleCloudOptions).temp_location)

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create(
                [self._custom_gcs_temp_location])
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate)))

        outputs = (
            pcoll
            |
            "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows())
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination))
            | beam.ParDo(WriteRecordsToFile(
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                coder=self.coder),
                         file_prefix=file_prefix_pcv).with_outputs(
                             WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                             WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(schema=self.schema,
                            write_disposition=self.write_disposition,
                            create_disposition=self.create_disposition,
                            test_client=self.test_client,
                            temporary_tables=self.temp_tables),
            load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES,
                                            main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
Exemplo n.º 5
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create([''])
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        all_destination_file_pairs_pc = self._write_files(
            destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(
                schema=self.schema,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                test_client=self.test_client,
                temporary_tables=self.temp_tables,
                additional_bq_parameters=self.additional_bq_parameters),
            load_job_name_pcv, *self.schema_side_inputs).with_outputs(
                TriggerLoadJobs.TEMP_TABLES, main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))
        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }