def __init__(self, packages, options, environment_version, pipeline_url): self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) self.worker_options = options.view_as(WorkerOptions) self.debug_options = options.view_as(DebugOptions) self.pipeline_url = pipeline_url self.proto = dataflow.Environment() self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE self.proto.dataset = '{}/cloud_dataflow'.format( GoogleCloudOptions.BIGQUERY_API_SERVICE) self.proto.tempStoragePrefix = ( self.google_cloud_options.temp_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE)) # User agent information. self.proto.userAgent = dataflow.Environment.UserAgentValue() self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint if self.google_cloud_options.service_account_email: self.proto.serviceAccountEmail = ( self.google_cloud_options.service_account_email) self.proto.userAgent.additionalProperties.extend([ dataflow.Environment.UserAgentValue.AdditionalProperty( key='name', value=to_json_value(self._get_python_sdk_name())), dataflow.Environment.UserAgentValue.AdditionalProperty( key='version', value=to_json_value(beam_version.__version__)) ]) # Version information. self.proto.version = dataflow.Environment.VersionValue() _verify_interpreter_version_is_supported(options) if self.standard_options.streaming: job_type = 'FNAPI_STREAMING' else: if _use_fnapi(options): job_type = 'FNAPI_BATCH' else: job_type = 'PYTHON_BATCH' self.proto.version.additionalProperties.extend([ dataflow.Environment.VersionValue.AdditionalProperty( key='job_type', value=to_json_value(job_type)), dataflow.Environment.VersionValue.AdditionalProperty( key='major', value=to_json_value(environment_version)) ]) # TODO: Use enumerated type instead of strings for job types. if job_type.startswith('FNAPI_'): runner_harness_override = (get_runner_harness_container_image()) self.debug_options.experiments = self.debug_options.experiments or [] if runner_harness_override: self.debug_options.experiments.append( 'runner_harness_container_image=' + runner_harness_override) # Add use_multiple_sdk_containers flag if its not already present. Do not # add the flag if 'no_use_multiple_sdk_containers' is present. # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK # till version 2.4. debug_options_experiments = self.debug_options.experiments if ('use_multiple_sdk_containers' not in debug_options_experiments and 'no_use_multiple_sdk_containers' not in debug_options_experiments): self.debug_options.experiments.append( 'use_multiple_sdk_containers') # FlexRS if self.google_cloud_options.flexrs_goal == 'COST_OPTIMIZED': self.proto.flexResourceSchedulingGoal = ( dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum. FLEXRS_COST_OPTIMIZED) elif self.google_cloud_options.flexrs_goal == 'SPEED_OPTIMIZED': self.proto.flexResourceSchedulingGoal = ( dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum. FLEXRS_SPEED_OPTIMIZED) # Experiments if self.debug_options.experiments: for experiment in self.debug_options.experiments: self.proto.experiments.append(experiment) # Worker pool(s) information. package_descriptors = [] for package in packages: package_descriptors.append( dataflow.Package( location='%s/%s' % (self.google_cloud_options.staging_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE), package), name=package)) pool = dataflow.WorkerPool( kind='local' if self.local else 'harness', packages=package_descriptors, taskrunnerSettings=dataflow.TaskRunnerSettings( parallelWorkerSettings=dataflow.WorkerSettings( baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT, servicePath=self.google_cloud_options.dataflow_endpoint))) pool.autoscalingSettings = dataflow.AutoscalingSettings() # Set worker pool options received through command line. if self.worker_options.num_workers: pool.numWorkers = self.worker_options.num_workers if self.worker_options.max_num_workers: pool.autoscalingSettings.maxNumWorkers = ( self.worker_options.max_num_workers) if self.worker_options.autoscaling_algorithm: values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum pool.autoscalingSettings.algorithm = { 'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE, 'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC, }.get(self.worker_options.autoscaling_algorithm) if self.worker_options.machine_type: pool.machineType = self.worker_options.machine_type if self.worker_options.disk_size_gb: pool.diskSizeGb = self.worker_options.disk_size_gb if self.worker_options.disk_type: pool.diskType = self.worker_options.disk_type if self.worker_options.zone: pool.zone = self.worker_options.zone if self.worker_options.network: pool.network = self.worker_options.network if self.worker_options.subnetwork: pool.subnetwork = self.worker_options.subnetwork if self.worker_options.worker_harness_container_image: pool.workerHarnessContainerImage = ( self.worker_options.worker_harness_container_image) else: pool.workerHarnessContainerImage = ( get_default_container_image_for_current_sdk(job_type)) if self.worker_options.use_public_ips is not None: if self.worker_options.use_public_ips: pool.ipConfiguration = ( dataflow.WorkerPool.IpConfigurationValueValuesEnum. WORKER_IP_PUBLIC) else: pool.ipConfiguration = ( dataflow.WorkerPool.IpConfigurationValueValuesEnum. WORKER_IP_PRIVATE) if self.standard_options.streaming: # Use separate data disk for streaming. disk = dataflow.Disk() if self.local: disk.diskType = 'local' # TODO(ccy): allow customization of disk. pool.dataDisks.append(disk) self.proto.workerPools.append(pool) sdk_pipeline_options = options.get_all_options() if sdk_pipeline_options: self.proto.sdkPipelineOptions = ( dataflow.Environment.SdkPipelineOptionsValue()) options_dict = { k: v for k, v in sdk_pipeline_options.items() if v is not None } options_dict["pipelineUrl"] = pipeline_url self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key='options', value=to_json_value(options_dict))) dd = DisplayData.create_from_options(options) items = [item.get_dict() for item in dd.items] self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key='display_data', value=to_json_value(items)))
def __init__(self, packages, options, environment_version): self.standard_options = options.view_as(StandardOptions) self.google_cloud_options = options.view_as(GoogleCloudOptions) self.worker_options = options.view_as(WorkerOptions) self.debug_options = options.view_as(DebugOptions) self.proto = dataflow.Environment() self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE self.proto.dataset = '{}/cloud_dataflow'.format( GoogleCloudOptions.BIGQUERY_API_SERVICE) self.proto.tempStoragePrefix = ( self.google_cloud_options.temp_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE)) # User agent information. self.proto.userAgent = dataflow.Environment.UserAgentValue() self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint if self.google_cloud_options.service_account_email: self.proto.serviceAccountEmail = ( self.google_cloud_options.service_account_email) sdk_name, version_string = get_sdk_name_and_version() self.proto.userAgent.additionalProperties.extend([ dataflow.Environment.UserAgentValue.AdditionalProperty( key='name', value=to_json_value(sdk_name)), dataflow.Environment.UserAgentValue.AdditionalProperty( key='version', value=to_json_value(version_string)) ]) # Version information. self.proto.version = dataflow.Environment.VersionValue() if self.standard_options.streaming: job_type = 'PYTHON_STREAMING' else: job_type = 'PYTHON_BATCH' self.proto.version.additionalProperties.extend([ dataflow.Environment.VersionValue.AdditionalProperty( key='job_type', value=to_json_value(job_type)), dataflow.Environment.VersionValue.AdditionalProperty( key='major', value=to_json_value(environment_version)) ]) # Experiments if self.debug_options.experiments: for experiment in self.debug_options.experiments: self.proto.experiments.append(experiment) # Worker pool(s) information. package_descriptors = [] for package in packages: package_descriptors.append( dataflow.Package( location='%s/%s' % (self.google_cloud_options.staging_location.replace( 'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE), package), name=package)) pool = dataflow.WorkerPool( kind='local' if self.local else 'harness', packages=package_descriptors, taskrunnerSettings=dataflow.TaskRunnerSettings( parallelWorkerSettings=dataflow.WorkerSettings( baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT, servicePath=self.google_cloud_options.dataflow_endpoint))) pool.autoscalingSettings = dataflow.AutoscalingSettings() # Set worker pool options received through command line. if self.worker_options.num_workers: pool.numWorkers = self.worker_options.num_workers if self.worker_options.max_num_workers: pool.autoscalingSettings.maxNumWorkers = ( self.worker_options.max_num_workers) if self.worker_options.autoscaling_algorithm: values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum pool.autoscalingSettings.algorithm = { 'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE, 'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC, }.get(self.worker_options.autoscaling_algorithm) if self.worker_options.machine_type: pool.machineType = self.worker_options.machine_type if self.worker_options.disk_size_gb: pool.diskSizeGb = self.worker_options.disk_size_gb if self.worker_options.disk_type: pool.diskType = self.worker_options.disk_type if self.worker_options.zone: pool.zone = self.worker_options.zone if self.worker_options.network: pool.network = self.worker_options.network if self.worker_options.worker_harness_container_image: pool.workerHarnessContainerImage = ( self.worker_options.worker_harness_container_image) else: # Default to using the worker harness container image for the current SDK # version. pool.workerHarnessContainerImage = ( 'dataflow.gcr.io/v1beta3/python:%s' % get_required_container_version()) if self.worker_options.use_public_ips is not None: if self.worker_options.use_public_ips: pool.ipConfiguration = ( dataflow.WorkerPool.IpConfigurationValueValuesEnum. WORKER_IP_PUBLIC) else: pool.ipConfiguration = ( dataflow.WorkerPool.IpConfigurationValueValuesEnum. WORKER_IP_PRIVATE) if self.standard_options.streaming: # Use separate data disk for streaming. disk = dataflow.Disk() if self.local: disk.diskType = 'local' # TODO(ccy): allow customization of disk. pool.dataDisks.append(disk) self.proto.workerPools.append(pool) sdk_pipeline_options = options.get_all_options() if sdk_pipeline_options: self.proto.sdkPipelineOptions = ( dataflow.Environment.SdkPipelineOptionsValue()) options_dict = { k: v for k, v in sdk_pipeline_options.iteritems() if v is not None } self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key='options', value=to_json_value(options_dict))) dd = DisplayData.create_from_options(options) items = [item.get_dict() for item in dd.items] self.proto.sdkPipelineOptions.additionalProperties.append( dataflow.Environment.SdkPipelineOptionsValue. AdditionalProperty(key='display_data', value=to_json_value(items)))