def create_batch(self, batch_id: str, batch_request: Dict[str, Any]) -> Dict[str, Any]: """Common function for creating a batch workload. Args: batch_id: Dataproc batch id. batch_request: Dict of the Batch resource. For more details, see: https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.locations.batches#:-batch Returns: Dict of the long-running Operation resource. For more details, see: https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.operations#resource:-operation """ # Create the Batch resource. create_batch_url = f'https://dataproc.googleapis.com/v1/projects/{self._project}/locations/{self._location}/batches/?batchId={batch_id}' lro = self._create_resource(create_batch_url, json.dumps(batch_request)) lro_name = lro['name'] # Write the Operation resource uri to the gcp_resources output file. job_resources = gcp_resources_pb2.GcpResources() job_resource = job_resources.resources.add() job_resource.resource_type = 'DataprocLro' job_resource.resource_uri = f'{_DATAPROC_URI_PREFIX}/{lro_name}' with open(self._gcp_resources, 'w') as f: f.write(json_format.MessageToJson(job_resources)) return lro
def create_python_job(python_module_path: str, project: str, gcp_resources: str, location: str, temp_location: str, requirements_file_path: str = '', args: Optional[str] = '[]'): """Creates a Dataflow python job. Args: python_module_path: The gcs path to the python file or folder to run. project: Required. The project of which the resource will be launched. gcp_resources: A placeholder output for returning the gcp_resouces proto. location: Required. The region of which the resource will be launched. temp_location: A GCS path for Dataflow to stage temporary job files created during the execution of the pipeline. requirements_file_path: Optional, the gcs or local path to the pip requirements file. args: The JsonArray list of args to pass to the python file. Returns: And instance of GCPResouces proto with the dataflow Job ID which is stored in gcp_resources path. Raises: RuntimeError: If the execution does not return a job ID. """ job_id = None if requirements_file_path: install_requirements(requirements_file_path) args_list = [] if args: args_list = json.loads(args) python_file_path = stage_file(python_module_path) cmd = prepare_cmd(project, location, python_file_path, args_list, temp_location) sub_process = Process(cmd) for line in sub_process.read_lines(): logging.info('DataflowRunner output: %s', line) job_id, location = extract_job_id_and_location(line) if job_id: logging.info('Found job id %s and location %s.', job_id, location) # Write the job proto to output. job_resources = gcp_resources_pb2.GcpResources() job_resource = job_resources.resources.add() job_resource.resource_type = 'DataflowJob' job_resource.resource_uri = f'https://dataflow.googleapis.com/v1b3/projects/{project}/locations/{location}/jobs/{job_id}' with open(gcp_resources, 'w') as f: f.write(json_format.MessageToJson(job_resources)) break if not job_id: raise RuntimeError( 'No dataflow job was found when running the python file.')
def _validate_gcp_resources_succeeded(self): with open(self._gcp_resources) as f: serialized_gcp_resources = f.read() # Instantiate GCPResources Proto operations = json_format.Parse(serialized_gcp_resources, gcp_resources_pb2.GcpResources()) self.assertLen(operations.resources, 1) self.assertEqual( operations.resources[0].resource_uri, f'https://dataproc.googleapis.com/v1/projects/{self._project}/regions/{self._location}/operations/{self._operation_id}' )
def check_if_operation_exists(self) -> Union[Dict[str, Any], None]: """Check if a Dataproc Batch operation already exists. Returns: Dict of the long-running Operation resource if it exists. For more details, see: https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.operations#resource:-operation None if the Operation resource does not exist. Raises: ValueError: Operation resource uri format is invalid. """ if path.exists(self._gcp_resources) and os.stat( self._gcp_resources).st_size != 0: with open(self._gcp_resources) as f: serialized_gcp_resources = f.read() job_resources = json_format.Parse(serialized_gcp_resources, gcp_resources_pb2.GcpResources()) # Resources should only contain one item. if len(job_resources.resources) != 1: raise ValueError( f'gcp_resources should contain one resource, found {len(job_resources.resources)}' ) # Validate the format of the Operation resource uri. job_name_pattern = re.compile(_DATAPROC_OPERATION_URI_TEMPLATE) match = job_name_pattern.match( job_resources.resources[0].resource_uri) try: matched_project = match.group('project') matched_region = match.group('region') matched_operation_id = match.group('operation') except AttributeError as err: raise ValueError('Invalid Resource uri: {}. Expect: {}.'.format( job_resources.resources[0].resource_uri, 'https://dataproc.googleapis.com/v1/projects/[projectId]/regions/[region]/operations/[operationId]' )) from err # Get the long-running Operation resource. lro = self._get_resource(job_resources.resources[0].resource_uri) return lro else: return None