示例#1
0
    def create_batch(self, batch_id: str,
                     batch_request: Dict[str, Any]) -> Dict[str, Any]:
        """Common function for creating a batch workload.

    Args:
      batch_id: Dataproc batch id.
      batch_request: Dict of the Batch resource. For more details, see:
        https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.locations.batches#:-batch

    Returns:
       Dict of the long-running Operation resource. For more details, see:
         https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.operations#resource:-operation
    """
        # Create the Batch resource.
        create_batch_url = f'https://dataproc.googleapis.com/v1/projects/{self._project}/locations/{self._location}/batches/?batchId={batch_id}'
        lro = self._create_resource(create_batch_url,
                                    json.dumps(batch_request))
        lro_name = lro['name']

        # Write the Operation resource uri to the gcp_resources output file.
        job_resources = gcp_resources_pb2.GcpResources()
        job_resource = job_resources.resources.add()
        job_resource.resource_type = 'DataprocLro'
        job_resource.resource_uri = f'{_DATAPROC_URI_PREFIX}/{lro_name}'
        with open(self._gcp_resources, 'w') as f:
            f.write(json_format.MessageToJson(job_resources))

        return lro
def create_python_job(python_module_path: str,
                      project: str,
                      gcp_resources: str,
                      location: str,
                      temp_location: str,
                      requirements_file_path: str = '',
                      args: Optional[str] = '[]'):
    """Creates a Dataflow python job.

  Args:
    python_module_path: The gcs path to the python file or folder to run.
    project: Required. The project of which the resource will be launched.
    gcp_resources: A placeholder output for returning the gcp_resouces proto.
    location: Required. The region of which the resource will be launched.
    temp_location: A GCS path for Dataflow to stage temporary job files created
      during the execution of the pipeline.
    requirements_file_path: Optional, the gcs or local path to the pip
      requirements file.
    args: The JsonArray list of args to pass to the python file.

  Returns:
    And instance of GCPResouces proto with the dataflow Job ID which is stored
    in gcp_resources path.
  Raises:
    RuntimeError: If the execution does not return a job ID.
  """
    job_id = None
    if requirements_file_path:
        install_requirements(requirements_file_path)
    args_list = []
    if args:
        args_list = json.loads(args)

    python_file_path = stage_file(python_module_path)
    cmd = prepare_cmd(project, location, python_file_path, args_list,
                      temp_location)
    sub_process = Process(cmd)
    for line in sub_process.read_lines():
        logging.info('DataflowRunner output: %s', line)
        job_id, location = extract_job_id_and_location(line)
        if job_id:
            logging.info('Found job id %s and location %s.', job_id, location)
            # Write the job proto to output.
            job_resources = gcp_resources_pb2.GcpResources()
            job_resource = job_resources.resources.add()
            job_resource.resource_type = 'DataflowJob'
            job_resource.resource_uri = f'https://dataflow.googleapis.com/v1b3/projects/{project}/locations/{location}/jobs/{job_id}'

            with open(gcp_resources, 'w') as f:
                f.write(json_format.MessageToJson(job_resources))
            break
    if not job_id:
        raise RuntimeError(
            'No dataflow job was found when running the python file.')
示例#3
0
 def _validate_gcp_resources_succeeded(self):
     with open(self._gcp_resources) as f:
         serialized_gcp_resources = f.read()
         # Instantiate GCPResources Proto
         operations = json_format.Parse(serialized_gcp_resources,
                                        gcp_resources_pb2.GcpResources())
         self.assertLen(operations.resources, 1)
         self.assertEqual(
             operations.resources[0].resource_uri,
             f'https://dataproc.googleapis.com/v1/projects/{self._project}/regions/{self._location}/operations/{self._operation_id}'
         )
示例#4
0
    def check_if_operation_exists(self) -> Union[Dict[str, Any], None]:
        """Check if a Dataproc Batch operation already exists.

    Returns:
      Dict of the long-running Operation resource if it exists. For more details, see:
         https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.operations#resource:-operation
      None if the Operation resource does not exist.

    Raises:
      ValueError: Operation resource uri format is invalid.
    """
        if path.exists(self._gcp_resources) and os.stat(
                self._gcp_resources).st_size != 0:
            with open(self._gcp_resources) as f:
                serialized_gcp_resources = f.read()

            job_resources = json_format.Parse(serialized_gcp_resources,
                                              gcp_resources_pb2.GcpResources())
            # Resources should only contain one item.
            if len(job_resources.resources) != 1:
                raise ValueError(
                    f'gcp_resources should contain one resource, found {len(job_resources.resources)}'
                )

            # Validate the format of the Operation resource uri.
            job_name_pattern = re.compile(_DATAPROC_OPERATION_URI_TEMPLATE)
            match = job_name_pattern.match(
                job_resources.resources[0].resource_uri)
            try:
                matched_project = match.group('project')
                matched_region = match.group('region')
                matched_operation_id = match.group('operation')
            except AttributeError as err:
                raise ValueError('Invalid Resource uri: {}. Expect: {}.'.format(
                    job_resources.resources[0].resource_uri,
                    'https://dataproc.googleapis.com/v1/projects/[projectId]/regions/[region]/operations/[operationId]'
                )) from err

            # Get the long-running Operation resource.
            lro = self._get_resource(job_resources.resources[0].resource_uri)
            return lro
        else:
            return None