Exemplo n.º 1
0
    def _cc_pipeline(self, pipeline, pipeline_name):

        runtime_configuration = self._get_runtime_configuration(pipeline.runtime_config)

        cos_endpoint = runtime_configuration.metadata['cos_endpoint']
        cos_username = runtime_configuration.metadata['cos_username']
        cos_password = runtime_configuration.metadata['cos_password']
        cos_directory = pipeline_name
        cos_bucket = runtime_configuration.metadata['cos_bucket']

        # Create dictionary that maps component Id to its ContainerOp instance
        notebook_ops = {}

        # All previous operation outputs should be propagated throughout the pipeline.
        # In order to process this recursively, the current operation's inputs should be combined
        # from its parent's inputs (which, themselves are derived from the outputs of their parent)
        # and its parent's outputs.
        for pipeline_operation in pipeline.operations.values():
            parent_inputs_and_outputs = []
            for parent_operation_id in pipeline_operation.parent_operations:
                parent_operation = pipeline.operations[parent_operation_id]
                if parent_operation.inputs:
                    parent_inputs_and_outputs.extend(parent_operation.inputs)
                if parent_operation.outputs:
                    parent_inputs_and_outputs.extend(parent_operation.outputs)

                if parent_inputs_and_outputs:
                    pipeline_operation.inputs = parent_inputs_and_outputs

        for operation in pipeline.operations.values():
            operation_artifact_archive = self._get_dependency_archive_name(operation)

            self.log.debug("Creating pipeline component :\n "
                           "componentID : %s \n "
                           "name : %s \n "
                           "parent_operations : %s \n "
                           "dependencies : %s \n "
                           "dependencies include subdirectories : %s \n "
                           "filename : %s \n "
                           "archive : %s \n "
                           "inputs : %s \n "
                           "outputs : %s \n "
                           "runtime image : %s \n ",
                           operation.id,
                           operation.name,
                           operation.parent_operations,
                           operation.dependencies,
                           operation.include_subdirectories,
                           operation.filename,
                           operation_artifact_archive,
                           operation.inputs,
                           operation.outputs,
                           operation.runtime_image)

            # create pipeline operation
            notebook_op = NotebookOp(name=operation.name,
                                     notebook=operation.filename,
                                     cos_endpoint=cos_endpoint,
                                     cos_bucket=cos_bucket,
                                     cos_directory=cos_directory,
                                     cos_dependencies_archive=operation_artifact_archive,
                                     image=operation.runtime_image)

            if operation.inputs:
                notebook_op.add_pipeline_inputs(self._artifact_list_to_str(operation.inputs))
            if operation.outputs:
                notebook_op.add_pipeline_outputs(self._artifact_list_to_str(operation.outputs))

            notebook_op.add_environment_variable('AWS_ACCESS_KEY_ID', cos_username)
            notebook_op.add_environment_variable('AWS_SECRET_ACCESS_KEY', cos_password)

            # Set ENV variables
            if operation.env_vars:
                for env_var in operation.env_vars:
                    # Strip any of these special characters from both key and value
                    # Splits on the first occurrence of '='
                    result = [x.strip(' \'\"') for x in env_var.split('=', 1)]
                    # Should be non empty key with a value
                    if len(result) == 2 and result[0] != '':
                        notebook_op.add_environment_variable(result[0], result[1])

            notebook_ops[operation.id] = notebook_op

            self.log.info("NotebookOp Created for Component '%s' (%s)", operation.name, operation.id)

            # upload operation dependencies to object storage
            try:
                dependency_archive_path = self._generate_dependency_archive(operation)
                cos_client = CosClient(config=runtime_configuration)
                cos_client.upload_file_to_dir(dir=cos_directory,
                                              file_name=operation_artifact_archive,
                                              file_path=dependency_archive_path)
            except BaseException:
                self.log.error("Error uploading artifacts to object storage.", exc_info=True)
                raise

            self.log.info("Pipeline dependencies have been uploaded to object storage")

        # Process dependencies after all the operations have been created
        for pipeline_operation in pipeline.operations.values():
            op = notebook_ops[pipeline_operation.id]
            for parent_operation_id in pipeline_operation.parent_operations:
                parent_op = notebook_ops[parent_operation_id]  # Parent Operation
                op.after(parent_op)

        return notebook_ops
Exemplo n.º 2
0
    def _cc_pipeline(self, pipeline, pipeline_name):

        runtime_configuration = self._get_runtime_configuration(
            pipeline.runtime_config)

        cos_endpoint = runtime_configuration.metadata['cos_endpoint']
        cos_username = runtime_configuration.metadata['cos_username']
        cos_password = runtime_configuration.metadata['cos_password']
        cos_directory = pipeline_name
        cos_bucket = runtime_configuration.metadata['cos_bucket']

        # Create dictionary that maps component Id to its ContainerOp instance
        notebook_ops = {}

        # All previous operation outputs should be propagated throughout the pipeline.
        # In order to process this recursively, the current operation's inputs should be combined
        # from its parent's inputs (which, themselves are derived from the outputs of their parent)
        # and its parent's outputs.
        for operation in pipeline.operations.values():
            parent_io = []  # gathers inputs & outputs relative to parent
            for parent_operation_id in operation.parent_operations:
                parent_operation = pipeline.operations[parent_operation_id]
                if parent_operation.inputs:
                    parent_io.extend(parent_operation.inputs)
                if parent_operation.outputs:
                    parent_io.extend(parent_operation.outputs)

                if parent_io:
                    operation.inputs = parent_io

        for operation in pipeline.operations.values():
            operation_artifact_archive = self._get_dependency_archive_name(
                operation)

            self.log.debug(
                "Creating pipeline component :\n {op} archive : {archive}".
                format(op=operation, archive=operation_artifact_archive))

            # create pipeline operation
            notebook_op = NotebookOp(
                name=operation.name,
                notebook=operation.filename,
                cos_endpoint=cos_endpoint,
                cos_bucket=cos_bucket,
                cos_directory=cos_directory,
                cos_dependencies_archive=operation_artifact_archive,
                image=operation.runtime_image)

            if operation.inputs:
                notebook_op.add_pipeline_inputs(
                    self._artifact_list_to_str(operation.inputs))
            if operation.outputs:
                notebook_op.add_pipeline_outputs(
                    self._artifact_list_to_str(operation.outputs))

            notebook_op.add_environment_variable('AWS_ACCESS_KEY_ID',
                                                 cos_username)
            notebook_op.add_environment_variable('AWS_SECRET_ACCESS_KEY',
                                                 cos_password)

            # Set ENV variables
            if operation.env_vars:
                for env_var in operation.env_vars:
                    # Strip any of these special characters from both key and value
                    # Splits on the first occurrence of '='
                    result = [x.strip(' \'\"') for x in env_var.split('=', 1)]
                    # Should be non empty key with a value
                    if len(result) == 2 and result[0] != '':
                        notebook_op.add_environment_variable(
                            result[0], result[1])

            notebook_ops[operation.id] = notebook_op

            self.log.info("NotebookOp Created for Component '%s' (%s)",
                          operation.name, operation.id)

            # upload operation dependencies to object storage
            try:
                t0 = time.time()
                dependency_archive_path = self._generate_dependency_archive(
                    operation)
                t1 = time.time()
                self.log.debug(
                    "Generation of dependency archive for operation '{name}' took {duration:.3f} secs."
                    .format(name=operation.name, duration=(t1 - t0)))

                cos_client = CosClient(config=runtime_configuration)
                t0 = time.time()
                cos_client.upload_file_to_dir(
                    dir=cos_directory,
                    file_name=operation_artifact_archive,
                    file_path=dependency_archive_path)
                t1 = time.time()
                self.log.debug(
                    "Upload of dependency archive for operation '{name}' took {duration:.3f} secs."
                    .format(name=operation.name, duration=(t1 - t0)))

            except FileNotFoundError as ex:
                self.log.error(
                    "Dependencies were not found building archive for operation: {}"
                    .format(operation.name),
                    exc_info=True)
                raise FileNotFoundError(
                    "Node '{}' referenced dependencies that were not found: {}"
                    .format(operation.name, ex))

            except BaseException as ex:
                self.log.error(
                    "Error uploading artifacts to object storage for operation: {}"
                    .format(operation.name),
                    exc_info=True)
                raise ex from ex

            self.log.info(
                "Pipeline dependencies have been uploaded to object storage")

        # Process dependencies after all the operations have been created
        for operation in pipeline.operations.values():
            op = notebook_ops[operation.id]
            for parent_operation_id in operation.parent_operations:
                parent_op = notebook_ops[
                    parent_operation_id]  # Parent Operation
                op.after(parent_op)

        return notebook_ops