def _cc_pipeline(self, pipeline, pipeline_name): runtime_configuration = self._get_runtime_configuration(pipeline.runtime_config) cos_endpoint = runtime_configuration.metadata['cos_endpoint'] cos_username = runtime_configuration.metadata['cos_username'] cos_password = runtime_configuration.metadata['cos_password'] cos_directory = pipeline_name cos_bucket = runtime_configuration.metadata['cos_bucket'] # Create dictionary that maps component Id to its ContainerOp instance notebook_ops = {} # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. for pipeline_operation in pipeline.operations.values(): parent_inputs_and_outputs = [] for parent_operation_id in pipeline_operation.parent_operations: parent_operation = pipeline.operations[parent_operation_id] if parent_operation.inputs: parent_inputs_and_outputs.extend(parent_operation.inputs) if parent_operation.outputs: parent_inputs_and_outputs.extend(parent_operation.outputs) if parent_inputs_and_outputs: pipeline_operation.inputs = parent_inputs_and_outputs for operation in pipeline.operations.values(): operation_artifact_archive = self._get_dependency_archive_name(operation) self.log.debug("Creating pipeline component :\n " "componentID : %s \n " "name : %s \n " "parent_operations : %s \n " "dependencies : %s \n " "dependencies include subdirectories : %s \n " "filename : %s \n " "archive : %s \n " "inputs : %s \n " "outputs : %s \n " "runtime image : %s \n ", operation.id, operation.name, operation.parent_operations, operation.dependencies, operation.include_subdirectories, operation.filename, operation_artifact_archive, operation.inputs, operation.outputs, operation.runtime_image) # create pipeline operation notebook_op = NotebookOp(name=operation.name, notebook=operation.filename, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=cos_directory, cos_dependencies_archive=operation_artifact_archive, image=operation.runtime_image) if operation.inputs: notebook_op.add_pipeline_inputs(self._artifact_list_to_str(operation.inputs)) if operation.outputs: notebook_op.add_pipeline_outputs(self._artifact_list_to_str(operation.outputs)) notebook_op.add_environment_variable('AWS_ACCESS_KEY_ID', cos_username) notebook_op.add_environment_variable('AWS_SECRET_ACCESS_KEY', cos_password) # Set ENV variables if operation.env_vars: for env_var in operation.env_vars: # Strip any of these special characters from both key and value # Splits on the first occurrence of '=' result = [x.strip(' \'\"') for x in env_var.split('=', 1)] # Should be non empty key with a value if len(result) == 2 and result[0] != '': notebook_op.add_environment_variable(result[0], result[1]) notebook_ops[operation.id] = notebook_op self.log.info("NotebookOp Created for Component '%s' (%s)", operation.name, operation.id) # upload operation dependencies to object storage try: dependency_archive_path = self._generate_dependency_archive(operation) cos_client = CosClient(config=runtime_configuration) cos_client.upload_file_to_dir(dir=cos_directory, file_name=operation_artifact_archive, file_path=dependency_archive_path) except BaseException: self.log.error("Error uploading artifacts to object storage.", exc_info=True) raise self.log.info("Pipeline dependencies have been uploaded to object storage") # Process dependencies after all the operations have been created for pipeline_operation in pipeline.operations.values(): op = notebook_ops[pipeline_operation.id] for parent_operation_id in pipeline_operation.parent_operations: parent_op = notebook_ops[parent_operation_id] # Parent Operation op.after(parent_op) return notebook_ops
def _cc_pipeline(self, pipeline, pipeline_name): runtime_configuration = self._get_runtime_configuration( pipeline.runtime_config) cos_endpoint = runtime_configuration.metadata['cos_endpoint'] cos_username = runtime_configuration.metadata['cos_username'] cos_password = runtime_configuration.metadata['cos_password'] cos_directory = pipeline_name cos_bucket = runtime_configuration.metadata['cos_bucket'] # Create dictionary that maps component Id to its ContainerOp instance notebook_ops = {} # All previous operation outputs should be propagated throughout the pipeline. # In order to process this recursively, the current operation's inputs should be combined # from its parent's inputs (which, themselves are derived from the outputs of their parent) # and its parent's outputs. for operation in pipeline.operations.values(): parent_io = [] # gathers inputs & outputs relative to parent for parent_operation_id in operation.parent_operations: parent_operation = pipeline.operations[parent_operation_id] if parent_operation.inputs: parent_io.extend(parent_operation.inputs) if parent_operation.outputs: parent_io.extend(parent_operation.outputs) if parent_io: operation.inputs = parent_io for operation in pipeline.operations.values(): operation_artifact_archive = self._get_dependency_archive_name( operation) self.log.debug( "Creating pipeline component :\n {op} archive : {archive}". format(op=operation, archive=operation_artifact_archive)) # create pipeline operation notebook_op = NotebookOp( name=operation.name, notebook=operation.filename, cos_endpoint=cos_endpoint, cos_bucket=cos_bucket, cos_directory=cos_directory, cos_dependencies_archive=operation_artifact_archive, image=operation.runtime_image) if operation.inputs: notebook_op.add_pipeline_inputs( self._artifact_list_to_str(operation.inputs)) if operation.outputs: notebook_op.add_pipeline_outputs( self._artifact_list_to_str(operation.outputs)) notebook_op.add_environment_variable('AWS_ACCESS_KEY_ID', cos_username) notebook_op.add_environment_variable('AWS_SECRET_ACCESS_KEY', cos_password) # Set ENV variables if operation.env_vars: for env_var in operation.env_vars: # Strip any of these special characters from both key and value # Splits on the first occurrence of '=' result = [x.strip(' \'\"') for x in env_var.split('=', 1)] # Should be non empty key with a value if len(result) == 2 and result[0] != '': notebook_op.add_environment_variable( result[0], result[1]) notebook_ops[operation.id] = notebook_op self.log.info("NotebookOp Created for Component '%s' (%s)", operation.name, operation.id) # upload operation dependencies to object storage try: t0 = time.time() dependency_archive_path = self._generate_dependency_archive( operation) t1 = time.time() self.log.debug( "Generation of dependency archive for operation '{name}' took {duration:.3f} secs." .format(name=operation.name, duration=(t1 - t0))) cos_client = CosClient(config=runtime_configuration) t0 = time.time() cos_client.upload_file_to_dir( dir=cos_directory, file_name=operation_artifact_archive, file_path=dependency_archive_path) t1 = time.time() self.log.debug( "Upload of dependency archive for operation '{name}' took {duration:.3f} secs." .format(name=operation.name, duration=(t1 - t0))) except FileNotFoundError as ex: self.log.error( "Dependencies were not found building archive for operation: {}" .format(operation.name), exc_info=True) raise FileNotFoundError( "Node '{}' referenced dependencies that were not found: {}" .format(operation.name, ex)) except BaseException as ex: self.log.error( "Error uploading artifacts to object storage for operation: {}" .format(operation.name), exc_info=True) raise ex from ex self.log.info( "Pipeline dependencies have been uploaded to object storage") # Process dependencies after all the operations have been created for operation in pipeline.operations.values(): op = notebook_ops[operation.id] for parent_operation_id in operation.parent_operations: parent_op = notebook_ops[ parent_operation_id] # Parent Operation op.after(parent_op) return notebook_ops