def _get_input_combinator(step: Step, scatter: Optional[Any] = None) -> InputCombinator: scatter_inputs = _get_scatter_inputs(scatter) # If there are no scatter ports in this step, create a single DotProduct combinator if not [n for n in scatter_inputs]: input_combinator = DotProductInputCombinator(name=utils.random_name(), step=step) for port in step.input_ports.values(): input_combinator.ports[port.name] = port return input_combinator # If there are scatter ports else: other_ports = dict(step.input_ports) cartesian_combinator = JupyterCartesianProductInputCombinator(name=utils.random_name(), step=step) # Separate scatter ports from the other ones scatter_ports = {} for port_name, port in step.input_ports.items(): if port_name in scatter_inputs: scatter_ports[port_name] = port del other_ports[port_name] # Choose the right combinator for the scatter ports, based on the scatter method property scatter_combinator = _get_combinator_from_scatter( step=step, scatter_ports=scatter_ports, scatter=scatter) cartesian_combinator.ports[scatter_combinator.name] = scatter_combinator # Create a CartesianProduct combinator between the scatter ports and the DotProduct of the others if other_ports: dotproduct_name = utils.random_name() dotproduct_combinator = DotProductInputCombinator(name=dotproduct_name, step=step) dotproduct_combinator.ports = other_ports cartesian_combinator.ports[dotproduct_name] = dotproduct_combinator return cartesian_combinator
async def run(self) -> None: jobs = [] # If there are input ports create jobs until termination token are received if self.input_ports: if self.input_combinator is None: raise WorkflowExecutionException( "No InputCombinator specified for step {step}".format( step=self.name)) while True: # Retrieve input tokens inputs = await self.input_combinator.get() # Check for termination if utils.check_termination(inputs): break # Set status to fireable self.status = Status.FIREABLE # Run job jobs.append( asyncio.create_task(self._run_job(inputs), name=utils.random_name())) # Otherwise simply run job else: jobs.append( asyncio.create_task(self._run_job([]), name=utils.random_name())) # Wait for jobs termination statuses = await asyncio.gather(*jobs) # Terminate step self.terminate(_get_step_status(statuses))
def __init__(self, context: StreamFlowContext, checkpoint_dir: Optional[Text] = None): super().__init__(context) self.checkpoint_dir = checkpoint_dir or os.path.join( tempfile.gettempdir(), 'streamflow', 'checkpoint', utils.random_name()) self.copy_tasks: MutableSequence = []
async def _get_tmpdir(self, resource: Text): scratch_home = '/scratch/home/{username}'.format( username=self.username) temp_dir = posixpath.join(scratch_home, 'streamflow', "".join(utils.random_name())) async with self._get_ssh_client(resource) as ssh_client: await ssh_client.run('mkdir -p {dir}'.format(dir=temp_dir)) return temp_dir
async def _run_with_streamflow(self, cell_name: Text, compiler, ast_nodes: List[Tuple[ast.AST, Text]], cell_config: MutableMapping[Text, Any]): # Build the step target from metadata cell = JupyterCell(name=cell_name, code=ast_nodes, compiler=compiler, metadata=cell_config) translator = JupyterNotebookTranslator(context=self.context) step = await translator.translate_cell(cell=cell, autoawait=self.autoawait, metadata=cell_config) # Inject inputs input_injector = BaseJob(name=utils.random_name(), step=BaseStep(utils.random_name(), self.context), inputs=[]) await self._inject_inputs(step=step, job=input_injector) # Execute the step await step.run() # Print output log output_retriever = utils.random_name() d = tempfile.mkdtemp() output = await _get_output(step=step, output_retriever=output_retriever, d=d) if output: print(output) # Retrieve output tokens if step.status == Status.COMPLETED: output_names = {} for port_name, port in step.output_ports.items(): if port_name != executor.CELL_OUTPUT: token_processor = step.output_ports[ port_name].token_processor token = await step.output_ports[port_name].get( output_retriever) token = await token_processor.collect_output(token, d) if isinstance(token.job, MutableSequence): output_names[token.name] = utils.flatten_list( [t.value for t in token.value]) else: output_names[token.name] = token.value # Update namespaces self.user_ns.update(output_names)
def _init_dir(self) -> Text: if self.step.target is not None: path_processor = posixpath workdir = self.step.workdir or path_processor.join( '/tmp', 'streamflow') else: path_processor = os.path workdir = self.step.workdir or path_processor.join( tempfile.gettempdir(), 'streamflow') dir_path = path_processor.join(workdir, utils.random_name()) return dir_path
async def collect_output(self, token: Token, output_dir: Text) -> Token: if isinstance(token.job, MutableSequence) or self.port_type not in [ 'File', 'Directory' ]: return await super().collect_output(token, output_dir) if token.value is not None and self.port_type in ['File', 'Directory']: context = self.get_context() output_collector = BaseJob(name=random_name(), step=BaseStep(name=random_name(), context=context), inputs=[], input_directory=output_dir) return token.update(await self._update_file_token( job=output_collector, src_job=context.scheduler.get_job(token.job), token_value=token.value, load_listing=LoadListing.deep_listing, writable=True)) else: return token
def _get_combinator_from_scatter(step: Step, scatter_ports: MutableMapping[Text, InputPort], scatter: Optional[MutableMapping[Text, Any]] = None) -> InputCombinator: scatter_method = scatter.get('method', 'cartesian') combinator_name = utils.random_name() if scatter_method == 'cartesian': combinator = JupyterCartesianProductInputCombinator(name=combinator_name, step=step) else: combinator = DotProductInputCombinator(name=combinator_name, step=step) if scatter: for entry in scatter.get('items') or []: if isinstance(entry, str): combinator.ports[entry] = scatter_ports[entry] else: inner_combinator = _get_combinator_from_scatter( step=step, scatter_ports=scatter_ports, scatter=entry) combinator.ports[inner_combinator.name] = inner_combinator return combinator
async def _run_batch_command( self, helper_file: Text, job_name: Text, resource: Text, workdir: Optional[Text] = None, stdin: Optional[Union[int, Text]] = None, stdout: Union[int, Text] = asyncio.subprocess.STDOUT, stderr: Union[int, Text] = asyncio.subprocess.STDOUT) -> Text: batch_command = "{workdir} qsub {stdin} {stdout} {stderr} {helper_file}".format( workdir="cd {workdir} &&".format( workdir=workdir) if workdir is not None else "", stdin="-i \"{stdin}\"".format( stdin=stdin) if stdin is not None else "", stdout=("-o \"{stdout}\"".format( stdout=stdout if stdout != STDOUT else utils.random_name())), stderr="-e \"{stderr}\"".format(stderr=stderr) if stderr != STDOUT and stderr != stdout else "", helper_file=helper_file) async with self._get_ssh_client(resource) as ssh_client: result = await ssh_client.run(batch_command) return result.stdout.strip()
async def run(self, output_dir: Optional[Text] = os.getcwd()): output_tokens = {} # Execute workflow for step in self.workflow.steps: execution = asyncio.create_task(self._execute(step), name=step) self.executions.append(execution) # If workflow has output ports if self.workflow.output_ports: # Retreive output tokens output_consumer = utils.random_name() for port_name, port in self.workflow.output_ports.items(): self.output_tasks.append(asyncio.create_task(port.get(output_consumer), name=port_name)) while not self.closed: output_tokens = await self._wait_outputs(output_consumer, output_dir, output_tokens) # Otherwise simply wait for all tasks to finish else: await asyncio.gather(*self.executions) # Check if workflow terminated properly for step in self.workflow.steps.values(): if step.status == Status.FAILED: raise WorkflowExecutionException("Workflow execution failed") # Print output tokens print(json.dumps(output_tokens, sort_keys=True, indent=4))
async def execute(self, job: Job) -> CommandOutput: connector = self.step.get_connector() # Transfer executor file to remote resource executor_path = await self._transfer_file( job, os.path.join(executor.__file__)) # Modify code, environment and namespaces according to inputs input_names = {} environment = {} for token in job.inputs: if token.value is not None: command_token = self.input_tokens[token.name] token_value = ([token.value] if isinstance( self.step.input_ports[token.name], ScatterInputPort) else token.value) if command_token.token_type == 'file': input_names[token.name] = token_value elif command_token.token_type == 'name': input_names[token.name] = token_value elif command_token.token_type == 'env': environment[token.name] = token_value # List output names to be retrieved from remote context output_names = [ name for name, p in self.step.output_ports.items() if name != executor.CELL_OUTPUT ] # Serialize AST nodes to remote resource code_path = await self._serialize_to_remote_file(job, self.ast_nodes) # Configure output fiel path path_processor = get_path_processor(self.step) output_path = path_processor.join(job.output_directory, random_name()) # Extract serializers from command tokens input_serializers = { k: v.serializer for k, v in self.input_tokens.items() if v.serializer is not None } output_serializers = { k: v.serializer for k, v in self.output_tokens.items() if v.serializer is not None } # Serialize namespaces to remote resource user_ns_path = await self._serialize_namespace( input_serializers=input_serializers, job=job, namespace=input_names) # Create dictionaries of postload input serializers and predump output serializers postload_input_serializers = { k: { 'postload': v['postload'] } for k, v in input_serializers.items() if 'postload' in v } predump_output_serializers = { k: { 'predump': v['predump'] } for k, v in output_serializers.items() if 'predump' in v } # Parse command cmd = [self.interpreter, executor_path] if os.path.basename(self.interpreter) == 'ipython': cmd.append('--') if self.step.workdir: cmd.extend(["--workdir", self.step.workdir]) if self.autoawait: cmd.append("--autoawait") cmd.extend(["--local-ns-file", user_ns_path]) if postload_input_serializers: postload_serializers_path = await self._serialize_to_remote_file( job, postload_input_serializers) cmd.extend( ["--postload-input-serializers", postload_serializers_path]) if predump_output_serializers: predump_serializers_path = await self._serialize_to_remote_file( job, predump_output_serializers) cmd.extend( ["--predump-output-serializers", predump_serializers_path]) for name in output_names: cmd.extend(["--output-name", name]) cmd.extend([code_path, output_path]) # Execute command if connector is not None: resources = job.get_resources() logger.info( 'Executing job {job} on resource {resource} into directory {outdir}:\n{command}' .format( job=job.name, resource=resources[0] if resources else None, outdir=job.output_directory, command=' \\\n\t'.join(cmd), )) # If step is assigned to multiple resources, add the STREAMFLOW_HOSTS environment variable if len(resources) > 1: available_resources = await connector.get_available_resources( self.step.target.service) hosts = { k: v.hostname for k, v in available_resources.items() if k in resources } environment['STREAMFLOW_HOSTS'] = ','.join(hosts.values()) # Configure standard streams stdin = self.stdin stdout = self.stdout if self.stdout is not None else STDOUT stderr = self.stderr if self.stderr is not None else stdout # Execute command result, exit_code = await connector.run( resources[0] if resources else None, cmd, environment=environment, workdir=job.output_directory, stdin=stdin, stdout=stdout, stderr=stderr, capture_output=True, job_name=job.name) else: logger.info( 'Executing job {job} into directory {outdir}: \n{command}'. format(job=job.name, outdir=job.output_directory, command=' \\\n\t'.join(cmd))) # Configure standard streams stdin = open(self.stdin, "rb") if self.stdin is not None else None stdout = open(self.stdout, "wb") if self.stdout is not None else None stderr = open(self.stderr, "wb") if self.stderr is not None else None # Execute command proc = await asyncio.create_subprocess_exec( *cmd, cwd=job.output_directory, env={ **os.environ, **environment }, stdin=stdin, stdout=stdout, stderr=stderr) result, error = await proc.communicate() exit_code = proc.returncode # Close streams if stdin is not None: stdin.close() if stdout is not None: stdout.close() if stderr is not None: stderr.close() # Retrieve outputs with TemporaryDirectory() as d: dest_path = os.path.join(d, path_processor.basename(output_path)) await self.step.context.data_manager.transfer_data(src=output_path, src_job=job, dst=dest_path, dst_job=None) with open(dest_path, mode='r') as f: json_output = json.load(f) # Infer status status = Status[json_output[executor.CELL_STATUS]] if status == Status.COMPLETED: command_stdout = json_output[executor.CELL_OUTPUT] if isinstance(command_stdout, MutableSequence ): # TODO: understand why we obtain a list here command_stdout = command_stdout[0] user_ns = await self._deserialize_namespace( job=job, output_serializers=output_serializers, remote_path=json_output[executor.CELL_LOCAL_NS]) else: command_stdout = json_output[executor.CELL_OUTPUT] user_ns = {} # Return the command output object return JupyterCommandOutput(value=command_stdout, status=status, user_ns=user_ns)
async def run_workflow(self, notebook): result = ExecutionResult(None) def error_before_exec(val): result.error_before_exec = val self.last_execution_succeeded = False self.last_execution_result = result return result cells = [ self.transform_cell(cell['code']) for cell in notebook['cells'] ] with self.builtin_trap, self.display_trap: try: # Extract cells code jupyter_cells = [] for cell, metadata in zip(cells, [ c.get('metadata', {'step': {}}) for c in notebook['cells'] ]): cell_name = self.compile.cache(cell, self.execution_count, raw_code=cell) code_ast = self.compile.ast_parse(cell, filename=cell_name) code_ast = self.transform_ast(code_ast) to_run = [(node, 'exec') for node in code_ast.body] jupyter_cells.append( JupyterCell(name=cell_name, code=to_run, compiler=self.compile, metadata=metadata)) # Build workflow translator = JupyterNotebookTranslator(context=self.context) workflow = await translator.translate( JupyterNotebook(cells=jupyter_cells, autoawait=self.autoawait, metadata=notebook.get('metadata'))) # Inject inputs input_injector = BaseJob(name=utils.random_name(), step=BaseStep(utils.random_name(), self.context), inputs=[]) for step in workflow.steps.values(): await self._inject_inputs(step=step, job=input_injector) except self.custom_exceptions as e: etype, value, tb = sys.exc_info() self.CustomTB(etype, value, tb) return error_before_exec(e) except (InputRejected, WorkflowDefinitionException) as e: self.showtraceback() return error_before_exec(e) except IndentationError as e: self.showindentationerror() return error_before_exec(e) except (OverflowError, SyntaxError, ValueError, TypeError, MemoryError) as e: self.showsyntaxerror() return error_before_exec(e) self.displayhook.exec_result = result # Execute workflow d = tempfile.mkdtemp() try: with open(os.devnull, 'w') as devnull: with redirect_stdout(devnull), redirect_stderr(devnull): await StreamFlowExecutor( context=self.context, workflow=workflow).run(output_dir=d) # Print output logs output_retriever = utils.random_name() d = tempfile.mkdtemp() result.result = {} for step in workflow.steps.values(): output = await _get_output( step=step, output_retriever=output_retriever, d=d) if output: result.result[step.name] = output except: if result: result.error_before_exec = sys.exc_info()[1] self.showtraceback() return result
async def _build_token_value( self, job: Job, token_value: Any, load_contents: Optional[bool] = None, load_listing: Optional[LoadListing] = None) -> Any: if load_contents is None: load_contents = self.load_contents if token_value is None: return self.default_value elif isinstance(token_value, MutableSequence): value_tasks = [] for t in token_value: value_tasks.append( asyncio.create_task( self._build_token_value(job, t, load_listing))) return await asyncio.gather(*value_tasks) elif (isinstance(token_value, MutableMapping) and token_value.get( 'class', token_value.get('type')) in ['File', 'Directory']): step = job.step if job is not None else self.port.step # Get filepath filepath = get_path_from_token(token_value) if filepath is not None: # Process secondary files in token value sf_map = {} if 'secondaryFiles' in token_value: sf_tasks = [] for sf in token_value.get('secondaryFiles', []): sf_path = get_path_from_token(sf) path_processor = get_path_processor(step) if not path_processor.isabs(sf_path): path_processor.join( path_processor.dirname(filepath), sf_path) sf_tasks.append( asyncio.create_task( _get_file_token(step=step, job=job, token_class=sf['class'], filepath=sf_path, basename=sf.get('basename'), load_contents=load_contents, load_listing=load_listing or self.load_listing))) sf_map = { get_path_from_token(sf): sf for sf in await asyncio.gather(*sf_tasks) } # Compute the new token value token_value = await _get_file_token( step=step, job=job, token_class=token_value.get('class', token_value.get('type')), filepath=filepath, basename=token_value.get('basename'), load_contents=load_contents, load_listing=load_listing or self.load_listing) # Compute new secondary files from port specification if self.secondary_files: context = utils.build_context(job) context['self'] = token_value sf_tasks, sf_specs = [], [] for secondary_file in self.secondary_files: # If pattern is an expression, evaluate it and process result if '$(' in secondary_file.pattern or '${' in secondary_file.pattern: sf_value = utils.eval_expression( expression=secondary_file.pattern, context=context, full_js=self.full_js, expression_lib=self.expression_lib) if isinstance(sf_value, MutableSequence): for sf in sf_value: sf_tasks.append( asyncio.create_task( self._process_secondary_file( job=job, secondary_file=sf, token_value=token_value, from_expression=True, existing_sf=sf_map, load_contents=load_contents, load_listing=load_listing or self.load_listing))) sf_specs.append(secondary_file) else: sf_tasks.append( asyncio.create_task( self._process_secondary_file( job=job, secondary_file=sf_value, token_value=token_value, from_expression=True, existing_sf=sf_map, load_contents=load_contents, load_listing=load_listing or self.load_listing))) sf_specs.append(secondary_file) # Otherwise, simply process the pattern string else: sf_tasks.append( asyncio.create_task( self._process_secondary_file( job=job, secondary_file=secondary_file.pattern, token_value=token_value, from_expression=False, existing_sf=sf_map, load_contents=load_contents, load_listing=load_listing or self.load_listing))) sf_specs.append(secondary_file) for sf_value, sf_spec in zip( await asyncio.gather(*sf_tasks), sf_specs): if sf_value is not None: sf_map[get_path_from_token(sf_value)] = sf_value elif sf_spec.required: raise WorkflowExecutionException( "Required secondary file {sf} not found". format(sf=sf_spec.pattern)) # Add all secondary files to the token if sf_map: token_value['secondaryFiles'] = list(sf_map.values()) # If there is only a 'contents' field, create a file on the step's resource and build the token elif 'contents' in token_value: path_processor = get_path_processor(self.port.step) filepath = path_processor.join( job.output_directory, token_value.get('basename', random_name())) connector = job.step.get_connector() resources = job.get_resources() or [None ] if job is not None else [ None ] await asyncio.gather(*[ asyncio.create_task( remotepath.write(connector, res, filepath, token_value['contents'])) for res in resources ]) token_value = await _get_file_token( step=step, job=job, token_class=token_value.get('class', token_value.get('type')), filepath=filepath, basename=token_value.get('basename'), load_contents=load_contents, load_listing=load_listing or self.load_listing) return token_value