Пример #1
0
    def copy(self, target: StorageVolume) -> List[str]:
        """Copy the file object to the target volume.

        Returns
        -------
        list of string
        """
        target.store(file=self._source, dst=self._target)
        return [self._target]
Пример #2
0
    def prepare(self, store: StorageVolume, inputs: List[str],
                outputs: List[str]):
        """Prepare the storage volume for a worker.

        Ensures that the input files that are needed by the worker are available
        in their latest version at the given volume store.

        Raises a ValueError if a specified input file does not exist.

        Parameters
        ----------
        store: flowserv.volume.base.StorageVolume
            Storage volume that is being prepared.
        inputs: list of string
            Relative path (keys) of required input files for a workflow step.
        outputs: list of string
            Relative path (keys) of created output files by a workflow step.
        """
        # Generate dictionary that maps all files that are matches to the given
        # query list to the list of storage volume that the files are available
        # at. At this point we perform a search with quadratic time complexity
        # in the number of query files and and files in the workflow context,
        # assuming that neither (or at least the query files) contains a very
        # large number of elements.
        required_files = dict()
        for q in inputs:
            # The comparison depends on whether the specified file name ends
            # with a '/' (indicating that a directory is referenced) or not.
            is_match = prefix_match if q.endswith('/') else exact_match
            for f, fstores in self.files.items():
                if f not in required_files and is_match(f, q):
                    required_files[f] = fstores
        # Copy required files that are currently not available to the worker.
        for f, fstores in required_files.items():
            # Check if the file is available at the target store.
            if store.identifier in fstores:
                continue
            # If the file is not available at the target volume we need to
            # upload it.
            source = self.get(fstores[0])
            # Upload file from the source storage volume to the target
            # volume.
            for key in source.copy(src=f, store=store):
                self.files[key].append(store.identifier)
        # Create folders for output files.
        out_folders = set()
        for file in outputs:
            parent = file if file.endswith('/') else util.join(
                *file.split('/')[:-1])
            out_folders.add(parent)
        for dirname in out_folders:
            store.mkdir(path=dirname)
Пример #3
0
def write_results(runstore: StorageVolume, files: Tuple[Union[dict, list], str, str]):
    """Create a result files for a workflow run.


    Parameters
    ----------
    runstore: flowserv.volume.base.StorageVolume
        Storage volume for the run (result) files of a successful workflow run.
    files: list
        List of 3-tuples containing the file data, format, and relative path.
    """
    for data, format, rel_path in files:
        runstore.store(file=io_file(data=data, format=format), dst=rel_path)
Пример #4
0
def volume_manager(specs: List[Dict], runstore: StorageVolume,
                   runfiles: List[str]) -> VolumeManager:
    """Create an instance of the storage volume manager for a workflow run.

    Combines the volume store specifications in the workflow run confguration
    with the storage volume for the workflow run files.

    Parameters
    ----------
    specs: list of dict
        List of specifications (dictionary serializations) for storage volumes.
    runstore: flowserv.volume.base.StorageVolume
        Storage volume for run files.
    runfiles: list of string
        List of files that have been copied to the run store.

    Returns
    -------
    flowserv.volume.manager.VolumeManager
    """
    stores = [runstore.to_dict()]
    files = defaultdict(list)
    for f in runfiles:
        files[f].append(DEFAULT_STORE)
    for doc in specs:
        # Ignore stores that match the identifier of the runstore to avoid
        # overriding the run store information.
        if doc['id'] == runstore.identifier:
            continue
        stores.append(doc)
        for f in doc.get('files', []):
            files[f].append(doc['id'])
    return VolumeManager(stores=stores, files=files)
Пример #5
0
    def create_workflow(
        self, run: RunObject, template: WorkflowTemplate, arguments: Dict,
        staticfs: StorageVolume
    ) -> RemoteWorkflowHandle:
        """Create a new instance of a workflow from the given workflow
        template and user-provided arguments.

        Parameters
        ----------
        run: flowserv.model.base.RunObject
            Handle for the run that is being executed.
        template: flowserv.model.template.base.WorkflowTemplate
            Workflow template containing the parameterized specification and
            the parameter declarations.
        arguments: dict
            Dictionary of argument values for parameters in the template.
        staticfs: flowserv.volume.base.StorageVolume
            Storage volume that contains the static files from the workflow
            template.

        Returns
        -------
        flowserv.controller.remote.client.RemoteWorkflowHandle
        """
        # Create a serial workfow to have a workflow handle.
        return RemoteWorkflowHandle(
            run_id=run.run_id,
            workflow_id=run.run_id,
            state=self.state,
            output_files=[],
            runstore=staticfs.get_store_for_folder(util.join('runs', run.run_id)),
            client=self
        )
Пример #6
0
def read_run_results(run: RunObject, schema: ResultSchema,
                     runstore: StorageVolume):
    """Read the run results from the result file that is specified in the workflow
    result schema. If the file is not found we currently do not raise an error.

    Parameters
    ----------
    run: flowserv.model.base.RunObject
        Handle for a workflow run.
    schema: flowserv.model.template.schema.ResultSchema
        Workflow result schema specification that contains the reference to the
        result file key.
    runstore: flowserv.volume.base.StorageVolume
        Storage volume containing the run (result) files for a successful
        workflow run.
    """
    with runstore.load(schema.result_file).open() as f:
        results = util.read_object(f)
    # Create a dictionary of result values.
    values = dict()
    for col in schema.columns:
        val = util.jquery(doc=results, path=col.jpath())
        col_id = col.column_id
        if val is None and col.required:
            msg = "missing value for '{}'".format(col_id)
            raise err.ConstraintViolationError(msg)
        elif val is not None:
            values[col_id] = col.cast(val)
    run.result = values
Пример #7
0
def prepare_postproc_data(input_files: List[str], ranking: List[RunResult],
                          run_manager: RunManager, store: StorageVolume):
    """Create input files for post-processing steps for a given set of runs.

    Creates files for a post-processing run in a given base directory on a
    storage volume. The resulting directory contains files for each run in a
    given ranking. For each run a sub-folder with the run identifier as the
    directory name is created. Each folder contains copies of result files for
    the run for those files that are specified in the input files list. A file
    ``runs.json`` in the base directory lists the runs in the ranking together
    with their group name.

    Parameters
    ----------
    input_files: list(string)
        List of identifier for benchmark run output files that are copied into
        the input directory for each submission.
    ranking: list(flowserv.model.ranking.RunResult)
        List of runs in the current result ranking
    run_manager: flowserv.model.run.RunManager
        Manager for workflow runs
    store: flowserv.volume.base.StorageVolume
        Target storage volume where the created post-processing files are
        stored.
    """
    # Collect information about runs and their result files.
    runs = list()
    for entry in ranking:
        run_id = entry.run_id
        group_name = entry.group_name
        # Create a sub-folder for the run in the output directory. Then copy
        # all given files into the created directory.
        rundir = run_id
        for key in input_files:
            # Copy run file to target file.
            file = run_manager.get_runfile(run_id=run_id, key=key)
            dst = util.join(rundir, key)
            store.store(file=file, dst=dst)
        runs.append({
            LABEL_ID: run_id,
            LABEL_NAME: group_name,
            LABEL_FILES: input_files
        })
    store.store(file=io_file(runs), dst=RUNS_FILE)
Пример #8
0
def store_run_files(run: RunObject, files: List[str], source: StorageVolume,
                    target: StorageVolume) -> List[RunFile]:
    """Create list of output files for a successful run. The list of files
    depends on whether files are specified in the workflow specification or not.
    If files are specified only those files are included in the returned lists.
    Otherwise, all result files that are listed in the run state are returned.

    Parameters
    ----------
    run: flowserv.model.base.RunObject
        Handle for a workflow run.
    files: list of string
        List of result files for a successful workflow run.
    source: flowserv.volume.base.StorageVolume
        Storage volume containing the run (result) files for a successful
        workflow run.
    target: flowserv.volume.base.StorageVolume
        Storage volume for persiting run result files.

    Returns
    -------
    list of RunObject, list of string
    """
    outputs = run.outputs()
    if outputs:
        # List only existing files for output specifications in the
        # workflow handle. Note that (i) the result of run.outputs() is
        # always a dictionary and (ii) that the keys in the returned
        # dictionary are not necessary equal to the file sources.
        files = [f.source for f in run.outputs().values()]
    # Copy files to the target volume.
    runfiles = list()
    for key in files:
        f = source.load(key)
        target.store(file=f, dst=key)
        mime_type, _ = mimetypes.guess_type(url=key)
        runfile = RunFile(key=key,
                          name=key,
                          mime_type=mime_type,
                          size=f.size())
        runfiles.append(runfile)
    return runfiles
Пример #9
0
def run_postproc_workflow(workflow: WorkflowObject, ranking: List[RunResult],
                          keys: List[str], run_manager: RunManager,
                          tmpstore: StorageVolume, staticfs: StorageVolume,
                          backend: WorkflowController):
    """Run post-processing workflow for a workflow template.

    Parameters
    ----------
    workflow: flowserv.model.base.WorkflowObject
        Handle for the workflow that triggered the post-processing workflow run.
    ranking: list(flowserv.model.ranking.RunResult)
        List of runs in the current result ranking.
    keys: list of string
        Sorted list of run identifier for runs in the ranking.
    run_manager: flowserv.model.run.RunManager
        Manager for workflow runs
    tmpstore: flowserv.volume.base.StorageVolume
        Temporary storage volume where the created post-processing files are
        stored. This volume will be erased after the workflow is started.
    staticfs: flowserv.volume.base.StorageVolume
        Storage volume that contains the static files from the workflow
        template.
    backend: flowserv.controller.base.WorkflowController
        Backend that is used to execute the post-processing workflow.
    """
    # Get workflow specification and the list of input files from the
    # post-processing statement.
    postproc_spec = workflow.postproc_spec
    workflow_spec = postproc_spec.get('workflow')
    pp_inputs = postproc_spec.get('inputs', {})
    pp_files = pp_inputs.get('files', [])
    # Prepare temporary directory with result files for all
    # runs in the ranking. The created directory is the only
    # run argument
    strace = None
    try:
        prepare_postproc_data(input_files=pp_files,
                              ranking=ranking,
                              run_manager=run_manager,
                              store=tmpstore)
        dst = pp_inputs.get('runs', RUNS_DIR)
        run_args = {PARA_RUNS: InputDirectory(store=tmpstore, target=RUNS_DIR)}
        arg_list = [serialize_arg(PARA_RUNS, dst)]
    except Exception as ex:
        logging.error(ex, exc_info=True)
        strace = util.stacktrace(ex)
        run_args = dict()
        arg_list = []
    # Create a new run for the workflow. The identifier for the run group is
    # None.
    run = run_manager.create_run(workflow=workflow,
                                 arguments=arg_list,
                                 runs=keys)
    if strace is not None:
        # If there were data preparation errors set the created run into an
        # error state and return.
        run_manager.update_run(run_id=run.run_id,
                               state=run.state().error(messages=strace))
    else:
        # Execute the post-processing workflow asynchronously if
        # there were no data preparation errors.
        try:
            postproc_state, runstore = backend.exec_workflow(
                run=run,
                template=WorkflowTemplate(workflow_spec=workflow_spec,
                                          parameters=PARAMETERS),
                arguments=run_args,
                staticfs=staticfs,
                config=workflow.engine_config)
        except Exception as ex:
            # Make sure to catch exceptions and set the run into an error state.
            postproc_state = run.state().error(messages=util.stacktrace(ex))
            runstore = None
        # Update the post-processing workflow run state if it is
        # no longer pending for execution.
        if not postproc_state.is_pending():
            run_manager.update_run(run_id=run.run_id,
                                   state=postproc_state,
                                   runstore=runstore)
        # Erase the temporary storage volume.
        tmpstore.erase()
Пример #10
0
    def exec_workflow(
            self,
            run: RunObject,
            template: WorkflowTemplate,
            arguments: Dict,
            staticfs: StorageVolume,
            config: Optional[Dict] = None
    ) -> Tuple[WorkflowState, StorageVolume]:
        """Initiate the execution of a given workflow template for a set of
        argument values. This will start a new process that executes a serial
        workflow asynchronously.

        The serial workflow engine executes workflows on the local machine and
        therefore uses the file system to store temporary run files. The path
        to the run folder is returned as the second value in the result tuple.
        The first value in the result tuple is the state of the workflow after
        the process is stated. If the workflow is executed asynchronously the
        state will be RUNNING. Otherwise, the run state should be an inactive
        state.

        The set of arguments is not further validated. It is assumed that the
        validation has been performed by the calling code (e.g., the run
        service manager).

        The optional configuration object can be used to override the worker
        configuration that was provided at object instantiation. Expects a
        dictionary with an element `workers` that contains a mapping of container
        identifier to a container worker configuration object.

        If the state of the run handle is not pending, an error is raised.

        Parameters
        ----------
        run: flowserv.model.base.RunObject
            Handle for the run that is being executed.
        template: flowserv.model.template.base.WorkflowTemplate
            Workflow template containing the parameterized specification and
            the parameter declarations.
        arguments: dict
            Dictionary of argument values for parameters in the template.
        staticfs: flowserv.volume.base.StorageVolume
            Storage volume that contains the static files from the workflow
            template.
        config: dict, default=None
            Optional object to overwrite the worker configuration settings.

        Returns
        -------
        flowserv.model.workflow.state.WorkflowState, flowserv.volume.base.StorageVolume
        """
        # Get the run state. Raise an error if the run is not in pending state.
        if not run.is_pending():
            raise RuntimeError("invalid run state '{}'".format(run.state))
        state = run.state()
        # Create configuration dictionary that merges the engine global
        # configuration with the workflow-specific one.
        run_config = self.config if self.config is not None else dict()
        if config:
            run_config.update(config)
        # Get the list of workflow steps, run arguments, and the list of output
        # files that the workflow is expected to generate.
        steps, run_args, outputs = parser.parse_template(template=template,
                                                         arguments=arguments)
        # Create and prepare storage volume for run files.
        runstore = self.fs.get_store_for_folder(key=util.join(
            self.runsdir, run.run_id),
                                                identifier=DEFAULT_STORE)
        try:
            # Copy template files to the run folder.
            files = staticfs.copy(src=None, store=runstore)
            # Store any given file arguments and additional input files
            # that are required by actor parameters into the run folder.
            for key, para in template.parameters.items():
                if para.is_file() and key in arguments:
                    for key in arguments[key].copy(target=runstore):
                        files.append(key)
                elif para.is_actor() and key in arguments:
                    input_files = arguments[key].files
                    for f in input_files if input_files else []:
                        for key in f.copy(target=runstore):
                            files.append(key)
            # Create factory objects for storage volumes.
            volumes = volume_manager(specs=run_config.get('volumes', []),
                                     runstore=runstore,
                                     runfiles=files)
            # Create factory for workers. Include mapping of workflow steps to
            # the worker that are responsible for their execution.
            workers = WorkerPool(workers=run_config.get('workers', []),
                                 managers={
                                     doc['step']: doc['worker']
                                     for doc in run_config.get('workflow', [])
                                 })
            # Start a new process to run the workflow. Make sure to catch all
            # exceptions to set the run state properly.
            state = state.start()
            if self.is_async:
                # Run steps asynchronously in a separate process
                pool = Pool(processes=1)
                task_callback_function = partial(callback_function,
                                                 lock=self.lock,
                                                 tasks=self.tasks,
                                                 service=self.service)
                with self.lock:
                    self.tasks[run.run_id] = (pool, state)
                pool.apply_async(run_workflow,
                                 args=(run.run_id, state, outputs, steps,
                                       run_args, volumes, workers),
                                 callback=task_callback_function)
                return state, runstore
            else:
                # Run steps synchronously and block the controller until done
                _, _, state_dict = run_workflow(run_id=run.run_id,
                                                state=state,
                                                output_files=outputs,
                                                steps=steps,
                                                arguments=run_args,
                                                volumes=volumes,
                                                workers=workers)
                return serialize.deserialize_state(state_dict), runstore
        except Exception as ex:
            # Set the workflow run into an ERROR state
            logging.error(ex, exc_info=True)
            return state.error(messages=util.stacktrace(ex)), runstore