Exemplo n.º 1
0
    def get_task(self, tid=None, key=None):
        """
        Return a task by task ID (graph nid) or task name (key).

        :param tid:       nid of task to return
        :type tid:        :py:int
        :param key:       task name
        :type key:        :py:str

        :return:          task object
        :rtype:           :graphit:Graph
        """

        if tid:
            task = self.workflow.getnodes(tid)
        elif key:
            task = self.workflow.query_nodes(key=key)
        else:
            raise WorkflowError('Search on task ID (tid) or task name (key). None defined')

        if task.empty():
            raise WorkflowError('Task with tid {0} not in workflow'.format(tid))
        if not task.get('format') == 'task':
            raise WorkflowError('Node with tid {0} is no task object'.format(tid))

        return task
Exemplo n.º 2
0
    def connect_task(self, task1, task2, *args, **kwargs):
        """
        Connect tasks by task ID (graph nid)

        Creates the directed graph edge connecting two tasks (nodes) together.
        An edge also defines which parameters in the output of one task serve
        as input for another task and how they are named.

        Parameter selection is defined by all additional arguments and keyword
        arguments provided to `connect_task`. Keyword arguments also define the
        name translation of the argument.

        :param task1:         first task of two tasks to connect
        :type task1:          :py:int
        :param task2:         second task of two tasks to connect
        :type task2:          :py:int

        :return:              edge identifier
        :rtype:               :py:tuple
        """

        for task in (task1, task2):
            if task not in self.workflow.nodes:
                raise WorkflowError('Task {0} not in workflow'.format(task))
            if self.workflow.nodes[task].get('format') != 'task':
                raise WorkflowError(
                    'Node {0} not of format "task"'.format(task))

        if task1 == task2:
            raise WorkflowError('Connection to self not allowed')

        edge_data = {'key': u'task_link'}
        data_mapping = prepaire_data_dict(kwargs)
        if data_mapping:
            edge_data['data_mapping'] = data_mapping
        if len(args):
            data_select = [to_unicode(arg) for arg in args]

            # If data_select and data_mapping, the mapping keys should be in data_select
            for key in data_mapping:
                if key not in data_select:
                    data_select.append(key)
                    logging.debug(
                        'Added {0} data key to data selection list'.format(
                            key))

            edge_data['data_select'] = data_select

        eid = self.workflow.add_edge(task1, task2, **edge_data)

        return eid
Exemplo n.º 3
0
def schema_uri_to_dict(uri, request=True):
    """
    Parse MDStudio WAMP JSON schema URI to dictionary

    The function accepts both the WAMP standard URI as well as the MDStudio
    resource URI. The latter one defines explicitly if the URI describes a
    'resource' or 'endpoint', uses the full request or response endpoint
    schema name as stored in the database (e.a. <endpoint name>_<request or
    response>) and defines the schema version.

    The WAMP URI style will always default to version 1 of the schema and
    uses the `request` argument to switch between retrieving the 'request' or
    'response' schema for the endpoint

    MDStudio resource URI syntax:
        <resource or endpoint>://<context>/<component>/<endpoint>/v<version ID>'

    WAMP URI syntax:
        <context>.<component>.<endpoint or resource>.<name>

    :param uri:     MDStudio WAMP JSON Schema URI
    :type uri:      :py:str
    :param request: return the request schema for a WAMP style URI else return
                    the response schema
    :type request:  :py:bool

    :return:        parsed JSON schema URI
    :rtype:         :py:dict
    """

    split_uri = re.split(urisplitter, uri)

    # Parse MDStudio resource URI
    if '//' in uri:
        if len(split_uri) != 5:
            raise WorkflowError('Invalid MDStudio schema uri: {0}'.format(uri))
        uri_dict = dict(zip(mdstudio_urischema[:4], split_uri[:4]))
        uri_dict[u'version'] = int(split_uri[-1].strip(u'v'))

    # Parse WAMP URI
    else:
        if len(split_uri) != 4:
            raise WorkflowError('Invalid WAMP schema uri: {0}'.format(uri))
        uri_dict = dict(zip(wamp_urischema, split_uri))
        uri_dict[u'name'] = u'{0}_{1}'.format(uri_dict[u'name'], u'request' if request else u'response')
        uri_dict[u'version'] = 1

    return uri_dict
Exemplo n.º 4
0
    def set_output(self, output, **kwargs):
        """
        Set the output of the task.

        If the task is configured to store output to disk (store_output == True)
        the dictionary with output data is serialized to JSON and stored in the
        task directory. A JSON schema $ref directive is added to the project file
        to enable reloading of the output data.

        :param output:  task output data to set
        :type output:   :py:dict
        :param kwargs:  additional output data as keyword arguments
        :type kwargs:   :py:dict

        :raises:        WorkflowError, output should be of type 'dict',
                        task directory should exist if store_output
        """

        # Output should be a dictionary for now
        if not isinstance(output, dict):
            raise WorkflowError(
                'Task {0} ({1}). Output should be a dictionary, got {2}'.
                format(self.nid, self.key, type(output)))

        # Update with any keyword arguments
        output.update(kwargs)

        # Store to file or not
        if self.task_metadata.store_output():
            project_dir = self.origin.query_nodes(
                key='project_metadata').project_dir()
            task_dir = self.task_metadata.workdir.get()
            if task_dir and os.path.exists(task_dir):

                # Check for file paths, copy data to workdir
                output = collect_data(output, task_dir)

                output_json = os.path.join(task_dir, 'output.json')
                json.dump(output, open(output_json, 'w'), indent=2)

                output = {'$ref': os.path.relpath(output_json, project_dir)}
            else:
                raise WorkflowError(
                    'Task directory does not exist: {0}'.format(task_dir))

        outnode = self.task_metadata.output_data
        if outnode.get() is None:
            outnode.set(self.data.value_tag, output)
Exemplo n.º 5
0
    def __init__(self, workflow=None, **kwargs):
        """
        Init the workflow specification

        If no workflow provided init a default empty one.
        Additional keyword parameters are used to update the workflow project
        metadata.

        :param workflow: workflow specification
        :type workflow:  :graphit:GraphAxis
        :param kwargs:   additional keyword arguments used to update project
                         metadata
        :type kwargs:    :py:dict

        :raises:         WorkflowError, if 'workflow' not valid
        """

        self.workflow = workflow
        if workflow is None:
            self.new()

        if not isinstance(self.workflow, GraphAxis):
            raise WorkflowError('Not a valid workflow {0}'.format(
                self.workflow))

        # Update project metadata
        if kwargs:
            project_metadata = self.workflow.query_nodes(
                key='project_metadata').descendants()
            if not project_metadata.empty():
                project_metadata.update(kwargs)
Exemplo n.º 6
0
    def __init__(self, session):
        """
        :param session: MDStudio WAMP session required to make WAMP calls.
        :type session:  :mdstudio:component:session:ComponentSession
        """

        self.session = session
        self.schema_endpoint = u'mdstudio.schema.endpoint.get'
        self.vendor = self.session.component_config.static.get('vendor')

        if self.vendor is None:
            raise WorkflowError('MDStudio static.vendor not defined. "settings.yml" file may be missing')

        # Cache schema's to limit calls
        self._schema_cache = {}
Exemplo n.º 7
0
    def add_task(self, task_name, task_type='PythonTask', **kwargs):
        """
        Add a new task to the workflow from the set of supported workflow
        task types defined in the workflow ORM.

        The 'new' method of each task type is called at first creation to
        construct the task data object in the graph.
        Additional keyword arguments provided to the 'add_task' method will
        used to update the task data

        :param task_name: administrative name of the task
        :type task_name:  :py:str
        :param task_type: task type to add
        :type task_type:  :py:str
        :param kwargs:    additional keyword arguments passed to the task
                          init_task method.
        :type kwargs:     :py:dict

        :return:          Task object
        :rtype:           :graphit:GraphAxis

        :raises:          WorkflowError, unsupported 'task_type'
        """

        # Task type needs to be supported by ORM
        if task_type not in task_types:
            raise WorkflowError(
                'Task type "{0}" not supported, requires: {1}'.format(
                    task_type, ', '.join(task_types)))

        # Add the task as node to the workflow graph. The task 'new' method is
        # called for initial task initiation.
        nid = self.workflow.add_node(task_name,
                                     run_node_new=True,
                                     task_type=task_type,
                                     format='task')

        # Update Task metadata
        task = self.workflow.getnodes(nid)
        task.descendants().update(kwargs)

        # If this is the first task added, set the root to task nid
        if len(self.workflow.query_nodes(format='task')) == 1:
            self.workflow.root = nid

        return task
Exemplo n.º 8
0
def load_referenced_output(output_dict, base_path=None):
    """
    Resolve referred output

    Referred output is defined in the output dictionary using the JSON Schema
    '$ref' keyword that points to a json file on disk.

    :param output_dict: output dict to be updated
    :type output_dict:  :py:dict
    :param base_path:   base project path
    :type base_path:    :py:str

    :return:            updated output dict
    :rtype:             :py:dict
    """

    for key in list(output_dict.keys()):

        value = output_dict[key]
        if key == '$ref':

            if not os.path.isabs(value):
                value = os.path.join(base_path, value)

            if os.path.exists(value):
                json_parsed = json.load(open(value))
                if isinstance(json_parsed, dict):
                    output_dict.update(json_parsed)
                del output_dict[key]
            else:
                raise WorkflowError(
                    'No such references output file: {0}'.format(value))

        elif isinstance(value, dict):
            output_dict[key] = load_referenced_output(value,
                                                      base_path=base_path)

    return output_dict
Exemplo n.º 9
0
    def load(self, workflow):
        """
        Load workflow specification

        Initiate a workflow from a workflow specification or instance thereof.

        :param workflow: File path to predefined workflow in .jgf format
        :type workflow:  :py:str
        """

        # Construct a workflow GraphAxis object
        self.workflow = read_jgf(workflow)
        self.workflow.node_tools = NodeAxisTools
        self.workflow.orm = WORKFLOW_ORM

        if self.workflow.root is None:
            WorkflowError('Workflow does not have a root node defined')

        # Get metadata
        metadata = self.workflow.query_nodes(key='project_metadata')
        logging.info('Load workflow "{0}"'.format(metadata.title.get()))
        logging.info('Created: {0}, updated: {1}'.format(
            metadata.create_time.get(), metadata.update_time.get()))
        logging.info('Description: {0}'.format(metadata.description.get()))
Exemplo n.º 10
0
    def run(self, project_dir="./md_workflow", tid=None, validate=True):
        """
        Run a workflow specification

        Runs the workflow until finished, failed or a breakpoint is reached.
        A workflow is a rooted Directed Acyclic Graph (DAG) that is started
        from the root node. It can be started from any node relative to the
        root as long as its parent(s) are successfully completed.

        The workflow will be executed on a different thread allowing for
        interactivity with the workflow instance while the workflow is
        running.

        By default, the workflow specification will be validated using the
        `validate` method of the WorkflowSpec class.

        :param tid:         start the workflow from task ID
        :type tid:          :py:int
        :param validate:    Validate the workflow before running it
        :type validate:     :py:bool
        :param project_dir: directory to store task output
        :type project_dir:  :py:str
        """

        # Empty workflow, return
        if self.workflow.empty() or not len(self.workflow.query_nodes(format='task')):
            logging.info('Workflow contains no tasks')
            return

        # Start from workflow root by default
        tid = tid or self.workflow.root

        # Check if tid exists
        if tid not in self.workflow.nodes:
            raise WorkflowError('Task with tid {0} not in workflow'.format(tid))

        # Validate workflow before running?
        if validate:
            if not validate_workflow(self.workflow):
                raise WorkflowError('Workflow validation failed')

        # If there are steps that store results locally (store_output == True)
        # Create a project directory.
        self.project_metadata = self.workflow.query_nodes(key='project_metadata')
        if any(self.workflow.query_nodes(key="store_output").values()):
            self.project_metadata.project_dir.set(self.workflow.data.value_tag,
                                                  self.project_metadata.project_dir.get(default=project_dir))
            if self.project_metadata.project_dir.exists and self.is_completed:
                raise WorkflowError('Directory for finished project exists: {0}'.format(
                    self.project_metadata.project_dir()))
            self.project_metadata.project_dir.makedirs()
        else:
            self.project_metadata.project_dir.set(self.workflow.data.value_tag, None)

        logging.info('Running workflow: {0}, start task ID: {1}'.format(self.project_metadata.title(), tid))

        # Set is_running flag. Function as a thread-safe signal to indicate
        # that the workflow is running.
        if self.is_running:
            logging.warning('Workflow {0} is already running'.format(self.project_metadata.title()))
            return
        self.is_running = True

        # Set workflow start time if not defined. Don't rerun to allow
        # continuation of unfinished workflow.
        if not self.project_metadata.start_time():
            self.project_metadata.start_time.set()

        # Spawn a workflow thread
        self.workflow_thread = threading.Thread(target=self.run_task, args=[tid])
        self.workflow_thread.daemon = True
        self.workflow_thread.start()