Пример #1
0
def exec_command(task_id, command, context, processor):
    """The function executes a given task using a package task processor.
    Returns a pair of task identifier and execution result.

    Parameters
    ----------
    task_id: string
        Unique task identifier
    command : vizier.viztrail.command.ModuleCommand
        Specification of the command that is to be executed
    context: vizier.engine.task.base.TaskContext
        Context for the executed task
    processor: vizier.engine.task.processor.TaskProcessor
        Task processor to execute the given command

    Returns
    -------
    (string, vizier.engine.task.processor.ExecResult)
    """
    try:
        result = processor.compute(
            command_id=command.command_id,
            arguments=command.arguments,
            context=context
        )
    except Exception as ex:
        outputs = ModuleOutputs().error(ex)
        result = ExecResult(is_success=False, outputs=outputs)
    return task_id, result
Пример #2
0
    def execute_script(self, args, context):
        """Execute a R script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get R script from user arguments
        source = args.get_value(cmd.PARA_R_SOURCE)
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream = list()
        sys.stdout = OutputStream(tag='out', stream=stream)
        sys.stderr = OutputStream(tag='err', stream=stream)
        outputs = ModuleOutputs()
        
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o]
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.identifier
        # Run the r code
        try:
            evalresp = mimir.evalR(mimir_table_names, source)
            ostd = evalresp['stdout']
            oerr = evalresp['stderr']
            if not ostd == '':
                outputs.stdout.append(HtmlOutput(ostd))
            if not oerr == '':
                outputs.stderr.append(TextOutput(oerr))
        except Exception as ex:
            outputs.error(ex)
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        for tag, text in stream:
            text = ''.join(text).strip()
            if tag == 'out':
                outputs.stdout.append(HtmlOutput(text))
            else:
                outputs.stderr.append(TextOutput(text))
        provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(
            is_success=(len(outputs.stderr) == 0),
            outputs=outputs,
            provenance=provenance
        )
Пример #3
0
    def compute_drop_dataset(self, args, context):
        """Execute drop dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get dataset name and remove the associated entry from the
        # dictionary of datasets in the context. Will raise exception if the
        # specified dataset does not exist.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()
        ds = context.get_dataset(ds_name)
        datasets = dict(context.datasets)
        del datasets[ds_name]
        return ExecResult(outputs=ModuleOutputs(
            stdout=[TextOutput('Dataset \'' + ds_name + '\' deleted')]),
                          provenance=ModuleProvenance(read=dict(),
                                                      write=dict(),
                                                      delete=[ds_name]))
Пример #4
0
 def set_success(self,
                 task_id: str,
                 finished_at: datetime = get_current_time(),
                 result: ExecResult = ExecResult()):
     self.task_id = task_id
     self.outputs = result.outputs
     self.state = 'SUCCESS'
Пример #5
0
def execute(task_id, project_id, command_doc, context, resources):
    """Execute the givven command.

    Parameters:
    -----------
    task_id: string
        Unique task identifier
    project_id: string
        Unique project identifier
    command_doc : dict
        Dictionary serialization of the module command
    context: dict
        Dictionary of available resources in the database state. The key is
        the resource name. Values are resource identifiers.
    resources: dict
        Optional information about resources that were generated during a
        previous execution of the command
    """
    # Create a remote workflow controller for the given task
    controller = worker_env.get_controller(project_id)
    # Notify the workflow controller that the task started to run
    controller.set_running(task_id=task_id, started_at=get_current_time())
    # Get the processor and execute the command. In case of an unknown package
    # the result is set to error.
    command = ModuleCommand.from_dict(command_doc)
    if command.package_id in worker_env.processors:
        processor = worker_env.processors[command.package_id]
        _, exec_result = exec_command(
            task_id=task_id,
            command=command,
            context=TaskContext(
                project_id=project_id,
                datastore=worker_env.datastores.get_datastore(project_id),
                filestore=worker_env.filestores.get_filestore(project_id),
                datasets=context[labels.CONTEXT_DATASETS],
                resources=resources,
                dataobjects=context[labels.CONTEXT_DATAOBJECTS]
            ),
            processor=processor
        )
    else:
        message = 'unknown package \'' + str(command.package_id) + '\''
        exec_result = ExecResult(
            is_success=False,
            outputs=ModuleOutputs(stderr=[TextOutput(message)])
        )
    # Notify the workflow controller that the task has finished
    if exec_result.is_success:
        controller.set_success(
            task_id=task_id,
            outputs=exec_result.outputs,
            provenance=exec_result.provenance
        )
    else:
        controller.set_error(
            task_id=task_id,
            outputs=exec_result.outputs
        )
Пример #6
0
    def compute_simple_chart(self, args, context):
        """Execute simple chart command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get dataset name and the associated dataset. This will raise an
        # exception if the dataset name is unknown.
        ds_name = args.get_value(pckg.PARA_DATASET)
        ds = context.get_dataset(ds_name)
        # Get user-provided name for the new chart and verify that it is a
        # valid name
        chart_name = args.get_value(pckg.PARA_NAME,
                                    default_value=ds_name + ' Plot')
        if chart_name == '' or chart_name == None:
            chart_name = ds_name + ' Plot'
        if not is_valid_name(chart_name):
            raise ValueError('invalid chart name \'' + str(chart_name) + '\'')
        chart_args = args.get_value(cmd.PARA_CHART)
        chart_type = chart_args.get_value(cmd.PARA_CHART_TYPE)
        grouped_chart = chart_args.get_value(cmd.PARA_CHART_GROUPED)
        # Create a new chart view handle and add the series definitions
        view = ChartViewHandle(dataset_name=ds_name,
                               chart_name=chart_name,
                               chart_type=chart_type,
                               grouped_chart=grouped_chart)
        # The data series index for x-axis values is optional
        if args.has(cmd.PARA_XAXIS):
            x_axis = args.get_value(cmd.PARA_XAXIS)
            # X-Axis column may be empty. In that case, we ignore the
            # x-axis spec
            add_data_series(args=x_axis,
                            view=view,
                            dataset=ds,
                            col_arg_id=cmd.PARA_XAXIS_COLUMN,
                            range_arg_id=cmd.PARA_XAXIS_RANGE)
            view.x_axis = 0
        # Definition of data series. Each series is a pair of column
        # identifier and a printable label.
        for data_series in args.get_value(cmd.PARA_SERIES):
            add_data_series(args=data_series, view=view, dataset=ds)
        # Execute the query and get the result
        rows = ChartQuery.exec_query(ds, view)
        # Add chart view handle as module output
        return ExecResult(
            outputs=ModuleOutputs(stdout=[ChartOutput(view=view, rows=rows)]),
            provenance=ModuleProvenance(read={ds_name: ds.identifier},
                                        write=dict(),
                                        charts=[view]))
Пример #7
0
    def compute_empty_dataset(self, args, context):
        """Execute empty dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        default_columns = [("''", "unnamed_column")]
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        try:
            source = "SELECT {};".format(", ".join(
                default_val + " AS " + col_name
                for default_val, col_name in default_columns))
            view_name, dependencies = mimir.createView(dict(), source)

            columns = [
                MimirDatasetColumn(identifier=col_id,
                                   name_in_dataset=col_defn[1])
                for col_defn, col_id in zip(default_columns,
                                            range(len(default_columns)))
            ]

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=1)
            provenance = ModuleProvenance(
                write={
                    ds_name:
                    DatasetDescriptor(identifier=ds.identifier,
                                      columns=ds.columns,
                                      row_count=ds.row_count)
                },
                read=dict(
                )  # Need to explicitly declare a lack of dependencies.
            )
            outputs.stdout.append(
                TextOutput("Empty dataset '{}' created".format(ds_name)))
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
Пример #8
0
    def execute_script(self, args, context):
        """Execute a Markdown script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get Markdown script from user arguments
        source = args.get_value(cmd.PARA_MARKDOWN_SOURCE)
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream = list()
        sys.stdout = OutputStream(tag='out', stream=stream)
        sys.stderr = OutputStream(tag='err', stream=stream)
        outputs = ModuleOutputs()
        # Run the markdown code
        try:
            #we should validate the markdown here
            ostd = source
            oerr = ''
            if not ostd == '':
                outputs.stdout.append(MarkdownOutput(ostd))
            if not oerr == '':
                outputs.stderr.append(TextOutput(oerr))
        except Exception as ex:
            outputs.error(ex)
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        for tag, text in stream:
            text = ''.join(text).strip()
            if tag == 'out':
                outputs.stdout.append(MarkdownOutput(text))
            else:
                outputs.stderr.append(TextOutput(text))
        provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
Пример #9
0
    def create_exec_result(self,
                           dataset_name,
                           input_dataset=None,
                           output_dataset=None,
                           database_state=None,
                           stdout=None,
                           resources=None):
        """Create execution result object for a successfully completed task.
        Assumes that a single datasets has been modified.

        Note that this method is not suitable to generate the result object for
        the drop dataset and rename dataset commands.

        Parameters
        ----------
        dataset_name: string
            Name of the manipulated dataset
        input_dataset: vizier.datastore.dataset.DatasetDescriptor
            Descriptor for the input dataset
        output_dataset: vizier.datastore.dataset.DatasetDescriptor, optional
            Descriptor for the resulting dataset
        database_state: dict, optional
            Identifier for datasets in the database state agains which a task
            was executed (keyed by user-provided name)
        stdout= list(string), optional
            Lines in the command output
        resources: dict, optional
            Optional resources that were generated by the command

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        if not output_dataset is None:
            ds = DatasetDescriptor(identifier=output_dataset.identifier,
                                   columns=output_dataset.columns,
                                   row_count=output_dataset.row_count)
        else:
            ds = None
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput(line)
                                          for line in stdout]),
            provenance=ModuleProvenance(
                read={dataset_name: input_dataset.identifier}
                if not input_dataset is None else None,
                write={dataset_name: ds},
                resources=resources))
Пример #10
0
    def compute_rename_dataset(self, args, context):
        """Execute rename dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get name of existing dataset and the new dataset name. Raise
        # exception if a dataset with the new name already exists or if the new
        # dataset name is not a valid name.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()
        new_name = args.get_value(pckg.PARA_NAME).lower()
        if new_name in context.datasets:
            raise ValueError('dataset \'' + new_name + '\' exists')
        if not is_valid_name(new_name):
            raise ValueError('invalid dataset name \'' + new_name + '\'')
        #  Get dataset. Raises exception if the dataset does not exist.
        ds = context.get_dataset(ds_name)
        # Adjust database state
        datasets = dict(context.datasets)
        del datasets[ds_name]
        datasets[new_name] = ds
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput('1 dataset renamed')]),
            provenance=ModuleProvenance(read=dict(),
                                        write={
                                            new_name:
                                            DatasetDescriptor(
                                                identifier=ds.identifier,
                                                columns=ds.columns,
                                                row_count=ds.row_count)
                                        },
                                        delete=[ds_name]))
Пример #11
0
    def set_success(
        self,
        task_id: str,
        finished_at: datetime = get_current_time(),
        result: ExecResult = ExecResult()
    ) -> Optional[bool]:
        """Set status of the module that is associated with the given task
        identifier to success. The finished_at property of the timestamp
        is set to the given value or the current time (if None).

        If case of a successful module execution the database state and module
        provenance information are also adjusted together with the module
        output streams. If the workflow has pending modules the first pending
        module will be executed next.

        Returns True if the state of the workflow was changed and False
        otherwise. The result is None if the project or task did not exist.

        Parameters
        ----------
        task_id : string
            Unique task identifier
        finished_at: datetime.datetime, optional
            Timestamp when module started running
        datasets : dict, optional
            Dictionary of resulting datasets. The user-specified name is the key
            for the dataset identifier.
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Output streams for module
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen by
            previous execution of the module.

        Returns
        -------
        bool
        """
        raise NotImplementedError
Пример #12
0
    def execute_query(self, args, context):
        """Execute a SQL query in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get SQL source code that is in this cell and the global
        # variables
        source = args.get_value(cmd.PARA_SQL_SOURCE)
        if not source.endswith(';'):
            source = source + ';'
        ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False)
        # Get mapping of datasets in the context to their respective table
        # name in the Mimir backend
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o]
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.table_name
        # Module outputs
        outputs = ModuleOutputs()
        try:
            # Create the view from the SQL source
            view_name, dependencies = mimir.createView(mimir_table_names,
                                                       source)
            sql = 'SELECT * FROM ' + view_name
            mimirSchema = mimir.getSchema(sql)

            columns = list()

            for col in mimirSchema:
                col_id = len(columns)
                name_in_dataset = col['name']
                col = MimirDatasetColumn(identifier=col_id,
                                         name_in_dataset=name_in_dataset)
                columns.append(col)

            row_count = mimir.countRows(view_name)

            provenance = None
            if ds_name is None or ds_name == '':
                ds_name = "TEMPORARY_RESULT"

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=row_count)
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            ds_output['name'] = ds_name

            dependencies = dict((dep_name.lower(),
                                 context.datasets.get(dep_name.lower(), None))
                                for dep_name in dependencies)
            # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies))

            outputs.stdout.append(DatasetOutput(ds_output))
            provenance = ModuleProvenance(write={
                ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  columns=ds.columns,
                                  row_count=ds.row_count)
            },
                                          read=dependencies)
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        # Return execution result
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
Пример #13
0
    def compute_load_dataset(self, args, context):
        """Execute load dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get the new dataset name. Raise exception if a dataset with the
        # specified name already exsists.
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        # Get components of the load source. Raise exception if the source
        # descriptor is invalid.
        source_desc = args.get_value(cmd.PARA_FILE)
        file_id = None
        url = None
        if pckg.FILE_ID in source_desc and source_desc[
                pckg.FILE_ID] is not None:
            file_id = source_desc[pckg.FILE_ID]
        elif pckg.FILE_URL in source_desc and source_desc[
                pckg.FILE_URL] is not None:
            url = source_desc[pckg.FILE_URL]
        else:
            raise ValueError('invalid source descriptor')
        username = source_desc[
            pckg.FILE_USERNAME] if pckg.FILE_USERNAME in source_desc else None
        password = source_desc[
            pckg.FILE_PASSWORD] if pckg.FILE_PASSWORD in source_desc else None
        reload = source_desc[
            pckg.FILE_RELOAD] if pckg.FILE_RELOAD in source_desc else False
        load_format = args.get_value(cmd.PARA_LOAD_FORMAT)
        detect_headers = args.get_value(cmd.PARA_DETECT_HEADERS,
                                        raise_error=False,
                                        default_value=True)
        infer_types = args.get_value(cmd.PARA_INFER_TYPES,
                                     raise_error=False,
                                     default_value=True)
        options = args.get_value(cmd.PARA_LOAD_OPTIONS, raise_error=False)
        m_opts = []
        print((args.get_value(cmd.PARA_LOAD_DSE,
                              raise_error=False,
                              default_value=False)))
        if args.get_value(cmd.PARA_LOAD_DSE,
                          raise_error=False,
                          default_value=False):
            m_opts.append({'name': 'datasourceErrors', 'value': 'true'})
        if not options is None:
            for option in options:
                load_opt_key = option.get_value(cmd.PARA_LOAD_OPTION_KEY)
                load_opt_val = option.get_value(cmd.PARA_LOAD_OPTION_VALUE)
                m_opts.append({'name': load_opt_key, 'value': load_opt_val})
        # Execute load command.
        result = self.api.load_dataset(datastore=context.datastore,
                                       filestore=context.filestore,
                                       file_id=file_id,
                                       url=url,
                                       detect_headers=detect_headers,
                                       infer_types=infer_types,
                                       load_format=load_format,
                                       options=m_opts,
                                       username=username,
                                       password=password,
                                       resources=context.resources,
                                       reload=reload,
                                       human_readable_name=ds_name.upper())
        # Delete the uploaded file (of load was from file). A reference to the
        # created dataset is in the resources and will be used if the module is
        # re-executed.
        #if not file_id is None:
        #    context.filestore.delete_file(file_id)
        ds = DatasetDescriptor(identifier=result.dataset.identifier,
                               columns=result.dataset.columns,
                               row_count=result.dataset.row_count)
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        ds_output['name'] = ds_name
        return ExecResult(
            outputs=ModuleOutputs(stdout=[DatasetOutput(ds_output)]),
            provenance=ModuleProvenance(
                read=dict(
                ),  # need to explicitly declare a lack of dependencies
                write={ds_name: ds},
                resources=result.resources))
Пример #14
0
    def execute_script(self, args, context):
        """Execute a Python script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get Python script from user arguments
        source = args.get_value(cmd.PYTHON_SOURCE)
        # Initialize the scope variables that are available to the executed
        # Python script. At this point this includes only the client to access
        # and manipulate datasets in the undelying datastore
        client = VizierDBClient(
            datastore=context.datastore,
            datasets=context.datasets
        )
        variables = {VARS_DBCLIENT: client}
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream = list()
        sys.stdout = OutputStream(tag='out', stream=stream)
        sys.stderr = OutputStream(tag='err', stream=stream)
        # Keep track of exception that is thrown by the code
        exception = None
        # Run the Python code
        try:
            python_cell_preload(variables)
            exec(source, variables, variables)
        except Exception as ex:
            exception = ex
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        outputs = ModuleOutputs()
        is_success = (exception is None)
        for tag, text in stream:
            text = ''.join(text).strip()
            if tag == 'out':
                outputs.stdout.append(HtmlOutput(text))
            else:
                outputs.stderr.append(TextOutput(text))
                is_success = False
        if is_success:
            # Create provenance information. Ensure that all dictionaries
            # contain elements of expected types, i.e, ensure that the user did
            # not attempt anything tricky.
            read = dict()
            for name in client.read:
                if not isinstance(name, str):
                    raise RuntimeError('invalid key for mapping dictionary')
                if name in context.datasets:
                    read[name] = context.datasets[name]
                    if not isinstance(read[name], str):
                        raise RuntimeError('invalid element in mapping dictionary')
                else:
                    read[name] = None
            write = dict()
            for name in client.write:
                if not isinstance(name, str):
                    raise RuntimeError('invalid key for mapping dictionary')
                ds_id = client.datasets[name]
                if not ds_id is None:
                    if not isinstance(ds_id, str):
                        raise RuntimeError('invalid value in mapping dictionary')
                    elif ds_id in client.descriptors:
                        write[name] = client.descriptors[ds_id]
                    else:
                        write[name] = client.datastore.get_descriptor(ds_id)
                else:
                    write[name] = None
            provenance = ModuleProvenance(
                read=read,
                write=write,
                delete=client.delete
            )
        else:
            outputs.error(exception)
            provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(
            is_success=is_success,
            outputs=outputs,
            provenance=provenance
        )
Пример #15
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the Mimir package using the set of
        user-provided arguments and the current database state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        # Get dataset. Raise exception if dataset is unknown.
        ds_name = arguments.get_value(pckg.PARA_DATASET).lower()
        dataset = context.get_dataset(ds_name)
        mimir_table_name = dataset.identifier
        # Keep track of the name of the input dataset for the provenance
        # information.
        input_ds_name = ds_name
        if command_id == cmd.MIMIR_GEOCODE:
            geocoder = arguments.get_value(cmd.PARA_GEOCODER)
            # Add columns for LATITUDE and LONGITUDE
            column_counter = dataset.max_column_id() + 1
            cname_lat = dataset.get_unique_name('LATITUDE')
            cname_lon = dataset.get_unique_name('LONGITUDE')
            dataset.columns.append(
                MimirDatasetColumn(
                    identifier=column_counter,
                    name_in_dataset=cname_lat,
                    data_type=DATATYPE_REAL
                )
            )
            dataset.columns.append(
                MimirDatasetColumn(
                    identifier=column_counter + 1,
                    name_in_dataset=cname_lon,
                    data_type=DATATYPE_REAL
                )
            )
            house = arguments.get_value(cmd.PARA_HOUSE_NUMBER, raise_error=False, default_value=None)
            street = arguments.get_value(cmd.PARA_STREET, raise_error=False, default_value=None)
            city = arguments.get_value(cmd.PARA_CITY, raise_error=False, default_value=None)
            state = arguments.get_value(cmd.PARA_STATE, raise_error=False, default_value=None)

            params = {
                'houseColumn': dataset.column_by_id(house).name_in_rdb   if house  is not None and house  != '' else None,
                'streetColumn': dataset.column_by_id(street).name_in_rdb if street is not None and street != '' else None,
                'cityColumn': dataset.column_by_id(city).name_in_rdb     if city   is not None and city   != '' else None,
                'stateColumn': dataset.column_by_id(state).name_in_rdb   if state  is not None and state  != '' else None,
                'geocoder': geocoder#,
                #'latitudeColumn': Option[String],
                #'longitudeColumn': Option[String],
                #'cacheCode': Option[String]
            }
        elif command_id == cmd.MIMIR_KEY_REPAIR:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = { "key" : column.name_in_rdb }
        elif command_id == cmd.MIMIR_MISSING_KEY:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = column.name_in_rdb
            # Set MISSING ONLY to FALSE to ensure that all rows are returned
            #params += ['MISSING_ONLY(FALSE)']
            # Need to run this lens twice in order to generate row ids for
            # any potential new tuple
        elif command_id == cmd.MIMIR_MISSING_VALUE:
            params = list()
            for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]):
                f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN))
                param = f_col.name_in_rdb
                col_constraint = col.get_value(
                    cmd.PARA_COLUMNS_CONSTRAINT,
                    raise_error=False
                )
                if col_constraint == '':
                    col_constraint = None
                #if not col_constraint is None:
                #    param = param + ' ' + str(col_constraint).replace("'", "\'\'").replace("OR", ") OR (")
                #param = '\'(' + param + ')\''
                params.append(param)
        elif command_id == cmd.MIMIR_PICKER:
            # Compute the input columns
            inputs = []
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_col = col.get_value(cmd.PARA_PICKFROM)
                column = dataset.column_by_id(c_col)
                inputs.append(column.name_in_rdb)

            # Compute the output column
            output = arguments.get_value(cmd.PARA_PICKAS, default_value = inputs[0])
            if output == "":
                output = inputs[0]
            else:
                output = dataset.get_unique_name(output.strip().upper())

            # Compute the final parameter list
            params = {
                "inputs" : inputs,
                "output" : output
            }
        elif command_id == cmd.MIMIR_TYPE_INFERENCE:
            params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))]
        elif command_id == cmd.MIMIR_SHAPE_DETECTOR:
            dseModel = arguments.get_value(cmd.PARA_MODEL_NAME)
            params = []
            if not dseModel is None:
                params = [str(dseModel)]
        elif command_id == cmd.MIMIR_COMMENT:
            commentsParams = []
            for idx, comment in enumerate(arguments.get_value(cmd.PARA_COMMENTS)):
                commentParam = {}
                
                # If target is defined, it is the column that we're trying to annotate
                # If unset (or empty), it means we're annotating the row.
                column_id = comment.get_value(cmd.PARA_EXPRESSION, None)

                if column_id is not None:
                    column = dataset.column_by_id(column_id)
                    commentParam['target'] = column.name_in_rdb

                # The comment
                commentParam['comment'] = comment.get_value(cmd.PARA_COMMENT)

                # If rowid is defined, it is the row that we're trying to annotate.  
                # If unset (or empty), it means that we're annotating all rows
                rowid = comment.get_value(cmd.PARA_ROWID, None) 
                if (rowid is not None) and (rowid != ""):
                    # If rowid begins with '=', it's a formula
                    if rowid[0] == '=':
                        commentParam['condition'] = rowid[1:]
                    else:
                        commentParam['rows'] = [ int(rowid) ]
                
                #TODO: handle result columns
                commentsParams.append(commentParam)
            params = {'comments' : commentsParams}
        elif command_id == cmd.MIMIR_PIVOT:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = {
                "target" : column.name_in_rdb,
                "keys" : [],
                "values" : []
            }
            for col_arg in arguments.get_value(cmd.PARA_VALUES):
                col = dataset.column_by_id(col_arg.get_value(cmd.PARA_VALUE))
                params["values"].append(col.name_in_rdb)
            for col_arg in arguments.get_value(cmd.PARA_KEYS, default_value=[]):
                col = dataset.column_by_id(col_arg.get_value(cmd.PARA_KEY))
                params["keys"].append(col.name_in_rdb)
            if len(params["values"]) < 1:
                raise ValueError("Need at least one value column")
            # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
        elif command_id == cmd.MIMIR_SHRED:
            params = { 
                "keepOriginalColumns" : arguments.get_value(cmd.PARA_KEEP_ORIGINAL)
            }
            shreds = []
            global_input_col = dataset.column_by_id(arguments.get_value(cmd.PARA_COLUMN_NAME))
            for (idx, shred) in enumerate(arguments.get_value(cmd.PARA_COLUMNS)):
                output_col = shred.get_value(cmd.PARA_OUTPUT_COLUMN)
                if output_col is None:
                    output_col = "{}_{}".format(global_input_col,idx)
                config = {}
                shred_type = shred.get_value(cmd.PARA_TYPE)
                expression = shred.get_value(cmd.PARA_EXPRESSION)
                group = shred.get_value(cmd.PARA_INDEX)
                if shred_type == "pattern":
                    config["regexp"] = expression
                    config["group"] = int(group)
                elif shred_type == "field":
                    config["separator"] = expression
                    config["field"] = int(group)
                elif shred_type == "explode":
                    config["separator"] = expression
                elif shred_type == "pass":
                    pass
                elif shred_type == "substring":
                    range_parts = re.match("([0-9]+)(([+\\-])([0-9]+))?", expression)
                    # print(range_parts)

                    # Mimir expects ranges to be given from start (inclusive) to end (exclusive)
                    # in a zero-based numbering scheme.

                    # Vizier expects input ranges to be given in a one-based numbering scheme.

                    # Convert to this format

                    if range_parts is None:
                        raise ValueError("Substring requires a range of the form '10', '10-11', or '10+1', but got '{}'".format(expression))
                    config["start"] = int(range_parts.group(1))-1 # Convert 1-based numbering to 0-based
                    if range_parts.group(2) is None:
                        config["end"] = config["start"] + 1 # if only one character, split one character
                    elif range_parts.group(3) == "+":
                        config["end"] = config["start"] + int(range_parts.group(4)) # start + length
                    elif range_parts.group(3) == "-":
                        config["end"] = int(range_parts.group(4)) # Explicit end, 1-based -> 0-based and exclusive cancel out
                    else:
                        raise ValueError("Invalid expression '{}' in substring shredder".format(expression))
                    # print("Shredding {} <- {} -- {}".format(output_col,config["start"],config["end"]))
                else:
                    raise ValueError("Invalid Shredding Type '{}'".format(shred_type))

                shreds.append({
                    **config,
                    "op" : shred_type,
                    "input" : global_input_col.name_in_rdb,
                    "output" : output_col,
                })
            params["shreds"] = shreds
            # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
        else:
            raise ValueError("Unknown Mimir lens '{}'".format(command_id))
        # Create Mimir lens
       
        mimir_lens_response = mimir.createLens(
            mimir_table_name,
            params,
            command_id,
            arguments.get_value(cmd.PARA_MATERIALIZE_INPUT, default_value=True),
            human_readable_name = ds_name.upper()
        )
        lens_name = mimir_lens_response['name']
        lens_schema = mimir_lens_response['schema']
        lens_properties = mimir_lens_response['properties']

        ds = MimirDatasetHandle.from_mimir_result(lens_name, lens_schema, lens_properties, ds_name)

        if command_id in LENSES_THAT_SHOULD_NOT_DISPLAY_TABLES:
            print_dataset_schema(outputs, ds_name, ds.columns)
        else:
            from vizier.api.webservice import server
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10
            )
            outputs.stdout.append(DatasetOutput(ds_output))
        
        # Return task result
        return ExecResult(
            outputs=outputs,
            provenance=ModuleProvenance(
                read={input_ds_name: dataset.identifier},
                write={ds_name: DatasetDescriptor(
                    identifier=ds.identifier,
                    name=ds_name,
                    columns=ds.columns
                )}
            )
        )
Пример #16
0
    def compute_unload_dataset(self, args, context):
        """Execute unload dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get the new dataset name. Raise exception if a dataset with the
        # specified name already exsists.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()

        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        # Get components of the load source. Raise exception if the source
        # descriptor is invalid.
        unload_format = args.get_value(cmd.PARA_UNLOAD_FORMAT)
        options = args.get_value(cmd.PARA_UNLOAD_OPTIONS, raise_error=False)
        m_opts = []

        if not options is None:
            for option in options:
                unload_opt_key = option.get_value(cmd.PARA_UNLOAD_OPTION_KEY)
                unload_opt_val = option.get_value(cmd.PARA_UNLOAD_OPTION_VALUE)
                m_opts.append({
                    'name': unload_opt_key,
                    'value': unload_opt_val
                })
        # Execute load command.
        dataset = context.get_dataset(ds_name)
        result = self.api.unload_dataset(dataset=dataset,
                                         datastore=context.datastore,
                                         filestore=context.filestore,
                                         unload_format=unload_format,
                                         options=m_opts,
                                         resources=context.resources)
        # Delete the uploaded file (of load was from file). A reference to the
        # created dataset is in the resources and will be used if the module is
        # re-executed.
        #file_id = result.resources[apibase.RESOURCE_FILEID]
        #if not file_id is None:
        #    context.filestore.delete_file(file_id)
        # Create result object
        outputhtml = HtmlOutput(''.join([
            "<div><a href=\"" + config.webservice.app_path + "/projects/" +
            str(context.project_id) + "/files/" + out_file.identifier +
            "\" download=\"" + out_file.name + "\">Download " + out_file.name +
            "</a></div>"
            for out_file in result.resources[apibase.RESOURCE_FILEID]
        ]))
        return ExecResult(outputs=ModuleOutputs(stdout=[outputhtml]),
                          provenance=ModuleProvenance(read={
                              ds_name:
                              context.datasets.get(ds_name.lower(), None)
                          },
                                                      write=dict()))
Пример #17
0
    def execute_script(self, args: ModuleArguments,
                       context: TaskContext) -> ExecResult:
        """Execute a Python script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get Python script from user arguments.  It is the source for VizierDBClient
        cell_src = args.get_value(cmd.PYTHON_SOURCE)

        # prepend python objects exported in previous cells to the source
        exported_methods = [
            context.datastore.get_object(descriptor.identifier).decode()
            for name, descriptor in context.dataobjects.items()
            if descriptor.artifact_type == ARTIFACT_TYPE_PYTHON
        ]
        overrides = [
            "def show(x):", "  global vizierdb", "  vizierdb.show(x)",
            "def export(x):", "  global vizierdb",
            "  vizierdb.export_module(x)", "def return_type(dt):",
            "  def wrap(x):", "    return x", "  return wrap", "pass"
        ]

        injected_source = "\n".join(exported_methods + overrides)
        injected_lines = len([x for x in injected_source if x == '\n']) + 1

        source = injected_source + '\n' + cell_src

        # Initialize the scope variables that are available to the executed
        # Python script. At this point this includes only the client to access
        # and manipulate datasets in the undelying datastore
        #
        # Use "any" type, since there's a (probably unnecessary) hack down
        # below that creates something that pretends to be a client.
        client: Any = VizierDBClient(datastore=context.datastore,
                                     datasets=context.datasets,
                                     source=cell_src,
                                     dataobjects=context.dataobjects,
                                     project_id=context.project_id,
                                     output_format=args.get_value(
                                         cmd.OUTPUT_FORMAT,
                                         default_value=OUTPUT_TEXT))
        variables = {VARS_DBCLIENT: client, VARS_OPEN: client.pycell_open}
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream: List[Tuple[str, str]] = list()
        sys.stdout = cast(TextIO, OutputStream(tag='out', stream=stream))
        sys.stderr = cast(TextIO, OutputStream(tag='err', stream=stream))
        # Keep track of exception that is thrown by the code
        exception = None
        resdata: Dict[str, Any] = dict()
        # Run the Python code
        try:
            python_cell_preload(variables, client=client)
            if SANDBOX_PYTHON_EXECUTION == "True":
                json_data = {
                    'source': source,
                    'datasets': context.datasets,
                    'dataobjects': context.dataobjects,
                    'datastore': context.datastore.__class__.__name__,
                    'basepath': context.datastore.base_path,
                    'project_id': context.project_id,
                    'output_format': client.output_format
                }
                res = requests.post(SANDBOX_PYTHON_URL, json=json_data)
                resdata = res.json()
                client = DotDict()
                for key, value in resdata['provenance'].items():
                    client.setattr(key, value)
                client.setattr('descriptors', {})
                client.setattr('datastore', context.datastore)
                client.setattr('datasets', resdata['datasets'])
                client.setattr('dataobjects', resdata['dataobjects'])
                client.setattr('output_format', resdata['output_format'])
                client.setattr('stdout', [
                    OutputObject(type=item['type'], value=item['value'])
                    for item in resdata.get('explicit_stdout', [])
                ])

            else:
                exec(source, variables, variables)

        except Exception as ex:
            exception = ex
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        outputs = ModuleOutputs()
        is_success = (exception is None)
        if SANDBOX_PYTHON_EXECUTION == "True":
            for text in resdata['stdout']:
                outputs.stdout.append(
                    OutputObject(value=text, type=client.output_format))
            for text in resdata['stderr']:
                outputs.stderr.append(TextOutput(text))
                is_success = False
        else:
            for tag, text in stream:
                text = ''.join(text).strip()
                if tag == 'out':
                    outputs.stdout.append(
                        OutputObject(value=text, type=client.output_format))
                else:
                    outputs.stderr.append(TextOutput(text))
                    is_success = False
        for output in client.stdout:
            outputs.stdout.append(output)

        if is_success:
            # Create provenance information. Ensure that all dictionaries
            # contain elements of expected types, i.e, ensure that the user did
            # not attempt anything tricky.
            read = dict()
            for name in client.read:
                if not isinstance(name, str):
                    raise RuntimeError('invalid key for mapping dictionary')
                if name in context.datasets:
                    read[name] = context.datasets[name].identifier
                    if not isinstance(read[name], str):
                        raise RuntimeError(
                            'invalid element in read mapping dictionary: {} (expecting str)'
                            .format(read[name]))
                elif name in context.dataobjects:
                    read[name] = context.dataobjects[name].identifier
                    if not isinstance(read[name], str):
                        raise RuntimeError(
                            'invalid element in read mapping dictionary: {} (expecting str)'
                            .format(read[name]))
                else:
                    raise RuntimeError('Unknown read artifact {}'.format(name))
            write = dict()
            for name in client.write:
                if not isinstance(name, str):
                    raise RuntimeError('invalid key for mapping dictionary')

                if name in client.datasets:
                    write_descriptor = client.datasets[name]
                    if not isinstance(write_descriptor, ArtifactDescriptor):
                        raise RuntimeError(
                            'invalid element in write mapping dictionary: {} (expecting str)'
                            .format(name))
                    else:
                        write[name] = write_descriptor
                elif name in client.dataobjects:
                    #wr_id = client.dataobjects[name]
                    write_descriptor = client.dataobjects[name]
                    #write_descriptor = client.datastore.get_object(identifier=wr_id)
                    if not isinstance(write_descriptor, ArtifactDescriptor):
                        raise RuntimeError(
                            'invalid element in write mapping dictionary: {} (expecting str)'
                            .format(name))
                    else:
                        write[name] = write_descriptor
                else:
                    raise RuntimeError(
                        'Unknown write artifact {}'.format(name))
            print("Pycell Execution Finished")
            print("     read: {}".format(read))
            print("     write: {}".format(write))
            provenance = ModuleProvenance(read=read,
                                          write=write,
                                          delete=client.delete)
        else:
            print("ERROR: {}".format(exception))
            assert (exception is not None)
            outputs.error(exception, offset_lines=-injected_lines)
            provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(is_success=is_success,
                          outputs=outputs,
                          provenance=provenance)
Пример #18
0
    def set_success(
        self,
        task_id: str,
        finished_at: datetime = get_current_time(),
        result: ExecResult = ExecResult()
    ) -> Optional[bool]:
        """Set status of the module that is associated with the given task
        identifier to success. The finished_at property of the timestamp
        is set to the given value or the current time (if None).

        If case of a successful module execution the database state and module
        provenance information are also adjusted together with the module
        output streams. If the workflow has pending modules the first pending
        module will be executed next.

        Returns True if the state of the workflow was changed and False
        otherwise. The result is None if the project or task did not exist.
        """
        with self.backend.lock:
            # Get task handle and remove it from the internal index. The result
            # is None if the task does not exist.
            task = pop_task(tasks=self.tasks, task_id=task_id)
            if task is None:
                return None
            # Get the handle for the head workflow of the specified branch and
            # the index for the module matching the identifier in the task.
            workflow, module_index = self.get_task_module(task)
            if workflow is None or module_index == -1:
                return None
            # Notify the backend that the task is finished
            self.backend.task_finished(task_id)
            module = workflow.modules[module_index]
            if not module.is_running:
                # The result is false if the state of the module did not change
                return False
            # print("UPDATED ARGUMENTS: {}".format(result.updated_arguments))
            module.set_success(finished_at=finished_at,
                               outputs=result.outputs,
                               provenance=result.provenance,
                               updated_arguments=result.updated_arguments)
            context = compute_context(workflow.modules[0:module_index])
            context = result.provenance.get_database_state(context)
            import sys
            sys.stderr.write(
                "Module {} finished at {} / Context: {} / Reads: [{}] / Writes: [{}]"
                .format(
                    module.external_form,
                    finished_at,
                    context,
                    ",".join(result.provenance.read)
                    if result.provenance.read is not None else "",
                    ",".join(result.provenance.write)
                    if result.provenance.write is not None else "",
                ))

            for next_module in workflow.modules[module_index + 1:]:
                if not next_module.is_pending:
                    # This case can only happen if we allow parallel execution
                    # of modules in the future. At this point it should not
                    # occur.
                    raise RuntimeError('invalid workflow state')
                elif not next_module.provenance.requires_exec(context):
                    # print("Module {} does not need re-execution, skipping".format(next_module))
                    context = next_module.provenance.get_database_state(
                        context)
                    next_module.set_success(
                        finished_at=finished_at,
                        outputs=next_module.outputs,
                        provenance=next_module.provenance,
                    )
                else:
                    # print("Scheduling {} for execution".format(next_module))
                    command = next_module.command
                    package_id = command.package_id
                    command_id = command.command_id
                    external_form = command.to_external_form(
                        command=self.packages[package_id].get(command_id),
                        datasets=dict(
                            (name, cast(DatasetDescriptor, context[name]))
                            for name in context if context[name].is_dataset))
                    # If the backend is going to run the task immediately we
                    # need to adjust the module state
                    state = self.backend.next_task_state()
                    if state == mstate.MODULE_RUNNING:
                        next_module.set_running(external_form=external_form,
                                                started_at=get_current_time())
                    else:
                        next_module.update_property(
                            external_form=external_form)
                    self.execute_module(project_id=task.project_id,
                                        branch_id=workflow.branch_id,
                                        module=next_module,
                                        artifacts=context)
                    break
            return True
Пример #19
0
    def execute_query(self, args: ModuleArguments,
                      context: TaskContext) -> ExecResult:
        """Execute a SQL query in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get SQL source code that is in this cell and the global
        # variables
        source = args.get_value(cmd.PARA_SQL_SOURCE)
        if not source.endswith(';'):
            source = source
        ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False)
        # Get mapping of datasets in the context to their respective table
        # name in the Mimir backend
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o].identifier
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.identifier
        # Module outputs
        outputs = ModuleOutputs()
        is_success = True
        functions = {
            name: context.dataobjects[name].identifier
            for name in context.dataobjects
            if context.dataobjects[name].obj_type == ARTIFACT_TYPE_PYTHON
        }
        try:
            # Create the view from the SQL source
            view_name, dependencies, mimirSchema, properties, functionDeps = mimir.createView(
                datasets=mimir_table_names,
                query=source,
                functions=dict(functions))
            ds = MimirDatasetHandle.from_mimir_result(view_name, mimirSchema,
                                                      properties, ds_name)

            print(mimirSchema)

            if ds_name is None or ds_name == '':
                ds_name = "TEMPORARY_RESULT"

            from vizier.api.webservice import server

            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            if ds_output is None:
                outputs.stderr.append(
                    TextOutput("Error displaying dataset {}".format(ds_name)))
            else:
                ds_output['name'] = ds_name
                outputs.stdout.append(DatasetOutput(ds_output))

            dependenciesDict: Dict[str, str] = {
                dep_name.lower(): get_artifact_id(dep)
                for dep_name, dep in [(
                    dep_name, context.datasets.get(dep_name.lower(), None))
                                      for dep_name in dependencies]
                if dep is not None
            }
            functionDepDict: Dict[str, str] = {
                dep_name.lower(): get_artifact_id(dep)
                for dep_name, dep in [(
                    dep_name, context.dataobjects.get(dep_name.lower(), None))
                                      for dep_name in dependencies]
                if dep is not None
            }
            # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies))

            provenance = ModuleProvenance(write={
                ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=ds_name,
                                  columns=ds.columns)
            },
                                          read={
                                              **dependenciesDict,
                                              **functionDepDict
                                          })
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
            is_success = False
        # Return execution result
        return ExecResult(is_success=is_success,
                          outputs=outputs,
                          provenance=provenance)
Пример #20
0
    def compute(self, command_id: str, arguments: "ModuleArguments",
                context: TaskContext) -> ExecResult:
        """Compute results for commands in the sampling package using 
        the set of user-provided arguments and the current database 
        state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """

        input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower()
        input_dataset: DatasetDescriptor = context.get_dataset(input_ds_name)
        if input_dataset is None:
            raise ValueError('unknown dataset \'' + input_ds_name + '\'')

        output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET,
                                             raise_error=False)
        if output_ds_name is None or output_ds_name == "":
            output_ds_name = input_ds_name + "_SAMPLE"
        output_ds_name = output_ds_name.lower()

        # Load the sampling configuration
        sample_mode = None

        if command_id == cmd.BASIC_SAMPLE:
            sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE))
            if sampling_rate > 1.0 or sampling_rate < 0.0:
                raise Exception("Sampling rate must be between 0.0 and 1.0")
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY,
                "probability": sampling_rate
            }
        elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE:
            column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN)
            column_defn = input_dataset.columns[column]
            if command_id == cmd.MANUAL_STRATIFIED_SAMPLE:
                strata = [{
                    "value":
                    stratum.get_value(cmd.PARA_STRATUM_VALUE),
                    "probability":
                    stratum.get_value(cmd.PARA_SAMPLING_RATE)
                } for stratum in arguments.get_value(cmd.PARA_STRATA)]
            else:
                probability = arguments.get_value(cmd.PARA_SAMPLING_RATE)
                strata = self.get_automatic_strata(input_dataset, column_defn,
                                                   probability)
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_STRATIFIED_ON,
                "column": column_defn.name,
                "type": column_defn.data_type,
                "strata": strata
            }
        else:
            raise Exception("Unknown sampling command: {}".format(command_id))

        table_name, schema = mimir.createSample(input_dataset.identifier,
                                                sample_mode,
                                                result_name="SAMPLE_" +
                                                get_unique_identifier())
        ds = MimirDatasetHandle.from_mimir_result(table_name,
                                                  schema,
                                                  properties={},
                                                  name=output_ds_name)

        # And start rendering some output
        outputs = ModuleOutputs()
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        if ds_output is not None:
            ds_output['name'] = output_ds_name
            outputs.stdout.append(DatasetOutput(ds_output))
        else:
            outputs.stderr.append(TextOutput("Error displaying dataset"))

        # Record Reads and writes
        provenance = ModuleProvenance(
            read={input_ds_name: input_dataset.identifier},
            write={
                output_ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=output_ds_name,
                                  columns=ds.columns)
            })

        # Return task result
        return ExecResult(outputs=outputs, provenance=provenance)
Пример #21
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the Mimir package using the set of
        user-provided arguments and the current database state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        store_as_dataset = None
        update_rows = False
        lens_annotations = []
        # Get dataset. Raise exception if dataset is unknown.
        ds_name = arguments.get_value(pckg.PARA_DATASET).lower()
        dataset = context.get_dataset(ds_name)
        mimir_table_name = dataset.table_name
        # Keep track of the name of the input dataset for the provenance
        # information.
        input_ds_name = ds_name
        if command_id == cmd.MIMIR_DOMAIN:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
        elif command_id == cmd.MIMIR_GEOCODE:
            geocoder = arguments.get_value(cmd.PARA_GEOCODER)
            params = ['GEOCODER(' + geocoder + ')']
            add_column_parameter(params, 'HOUSE_NUMBER', dataset, arguments,
                                 cmd.PARA_HOUSE_NUMBER)
            add_column_parameter(params, 'STREET', dataset, arguments,
                                 cmd.PARA_STREET)
            add_column_parameter(params, 'CITY', dataset, arguments,
                                 cmd.PARA_CITY)
            add_column_parameter(params, 'STATE', dataset, arguments,
                                 cmd.PARA_STATE)
            # Add columns for LATITUDE and LONGITUDE
            column_counter = dataset.max_column_id() + 1
            cname_lat = dataset.get_unique_name('LATITUDE')
            cname_lon = dataset.get_unique_name('LONGITUDE')
            dataset.columns.append(
                MimirDatasetColumn(identifier=column_counter,
                                   name_in_dataset=cname_lat,
                                   data_type=DATATYPE_REAL))
            dataset.columns.append(
                MimirDatasetColumn(identifier=column_counter + 1,
                                   name_in_dataset=cname_lon,
                                   data_type=DATATYPE_REAL))
            params.append('RESULT_COLUMNS(' + cname_lat + ',' + cname_lon +
                          ')')
        elif command_id == cmd.MIMIR_KEY_REPAIR:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
            update_rows = True
        elif command_id == cmd.MIMIR_MISSING_KEY:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
            # Set MISSING ONLY to FALSE to ensure that all rows are returned
            params += ['MISSING_ONLY(FALSE)']
            # Need to run this lens twice in order to generate row ids for
            # any potential new tuple
            mimir_lens_response = mimir.createLens(
                dataset.table_name, params, command_id,
                arguments.get_value(cmd.PARA_MATERIALIZE_INPUT,
                                    default_value=True))
            (mimir_table_name,
             lens_annotations) = (mimir_lens_response.lensName(),
                                  mimir_lens_response.annotations())
            params = [ROW_ID, 'MISSING_ONLY(FALSE)']
            update_rows = True
        elif command_id == cmd.MIMIR_MISSING_VALUE:
            params = list()
            for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]):
                f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN))
                param = f_col.name_in_rdb
                col_constraint = col.get_value(cmd.PARA_COLUMNS_CONSTRAINT,
                                               raise_error=False)
                if col_constraint == '':
                    col_constraint = None
                if not col_constraint is None:
                    param = param + ' ' + str(col_constraint).replace(
                        "'", "\'\'").replace("OR", ") OR (")
                param = '\'(' + param + ')\''
                params.append(param)
        elif command_id == cmd.MIMIR_PICKER:
            pick_from = list()
            column_names = list()
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_col = col.get_value(cmd.PARA_PICKFROM)
                column = dataset.column_by_id(c_col)
                pick_from.append(column.name_in_rdb)
                column_names.append(column.name.upper().replace(' ', '_'))
            # Add result column to dataset schema
            pick_as = arguments.get_value(cmd.PARA_PICKAS,
                                          default_value='PICK_ONE_' +
                                          '_'.join(column_names))
            pick_as = dataset.get_unique_name(pick_as.strip().upper())
            dataset.columns.append(
                MimirDatasetColumn(identifier=dataset.max_column_id() + 1,
                                   name_in_dataset=pick_as))
            params = ['PICK_FROM(' + ','.join(pick_from) + ')']
            params.append('PICK_AS(' + pick_as + ')')
        elif command_id == cmd.MIMIR_SCHEMA_MATCHING:
            store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
            if store_as_dataset in context.datasets:
                raise ValueError('dataset \'' + store_as_dataset + '\' exists')
            if not is_valid_name(store_as_dataset):
                raise ValueError('invalid dataset name \'' + store_as_dataset +
                                 '\'')
            column_names = list()
            params = ['\'' + ROW_ID + ' int\'']
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_name = col.get_value(pckg.PARA_COLUMN)
                c_type = col.get_value(cmd.PARA_TYPE)
                params.append('\'' + c_name + ' ' + c_type + '\'')
                column_names.append(c_name)
        elif command_id == cmd.MIMIR_TYPE_INFERENCE:
            params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))]
        elif command_id == cmd.MIMIR_SHAPE_DETECTOR:
            dseModel = arguments.get_value(cmd.PARA_MODEL_NAME)
            params = []
            if not dseModel is None:
                params = [str(dseModel)]
        elif command_id == cmd.MIMIR_COMMENT:
            params = []
            for comment in arguments.get_value(cmd.PARA_COMMENTS):
                c_expr = comment.get_value(cmd.PARA_EXPRESSION)
                c_cmnt = comment.get_value(cmd.PARA_COMMENT)
                c_rowid = comment.get_value(cmd.PARA_ROWID)
                if c_rowid is None:
                    params.append('COMMENT(' + c_expr + ', \'' + c_cmnt +
                                  '\') ')
                else:
                    params.append('COMMENT(' + c_expr + ', \'' + c_cmnt +
                                  '\', \'' + c_rowid + '\') ')
            result_cols = []
            for col in arguments.get_value(cmd.PARA_RESULT_COLUMNS):
                c_name = col.get_value(pckg.PARA_COLUMN)
                result_cols.append(c_name)
            if len(result_cols) > 0:
                params.append('RESULT_COLUMNS(' + ','.join(result_cols) + ')')
        else:
            raise ValueError('unknown Mimir lens \'' + str(lens) + '\'')
        # Create Mimir lens
        if command_id in [
                cmd.MIMIR_SCHEMA_MATCHING, cmd.MIMIR_TYPE_INFERENCE,
                cmd.MIMIR_SHAPE_DETECTOR
        ]:
            lens_name = mimir.createAdaptiveSchema(mimir_table_name, params,
                                                   command_id.upper())
        else:
            mimir_lens_response = mimir.createLens(
                mimir_table_name,
                params,
                command_id.upper(),
                arguments.get_value(cmd.PARA_MATERIALIZE_INPUT,
                                    default_value=True),
                human_readable_name=ds_name.upper())
            (lens_name,
             lens_annotations) = (mimir_lens_response['lensName'],
                                  mimir_lens_response['annotations'])
        # Create a view including missing row ids for the result of a
        # MISSING KEY lens
        if command_id == cmd.MIMIR_MISSING_KEY:
            lens_name, row_counter = create_missing_key_view(
                dataset, lens_name, column)
            dataset.row_counter = row_counter
        # Create datastore entry for lens.
        if not store_as_dataset is None:
            columns = list()
            for c_name in column_names:
                col_id = len(columns)
                columns.append(
                    MimirDatasetColumn(identifier=col_id,
                                       name_in_dataset=c_name))
            ds = context.datastore.register_dataset(
                table_name=lens_name,
                columns=columns,
                annotations=dataset.annotations)
            ds_name = store_as_dataset
        else:
            ds = context.datastore.register_dataset(
                table_name=lens_name,
                columns=dataset.columns,
                annotations=dataset.annotations)
        # Add dataset schema and returned annotations to output
        if command_id in [
                cmd.MIMIR_SCHEMA_MATCHING, cmd.MIMIR_TYPE_INFERENCE,
                cmd.MIMIR_SHAPE_DETECTOR
        ]:
            print_dataset_schema(outputs, ds_name, ds.columns)
        else:
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            outputs.stdout.append(DatasetOutput(ds_output))

        print_lens_annotations(outputs, lens_annotations)
        dsd = DatasetDescriptor(identifier=ds.identifier,
                                columns=ds.columns,
                                row_count=ds.row_count)
        result_resources = dict()
        result_resources[base.RESOURCE_DATASET] = ds.identifier

        # Return task result
        return ExecResult(outputs=outputs,
                          provenance=ModuleProvenance(
                              read={input_ds_name: dataset.identifier},
                              write={ds_name: dsd},
                              resources=result_resources))