Exemplo n.º 1
0
    def filter_columns(self, identifier: str, columns: List[int],
                       names: List[str],
                       datastore: Datastore) -> VizualApiResult:
        """Dataset projection operator. Returns a copy of the dataset with the
        given identifier that contains only those columns listed in columns.
        The list of names contains optional new names for the filtered columns.
        A value of None in names indicates that the name of the corresponding
        column is not changed.

        Raises ValueError if no dataset with given identifier exists or if any
        of the filter columns are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for columns in the result.
        names: list(string)
            Optional new names for filtered columns.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # The schema of the new dataset only contains the columns in the given
        # list. A column might need to be renamed.
        schema = list()
        column_mapping = list()
        col_list = []
        for i in range(len(columns)):
            col_idx = get_index_for_column(dataset, columns[i])
            col = dataset.columns[col_idx]
            if not names[i] is None:
                if not is_valid_name(names[i]):
                    raise ValueError('invalid column name \'' + str(names[i]) +
                                     '\'')
                schema.append(
                    MimirDatasetColumn(identifier=col.identifier,
                                       name_in_dataset=names[i],
                                       name_in_rdb=names[i]))
            else:
                schema.append(col)
            column_mapping.append({
                "columns_column": col_idx,
                "columns_name": schema[-1].name
            })
            col_list.append(col.name_in_rdb)
        command = {"id": "projection", "columns": column_mapping}
        response = mimir.vizualScript(dataset.identifier, command)
        return VizualApiResult.from_mimir(response)
Exemplo n.º 2
0
    def insert_column(self, identifier, position, name, datastore):
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the column will be inserted
        name: string, optional
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Get identifier for new column
        col_id = dataset.max_column_id() + 1
        # Insert new column into schema
        schema = list(dataset.columns)
        new_column = MimirDatasetColumn(col_id, name, name)
        schema.insert(position, new_column)
        # Create a view for the modified schema
        col_list = []
        for col in schema:
            if col.identifier == new_column.identifier:
                # Note: By no (April 2018) this requires Mimir to run with the
                # XNULL option. Otherwise, in some scenarios setting the all
                # values in the new column to NULL may cause an exception.
                col_list.append(" CAST('' AS int) AS " + col.name_in_rdb)
            else:
                col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(
            col_list) + ' FROM ' + dataset.table_name + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=schema,
                                        row_counter=dataset.row_counter,
                                        annotations=dataset.annotations)
        return VizualApiResult(ds)
Exemplo n.º 3
0
    def compute_empty_dataset(self, args, context):
        """Execute empty dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        default_columns = [("''", "unnamed_column")]
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        try:
            source = "SELECT {};".format(", ".join(
                default_val + " AS " + col_name
                for default_val, col_name in default_columns))
            view_name, dependencies = mimir.createView(dict(), source)

            columns = [
                MimirDatasetColumn(identifier=col_id,
                                   name_in_dataset=col_defn[1])
                for col_defn, col_id in zip(default_columns,
                                            range(len(default_columns)))
            ]

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=1)
            provenance = ModuleProvenance(
                write={
                    ds_name:
                    DatasetDescriptor(identifier=ds.identifier,
                                      columns=ds.columns,
                                      row_count=ds.row_count)
                },
                read=dict(
                )  # Need to explicitly declare a lack of dependencies.
            )
            outputs.stdout.append(
                TextOutput("Empty dataset '{}' created".format(ds_name)))
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
Exemplo n.º 4
0
    def filter_columns(self, identifier, columns, names, datastore):
        """Dataset projection operator. Returns a copy of the dataset with the
        given identifier that contains only those columns listed in columns.
        The list of names contains optional new names for the filtered columns.
        A value of None in names indicates that the name of the corresponding
        column is not changed.

        Raises ValueError if no dataset with given identifier exists or if any
        of the filter columns are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for columns in the result.
        names: list(string)
            Optional new names for filtered columns.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # The schema of the new dataset only contains the columns in the given
        # list. A column might need to be renamed.
        schema = list()
        col_list = []
        for i in range(len(columns)):
            col_idx = get_index_for_column(dataset, columns[i])
            col = dataset.columns[col_idx]
            if not names[i] is None:
                schema.append(
                    MimirDatasetColumn(identifier=col.identifier,
                                       name_in_dataset=names[i],
                                       name_in_rdb=col.name_in_rdb))
            else:
                schema.append(col)
            col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(
            col_list) + ' FROM ' + dataset.table_name + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=schema,
                                        row_counter=dataset.row_counter,
                                        annotations=dataset.annotations.filter(
                                            columns=columns,
                                            rows=dataset.row_ids))
        return VizualApiResult(ds)
Exemplo n.º 5
0
    def create_dataset(
            self,
            columns: List[DatasetColumn],
            rows: List[DatasetRow],
            properties: Dict[str, Any] = None,
            human_readable_name: str = "Untitled Dataset",
            backend_options: Optional[List[Tuple[str, str]]] = None,
            dependencies: Optional[List[str]] = None) -> MimirDatasetHandle:
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        properties: dict(string, any), optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Get unique identifier for new dataset
        properties = {} if properties is None else properties
        backend_options = [] if backend_options is None else backend_options
        dependencies = [] if dependencies is None else dependencies
        identifier = 'DS_' + get_unique_identifier()
        columns = [
            col if isinstance(col, MimirDatasetColumn) else MimirDatasetColumn(
                identifier=col.identifier,
                name_in_dataset=col.name,
                data_type=col.data_type) for col in columns
        ]

        table_name, schema = mimir.loadDataInline(
            schema=[{
                "name": base.sanitize_column_name(col.name),
                "type": col.data_type
            } for col in columns],
            rows=[row.values for row in rows],
            result_name=identifier,
            human_readable_name=human_readable_name,
            dependencies=dependencies,
            properties=properties)

        # Insert the new dataset metadata information into the datastore
        return MimirDatasetHandle.from_mimir_result(table_name=table_name,
                                                    schema=schema,
                                                    properties=properties,
                                                    name=human_readable_name)
Exemplo n.º 6
0
    def empty_dataset(
        self,
        datastore: Datastore,
        filestore: Filestore,
        initial_columns: List[Tuple[str, str]] = [("''", "unnamed_column")]
    ) -> VizualApiResult:
        """Create (or load) a new dataset from a given file or Uri. It is
        guaranteed that either the file identifier or the url are not None but
        one of them will be None. The user name and password may only be given
        if an url is given.

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        Parameters
        ----------
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets
        filestore: vizier.filestore.Filestore
            Filestore to retrieve uploaded datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        assert (isinstance(datastore, MimirDatastore))
        ds = datastore.create_dataset(
            columns=[
                MimirDatasetColumn(identifier=id,
                                   name_in_dataset=col,
                                   data_type="varchar")
                for id, (default, col) in enumerate(initial_columns)
            ],
            rows=[
                DatasetRow(
                    identifier=str(id),
                    values=[default for default, col in initial_columns])
                for id in range(1, 2)
            ],
            human_readable_name="Empty Table",
        )

        return VizualApiResult(dataset=ds)
Exemplo n.º 7
0
    def insert_column(self, identifier: str, position: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the column will be inserted
        name: string, optional
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Get identifier for new column
        col_id = dataset.max_column_id() + 1
        # Insert new column into schema
        schema = list(dataset.columns)
        new_column = MimirDatasetColumn(col_id, name, name)
        schema.insert(position, new_column)
        command = {"id": "insertColumn", "name": name, "position": position}
        response = mimir.vizualScript(dataset.identifier, command)
        return VizualApiResult.from_mimir(response)
Exemplo n.º 8
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the Mimir package using the set of
        user-provided arguments and the current database state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        # Get dataset. Raise exception if dataset is unknown.
        ds_name = arguments.get_value(pckg.PARA_DATASET).lower()
        dataset = context.get_dataset(ds_name)
        mimir_table_name = dataset.identifier
        # Keep track of the name of the input dataset for the provenance
        # information.
        input_ds_name = ds_name
        if command_id == cmd.MIMIR_GEOCODE:
            geocoder = arguments.get_value(cmd.PARA_GEOCODER)
            # Add columns for LATITUDE and LONGITUDE
            column_counter = dataset.max_column_id() + 1
            cname_lat = dataset.get_unique_name('LATITUDE')
            cname_lon = dataset.get_unique_name('LONGITUDE')
            dataset.columns.append(
                MimirDatasetColumn(
                    identifier=column_counter,
                    name_in_dataset=cname_lat,
                    data_type=DATATYPE_REAL
                )
            )
            dataset.columns.append(
                MimirDatasetColumn(
                    identifier=column_counter + 1,
                    name_in_dataset=cname_lon,
                    data_type=DATATYPE_REAL
                )
            )
            house = arguments.get_value(cmd.PARA_HOUSE_NUMBER, raise_error=False, default_value=None)
            street = arguments.get_value(cmd.PARA_STREET, raise_error=False, default_value=None)
            city = arguments.get_value(cmd.PARA_CITY, raise_error=False, default_value=None)
            state = arguments.get_value(cmd.PARA_STATE, raise_error=False, default_value=None)

            params = {
                'houseColumn': dataset.column_by_id(house).name_in_rdb   if house  is not None and house  != '' else None,
                'streetColumn': dataset.column_by_id(street).name_in_rdb if street is not None and street != '' else None,
                'cityColumn': dataset.column_by_id(city).name_in_rdb     if city   is not None and city   != '' else None,
                'stateColumn': dataset.column_by_id(state).name_in_rdb   if state  is not None and state  != '' else None,
                'geocoder': geocoder#,
                #'latitudeColumn': Option[String],
                #'longitudeColumn': Option[String],
                #'cacheCode': Option[String]
            }
        elif command_id == cmd.MIMIR_KEY_REPAIR:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = { "key" : column.name_in_rdb }
        elif command_id == cmd.MIMIR_MISSING_KEY:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = column.name_in_rdb
            # Set MISSING ONLY to FALSE to ensure that all rows are returned
            #params += ['MISSING_ONLY(FALSE)']
            # Need to run this lens twice in order to generate row ids for
            # any potential new tuple
        elif command_id == cmd.MIMIR_MISSING_VALUE:
            params = list()
            for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]):
                f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN))
                param = f_col.name_in_rdb
                col_constraint = col.get_value(
                    cmd.PARA_COLUMNS_CONSTRAINT,
                    raise_error=False
                )
                if col_constraint == '':
                    col_constraint = None
                #if not col_constraint is None:
                #    param = param + ' ' + str(col_constraint).replace("'", "\'\'").replace("OR", ") OR (")
                #param = '\'(' + param + ')\''
                params.append(param)
        elif command_id == cmd.MIMIR_PICKER:
            # Compute the input columns
            inputs = []
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_col = col.get_value(cmd.PARA_PICKFROM)
                column = dataset.column_by_id(c_col)
                inputs.append(column.name_in_rdb)

            # Compute the output column
            output = arguments.get_value(cmd.PARA_PICKAS, default_value = inputs[0])
            if output == "":
                output = inputs[0]
            else:
                output = dataset.get_unique_name(output.strip().upper())

            # Compute the final parameter list
            params = {
                "inputs" : inputs,
                "output" : output
            }
        elif command_id == cmd.MIMIR_TYPE_INFERENCE:
            params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))]
        elif command_id == cmd.MIMIR_SHAPE_DETECTOR:
            dseModel = arguments.get_value(cmd.PARA_MODEL_NAME)
            params = []
            if not dseModel is None:
                params = [str(dseModel)]
        elif command_id == cmd.MIMIR_COMMENT:
            commentsParams = []
            for idx, comment in enumerate(arguments.get_value(cmd.PARA_COMMENTS)):
                commentParam = {}
                
                # If target is defined, it is the column that we're trying to annotate
                # If unset (or empty), it means we're annotating the row.
                column_id = comment.get_value(cmd.PARA_EXPRESSION, None)

                if column_id is not None:
                    column = dataset.column_by_id(column_id)
                    commentParam['target'] = column.name_in_rdb

                # The comment
                commentParam['comment'] = comment.get_value(cmd.PARA_COMMENT)

                # If rowid is defined, it is the row that we're trying to annotate.  
                # If unset (or empty), it means that we're annotating all rows
                rowid = comment.get_value(cmd.PARA_ROWID, None) 
                if (rowid is not None) and (rowid != ""):
                    # If rowid begins with '=', it's a formula
                    if rowid[0] == '=':
                        commentParam['condition'] = rowid[1:]
                    else:
                        commentParam['rows'] = [ int(rowid) ]
                
                #TODO: handle result columns
                commentsParams.append(commentParam)
            params = {'comments' : commentsParams}
        elif command_id == cmd.MIMIR_PIVOT:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = {
                "target" : column.name_in_rdb,
                "keys" : [],
                "values" : []
            }
            for col_arg in arguments.get_value(cmd.PARA_VALUES):
                col = dataset.column_by_id(col_arg.get_value(cmd.PARA_VALUE))
                params["values"].append(col.name_in_rdb)
            for col_arg in arguments.get_value(cmd.PARA_KEYS, default_value=[]):
                col = dataset.column_by_id(col_arg.get_value(cmd.PARA_KEY))
                params["keys"].append(col.name_in_rdb)
            if len(params["values"]) < 1:
                raise ValueError("Need at least one value column")
            # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
        elif command_id == cmd.MIMIR_SHRED:
            params = { 
                "keepOriginalColumns" : arguments.get_value(cmd.PARA_KEEP_ORIGINAL)
            }
            shreds = []
            global_input_col = dataset.column_by_id(arguments.get_value(cmd.PARA_COLUMN_NAME))
            for (idx, shred) in enumerate(arguments.get_value(cmd.PARA_COLUMNS)):
                output_col = shred.get_value(cmd.PARA_OUTPUT_COLUMN)
                if output_col is None:
                    output_col = "{}_{}".format(global_input_col,idx)
                config = {}
                shred_type = shred.get_value(cmd.PARA_TYPE)
                expression = shred.get_value(cmd.PARA_EXPRESSION)
                group = shred.get_value(cmd.PARA_INDEX)
                if shred_type == "pattern":
                    config["regexp"] = expression
                    config["group"] = int(group)
                elif shred_type == "field":
                    config["separator"] = expression
                    config["field"] = int(group)
                elif shred_type == "explode":
                    config["separator"] = expression
                elif shred_type == "pass":
                    pass
                elif shred_type == "substring":
                    range_parts = re.match("([0-9]+)(([+\\-])([0-9]+))?", expression)
                    # print(range_parts)

                    # Mimir expects ranges to be given from start (inclusive) to end (exclusive)
                    # in a zero-based numbering scheme.

                    # Vizier expects input ranges to be given in a one-based numbering scheme.

                    # Convert to this format

                    if range_parts is None:
                        raise ValueError("Substring requires a range of the form '10', '10-11', or '10+1', but got '{}'".format(expression))
                    config["start"] = int(range_parts.group(1))-1 # Convert 1-based numbering to 0-based
                    if range_parts.group(2) is None:
                        config["end"] = config["start"] + 1 # if only one character, split one character
                    elif range_parts.group(3) == "+":
                        config["end"] = config["start"] + int(range_parts.group(4)) # start + length
                    elif range_parts.group(3) == "-":
                        config["end"] = int(range_parts.group(4)) # Explicit end, 1-based -> 0-based and exclusive cancel out
                    else:
                        raise ValueError("Invalid expression '{}' in substring shredder".format(expression))
                    # print("Shredding {} <- {} -- {}".format(output_col,config["start"],config["end"]))
                else:
                    raise ValueError("Invalid Shredding Type '{}'".format(shred_type))

                shreds.append({
                    **config,
                    "op" : shred_type,
                    "input" : global_input_col.name_in_rdb,
                    "output" : output_col,
                })
            params["shreds"] = shreds
            # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
        else:
            raise ValueError("Unknown Mimir lens '{}'".format(command_id))
        # Create Mimir lens
       
        mimir_lens_response = mimir.createLens(
            mimir_table_name,
            params,
            command_id,
            arguments.get_value(cmd.PARA_MATERIALIZE_INPUT, default_value=True),
            human_readable_name = ds_name.upper()
        )
        lens_name = mimir_lens_response['name']
        lens_schema = mimir_lens_response['schema']
        lens_properties = mimir_lens_response['properties']

        ds = MimirDatasetHandle.from_mimir_result(lens_name, lens_schema, lens_properties, ds_name)

        if command_id in LENSES_THAT_SHOULD_NOT_DISPLAY_TABLES:
            print_dataset_schema(outputs, ds_name, ds.columns)
        else:
            from vizier.api.webservice import server
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10
            )
            outputs.stdout.append(DatasetOutput(ds_output))
        
        # Return task result
        return ExecResult(
            outputs=outputs,
            provenance=ModuleProvenance(
                read={input_ds_name: dataset.identifier},
                write={ds_name: DatasetDescriptor(
                    identifier=ds.identifier,
                    name=ds_name,
                    columns=ds.columns
                )}
            )
        )
Exemplo n.º 9
0
    def rename_column(self, identifier, column_id, name, datastore):
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        columns = list()
        schema = list(dataset.columns)
        colIndex = get_index_for_column(dataset, column_id)
        col = schema[colIndex]
        # No need to do anything if the name hasn't changed
        if col.name.lower() != name.lower():

            sql = 'SELECT * FROM ' + dataset.table_name
            mimirSchema = mimir.getSchema(sql)
            # Create list of dataset columns
            colSql = ''
            idx = 0
            for col in mimirSchema:
                col_id = len(columns)
                name_in_dataset = sanitize_column_name(col['name'].upper())
                name_in_rdb = sanitize_column_name(col['name'].upper())
                col = MimirDatasetColumn(identifier=col_id,
                                         name_in_dataset=name_in_dataset,
                                         name_in_rdb=name_in_rdb)
                if idx == 0:
                    colSql = name_in_dataset + ' AS ' + name_in_rdb
                elif idx == colIndex:
                    colSql = colSql + ', ' + name_in_dataset + ' AS ' + name
                    col.name = name
                    col.name_in_rdb = name
                else:
                    colSql = colSql + ', ' + name_in_dataset + ' AS ' + name_in_rdb
                columns.append(col)
                idx = idx + 1
            # Create view for loaded dataset
            sql = 'SELECT ' + colSql + ' FROM {{input}};'
            view_name, dependencies = mimir.createView(dataset.table_name, sql)
            # There are no changes to the underlying database. We only need to
            # change the column information in the dataset schema.
            # Store updated dataset to get new identifier
            ds = datastore.register_dataset(table_name=view_name,
                                            columns=columns,
                                            row_counter=dataset.row_counter,
                                            annotations=dataset.annotations)
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
Exemplo n.º 10
0
    def execute_query(self, args, context):
        """Execute a SQL query in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get SQL source code that is in this cell and the global
        # variables
        source = args.get_value(cmd.PARA_SQL_SOURCE)
        if not source.endswith(';'):
            source = source + ';'
        ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False)
        # Get mapping of datasets in the context to their respective table
        # name in the Mimir backend
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o]
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.table_name
        # Module outputs
        outputs = ModuleOutputs()
        try:
            # Create the view from the SQL source
            view_name, dependencies = mimir.createView(mimir_table_names,
                                                       source)
            sql = 'SELECT * FROM ' + view_name
            mimirSchema = mimir.getSchema(sql)

            columns = list()

            for col in mimirSchema:
                col_id = len(columns)
                name_in_dataset = col['name']
                col = MimirDatasetColumn(identifier=col_id,
                                         name_in_dataset=name_in_dataset)
                columns.append(col)

            row_count = mimir.countRows(view_name)

            provenance = None
            if ds_name is None or ds_name == '':
                ds_name = "TEMPORARY_RESULT"

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=row_count)
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            ds_output['name'] = ds_name

            dependencies = dict((dep_name.lower(),
                                 context.datasets.get(dep_name.lower(), None))
                                for dep_name in dependencies)
            # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies))

            outputs.stdout.append(DatasetOutput(ds_output))
            provenance = ModuleProvenance(write={
                ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  columns=ds.columns,
                                  row_count=ds.row_count)
            },
                                          read=dependencies)
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        # Return execution result
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
Exemplo n.º 11
0
    def create_dataset(self,
                       columns,
                       rows,
                       human_readable_name=None,
                       annotations=None,
                       backend_options=[],
                       dependencies=[]):
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Get unique identifier for new dataset
        identifier = 'DS_' + get_unique_identifier()
        # Write rows to temporary file in CSV format
        tmp_file = os.path.abspath(self.base_path + identifier)
        # Create a list of columns that contain the user-vizible column name and
        # the name in the database
        db_columns = list()
        colSql = ''
        for col in map(base.sanitize_column_name, columns):
            db_columns.append(
                MimirDatasetColumn(identifier=col.identifier,
                                   name_in_dataset=col.name,
                                   name_in_rdb=col.name))
            if colSql == '':
                colSql = col.name + ' AS ' + col.name
            else:
                colSql = colSql + ', ' + col.name + ' AS ' + col.name
        # Create CSV file for load
        with open(tmp_file, 'w') as f_out:
            writer = csv.writer(f_out, quoting=csv.QUOTE_MINIMAL)
            writer.writerow([col.name_in_rdb for col in db_columns])
            for row in rows:
                record = helper.encode_values(row.values)
                writer.writerow(record)
        # Load CSV file using Mimirs loadCSV method.
        table_name = mimir.loadDataSource(
            tmp_file,
            True,
            True,
            human_readable_name=human_readable_name,
            backend_options=backend_options,
            dependencies=dependencies)
        os.remove(tmp_file)
        sql = 'SELECT ' + colSql + ' FROM {{input}};'
        view_name, dependencies = mimir.createView(table_name, sql)
        # Get number of rows in the view that was created in the backend
        row_count = mimir.countRows(view_name)

        # Insert the new dataset metadata information into the datastore
        return self.register_dataset(table_name=view_name,
                                     columns=db_columns,
                                     row_counter=row_count,
                                     annotations=annotations)
Exemplo n.º 12
0
    def load_dataset(self,
                     f_handle=None,
                     url=None,
                     detect_headers=True,
                     infer_types=True,
                     load_format='csv',
                     options=[],
                     human_readable_name=None):
        """Create a new dataset from a given file or url. Expects that either
        the file handle or the url are not None. Raises ValueError if both are
        None or not None.


        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle, optional
            handle for an uploaded file on the associated file server.
        url: string, optional, optional
            Url for the file source
        detect_headers: bool, optional
            Detect column names in loaded file if True
        infer_types: bool, optional
            Infer column types for loaded dataset if True
        load_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        human_readable_name: string, optional
            Optional human readable name for the resulting table

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        if f_handle is None and url is None:
            raise ValueError('no load source given')
        elif not f_handle is None and not url is None:
            raise ValueError('too many load sources given')
        elif url is None:
            # os.path.abspath((r'%s' % os.getcwd().replace('\\','/') ) + '/' + f_handle.filepath)
            abspath = f_handle.filepath
        elif not url is None:
            abspath = url
        # Load dataset into Mimir
        init_load_name = mimir.loadDataSource(abspath, infer_types,
                                              detect_headers, load_format,
                                              human_readable_name, options)
        # Retrieve schema information for the created dataset
        sql = 'SELECT * FROM ' + init_load_name
        mimirSchema = mimir.getSchema(sql)
        # Create list of dataset columns
        columns = list()
        colSql = ''
        for col in mimirSchema:
            col_id = len(columns)
            name_in_dataset = base.sanitize_column_name(col['name'].upper())
            name_in_rdb = base.sanitize_column_name(col['name'].upper())
            col = MimirDatasetColumn(identifier=col_id,
                                     name_in_dataset=name_in_dataset,
                                     name_in_rdb=name_in_rdb)
            if colSql == '':
                colSql = name_in_dataset + ' AS ' + name_in_rdb
            else:
                colSql = colSql + ', ' + name_in_dataset + ' AS ' + name_in_rdb
            columns.append(col)
        # Create view for loaded dataset
        sql = 'SELECT ' + colSql + ' FROM {{input}};'
        view_name, dependencies = mimir.createView(init_load_name, sql)
        # TODO: this is a hack to speed up this step a bit.
        #  we get the first row id and the count and take a range;
        #  this is fragile and should be made better
        #
        # NOTE: This does not work because ROW_ID appears to be a string.
        # Thus, sorting not necessarily returns the smallest integer value
        # first.
        #
        row_count = mimir.countRows(view_name)

        return self.register_dataset(table_name=view_name,
                                     columns=columns,
                                     row_counter=row_count)
Exemplo n.º 13
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the Mimir package using the set of
        user-provided arguments and the current database state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        store_as_dataset = None
        update_rows = False
        lens_annotations = []
        # Get dataset. Raise exception if dataset is unknown.
        ds_name = arguments.get_value(pckg.PARA_DATASET).lower()
        dataset = context.get_dataset(ds_name)
        mimir_table_name = dataset.table_name
        # Keep track of the name of the input dataset for the provenance
        # information.
        input_ds_name = ds_name
        if command_id == cmd.MIMIR_DOMAIN:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
        elif command_id == cmd.MIMIR_GEOCODE:
            geocoder = arguments.get_value(cmd.PARA_GEOCODER)
            params = ['GEOCODER(' + geocoder + ')']
            add_column_parameter(params, 'HOUSE_NUMBER', dataset, arguments,
                                 cmd.PARA_HOUSE_NUMBER)
            add_column_parameter(params, 'STREET', dataset, arguments,
                                 cmd.PARA_STREET)
            add_column_parameter(params, 'CITY', dataset, arguments,
                                 cmd.PARA_CITY)
            add_column_parameter(params, 'STATE', dataset, arguments,
                                 cmd.PARA_STATE)
            # Add columns for LATITUDE and LONGITUDE
            column_counter = dataset.max_column_id() + 1
            cname_lat = dataset.get_unique_name('LATITUDE')
            cname_lon = dataset.get_unique_name('LONGITUDE')
            dataset.columns.append(
                MimirDatasetColumn(identifier=column_counter,
                                   name_in_dataset=cname_lat,
                                   data_type=DATATYPE_REAL))
            dataset.columns.append(
                MimirDatasetColumn(identifier=column_counter + 1,
                                   name_in_dataset=cname_lon,
                                   data_type=DATATYPE_REAL))
            params.append('RESULT_COLUMNS(' + cname_lat + ',' + cname_lon +
                          ')')
        elif command_id == cmd.MIMIR_KEY_REPAIR:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
            update_rows = True
        elif command_id == cmd.MIMIR_MISSING_KEY:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
            # Set MISSING ONLY to FALSE to ensure that all rows are returned
            params += ['MISSING_ONLY(FALSE)']
            # Need to run this lens twice in order to generate row ids for
            # any potential new tuple
            mimir_lens_response = mimir.createLens(
                dataset.table_name, params, command_id,
                arguments.get_value(cmd.PARA_MATERIALIZE_INPUT,
                                    default_value=True))
            (mimir_table_name,
             lens_annotations) = (mimir_lens_response.lensName(),
                                  mimir_lens_response.annotations())
            params = [ROW_ID, 'MISSING_ONLY(FALSE)']
            update_rows = True
        elif command_id == cmd.MIMIR_MISSING_VALUE:
            params = list()
            for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]):
                f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN))
                param = f_col.name_in_rdb
                col_constraint = col.get_value(cmd.PARA_COLUMNS_CONSTRAINT,
                                               raise_error=False)
                if col_constraint == '':
                    col_constraint = None
                if not col_constraint is None:
                    param = param + ' ' + str(col_constraint).replace(
                        "'", "\'\'").replace("OR", ") OR (")
                param = '\'(' + param + ')\''
                params.append(param)
        elif command_id == cmd.MIMIR_PICKER:
            pick_from = list()
            column_names = list()
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_col = col.get_value(cmd.PARA_PICKFROM)
                column = dataset.column_by_id(c_col)
                pick_from.append(column.name_in_rdb)
                column_names.append(column.name.upper().replace(' ', '_'))
            # Add result column to dataset schema
            pick_as = arguments.get_value(cmd.PARA_PICKAS,
                                          default_value='PICK_ONE_' +
                                          '_'.join(column_names))
            pick_as = dataset.get_unique_name(pick_as.strip().upper())
            dataset.columns.append(
                MimirDatasetColumn(identifier=dataset.max_column_id() + 1,
                                   name_in_dataset=pick_as))
            params = ['PICK_FROM(' + ','.join(pick_from) + ')']
            params.append('PICK_AS(' + pick_as + ')')
        elif command_id == cmd.MIMIR_SCHEMA_MATCHING:
            store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
            if store_as_dataset in context.datasets:
                raise ValueError('dataset \'' + store_as_dataset + '\' exists')
            if not is_valid_name(store_as_dataset):
                raise ValueError('invalid dataset name \'' + store_as_dataset +
                                 '\'')
            column_names = list()
            params = ['\'' + ROW_ID + ' int\'']
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_name = col.get_value(pckg.PARA_COLUMN)
                c_type = col.get_value(cmd.PARA_TYPE)
                params.append('\'' + c_name + ' ' + c_type + '\'')
                column_names.append(c_name)
        elif command_id == cmd.MIMIR_TYPE_INFERENCE:
            params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))]
        elif command_id == cmd.MIMIR_SHAPE_DETECTOR:
            dseModel = arguments.get_value(cmd.PARA_MODEL_NAME)
            params = []
            if not dseModel is None:
                params = [str(dseModel)]
        elif command_id == cmd.MIMIR_COMMENT:
            params = []
            for comment in arguments.get_value(cmd.PARA_COMMENTS):
                c_expr = comment.get_value(cmd.PARA_EXPRESSION)
                c_cmnt = comment.get_value(cmd.PARA_COMMENT)
                c_rowid = comment.get_value(cmd.PARA_ROWID)
                if c_rowid is None:
                    params.append('COMMENT(' + c_expr + ', \'' + c_cmnt +
                                  '\') ')
                else:
                    params.append('COMMENT(' + c_expr + ', \'' + c_cmnt +
                                  '\', \'' + c_rowid + '\') ')
            result_cols = []
            for col in arguments.get_value(cmd.PARA_RESULT_COLUMNS):
                c_name = col.get_value(pckg.PARA_COLUMN)
                result_cols.append(c_name)
            if len(result_cols) > 0:
                params.append('RESULT_COLUMNS(' + ','.join(result_cols) + ')')
        else:
            raise ValueError('unknown Mimir lens \'' + str(lens) + '\'')
        # Create Mimir lens
        if command_id in [
                cmd.MIMIR_SCHEMA_MATCHING, cmd.MIMIR_TYPE_INFERENCE,
                cmd.MIMIR_SHAPE_DETECTOR
        ]:
            lens_name = mimir.createAdaptiveSchema(mimir_table_name, params,
                                                   command_id.upper())
        else:
            mimir_lens_response = mimir.createLens(
                mimir_table_name,
                params,
                command_id.upper(),
                arguments.get_value(cmd.PARA_MATERIALIZE_INPUT,
                                    default_value=True),
                human_readable_name=ds_name.upper())
            (lens_name,
             lens_annotations) = (mimir_lens_response['lensName'],
                                  mimir_lens_response['annotations'])
        # Create a view including missing row ids for the result of a
        # MISSING KEY lens
        if command_id == cmd.MIMIR_MISSING_KEY:
            lens_name, row_counter = create_missing_key_view(
                dataset, lens_name, column)
            dataset.row_counter = row_counter
        # Create datastore entry for lens.
        if not store_as_dataset is None:
            columns = list()
            for c_name in column_names:
                col_id = len(columns)
                columns.append(
                    MimirDatasetColumn(identifier=col_id,
                                       name_in_dataset=c_name))
            ds = context.datastore.register_dataset(
                table_name=lens_name,
                columns=columns,
                annotations=dataset.annotations)
            ds_name = store_as_dataset
        else:
            ds = context.datastore.register_dataset(
                table_name=lens_name,
                columns=dataset.columns,
                annotations=dataset.annotations)
        # Add dataset schema and returned annotations to output
        if command_id in [
                cmd.MIMIR_SCHEMA_MATCHING, cmd.MIMIR_TYPE_INFERENCE,
                cmd.MIMIR_SHAPE_DETECTOR
        ]:
            print_dataset_schema(outputs, ds_name, ds.columns)
        else:
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            outputs.stdout.append(DatasetOutput(ds_output))

        print_lens_annotations(outputs, lens_annotations)
        dsd = DatasetDescriptor(identifier=ds.identifier,
                                columns=ds.columns,
                                row_count=ds.row_count)
        result_resources = dict()
        result_resources[base.RESOURCE_DATASET] = ds.identifier

        # Return task result
        return ExecResult(outputs=outputs,
                          provenance=ModuleProvenance(
                              read={input_ds_name: dataset.identifier},
                              write={ds_name: dsd},
                              resources=result_resources))