示例#1
0
    def move_column(self, identifier: str, column_id: int, position: int,
                    datastore: Datastore) -> VizualApiResult:
        """Move a column within a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified column is unknown or the target position invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        position: int
            Target position for the column
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid target position \'' + str(position) +
                             '\'')
        # Get index position of column that is being moved
        source_idx = dataset.get_index(column_id)
        if source_idx is None:
            raise ValueError('unknown column identifier \'' + str(column_id) +
                             '\'')
        # No need to do anything if source position equals target position
        if source_idx != position:
            columns = list(dataset.columns)
            columns.insert(position, columns.pop(source_idx))
            rows = dataset.fetch_rows()
            for row in rows:
                row.values.insert(position, row.values.pop(source_idx))
            # Store updated dataset to get new identifier
            ds = datastore.create_dataset(columns=columns,
                                          rows=rows,
                                          properties={})
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
示例#2
0
    def rename_column(self, identifier: str, column_id: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        col_idx = dataset.get_index(column_id)
        if col_idx is None:
            raise ValueError('unknown column identifier \'' + str(column_id) +
                             '\'')
        # Nothing needs to be changed if name does not differ from column name
        if dataset.columns[col_idx].name.lower() != name.lower():
            columns = list(dataset.columns)
            col = columns[col_idx]
            columns[col_idx] = DatasetColumn(identifier=col.identifier,
                                             name=name,
                                             data_type=col.data_type)
            # Store updated dataset to get new identifier
            ds = datastore.create_dataset(columns=columns,
                                          rows=dataset.fetch_rows(),
                                          properties={})
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
示例#3
0
    def delete_row(self, identifier: str, row_index: str,
                   datastore: Datastore) -> VizualApiResult:
        """Delete a row in a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified row is not within the range of the dataset.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        row_index: int
            Row index for deleted row
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that row refers a valid row in the dataset
        if int(row_index) < 0 or int(row_index) >= dataset.row_count:
            raise ValueError('invalid row index \'' + str(row_index) + '\'')
        # Delete the row at the given index position
        rows = dataset.fetch_rows()
        del rows[int(row_index)]
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=dataset.columns,
                                      rows=rows,
                                      properties={})
        return VizualApiResult(ds)
示例#4
0
    def filter_columns(self, identifier: str, columns: List[int],
                       names: List[str],
                       datastore: Datastore) -> VizualApiResult:
        """Dataset projection operator. Returns a copy of the dataset with the
        given identifier that contains only those columns listed in columns.
        The list of names contains optional new names for the filtered columns.
        A value of None in names indicates that the name of the corresponding
        column is not changed.

        Raises ValueError if no dataset with given identifier exists or if any
        of the filter columns are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for columns in the result.
        names: list(string)
            Optional new names for filtered columns.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # The schema of the new dataset only contains the columns in the given
        # list. Keep track of their index positions to filter values.
        schema = list()
        val_filter = list()
        for i in range(len(columns)):
            col_idx = dataset.get_index(columns[i])
            if col_idx is None:
                raise ValueError('unknown column identifier \'' +
                                 str(columns[i]) + '\'')
            col = dataset.columns[col_idx]
            if not names[i] is None:
                schema.append(
                    DatasetColumn(identifier=col.identifier,
                                  name=names[i],
                                  data_type=col.data_type))
            else:
                schema.append(col)
            val_filter.append(col_idx)
        # Create a list of projected rows
        rows = list()
        for row in dataset.fetch_rows():
            values = list()
            for v_idx in val_filter:
                values.append(row.values[v_idx])
            rows.append(DatasetRow(identifier=row.identifier, values=values))
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=schema, rows=rows, properties={})
        return VizualApiResult(ds)
示例#5
0
    def update_cell(self, identifier, column_id, row_id, value, datastore):
        """Update a cell in a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified cell is outside of the current dataset ranges.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier
        column_id: int
            Unique column identifier for updated cell
        row_id: int
            Unique row identifier
        value: string
            New cell value
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the index of the specified cell column
        col_index = get_index_for_column(dataset, column_id)
        # Raise exception if row id is not valid

        # Create a view for the modified dataset
        col_list = []
        for i in range(len(dataset.columns)):
            col = dataset.columns[i]
            if i == col_index:
                try:
                    val_stmt = col.to_sql_value(value)
                    col_sql = val_stmt + ' ELSE ' + col.name_in_rdb + ' END '
                except ValueError:
                    col_sql = '\'' + str(
                        value
                    ) + '\' ELSE CAST({{input}}.' + col.name_in_rdb + ' AS varchar) END '
                rid_sql = MIMIR_ROWID_COL.to_sql_value(row_id)
                stmt = 'CASE WHEN ' + ROW_ID + ' = ' + rid_sql + ' THEN '
                stmt += col_sql
                stmt += 'AS ' + col.name_in_rdb
                col_list.append(stmt)
            else:
                col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(
            col_list) + ' FROM ' + dataset.table_name + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=dataset.columns,
                                        row_counter=dataset.row_counter,
                                        annotations=dataset.annotations)
        return VizualApiResult(ds)
示例#6
0
    def move_column(self, identifier, column_id, position, datastore):
        """Move a column within a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified column is unknown or the target position invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        position: int
            Target position for the column
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid target position \'' + str(position) +
                             '\'')
        # Get index position of column that is being moved
        source_idx = get_index_for_column(dataset, column_id)
        # No need to do anything if source position equals target position
        if source_idx != position:
            # There are no changes to the underlying database. We only need to
            # change the column information in the dataset schema.
            schema = list(dataset.columns)
            schema.insert(position, schema.pop(source_idx))
            # Store updated dataset to get new identifier
            ds = datastore.register_dataset(table_name=dataset.table_name,
                                            columns=schema,
                                            row_counter=dataset.row_counter,
                                            annotations=dataset.annotations)
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
示例#7
0
    def sort_dataset(self, identifier: str, columns: List[int],
                     reversed: List[bool],
                     datastore: Datastore) -> VizualApiResult:
        """Sort the dataset with the given identifier according to the order by
        statement. The order by statement is a pair of lists. The first list
        contains the identifier of columns to sort on. The second list contains
        boolean flags, one for each entry in columns, indicating whether sort
        order is revered for the corresponding column or not.

        Returns the number of rows in the dataset and the identifier of the
        sorted dataset.

        Raises ValueError if no dataset with given identifier exists or if any
        of the columns in the order by clause are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for sort columns.
        reversed: list(bool)
            Flags indicating whether the sort order of the corresponding column
            is reveresed.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Fetch the full set of rows
        rows = dataset.fetch_rows()
        # Sort multiple times, ones for each of the sort columns (in reverse
        # order of appearance in the order by clause)
        for i in range(len(columns)):
            l_idx = len(columns) - (i + 1)
            col_id = columns[l_idx]
            col_idx = dataset.get_index(col_id)
            # print("SORT: {}".format(col_idx))
            # print("\n".join(", ".join("'{}':{}".format(v, type(v)) for v in row.values) for row in rows))
            reverse = reversed[l_idx]
            if col_idx is None:
                raise ValueError('unknown column identifier \'' + str(col_id) +
                                 '\'')
            else:
                rows.sort(key=lambda row: row.values[cast(int, col_idx)],
                          reverse=reverse)
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=dataset.columns,
                                      rows=rows,
                                      properties={})
        return VizualApiResult(ds)
示例#8
0
    def insert_column(self, identifier, position, name, datastore):
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the column will be inserted
        name: string, optional
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Get identifier for new column
        col_id = dataset.max_column_id() + 1
        # Insert new column into schema
        schema = list(dataset.columns)
        new_column = MimirDatasetColumn(col_id, name, name)
        schema.insert(position, new_column)
        # Create a view for the modified schema
        col_list = []
        for col in schema:
            if col.identifier == new_column.identifier:
                # Note: By no (April 2018) this requires Mimir to run with the
                # XNULL option. Otherwise, in some scenarios setting the all
                # values in the new column to NULL may cause an exception.
                col_list.append(" CAST('' AS int) AS " + col.name_in_rdb)
            else:
                col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(
            col_list) + ' FROM ' + dataset.table_name + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=schema,
                                        row_counter=dataset.row_counter,
                                        annotations=dataset.annotations)
        return VizualApiResult(ds)
示例#9
0
    def move_row(self, identifier: str, row_id: str, position: int,
                 datastore: Datastore):
        """Move a row within a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified row or position is not within the range of the dataset.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        row_index: int
            Row index for deleted row
        position: int
            Target position for the row
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that row is within dataset bounds
        if int(row_id) < 0 or int(row_id) >= dataset.row_count:
            raise ValueError('invalid source row \'' + str(row_id) + '\'')
        # Make sure that position is a valid row index in the new dataset
        if position < 0 or position > dataset.row_count:
            raise ValueError('invalid target position \'' + str(position) +
                             '\'')
        # No need to do anything if source position equals target position
        if row_id != position:
            rows = dataset.fetch_rows()
            rows.insert(position, rows.pop(int(row_id)))
            # Store updated dataset to get new identifier
            ds = datastore.create_dataset(columns=dataset.columns,
                                          rows=rows,
                                          properties={})
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
示例#10
0
    def filter_columns(self, identifier, columns, names, datastore):
        """Dataset projection operator. Returns a copy of the dataset with the
        given identifier that contains only those columns listed in columns.
        The list of names contains optional new names for the filtered columns.
        A value of None in names indicates that the name of the corresponding
        column is not changed.

        Raises ValueError if no dataset with given identifier exists or if any
        of the filter columns are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for columns in the result.
        names: list(string)
            Optional new names for filtered columns.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # The schema of the new dataset only contains the columns in the given
        # list. A column might need to be renamed.
        schema = list()
        col_list = []
        for i in range(len(columns)):
            col_idx = get_index_for_column(dataset, columns[i])
            col = dataset.columns[col_idx]
            if not names[i] is None:
                schema.append(
                    MimirDatasetColumn(identifier=col.identifier,
                                       name_in_dataset=names[i],
                                       name_in_rdb=col.name_in_rdb))
            else:
                schema.append(col)
            col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(
            col_list) + ' FROM ' + dataset.table_name + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=schema,
                                        row_counter=dataset.row_counter,
                                        annotations=dataset.annotations.filter(
                                            columns=columns,
                                            rows=dataset.row_ids))
        return VizualApiResult(ds)
示例#11
0
    def move_row(self, identifier, row_index, position, datastore):
        """Move a row within a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified row or position is not within the range of the dataset.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        row_index: int
            Row index for deleted row
        position: int
            Target position for the row
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that row is within dataset bounds
        if row_index < 0 or row_index >= dataset.row_count:
            raise ValueError('invalid source row \'' + str(row_index) + '\'')
        # Make sure that position is a valid row index in the new dataset
        if position < 0 or position > dataset.row_count:
            raise ValueError('invalid target position \'' + str(position) +
                             '\'')
            # No need to do anything if source position equals target position

            # Store updated dataset to get new identifier
            ds = datastore.register_dataset(table_name=dataset.table_name,
                                            columns=dataset.columns,
                                            row_counter=dataset.row_counter,
                                            annotations=dataset.annotations)
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
示例#12
0
 def materialize_dataset(self, identifier: str,
                         datastore: Datastore) -> VizualApiResult:
     """Create a materialized snapshot of the dataset for faster
     execution.
 
     This is a no-op for the FS Backend
     """
     dataset = datastore.get_dataset(identifier)
     if dataset is None:
         raise ValueError('unknown dataset \'' + identifier + '\'')
     return VizualApiResult(dataset)
示例#13
0
    def sort_dataset(self, identifier: str, columns: List[int],
                     reversed: List[bool],
                     datastore: Datastore) -> VizualApiResult:
        """Sort the dataset with the given identifier according to the order by
        statement. The order by statement is a pair of lists. The first list
        contains the identifier of columns to sort on. The second list contains
        boolean flags, one for each entry in columns, indicating whether sort
        order is revered for the corresponding column or not.

        Returns the number of rows in the dataset and the identifier of the
        sorted dataset.

        Raises ValueError if no dataset with given identifier exists or if any
        of the columns in the order by clause are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for sort columns.
        reversed: list(bool)
            Flags indicating whether the sort order of the corresponding column
            is reveresed.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        assert (isinstance(dataset, MimirDatasetHandle))
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Create order by clause based on columns and reversed flags
        order_by_clause = list()
        for i in range(len(columns)):
            col_id = columns[i]
            stmt = cast(MimirDatasetColumn,
                        dataset.column_by_id(col_id)).name_in_rdb
            if reversed[i]:
                stmt += ' DESC'
            order_by_clause.append(stmt)
        sql = 'SELECT * FROM ' + dataset.identifier + ' ORDER BY '
        sql += ','.join(order_by_clause)
        view_name, dependencies, schema, properties, functionDeps = mimir.createView(
            datasets={dataset.identifier: dataset.identifier}, query=sql)
        ds = MimirDatasetHandle.from_mimir_result(view_name, schema,
                                                  properties)
        return VizualApiResult(ds)
示例#14
0
    def update_cell(self, identifier: str, column_id: int, row_id: str,
                    value: str, datastore: Datastore) -> VizualApiResult:
        """Update a cell in a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified cell is outside of the current dataset ranges.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier
        column_id: int
            Unique column identifier for updated cell
        row_id: int
            Unique row identifier
        value: string
            New cell value
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get column index forst in case it raises an exception
        col_idx = dataset.get_index(column_id)
        if col_idx is None:
            raise ValueError('unknown column identifier \'' + str(column_id) +
                             '\'')
        # Update the specified cell in the given data array
        rows = dataset.fetch_rows()
        row_index = -1
        for i in range(len(rows)):
            if int(rows[i].identifier) == int(row_id):
                row_index = i
                break
        # Make sure that row refers a valid row in the dataset
        if row_index < 0:
            raise ValueError('invalid row identifier \'' + str(row_id) + '\'')
        r = rows[row_index]
        values = list(r.values)
        values[col_idx] = value
        rows[row_index] = DatasetRow(identifier=r.identifier, values=values)
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=dataset.columns,
                                      rows=rows,
                                      properties={})
        return VizualApiResult(ds)
示例#15
0
 def import_dataset(self, datastore: Datastore, project_id: str,
                    dataset_id: str) -> VizualApiResult:
     from vizier.api.webservice.server import api
     # Mimir doesn't actually need to use the project ID (yet), but let's check the
     # URL for safety anyway
     project = api.projects.projects.get_project(project_id)
     if project is None:
         raise Exception("No Such Project: {}".format(project_id))
     # Get the actual dataset
     dataset = datastore.get_dataset(dataset_id)
     if dataset is None:
         raise Exception("No Such Dataset: {}".format(dataset_id))
     return VizualApiResult(dataset, {})
示例#16
0
    def sort_dataset(self, identifier, columns, reversed, datastore):
        """Sort the dataset with the given identifier according to the order by
        statement. The order by statement is a pair of lists. The first list
        contains the identifier of columns to sort on. The second list contains
        boolean flags, one for each entry in columns, indicating whether sort
        order is revered for the corresponding column or not.

        Returns the number of rows in the dataset and the identifier of the
        sorted dataset.

        Raises ValueError if no dataset with given identifier exists or if any
        of the columns in the order by clause are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for sort columns.
        reversed: list(bool)
            Flags indicating whether the sort order of the corresponding column
            is reveresed.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Create order by clause based on columns and reversed flags
        order_by_clause = list()
        for i in range(len(columns)):
            col_id = columns[i]
            stmt = dataset.column_by_id(col_id).name_in_rdb
            if reversed[i]:
                stmt += ' DESC'
            order_by_clause.append(stmt)
        sql = 'SELECT * FROM {{input}} ORDER BY '
        sql += ','.join(order_by_clause) + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)

        # Register new dataset with only a modified list of row identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=dataset.columns,
                                        annotations=dataset.annotations)
        return VizualApiResult(ds)
示例#17
0
    def insert_column(self, identifier: str, position: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the column will be inserted
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not name is None and not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Insert new column into dataset
        columns = list(dataset.columns)
        rows = dataset.fetch_rows()
        columns.insert(
            position,
            DatasetColumn(identifier=dataset.max_column_id() + 1,
                          name=name if not name is None else ''))
        # Add a null value to each row for the new column
        for row in rows:
            row.values.insert(position, None)
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=columns,
                                      rows=rows,
                                      properties={})
        return VizualApiResult(ds)
示例#18
0
 def materialize_dataset(self, identifier: str,
                         datastore: Datastore) -> VizualApiResult:
     """Create a materialized snapshot of the dataset for faster
     execution."""
     input_dataset = datastore.get_dataset(identifier)
     if input_dataset is None:
         raise ValueError('unknown dataset \'' + identifier + '\'')
     cast(MimirDatasetHandle, input_dataset)
     response = mimir.materialize(input_dataset.identifier)
     output_ds = MimirDatasetHandle(
         identifier=response["name"],
         columns=cast(List[MimirDatasetColumn], input_dataset.columns),
         properties=input_dataset.get_properties(),
         name=input_dataset.name
         if input_dataset.name is not None else "untitled dataset")
     return VizualApiResult(output_ds)
示例#19
0
    def insert_row(self, identifier, position, datastore):
        """Insert row at given position in a dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified row psotion isoutside the dataset bounds.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the row will be inserted
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid row index in the new dataset
        if position < 0 or position > len(dataset.row_ids):
            raise ValueError('invalid row index \'' + str(position) + '\'')
        # Get unique id for new row
        dataset.row_counter += 1

        # Create a view for the modified schema
        col_list = []
        for col in dataset.columns:
            col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(col_list) + ' FROM ' + dataset.table_name
        mimirSchema = mimir.getSchema(sql)
        union_list = []
        for col in mimirSchema[1:]:
            union_list.append('CAST(NULL AS ' + col['baseType'] + ') AS ' +
                              col['name'])
        sql = '(' + sql + ') UNION ALL (SELECT ' + ','.join(union_list) + ');'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=dataset.columns,
                                        row_counter=dataset.row_counter,
                                        annotations=dataset.annotations)
        return VizualApiResult(ds)
示例#20
0
    def unload_dataset(self,
                       dataset: DatasetDescriptor,
                       datastore: Datastore,
                       filestore: Filestore,
                       unload_format: str = 'csv',
                       options: List[Dict[str, Any]] = [],
                       resources: Dict[str, Any] = None):
        """Export (or unload) a dataset to a given file format. 

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        Parameters
        ----------
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets
        filestore: vizier.filestore.Filestore
            Filestore to retrieve uploaded datasets
        unload_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        resources: dict, optional
            Dictionary of additional resources (i.e., key,value pairs) that were
            generated during a previous execution of the associated module
        
        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        f_handles = None
        result_resources = dict()

        assert (isinstance(datastore, MimirDatastore))
        assert (isinstance(filestore, FileSystemFilestore))

        if dataset is not None:
            f_handles = datastore.unload_dataset(
                filepath=filestore.get_file_dir(get_unique_identifier()),
                dataset_name=dataset.identifier,
                format=unload_format,
                options=options)
        result_resources[base.RESOURCE_FILEID] = f_handles
        return VizualApiResult(dataset=dataset, resources=result_resources)
示例#21
0
    def empty_dataset(
        self,
        datastore: Datastore,
        filestore: Filestore,
        initial_columns: List[Tuple[str, str]] = [("''", "unnamed_column")]
    ) -> VizualApiResult:
        """Create (or load) a new dataset from a given file or Uri. It is
        guaranteed that either the file identifier or the url are not None but
        one of them will be None. The user name and password may only be given
        if an url is given.

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        Parameters
        ----------
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets
        filestore: vizier.filestore.Filestore
            Filestore to retrieve uploaded datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        assert (isinstance(datastore, MimirDatastore))
        ds = datastore.create_dataset(
            columns=[
                MimirDatasetColumn(identifier=id,
                                   name_in_dataset=col,
                                   data_type="varchar")
                for id, (default, col) in enumerate(initial_columns)
            ],
            rows=[
                DatasetRow(
                    identifier=str(id),
                    values=[default for default, col in initial_columns])
                for id in range(1, 2)
            ],
            human_readable_name="Empty Table",
        )

        return VizualApiResult(dataset=ds)
示例#22
0
    def delete_column(self, identifier, column_id, datastore):
        """Delete a column in a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified column is unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the index of the specified column that is to be deleted.
        col_index = dataset.get_index(column_id)
        if col_index is None:
            raise ValueError('unknown column identifier \'' + str(column_id) +
                             '\'')
        # Delete column from schema. Keep track of the column name for the
        # result output.
        columns = list(dataset.columns)
        name = columns[col_index].name
        del columns[col_index]
        # Delete all value for the deleted column
        rows = dataset.fetch_rows()
        for row in rows:
            del row.values[col_index]
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(
            columns=columns,
            rows=rows,
            annotations=dataset.annotations.filter(
                columns=[c.identifier for c in columns]))
        return VizualApiResult(ds)
示例#23
0
    def move_column(self, identifier: str, column_id: int, position: int,
                    datastore: Datastore) -> VizualApiResult:
        """Move a column within a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified column is unknown or the target position invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        position: int
            Target position for the column
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid target position \'' + str(position) +
                             '\'')
        # Get index position of column that is being moved
        source_idx = get_index_for_column(dataset, column_id)
        # No need to do anything if source position equals target position
        if source_idx != position:
            # Keep the mimir-side schema aligned with the vizier-side schema
            command = {
                "id": "moveColumn",
                "column": source_idx,
                "position": position
            }
            response = mimir.vizualScript(dataset.identifier, command)
            return VizualApiResult.from_mimir(response)
        else:
            return VizualApiResult(dataset)
示例#24
0
    def delete_column(self, identifier, column_id, datastore):
        """Delete a column in a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified column is unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the index of the specified column that is to be deleted.
        col_index = get_index_for_column(dataset, column_id)
        # Delete column from schema
        schema = list(dataset.columns)
        del schema[col_index]
        # Create a view for the modified schema
        col_list = []
        for col in schema:
            col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(
            col_list) + ' FROM ' + dataset.table_name + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=schema,
                                        row_counter=dataset.row_counter,
                                        annotations=dataset.annotations)
        return VizualApiResult(ds)
示例#25
0
    def insert_row(self, identifier: str, position: int,
                   datastore: Datastore) -> VizualApiResult:
        """Insert row at given position in a dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified row psotion isoutside the dataset bounds.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the row will be inserted
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        print('---------------' + str(dataset.__class__.__name__))
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        assert (isinstance(dataset, FileSystemDatasetHandle))
        # Make sure that position is a valid row index in the new dataset
        if position < 0 or position > dataset.row_count:
            raise ValueError('invalid row index \'' + str(position) + '\'')
        # Create empty set of values
        rows = dataset.fetch_rows()
        rows.insert(
            position,
            DatasetRow(identifier=str(dataset.max_row_id() + 1),
                       values=[None] * len(dataset.columns)))
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=dataset.columns,
                                      rows=rows,
                                      properties={})
        return VizualApiResult(ds)
示例#26
0
    def delete_row(self, identifier, rowid, datastore):
        """Delete a row in a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified row is not within the range of the dataset.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        row_index: int
            Row index for deleted row
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')

        # Create a view for the modified dataset
        col_list = []
        for col in dataset.columns:
            col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(col_list) + ' FROM ' + dataset.table_name
        sql += ' WHERE ' + ROW_ID + ' <> ' + MIMIR_ROWID_COL.to_sql_value(
            roid) + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=dataset.columns,
                                        row_counter=dataset.row_counter - 1,
                                        annotations=dataset.annotations)
        return VizualApiResult(ds)
示例#27
0
    def load_dataset(
            self,
            datastore: Datastore,
            filestore: Filestore,
            file_id: Optional[str] = None,
            url: Optional[str] = None,
            detect_headers: bool = True,
            infer_types: bool = True,
            load_format: str = 'csv',
            options: List[Dict[str, str]] = [],
            username: str = None,
            password: str = None,
            resources: Optional[Dict[str, Any]] = None,
            reload: bool = False,
            human_readable_name: Optional[str] = None,
            proposed_schema: List[Tuple[str, str]] = []) -> VizualApiResult:
        """Create (or load) a new dataset from a given file or Uri. It is
        guaranteed that either the file identifier or the url are not None but
        one of them will be None. The user name and password may only be given
        if an url is given.

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        Parameters
        ----------
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets
        filestore: vizier.filestore.Filestore
            Filestore to retrieve uploaded datasets
        file_id: string, optional
            Identifier for a file in an associated filestore
        url: string, optional
            Identifier for a web resource
        detect_headers: bool, optional
            Detect column names in loaded file if True
        infer_types: bool, optional
            Infer column types for loaded dataset if True
        load_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        username: string, optional
            User name for authentication when accessing restricted resources
        password: string, optional
            Password for authentication when accessing restricted resources
        resources: dict, optional
            Dictionary of additional resources (i.e., key,value pairs) that were
            generated during a previous execution of the associated module
        reload: bool, optional
            Flag to force download of a remote resource even if it was
            downloaded previously

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        dataset = None
        result_resources = dict()
        if url is not None:
            # If the same url has been previously used to generate a dataset
            # we do not need to download the file and re-create the dataset.
            if not reload and not resources is None and base.RESOURCE_URL in resources and base.RESOURCE_DATASET in resources:
                # Check if the previous download matches the given Uri
                if resources[base.RESOURCE_URL] == url:
                    ds_id = resources[base.RESOURCE_DATASET]
                    dataset = datastore.get_dataset(ds_id)
            # If dataset is still None we need to create a new dataset by
            # downloading the given Uri
            if dataset is None:
                assert (isinstance(datastore, FileSystemDatastore))
                dataset = datastore.download_dataset(
                    url=url,
                    username=username,
                    password=password,
                )
            result_resources[base.RESOURCE_URL] = url
        else:
            # either url or file_id must not be None
            assert (file_id is not None)

            # If the same file has been previously used to generate a dataset
            # we do not need to re-create it.
            if not resources is None and base.RESOURCE_FILEID in resources and base.RESOURCE_DATASET in resources:
                if resources[base.RESOURCE_FILEID] == file_id:
                    ds_id = resources[base.RESOURCE_DATASET]
                    dataset = datastore.get_dataset(ds_id)
            # If dataset is still None we need to create a new dataset from the
            # specified file
            if dataset is None:
                dataset = datastore.load_dataset(
                    f_handle=filestore.get_file(file_id),
                    proposed_schema=proposed_schema)
            result_resources[base.RESOURCE_FILEID] = file_id
        # Ensure that the dataset is not None at this point
        if dataset is None:
            raise ValueError('unknown file or resource')
        result_resources[base.RESOURCE_DATASET] = dataset.identifier
        return VizualApiResult(dataset=dataset, resources=result_resources)
示例#28
0
    def load_dataset(self,
                     datastore,
                     filestore,
                     file_id=None,
                     url=None,
                     detect_headers=True,
                     infer_types=True,
                     load_format='csv',
                     options=[],
                     username=None,
                     password=None,
                     resources=None,
                     reload=False,
                     human_readable_name=None):
        """Create (or load) a new dataset from a given file or Uri. It is
        guaranteed that either the file identifier or the url are not None but
        one of them will be None. The user name and password may only be given
        if an url is given.

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        Parameters
        ----------
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets
        filestore: vizier.filestore.Filestore
            Filestore to retrieve uploaded datasets
        file_id: string, optional
            Identifier for a file in an associated filestore
        url: string, optional
            Identifier for a web resource
        detect_headers: bool, optional
            Detect column names in loaded file if True
        infer_types: bool, optional
            Infer column types for loaded dataset if True
        load_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        username: string, optional
            User name for authentication when accessing restricted resources
        password: string, optional
            Password for authentication when accessing restricted resources
        resources: dict, optional
            Dictionary of additional resources (i.e., key,value pairs) that were
            generated during a previous execution of the associated module
        reload: bool, optional
            Flag to force download of a remote resource even if it was
            downloaded previously

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        dataset = None
        f_handle = None
        result_resources = dict()
        if not url is None:
            # If the same url has been previously used to generate a dataset
            # we do not need to download the file and re-create the dataset.
            if not reload and not resources is None and base.RESOURCE_URL in resources and base.RESOURCE_DATASET in resources:
                # Check if the previous download matches the given Uri
                if resources[base.RESOURCE_URL] == url:
                    ds_id = resources[base.RESOURCE_DATASET]
                    dataset = datastore.get_dataset(ds_id)
            result_resources[base.RESOURCE_URL] = url
        elif not file_id is None:
            # If the same file has been previously used to generate a dataset
            # we do not need to re-create it.
            if not resources is None and base.RESOURCE_FILEID in resources and base.RESOURCE_DATASET in resources:
                if resources[base.RESOURCE_FILEID] == file_id:
                    ds_id = resources[base.RESOURCE_DATASET]
                    dataset = datastore.get_dataset(ds_id)
            # If the dataset is None we will load the dataset from an uploaded
            # file. Need to get the file handle for the file here.
            if dataset is None:
                f_handle = filestore.get_file(file_id)
            result_resources[base.RESOURCE_FILEID] = file_id
        else:
            raise ValueError('no source identifier given for load')
        # If the dataset is still None at this point we need to call the
        # load_dataset method of the datastore to load it.
        if dataset is None:
            dataset = datastore.load_dataset(
                f_handle=f_handle,
                url=url,
                detect_headers=detect_headers,
                infer_types=infer_types,
                load_format=load_format,
                human_readable_name=human_readable_name,
                options=options)
        result_resources[base.RESOURCE_DATASET] = dataset.identifier
        return VizualApiResult(dataset=dataset, resources=result_resources)

        # Ensure that file name references a previously uploaded file.
        f_handle = self.fileserver.get_file(file_id)
        if f_handle is None:
            raise ValueError('unknown file \'' + file_id + '\'')
示例#29
0
    def rename_column(self, identifier, column_id, name, datastore):
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        columns = list()
        schema = list(dataset.columns)
        colIndex = get_index_for_column(dataset, column_id)
        col = schema[colIndex]
        # No need to do anything if the name hasn't changed
        if col.name.lower() != name.lower():

            sql = 'SELECT * FROM ' + dataset.table_name
            mimirSchema = mimir.getSchema(sql)
            # Create list of dataset columns
            colSql = ''
            idx = 0
            for col in mimirSchema:
                col_id = len(columns)
                name_in_dataset = sanitize_column_name(col['name'].upper())
                name_in_rdb = sanitize_column_name(col['name'].upper())
                col = MimirDatasetColumn(identifier=col_id,
                                         name_in_dataset=name_in_dataset,
                                         name_in_rdb=name_in_rdb)
                if idx == 0:
                    colSql = name_in_dataset + ' AS ' + name_in_rdb
                elif idx == colIndex:
                    colSql = colSql + ', ' + name_in_dataset + ' AS ' + name
                    col.name = name
                    col.name_in_rdb = name
                else:
                    colSql = colSql + ', ' + name_in_dataset + ' AS ' + name_in_rdb
                columns.append(col)
                idx = idx + 1
            # Create view for loaded dataset
            sql = 'SELECT ' + colSql + ' FROM {{input}};'
            view_name, dependencies = mimir.createView(dataset.table_name, sql)
            # There are no changes to the underlying database. We only need to
            # change the column information in the dataset schema.
            # Store updated dataset to get new identifier
            ds = datastore.register_dataset(table_name=view_name,
                                            columns=columns,
                                            row_counter=dataset.row_counter,
                                            annotations=dataset.annotations)
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
示例#30
0
    def load_dataset(
            self,
            datastore: Datastore,
            filestore: Filestore,
            file_id: Optional[str] = None,
            url: Optional[str] = None,
            detect_headers: bool = True,
            infer_types: bool = True,
            load_format: str = 'csv',
            options: List[Dict[str, str]] = [],
            username: str = None,
            password: str = None,
            resources: Optional[Dict[str, Any]] = None,
            reload: bool = False,
            human_readable_name: Optional[str] = None,
            proposed_schema: List[Tuple[str, str]] = []) -> VizualApiResult:
        """Create (or load) a new dataset from a given file or Uri. It is
        guaranteed that either the file identifier or the url are not None but
        one of them will be None. The user name and password may only be given
        if an url is given.

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        ^--- Vistrails will automatically skip re-execution, so the only reason
        that we'd re-execute the cell is if the user manually asked us to.  If 
        that's the case, we should actually reload the file (e.g., because we
        may be reloading with different parameters).


        Parameters
        ----------
        datastore : Datastore to retireve and update datasets
        filestore: Filestore to retrieve uploaded datasets
        file_id: Identifier for a file in an associated filestore
        url: Identifier for a web resource
        detect_headers: Detect column names in loaded file if True
        infer_types: Infer column types for loaded dataset if True
        load_format: Format identifier
        options: Additional options for Mimirs load command
        username: User name for authentication when accessing restricted resources
        password: Password for authentication when accessing restricted resources
        resources: Dictionary of additional resources (i.e., key,value pairs) that were
            generated during a previous execution of the associated module
        reload: If set to false, avoid reloading the data if possible.
        human_readable_name: A user-facing name for this table
        proposed_schema: A list of name/type pairs that will override 
                         the inferred column names/types if present

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        dataset = None
        f_handle = None
        result_resources = dict()
        if url is not None:
            if (debug_is_on()):
                print("LOAD URL: {}".format(url))
            # If the same url has been previously used to generate a dataset
            # we do not need to download the file and re-create the dataset.
            if not reload and not resources is None and base.RESOURCE_URL in resources and base.RESOURCE_DATASET in resources:
                # Check if the previous download matches the given Uri
                if resources[base.RESOURCE_URL] == url:
                    ds_id = resources[base.RESOURCE_DATASET]
                    if (debug_is_on()):
                        print("   ... re-using existing dataset {}".format(
                            ds_id))
                    dataset = datastore.get_dataset(ds_id)
            result_resources[base.RESOURCE_URL] = url
        elif file_id is not None:
            if debug_is_on():
                print("LOAD FILE: {}".format(file_id))
            # If the same file has been previously used to generate a dataset
            # we do not need to re-create it.
            if (not reload) and (resources is not None) and (
                    base.RESOURCE_FILEID
                    in resources) and (base.RESOURCE_DATASET in resources):
                if resources[base.RESOURCE_FILEID] == file_id:
                    ds_id = resources[base.RESOURCE_DATASET]
                    # if(debug_is_on()):
                    print("   ... re-using existing dataset {}".format(ds_id))
                    dataset = datastore.get_dataset(ds_id)
                    print("DATASET: {}".format(dataset))
            # If the dataset is None we will load the dataset from an uploaded
            # file. Need to get the file handle for the file here.
            if dataset is None:
                print("getting file")
                f_handle = filestore.get_file(file_id)
                if (f_handle is None):
                    raise ValueError(
                        "The uploaded file got deleted, try re-uploading.")
            result_resources[base.RESOURCE_FILEID] = file_id
        else:
            raise ValueError('no source identifier given for load')

        # If the dataset is still None at this point we need to call the
        # load_dataset method of the datastore to load it.
        if dataset is None:
            if (url is None and f_handle is None):
                raise ValueError("Need an URL or an Uploaded File to load")
            assert (isinstance(datastore, MimirDatastore))
            if (debug_is_on()):
                print("   ... loading dataset {} / {}".format(url, f_handle))
            dataset = datastore.load_dataset(
                f_handle=f_handle,
                url=url,
                detect_headers=detect_headers,
                infer_types=infer_types,
                load_format=load_format,
                human_readable_name=human_readable_name,
                options=options,
                proposed_schema=proposed_schema)
        result_resources[base.RESOURCE_DATASET] = dataset.identifier
        return VizualApiResult(dataset=dataset, resources=result_resources)

        # Ensure that file name references a previously uploaded file.
        f_handle = self.fileserver.get_file(file_id)
        if f_handle is None:
            raise ValueError('unknown file \'' + file_id + '\'')