Пример #1
0
 def get_row_count(self) -> int:
     if self._row_count is None:
         # try to use the count property if present
         self._row_count = self.get_properties().get("count", None)
         if self._row_count is None:
             self._row_count = mimir.countRows(self.identifier)
     return self._row_count
Пример #2
0
    def execute_query(self, args, context):
        """Execute a SQL query in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get SQL source code that is in this cell and the global
        # variables
        source = args.get_value(cmd.PARA_SQL_SOURCE)
        if not source.endswith(';'):
            source = source + ';'
        ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False)
        # Get mapping of datasets in the context to their respective table
        # name in the Mimir backend
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o]
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.table_name
        # Module outputs
        outputs = ModuleOutputs()
        try:
            # Create the view from the SQL source
            view_name, dependencies = mimir.createView(mimir_table_names,
                                                       source)
            sql = 'SELECT * FROM ' + view_name
            mimirSchema = mimir.getSchema(sql)

            columns = list()

            for col in mimirSchema:
                col_id = len(columns)
                name_in_dataset = col['name']
                col = MimirDatasetColumn(identifier=col_id,
                                         name_in_dataset=name_in_dataset)
                columns.append(col)

            row_count = mimir.countRows(view_name)

            provenance = None
            if ds_name is None or ds_name == '':
                ds_name = "TEMPORARY_RESULT"

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=row_count)
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            ds_output['name'] = ds_name

            dependencies = dict((dep_name.lower(),
                                 context.datasets.get(dep_name.lower(), None))
                                for dep_name in dependencies)
            # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies))

            outputs.stdout.append(DatasetOutput(ds_output))
            provenance = ModuleProvenance(write={
                ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  columns=ds.columns,
                                  row_count=ds.row_count)
            },
                                          read=dependencies)
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        # Return execution result
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
Пример #3
0
    def create_dataset(self,
                       columns,
                       rows,
                       human_readable_name=None,
                       annotations=None,
                       backend_options=[],
                       dependencies=[]):
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Get unique identifier for new dataset
        identifier = 'DS_' + get_unique_identifier()
        # Write rows to temporary file in CSV format
        tmp_file = os.path.abspath(self.base_path + identifier)
        # Create a list of columns that contain the user-vizible column name and
        # the name in the database
        db_columns = list()
        colSql = ''
        for col in map(base.sanitize_column_name, columns):
            db_columns.append(
                MimirDatasetColumn(identifier=col.identifier,
                                   name_in_dataset=col.name,
                                   name_in_rdb=col.name))
            if colSql == '':
                colSql = col.name + ' AS ' + col.name
            else:
                colSql = colSql + ', ' + col.name + ' AS ' + col.name
        # Create CSV file for load
        with open(tmp_file, 'w') as f_out:
            writer = csv.writer(f_out, quoting=csv.QUOTE_MINIMAL)
            writer.writerow([col.name_in_rdb for col in db_columns])
            for row in rows:
                record = helper.encode_values(row.values)
                writer.writerow(record)
        # Load CSV file using Mimirs loadCSV method.
        table_name = mimir.loadDataSource(
            tmp_file,
            True,
            True,
            human_readable_name=human_readable_name,
            backend_options=backend_options,
            dependencies=dependencies)
        os.remove(tmp_file)
        sql = 'SELECT ' + colSql + ' FROM {{input}};'
        view_name, dependencies = mimir.createView(table_name, sql)
        # Get number of rows in the view that was created in the backend
        row_count = mimir.countRows(view_name)

        # Insert the new dataset metadata information into the datastore
        return self.register_dataset(table_name=view_name,
                                     columns=db_columns,
                                     row_counter=row_count,
                                     annotations=annotations)
Пример #4
0
    def register_dataset(self,
                         table_name,
                         columns,
                         row_counter=None,
                         annotations=None):
        """Create a new record for a database table or view. Note that this
        method does not actually create the table or view in the database but
        adds the datasets metadata to the data store. The table or view will
        have been created by a load command or be the result from executing
        a lens or a VizUAL command.

        Parameters
        ----------
        table_name: string
            Name of relational database table or view containing the dataset.
        columns: list(vizier.datastore.mimir.MimirDatasetColumn)
            List of column names in the dataset schema and their corresponding
            names in the relational database table or view.
        row_counter: int
            Counter for unique row ids
        annotations: vizier.datastore.metadata.DatasetMetadata
            Annotations for dataset components
        update_rows: bool, optional
            Flag indicating that the number of rows may have changed and the
            list of row identifier therefore needs to be checked.

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        # Depending on whether we need to update row ids we either query the
        # database or just get the schema. In either case mimir_schema will
        # contain a the returned Mimir schema information.
        sql = base.get_select_query(table_name, columns=columns) + ';'
        mimir_schema = mimir.getSchema(sql)

        # Create a mapping of column name (in database) to column type. This
        # mapping is then used to update the data type information for all
        # column descriptors.
        col_types = dict()
        for col in mimir_schema:
            col_types[base.sanitize_column_name(
                col['name'].upper())] = col['baseType']
        for col in columns:
            col.data_type = col_types[col.name_in_rdb]
        # Set row counter to max. row id + 1 if None
        if row_counter is None:
            row_counter = mimir.countRows(table_name)
        dataset = MimirDatasetHandle(identifier=get_unique_identifier(),
                                     columns=list(
                                         map(base.sanitize_column_name,
                                             columns)),
                                     table_name=table_name,
                                     row_counter=row_counter,
                                     annotations=annotations)
        # Create a new directory for the dataset if it doesn't exist.
        dataset_dir = self.get_dataset_dir(dataset.identifier)
        if not os.path.isdir(dataset_dir):
            os.makedirs(dataset_dir)
        # Write dataset and annotation file to disk
        dataset.to_file(self.get_dataset_file(dataset.identifier))
        dataset.annotations.to_file(
            self.get_metadata_filename(dataset.identifier))
        return dataset
Пример #5
0
    def load_dataset(self,
                     f_handle=None,
                     url=None,
                     detect_headers=True,
                     infer_types=True,
                     load_format='csv',
                     options=[],
                     human_readable_name=None):
        """Create a new dataset from a given file or url. Expects that either
        the file handle or the url are not None. Raises ValueError if both are
        None or not None.


        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle, optional
            handle for an uploaded file on the associated file server.
        url: string, optional, optional
            Url for the file source
        detect_headers: bool, optional
            Detect column names in loaded file if True
        infer_types: bool, optional
            Infer column types for loaded dataset if True
        load_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        human_readable_name: string, optional
            Optional human readable name for the resulting table

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        if f_handle is None and url is None:
            raise ValueError('no load source given')
        elif not f_handle is None and not url is None:
            raise ValueError('too many load sources given')
        elif url is None:
            # os.path.abspath((r'%s' % os.getcwd().replace('\\','/') ) + '/' + f_handle.filepath)
            abspath = f_handle.filepath
        elif not url is None:
            abspath = url
        # Load dataset into Mimir
        init_load_name = mimir.loadDataSource(abspath, infer_types,
                                              detect_headers, load_format,
                                              human_readable_name, options)
        # Retrieve schema information for the created dataset
        sql = 'SELECT * FROM ' + init_load_name
        mimirSchema = mimir.getSchema(sql)
        # Create list of dataset columns
        columns = list()
        colSql = ''
        for col in mimirSchema:
            col_id = len(columns)
            name_in_dataset = base.sanitize_column_name(col['name'].upper())
            name_in_rdb = base.sanitize_column_name(col['name'].upper())
            col = MimirDatasetColumn(identifier=col_id,
                                     name_in_dataset=name_in_dataset,
                                     name_in_rdb=name_in_rdb)
            if colSql == '':
                colSql = name_in_dataset + ' AS ' + name_in_rdb
            else:
                colSql = colSql + ', ' + name_in_dataset + ' AS ' + name_in_rdb
            columns.append(col)
        # Create view for loaded dataset
        sql = 'SELECT ' + colSql + ' FROM {{input}};'
        view_name, dependencies = mimir.createView(init_load_name, sql)
        # TODO: this is a hack to speed up this step a bit.
        #  we get the first row id and the count and take a range;
        #  this is fragile and should be made better
        #
        # NOTE: This does not work because ROW_ID appears to be a string.
        # Thus, sorting not necessarily returns the smallest integer value
        # first.
        #
        row_count = mimir.countRows(view_name)

        return self.register_dataset(table_name=view_name,
                                     columns=columns,
                                     row_counter=row_count)
Пример #6
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the sampling package using 
        the set of user-provided arguments and the current database 
        state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """

        input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower()
        input_dataset = context.get_dataset(input_ds_name)
        if input_dataset is None:
            raise ValueError('unknown dataset \'' + input_ds_name + '\'')

        output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET,
                                             raise_error=False)
        if output_ds_name == None or output_ds_name == "":
            output_ds_name = input_ds_name + "_SAMPLE"
        output_ds_name = output_ds_name.lower()

        ## Load the sampling configuration
        sample_mode = None

        if command_id == cmd.BASIC_SAMPLE:
            sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE))
            if sampling_rate > 1.0 or sampling_rate < 0.0:
                raise Exception("Sampling rate must be between 0.0 and 1.0")
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY,
                "probability": sampling_rate
            }
        elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE:
            column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN)
            column_defn = input_dataset.columns[column]
            if command_id == cmd.MANUAL_STRATIFIED_SAMPLE:
                strata = [{
                    "value":
                    stratum.get_value(cmd.PARA_STRATUM_VALUE),
                    "probability":
                    stratum.get_value(cmd.PARA_SAMPLING_RATE)
                } for stratum in arguments.get_value(cmd.PARA_STRATA)]
            else:
                probability = arguments.get_value(cmd.PARA_SAMPLING_RATE)
                strata = self.get_automatic_strata(input_dataset, column_defn,
                                                   probability)
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_STRATIFIED_ON,
                "column": column_defn.name,
                "type": column_defn.data_type,
                "strata": strata
            }
        else:
            raise Exception("Unknown sampling command: {}".format(command_id))

        sample_view_id = mimir.createSample(input_dataset.table_name,
                                            sample_mode)
        row_count = mimir.countRows(sample_view_id)

        ## Register the view with Vizier
        ds = context.datastore.register_dataset(table_name=sample_view_id,
                                                columns=input_dataset.columns,
                                                row_counter=row_count)

        ## And start rendering some output
        outputs = ModuleOutputs()
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        ds_output['name'] = output_ds_name
        outputs.stdout.append(DatasetOutput(ds_output))

        ## Record Reads and writes
        provenance = ModuleProvenance(
            read={input_ds_name: input_dataset.identifier},
            write={
                output_ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  columns=ds.columns,
                                  row_count=ds.row_count)
            })

        # Return task result
        return ExecResult(outputs=outputs, provenance=provenance)