def get_row_count(self) -> int: if self._row_count is None: # try to use the count property if present self._row_count = self.get_properties().get("count", None) if self._row_count is None: self._row_count = mimir.countRows(self.identifier) return self._row_count
def execute_query(self, args, context): """Execute a SQL query in the given context. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ # Get SQL source code that is in this cell and the global # variables source = args.get_value(cmd.PARA_SQL_SOURCE) if not source.endswith(';'): source = source + ';' ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False) # Get mapping of datasets in the context to their respective table # name in the Mimir backend mimir_table_names = dict() for ds_name_o in context.datasets: dataset_id = context.datasets[ds_name_o] dataset = context.datastore.get_dataset(dataset_id) if dataset is None: raise ValueError('unknown dataset \'' + ds_name_o + '\'') mimir_table_names[ds_name_o] = dataset.table_name # Module outputs outputs = ModuleOutputs() try: # Create the view from the SQL source view_name, dependencies = mimir.createView(mimir_table_names, source) sql = 'SELECT * FROM ' + view_name mimirSchema = mimir.getSchema(sql) columns = list() for col in mimirSchema: col_id = len(columns) name_in_dataset = col['name'] col = MimirDatasetColumn(identifier=col_id, name_in_dataset=name_in_dataset) columns.append(col) row_count = mimir.countRows(view_name) provenance = None if ds_name is None or ds_name == '': ds_name = "TEMPORARY_RESULT" ds = context.datastore.register_dataset(table_name=view_name, columns=columns, row_counter=row_count) ds_output = server.api.datasets.get_dataset( project_id=context.project_id, dataset_id=ds.identifier, offset=0, limit=10) ds_output['name'] = ds_name dependencies = dict((dep_name.lower(), context.datasets.get(dep_name.lower(), None)) for dep_name in dependencies) # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies)) outputs.stdout.append(DatasetOutput(ds_output)) provenance = ModuleProvenance(write={ ds_name: DatasetDescriptor(identifier=ds.identifier, columns=ds.columns, row_count=ds.row_count) }, read=dependencies) except Exception as ex: provenance = ModuleProvenance() outputs.error(ex) # Return execution result return ExecResult(is_success=(len(outputs.stderr) == 0), outputs=outputs, provenance=provenance)
def create_dataset(self, columns, rows, human_readable_name=None, annotations=None, backend_options=[], dependencies=[]): """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Get unique identifier for new dataset identifier = 'DS_' + get_unique_identifier() # Write rows to temporary file in CSV format tmp_file = os.path.abspath(self.base_path + identifier) # Create a list of columns that contain the user-vizible column name and # the name in the database db_columns = list() colSql = '' for col in map(base.sanitize_column_name, columns): db_columns.append( MimirDatasetColumn(identifier=col.identifier, name_in_dataset=col.name, name_in_rdb=col.name)) if colSql == '': colSql = col.name + ' AS ' + col.name else: colSql = colSql + ', ' + col.name + ' AS ' + col.name # Create CSV file for load with open(tmp_file, 'w') as f_out: writer = csv.writer(f_out, quoting=csv.QUOTE_MINIMAL) writer.writerow([col.name_in_rdb for col in db_columns]) for row in rows: record = helper.encode_values(row.values) writer.writerow(record) # Load CSV file using Mimirs loadCSV method. table_name = mimir.loadDataSource( tmp_file, True, True, human_readable_name=human_readable_name, backend_options=backend_options, dependencies=dependencies) os.remove(tmp_file) sql = 'SELECT ' + colSql + ' FROM {{input}};' view_name, dependencies = mimir.createView(table_name, sql) # Get number of rows in the view that was created in the backend row_count = mimir.countRows(view_name) # Insert the new dataset metadata information into the datastore return self.register_dataset(table_name=view_name, columns=db_columns, row_counter=row_count, annotations=annotations)
def register_dataset(self, table_name, columns, row_counter=None, annotations=None): """Create a new record for a database table or view. Note that this method does not actually create the table or view in the database but adds the datasets metadata to the data store. The table or view will have been created by a load command or be the result from executing a lens or a VizUAL command. Parameters ---------- table_name: string Name of relational database table or view containing the dataset. columns: list(vizier.datastore.mimir.MimirDatasetColumn) List of column names in the dataset schema and their corresponding names in the relational database table or view. row_counter: int Counter for unique row ids annotations: vizier.datastore.metadata.DatasetMetadata Annotations for dataset components update_rows: bool, optional Flag indicating that the number of rows may have changed and the list of row identifier therefore needs to be checked. Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ # Depending on whether we need to update row ids we either query the # database or just get the schema. In either case mimir_schema will # contain a the returned Mimir schema information. sql = base.get_select_query(table_name, columns=columns) + ';' mimir_schema = mimir.getSchema(sql) # Create a mapping of column name (in database) to column type. This # mapping is then used to update the data type information for all # column descriptors. col_types = dict() for col in mimir_schema: col_types[base.sanitize_column_name( col['name'].upper())] = col['baseType'] for col in columns: col.data_type = col_types[col.name_in_rdb] # Set row counter to max. row id + 1 if None if row_counter is None: row_counter = mimir.countRows(table_name) dataset = MimirDatasetHandle(identifier=get_unique_identifier(), columns=list( map(base.sanitize_column_name, columns)), table_name=table_name, row_counter=row_counter, annotations=annotations) # Create a new directory for the dataset if it doesn't exist. dataset_dir = self.get_dataset_dir(dataset.identifier) if not os.path.isdir(dataset_dir): os.makedirs(dataset_dir) # Write dataset and annotation file to disk dataset.to_file(self.get_dataset_file(dataset.identifier)) dataset.annotations.to_file( self.get_metadata_filename(dataset.identifier)) return dataset
def load_dataset(self, f_handle=None, url=None, detect_headers=True, infer_types=True, load_format='csv', options=[], human_readable_name=None): """Create a new dataset from a given file or url. Expects that either the file handle or the url are not None. Raises ValueError if both are None or not None. Parameters ---------- f_handle : vizier.filestore.base.FileHandle, optional handle for an uploaded file on the associated file server. url: string, optional, optional Url for the file source detect_headers: bool, optional Detect column names in loaded file if True infer_types: bool, optional Infer column types for loaded dataset if True load_format: string, optional Format identifier options: list, optional Additional options for Mimirs load command human_readable_name: string, optional Optional human readable name for the resulting table Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ if f_handle is None and url is None: raise ValueError('no load source given') elif not f_handle is None and not url is None: raise ValueError('too many load sources given') elif url is None: # os.path.abspath((r'%s' % os.getcwd().replace('\\','/') ) + '/' + f_handle.filepath) abspath = f_handle.filepath elif not url is None: abspath = url # Load dataset into Mimir init_load_name = mimir.loadDataSource(abspath, infer_types, detect_headers, load_format, human_readable_name, options) # Retrieve schema information for the created dataset sql = 'SELECT * FROM ' + init_load_name mimirSchema = mimir.getSchema(sql) # Create list of dataset columns columns = list() colSql = '' for col in mimirSchema: col_id = len(columns) name_in_dataset = base.sanitize_column_name(col['name'].upper()) name_in_rdb = base.sanitize_column_name(col['name'].upper()) col = MimirDatasetColumn(identifier=col_id, name_in_dataset=name_in_dataset, name_in_rdb=name_in_rdb) if colSql == '': colSql = name_in_dataset + ' AS ' + name_in_rdb else: colSql = colSql + ', ' + name_in_dataset + ' AS ' + name_in_rdb columns.append(col) # Create view for loaded dataset sql = 'SELECT ' + colSql + ' FROM {{input}};' view_name, dependencies = mimir.createView(init_load_name, sql) # TODO: this is a hack to speed up this step a bit. # we get the first row id and the count and take a range; # this is fragile and should be made better # # NOTE: This does not work because ROW_ID appears to be a string. # Thus, sorting not necessarily returns the smallest integer value # first. # row_count = mimir.countRows(view_name) return self.register_dataset(table_name=view_name, columns=columns, row_counter=row_count)
def compute(self, command_id, arguments, context): """Compute results for commands in the sampling package using the set of user-provided arguments and the current database state. Parameters ---------- command_id: string Unique identifier for a command in a package declaration arguments: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower() input_dataset = context.get_dataset(input_ds_name) if input_dataset is None: raise ValueError('unknown dataset \'' + input_ds_name + '\'') output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False) if output_ds_name == None or output_ds_name == "": output_ds_name = input_ds_name + "_SAMPLE" output_ds_name = output_ds_name.lower() ## Load the sampling configuration sample_mode = None if command_id == cmd.BASIC_SAMPLE: sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE)) if sampling_rate > 1.0 or sampling_rate < 0.0: raise Exception("Sampling rate must be between 0.0 and 1.0") sample_mode = { "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY, "probability": sampling_rate } elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE: column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN) column_defn = input_dataset.columns[column] if command_id == cmd.MANUAL_STRATIFIED_SAMPLE: strata = [{ "value": stratum.get_value(cmd.PARA_STRATUM_VALUE), "probability": stratum.get_value(cmd.PARA_SAMPLING_RATE) } for stratum in arguments.get_value(cmd.PARA_STRATA)] else: probability = arguments.get_value(cmd.PARA_SAMPLING_RATE) strata = self.get_automatic_strata(input_dataset, column_defn, probability) sample_mode = { "mode": cmd.SAMPLING_MODE_STRATIFIED_ON, "column": column_defn.name, "type": column_defn.data_type, "strata": strata } else: raise Exception("Unknown sampling command: {}".format(command_id)) sample_view_id = mimir.createSample(input_dataset.table_name, sample_mode) row_count = mimir.countRows(sample_view_id) ## Register the view with Vizier ds = context.datastore.register_dataset(table_name=sample_view_id, columns=input_dataset.columns, row_counter=row_count) ## And start rendering some output outputs = ModuleOutputs() ds_output = server.api.datasets.get_dataset( project_id=context.project_id, dataset_id=ds.identifier, offset=0, limit=10) ds_output['name'] = output_ds_name outputs.stdout.append(DatasetOutput(ds_output)) ## Record Reads and writes provenance = ModuleProvenance( read={input_ds_name: input_dataset.identifier}, write={ output_ds_name: DatasetDescriptor(identifier=ds.identifier, columns=ds.columns, row_count=ds.row_count) }) # Return task result return ExecResult(outputs=outputs, provenance=provenance)