def upload_file(self, filename): """Create a new entry from a given local file. Will make a copy of the given file. Raises ValueError if the given file does not exist. Parameters ---------- filename: string Path to file on disk Returns ------- vizier.filestore.base.FileHandle """ # Ensure that the given file exists if not os.path.isfile(filename): raise ValueError('invalid file path \'' + str(filename) + '\'') name = name = os.path.basename(filename) # Create a new unique identifier for the file. identifier = get_unique_identifier() file_dir = self.get_file_dir(identifier, create=True) output_file = os.path.join(file_dir, DATA_FILENAME) # Copy the uploaded file shutil.copyfile(filename, output_file) # Add file to file index f_handle = FileHandle( identifier, filepath=output_file, file_name=name ) # Write metadata file write_metadata_file(file_dir, f_handle) return f_handle
def create_viztrail(self, env_id, properties): """Create a new viztrail. Raises ValueError if the given execution environment is unknown. Parameters ---------- env_id: string Identifier for workflow execution environment that is used fot the new viztrail properties: dict Set of properties for the new viztrail Returns ------- vizier.workflow.base.ViztrailHandle """ if not env_id in self.envs: raise ValueError('unknown execution environment \'' + env_id + '\'') # Get unique viztrail identifier identifier = get_unique_identifier() # Create viztrail directory fs_dir = os.path.join(self.base_dir, identifier) os.makedirs(fs_dir) # Create new viztrail and add to cache viztrail = FileSystemViztrailHandle.create_viztrail( fs_dir, identifier, self.envs[env_id], properties=properties) self.cache[viztrail.identifier] = viztrail return viztrail
def upload_file(): """Upload CSV file (POST) - Upload a CSV or TSV file containing a full dataset. """ # The upload request may contain a file object or an Url from where to # download the data. if request.files and 'file' in request.files: file = request.files['file'] # A browser may submit a empty part without filename if file.filename == '': raise InvalidRequest('empty file name') # Save uploaded file to temp directory identifier = get_unique_identifier() upload_file = api.fileserver.get_filepath(identifier) file.save(upload_file) prov = {'filename': file.filename} elif request.json and 'url' in request.json: obj = validate_json_request(request, required=['url']) url = obj['url'] # Save uploaded file to temp directory upload_file = url prov = {'url': url} else: raise InvalidRequest('no file or url specified in request') try: result = jsonify(api.upload_file(upload_file, provenance=prov)), 201 return result except ValueError as ex: raise InvalidRequest(str(ex))
def upload_stream(self, file, file_name): """Create a new entry from a given file stream. Will copy the given file to a file in the base directory. Parameters ---------- file: werkzeug.datastructures.FileStorage File object (e.g., uploaded via HTTP request) file_name: string Name of the file Returns ------- vizier.filestore.base.FileHandle """ # Create a new unique identifier for the file. identifier = get_unique_identifier() file_dir = self.get_file_dir(identifier, create=True) output_file = os.path.join(file_dir, DATA_FILENAME) # Save the file object to the new file path file.save(output_file) f_handle = FileHandle( identifier, filepath=output_file, file_name=file_name ) # Write metadata file write_metadata_file(file_dir, f_handle) return f_handle
def create_object(self, value, obj_type="text/plain"): """Update the annotations for a component of the datasets with the given identifier. Returns the updated annotations or None if the dataset does not exist. The distinction between old value and new value is necessary since annotations have no unique identifier. We use the key,value pair to identify an existing annotation for update. When creating a new annotation th old value is None. Parameters ---------- key: string, optional object key old_value: string, optional Previous value when updating an existing annotation. new_value: string, optional Updated value Returns ------- identifier """ return mimir.createBlob(identifier="{}".format( get_unique_identifier()), blob_type=obj_type, data=value)
def from_file(f_handle): """Read dataset from file. Expects the file to be in Json format which is the default serialization format used by to_file(). Parameters ---------- f_handle : vizier.filestore.base.FileHandle Handle for an uploaded file on a file server Returns ------- vizier.datastore.base.Dataset """ # Expects a CSV/TSV file. The first row contains the column names. # Read all information and return a InMemDatasetHandle if not f_handle.is_verified_csv: raise ValueError('failed to create dataset from file \'' + f_handle.name + '\'') # Read all information and return a InMemDatasetHandle columns = [] rows = [] with f_handle.open() as csvfile: reader = csv.reader(csvfile, delimiter=f_handle.delimiter) for col_name in reader.next(): columns.append(DatasetColumn(len(columns), col_name.strip())) for row in reader: values = [cast(v.strip()) for v in row] rows.append(DatasetRow(len(rows), values)) # Return InMemDatasetHandle return InMemDatasetHandle( identifier=get_unique_identifier(), columns=columns, rows=rows, column_counter=len(columns), row_counter=len(rows) )
def download_file(self, url, username=None, password=None): """Create a local copy of the identified web resource. Parameters ---------- url : string Unique resource identifier for external resource that is accessed username: string, optional Optional user name for authentication password: string, optional Optional password for authentication Returns ------- vizier.filestore.base.FileHandle """ # Get unique identifier and output file identifier = get_unique_identifier() file_dir = self.get_file_dir(identifier, create=True) output_file = os.path.join(file_dir, DATA_FILENAME) # Write web resource to output file. response = urllib.request.urlopen(url) filename = get_download_filename(url, response.info()) mode = 'w' if filename.endswith('.gz'): mode += 'b' with open(output_file, mode) as f: f.write(response.read()) # Add file to file index f_handle = FileHandle(identifier, filepath=output_file, file_name=filename) # Write metadata file write_metadata_file(file_dir, f_handle) return f_handle
def create_object(self, value, obj_type="text/plain"): """Update the annotations for a component of the datasets with the given identifier. Returns the updated annotations or None if the dataset does not exist. The distinction between old value and new value is necessary since annotations have no unique identifier. We use the key,value pair to identify an existing annotation for update. When creating a new annotation th old value is None. Parameters ---------- value: bytes The value of the object obj_type: string, optional The type of the object Returns ------- identifier """ data_object_filename = None identifier = None while data_object_filename is None: identifier = "OBJ_" + get_unique_identifier() data_object_filename = self.get_data_object_file(identifier) if os.path.exists(data_object_filename): data_object_filename = None with open(data_object_filename, "wb") as f: f.write(value) with open(data_object_filename + ".mime", "w") as f: f.write(obj_type) return identifier
def create_dataset( self, identifier=None, columns=None, rows=None, column_counter=None, row_counter=None, annotations=None ): """Create a new dataset in the data store for the given data. Raises ValueError if (1) any of the column or row identifier have a negative value, or (2) if the given column or row counter have value lower or equal to any of the column or row identifier. Parameters ---------- identifier: string, optional Unique dataset identifier columns: list(vizier.datastore.base.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.base.DatasetRow) List of dataset rows. column_counter: int, optional Counter to generate unique column identifier row_counter: int, optional Counter to generate unique row identifier annotations: vizier.datastore.metadata.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.mem.InMemDatasetHandle """ # Set columns and rows if not given if columns is None: columns = list() if rows is None: rows = list() else: # Validate the number of values in the given rows validate_schema(columns, rows) # Validate the given dataset schema. Will raise ValueError in case of # schema violations if identifier is None: identifier = get_unique_identifier() if column_counter is None: column_counter = max_column_id(columns) + 1 if row_counter is None: row_counter = max_row_id(rows) # Make sure annotation sis not None if annotations is None: annotations = DatasetMetadata() self.datasets[identifier] = InMemDatasetHandle( identifier=identifier, columns=list(columns), rows=list(rows), column_counter=column_counter, row_counter=row_counter, annotations=annotations.copy_metadata() ) return self.datasets[identifier]
def create_dataset(self, columns, rows, annotations=None): """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Raises ValueError if (1) the column identifier are not unique, (2) the row identifier are not uniqe, (3) the number of columns and values in a row do not match, (4) any of the column or row identifier have a negative value, or (5) if the given column or row counter have value lower or equal to any of the column or row identifier. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Validate (i) that each column has a unique identifier, (ii) each row # has a unique identifier, and (iii) that every row has exactly one # value per column. _, max_row_id = validate_dataset(columns=columns, rows=rows) # Get new identifier and create directory for new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file data_file = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(data_file).write(rows) # Filter annotations for non-existing resources if not annotations is None: annotations = annotations.filter( columns=[c.identifier for c in columns], rows=[r.identifier for r in rows]) # Create dataset an write dataset file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, data_file=data_file, row_count=len(rows), max_row_id=max_row_id, annotations=annotations) dataset.to_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE)) # Write metadata file if annotations are given if not annotations is None: dataset.annotations.to_file(self.get_metadata_filename(identifier)) # Return handle for new dataset return DatasetDescriptor(identifier=dataset.identifier, columns=dataset.columns, row_count=dataset.row_count)
def load_dataset( self, f_handle: FileHandle, proposed_schema: List[Tuple[str, str]] = []) -> FileSystemDatasetHandle: """Create a new dataset from a given file. Raises ValueError if the given file could not be loaded as a dataset. Parameters ---------- f_handle : vizier.filestore.base.FileHandle Handle for an uploaded file Returns ------- vizier.datastore.fs.dataset.FileSystemDatasetHandle """ # The file handle might be None in which case an exception is raised if f_handle is None: raise ValueError('unknown file') # Expects a file in a supported tabular data format. if not f_handle.is_tabular: raise ValueError('cannot create dataset from file \'' + f_handle.name + '\'') # Open the file as a csv file. Expects that the first row contains the # column names. Read dataset schema and dataset rows into two separate # lists. columns: List[DatasetColumn] = [] rows: List[DatasetRow] = [] with f_handle.open() as csvfile: reader = csv.reader(csvfile, delimiter=f_handle.delimiter) for col_name in next(reader): columns.append( DatasetColumn(identifier=len(columns), name=col_name.strip())) for row in reader: values = [cast(v.strip()) for v in row] rows.append( DatasetRow(identifier=str(len(rows)), values=values)) # Get unique identifier and create subfolder for the new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file data_file = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(data_file).write(rows) # Create dataset an write descriptor to file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, data_file=data_file, row_count=len(rows), max_row_id=len(rows) - 1) dataset.to_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE)) return dataset
def get_tempfile(): """Return the path to a temporary CSV file. Try to get a unique name to avoid problems with existing datasets. Returns ------- string """ tmp_prefix = 'DS_' + get_unique_identifier() return tempfile.mkstemp(suffix='.csv', prefix=tmp_prefix)[1]
def create_dataset( self, columns: List[DatasetColumn], rows: List[DatasetRow], properties: Dict[str, Any] = None, human_readable_name: str = "Untitled Dataset", backend_options: Optional[List[Tuple[str, str]]] = None, dependencies: Optional[List[str]] = None) -> MimirDatasetHandle: """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. properties: dict(string, any), optional Annotations for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Get unique identifier for new dataset properties = {} if properties is None else properties backend_options = [] if backend_options is None else backend_options dependencies = [] if dependencies is None else dependencies identifier = 'DS_' + get_unique_identifier() columns = [ col if isinstance(col, MimirDatasetColumn) else MimirDatasetColumn( identifier=col.identifier, name_in_dataset=col.name, data_type=col.data_type) for col in columns ] table_name, schema = mimir.loadDataInline( schema=[{ "name": base.sanitize_column_name(col.name), "type": col.data_type } for col in columns], rows=[row.values for row in rows], result_name=identifier, human_readable_name=human_readable_name, dependencies=dependencies, properties=properties) # Insert the new dataset metadata information into the datastore return MimirDatasetHandle.from_mimir_result(table_name=table_name, schema=schema, properties=properties, name=human_readable_name)
def unload_dataset(self, dataset_name, format='csv', options=[], filename=""): """Export a dataset from a given name. Raises ValueError if the given dataset could not be exported. Parameters ---------- dataset_name: string Name of the dataset to unload format: string Format for output (csv, json, ect.) options: dict Options for data unload filename: string The output filename - may be empty if outputting to a database Returns ------- vizier.filestore.base.FileHandle """ name = os.path.basename(filename).lower() # Create a new unique identifier for the file. identifier = get_unique_identifier() abspath = "" if not filename == "": abspath = os.path.abspath((r'%s' % os.getcwd().replace('\\', '/')) + '/' + identifier) mimir._mimir.unloadDataSource(dataset_name, abspath, format, mimir._jvmhelper.to_scala_seq(options)) created_at = get_current_time() output_file = abspath # Add file to file index f_handle = FileHandle(identifier, name, output_file, created_at, properties=dict()) return f_handle
def unload_dataset(self, dataset: DatasetDescriptor, datastore: Datastore, filestore: Filestore, unload_format: str = 'csv', options: List[Dict[str, Any]] = [], resources: Dict[str, Any] = None): """Export (or unload) a dataset to a given file format. The resources refer to any resoures (e.g., file identifier) that have been generated by a previous execution of the respective task. This allows to associate an identifier with a downloaded file to avoid future downloads (unless the reload flag is True). Parameters ---------- datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets filestore: vizier.filestore.Filestore Filestore to retrieve uploaded datasets unload_format: string, optional Format identifier options: list, optional Additional options for Mimirs load command resources: dict, optional Dictionary of additional resources (i.e., key,value pairs) that were generated during a previous execution of the associated module Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ f_handles = None result_resources = dict() assert (isinstance(datastore, MimirDatastore)) assert (isinstance(filestore, FileSystemFilestore)) if dataset is not None: f_handles = datastore.unload_dataset( filepath=filestore.get_file_dir(get_unique_identifier()), dataset_name=dataset.identifier, format=unload_format, options=options) result_resources[base.RESOURCE_FILEID] = f_handles return VizualApiResult(dataset=dataset, resources=result_resources)
def unload_dataset(self, filepath, dataset_name, format='csv', options=[], filename=""): """Export a dataset from a given name. Raises ValueError if the given dataset could not be exported. Parameters ---------- dataset_name: string Name of the dataset to unload format: string Format for output (csv, json, ect.) options: dict Options for data unload filename: string The output filename - may be empty if outputting to a database Returns ------- vizier.filestore.base.FileHandle """ name = os.path.basename(filepath).lower() basepath = filepath.replace(name, "") # Create a new unique identifier for the file. abspath = os.path.abspath((r'%s' % filepath)) exported_files = mimir.unloadDataSource(dataset_name, abspath, format, options) file_handles = [] for output_file in exported_files: name = os.path.basename(output_file).lower() identifier = get_unique_identifier() file_dir = os.path.join(basepath, identifier) if not os.path.isdir(file_dir): os.makedirs(file_dir) fs_output_file = os.path.join(file_dir, DATA_FILENAME) shutil.move(os.path.join(filepath, output_file), fs_output_file) f_handle = FileHandle(identifier, output_file, name) file_handles.append(f_handle) write_metadata_file(file_dir, f_handle) return file_handles
def __init__(self, project_id, branch_id, module_id, controller): """Initialize the components of the extended task handle. Generates a unique identifier for the task. Parameters ---------- project_id: string Unique project identifier branch_id: string Unique branch identifier module_id: string Unique module identifier controller: vizier.engine.base.VizierEngine Reference to the vizier engine """ super(ExtendedTaskHandle, self).__init__(task_id=get_unique_identifier(), project_id=project_id, controller=controller) self.branch_id = branch_id self.module_id = module_id
def __init__(self, dataset_name, identifier=None, chart_name=None, data=None, x_axis=None, chart_type=None, grouped_chart=True): """Initialize the view handle. Parameters ---------- identifier: string Unique view identifier dataset_name: string Name used toreference a dataset in the curation workflow within which this view is defined chart_name: string Unique chart name for reference data: list(vizier.plot.view.DataSeriesHandle), optional List of data series handles defining the data series in the chart x_axis: int, optional Optional index of the data series that is used for x-axis labels chart_type: string, optional Type of chart that is being displayed grouped_chart: bool, optional Flag indicating whether data series are grouped into single chart """ self.dataset_name = dataset_name self.identifier = identifier if not identifier is None else get_unique_identifier( ) self.chart_name = chart_name if not chart_name is None else 'Chart' self.data = data if not data is None else list() self.x_axis = x_axis self.chart_type = chart_type if not chart_type is None else 'Bar Chart' self.grouped_chart = grouped_chart
def create_dataset(self, identifier=None, columns=list(), rows=list(), column_counter=0, row_counter=0, annotations=None): """Create a new dataset in the data store for the given data. Raises ValueError if the number of values in each row of the dataset doesn't match the number of columns in the dataset schema. Parameters ---------- dataset : vizier.datastore.base.DatasetHandle Dataset object Returns ------- vizier.datastore.mimit.MimirDatasetHandle """ # Get unique identifier for new dataset identifier = 'DS_' + get_unique_identifier() # Write rows to temporary file in CSV format tmp_file = os.path.abspath(self.base_dir + '/../../filestore/files/' + identifier) # Create a list of columns that contain the user-vizible column name and # the name in the database db_columns = list() colSql = 'ROWID() AS ' + ROW_ID for col in map(lambda cn: self.bad_col_names.get(cn, cn), columns): db_columns.append( MimirDatasetColumn( identifier=col.identifier, name_in_dataset=col.name, name_in_rdb=col.name #COL_PREFIX + str(len(db_columns)) )) colSql = colSql + ', ' + col.name + ' AS ' + col.name # Create CSV file for load with open(tmp_file, 'w') as f_out: writer = csv.writer(f_out, quoting=csv.QUOTE_MINIMAL) writer.writerow([col.name_in_rdb for col in db_columns]) for row in rows: record = encode_values(row.values) writer.writerow(record) # Load CSV file using Mimirs loadCSV method. table_name = mimir._mimir.loadCSV(tmp_file, ',', True, True) sql = 'SELECT ' + colSql + ' FROM {{input}}' view_name = mimir._mimir.createView(table_name, sql) #sql = 'SELECT '+ROW_ID+' FROM ' + view_name #rs = json.loads(mimir._mimir.vistrailsQueryMimirJson(sql, False, False)) # List of row ids in the new dataset #row_ids = rs['prov'] #range(len(rs['prov'])) sql = 'SELECT COUNT(*) AS RECCNT FROM ' + view_name rs_count = json.loads( mimir._mimir.vistrailsQueryMimirJson(sql, False, False)) row_count = int(rs_count['data'][0][0]) sql = 'SELECT * FROM ' + view_name + ' LIMIT ' + str( config.DEFAULT_MAX_ROW_LIMIT) rs = json.loads(mimir._mimir.vistrailsQueryMimirJson( sql, False, False)) row_ids = rs['prov'] # Insert the new dataset metadata information into the datastore return self.register_dataset(table_name=view_name, columns=db_columns, row_ids=row_ids, row_counter=row_count, annotations=annotations)
def load_dataset(filename): """Create a table in Mimir from the given file. Parameters ---------- filename: string Path to the file """ # Create a copy of the original file under a unique name. tmp_file = get_tempfile() shutil.copyfile(filename, tmp_file) # Load dataset and retrieve the result to get the dataset schema init_load_name = mimir._mimir.loadCSV(tmp_file) sql = 'SELECT * FROM ' + init_load_name rs = mimir._mimir.vistrailsQueryMimir(sql, True, True) mimir_schema = rs.schema() reader = csv.reader(StringIO(rs.csvStr()), delimiter=',', skipinitialspace=True) # Write retieved result to temp file. Add unique column names and row # identifier os.remove(tmp_file) tmp_file = get_tempfile() # List of Mimir dataset column descriptors for the dataset schema columns = list() with open(tmp_file, 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_NONE) # Get dataset schema information from retrieved result out_schema = [ROW_ID.upper()] for name_in_dataset in reader.next(): name_in_dataset = name_in_dataset.strip() col_id = len(columns) name_in_rdb = COL_PREFIX + str(col_id) out_schema.append(name_in_rdb) columns.append( MimirDatasetColumn( col_id, name_in_dataset, name_in_rdb, data_type=str(mimir_schema.get(name_in_dataset))[5:-1])) writer.writerow(out_schema) # Remaining rows are dataset rows row_ids = list() for row in reader: row_id = len(row_ids) row_ids.append(row_id) out_row = [str(row_id)] for val in row: val = val.strip() if val.startswith('\'') and val.endswith('\''): val = val[1:-1] elif val == 'NULL': val = '' out_row.append(val) writer.writerow(out_row) table_name = mimir._mimir.loadCSV(tmp_file) os.remove(tmp_file) sql = 'SELECT * FROM ' + table_name rs = mimir._mimir.vistrailsQueryMimir(sql, True, True) reasons = rs.celReasons() uncertainty = rs.colsDet() return MimirDatasetHandle(get_unique_identifier(), columns, table_name, row_ids, len(columns), len(row_ids), annotations=get_annotations( columns, row_ids, reasons, uncertainty))
def load_dataset( self, f_handle: Optional[FileHandle] = None, proposed_schema: List[Tuple[str, str]] = [], url: Optional[str] = None, detect_headers: bool = True, infer_types: bool = True, properties: Dict[str, Any] = {}, load_format: str = 'csv', options: List[Dict[str, str]] = [], human_readable_name: Optional[str] = None, ): """Create a new dataset from a given file or url. Expects that either the file handle or the url are not None. Raises ValueError if both are None or not None. Parameters ---------- f_handle : vizier.filestore.base.FileHandle, optional handle for an uploaded file on the associated file server. url: string, optional, optional Url for the file source detect_headers: bool, optional Detect column names in loaded file if True infer_types: bool, optional Infer column types for loaded dataset if True load_format: string, optional Format identifier options: list, optional Additional options for Mimirs load command human_readable_name: string, optional Optional human readable name for the resulting table Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ assert (url is not None or f_handle is not None) if f_handle is None and url is None: raise ValueError('no load source given') elif f_handle is not None and url is not None: raise ValueError('too many load sources given') elif url is None and f_handle is not None: # os.path.abspath((r'%s' % os.getcwd().replace('\\','/') ) + '/' + f_handle.filepath) abspath = f_handle.filepath elif url is not None: abspath = url # for ease of debugging, associate each table with a prefix identifying its nature prefix = load_format if load_format in SAFE_FORMAT_IDENTIFIER_PREFIXES else "LOADED_" # Load dataset into Mimir table_name, mimirSchema = mimir.loadDataSource( abspath, infer_types, detect_headers, load_format, human_readable_name, options, properties=properties, result_name=prefix + get_unique_identifier(), proposed_schema=proposed_schema) return MimirDatasetHandle.from_mimir_result(table_name, mimirSchema, properties, human_readable_name)
def compute(self, command_id: str, arguments: "ModuleArguments", context: TaskContext) -> ExecResult: """Compute results for commands in the sampling package using the set of user-provided arguments and the current database state. Parameters ---------- command_id: string Unique identifier for a command in a package declaration arguments: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower() input_dataset: DatasetDescriptor = context.get_dataset(input_ds_name) if input_dataset is None: raise ValueError('unknown dataset \'' + input_ds_name + '\'') output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False) if output_ds_name is None or output_ds_name == "": output_ds_name = input_ds_name + "_SAMPLE" output_ds_name = output_ds_name.lower() # Load the sampling configuration sample_mode = None if command_id == cmd.BASIC_SAMPLE: sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE)) if sampling_rate > 1.0 or sampling_rate < 0.0: raise Exception("Sampling rate must be between 0.0 and 1.0") sample_mode = { "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY, "probability": sampling_rate } elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE: column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN) column_defn = input_dataset.columns[column] if command_id == cmd.MANUAL_STRATIFIED_SAMPLE: strata = [{ "value": stratum.get_value(cmd.PARA_STRATUM_VALUE), "probability": stratum.get_value(cmd.PARA_SAMPLING_RATE) } for stratum in arguments.get_value(cmd.PARA_STRATA)] else: probability = arguments.get_value(cmd.PARA_SAMPLING_RATE) strata = self.get_automatic_strata(input_dataset, column_defn, probability) sample_mode = { "mode": cmd.SAMPLING_MODE_STRATIFIED_ON, "column": column_defn.name, "type": column_defn.data_type, "strata": strata } else: raise Exception("Unknown sampling command: {}".format(command_id)) table_name, schema = mimir.createSample(input_dataset.identifier, sample_mode, result_name="SAMPLE_" + get_unique_identifier()) ds = MimirDatasetHandle.from_mimir_result(table_name, schema, properties={}, name=output_ds_name) # And start rendering some output outputs = ModuleOutputs() ds_output = server.api.datasets.get_dataset( project_id=context.project_id, dataset_id=ds.identifier, offset=0, limit=10) if ds_output is not None: ds_output['name'] = output_ds_name outputs.stdout.append(DatasetOutput(ds_output)) else: outputs.stderr.append(TextOutput("Error displaying dataset")) # Record Reads and writes provenance = ModuleProvenance( read={input_ds_name: input_dataset.identifier}, write={ output_ds_name: DatasetDescriptor(identifier=ds.identifier, name=output_ds_name, columns=ds.columns) }) # Return task result return ExecResult(outputs=outputs, provenance=provenance)
def register_dataset(self, table_name, columns, row_counter=None, annotations=None): """Create a new record for a database table or view. Note that this method does not actually create the table or view in the database but adds the datasets metadata to the data store. The table or view will have been created by a load command or be the result from executing a lens or a VizUAL command. Parameters ---------- table_name: string Name of relational database table or view containing the dataset. columns: list(vizier.datastore.mimir.MimirDatasetColumn) List of column names in the dataset schema and their corresponding names in the relational database table or view. row_counter: int Counter for unique row ids annotations: vizier.datastore.metadata.DatasetMetadata Annotations for dataset components update_rows: bool, optional Flag indicating that the number of rows may have changed and the list of row identifier therefore needs to be checked. Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ # Depending on whether we need to update row ids we either query the # database or just get the schema. In either case mimir_schema will # contain a the returned Mimir schema information. sql = base.get_select_query(table_name, columns=columns) + ';' mimir_schema = mimir.getSchema(sql) # Create a mapping of column name (in database) to column type. This # mapping is then used to update the data type information for all # column descriptors. col_types = dict() for col in mimir_schema: col_types[base.sanitize_column_name( col['name'].upper())] = col['baseType'] for col in columns: col.data_type = col_types[col.name_in_rdb] # Set row counter to max. row id + 1 if None if row_counter is None: row_counter = mimir.countRows(table_name) dataset = MimirDatasetHandle(identifier=get_unique_identifier(), columns=list( map(base.sanitize_column_name, columns)), table_name=table_name, row_counter=row_counter, annotations=annotations) # Create a new directory for the dataset if it doesn't exist. dataset_dir = self.get_dataset_dir(dataset.identifier) if not os.path.isdir(dataset_dir): os.makedirs(dataset_dir) # Write dataset and annotation file to disk dataset.to_file(self.get_dataset_file(dataset.identifier)) dataset.annotations.to_file( self.get_metadata_filename(dataset.identifier)) return dataset
def create_dataset(self, columns, rows, human_readable_name=None, annotations=None, backend_options=[], dependencies=[]): """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Get unique identifier for new dataset identifier = 'DS_' + get_unique_identifier() # Write rows to temporary file in CSV format tmp_file = os.path.abspath(self.base_path + identifier) # Create a list of columns that contain the user-vizible column name and # the name in the database db_columns = list() colSql = '' for col in map(base.sanitize_column_name, columns): db_columns.append( MimirDatasetColumn(identifier=col.identifier, name_in_dataset=col.name, name_in_rdb=col.name)) if colSql == '': colSql = col.name + ' AS ' + col.name else: colSql = colSql + ', ' + col.name + ' AS ' + col.name # Create CSV file for load with open(tmp_file, 'w') as f_out: writer = csv.writer(f_out, quoting=csv.QUOTE_MINIMAL) writer.writerow([col.name_in_rdb for col in db_columns]) for row in rows: record = helper.encode_values(row.values) writer.writerow(record) # Load CSV file using Mimirs loadCSV method. table_name = mimir.loadDataSource( tmp_file, True, True, human_readable_name=human_readable_name, backend_options=backend_options, dependencies=dependencies) os.remove(tmp_file) sql = 'SELECT ' + colSql + ' FROM {{input}};' view_name, dependencies = mimir.createView(table_name, sql) # Get number of rows in the view that was created in the backend row_count = mimir.countRows(view_name) # Insert the new dataset metadata information into the datastore return self.register_dataset(table_name=view_name, columns=db_columns, row_counter=row_count, annotations=annotations)
def append_workflow_module(self, project_id, branch_id, command): """Append module to the workflow at the head of the given viztrail branch. The modified workflow will be executed. The result is the new head of the branch. Returns the handle for the new module in the modified workflow. The result is None if the specified project or branch do not exist. Parameters ---------- project_id: string Unique project identifier branch_id : string Unique branch identifier command : vizier.viztrail.command.ModuleCommand Specification of the command that is to be executed by the appended workflow module Returns ------- vizier.viztrail.module.base.ModuleHandle """ with self.backend.lock: # Get the handle for the specified branch branch = self.projects.get_branch(project_id=project_id, branch_id=branch_id) if branch is None: return None # Get the current database state from the last module in the current # branch head. At the same time we retrieve the list of modules for # the current head of the branch. head = branch.get_head() if not head is None and len(head.modules) > 0: datasets = head.modules[-1].datasets modules = head.modules is_active = head.is_active is_error = head.modules[-1].is_error or head.modules[ -1].is_canceled else: datasets = dict() modules = list() is_active = False is_error = False # Get the external representation for the command external_form = command.to_external_form( command=self.packages[command.package_id].get( command.command_id), datasets=datasets) # If the workflow is not active and the command can be executed # synchronously we run the command immediately and return the # completed workflow. Otherwise, a pending workflow is created. if not is_active and self.backend.can_execute(command): ts_start = get_current_time() result = self.backend.execute(task=TaskHandle( task_id=get_unique_identifier(), project_id=project_id, controller=self), command=command, context=task_context(datasets)) ts = ModuleTimestamp(created_at=ts_start, started_at=ts_start, finished_at=get_current_time()) # Depending on the execution outcome create a handle for the # executed module if result.is_success: module = ModuleHandle( state=mstate.MODULE_SUCCESS, command=command, external_form=external_form, timestamp=ts, datasets=result.provenance.get_database_state( modules[-1].datasets if len(modules) > 0 else dict( )), outputs=result.outputs, provenance=result.provenance) else: module = ModuleHandle(state=mstate.MODULE_ERROR, command=command, external_form=external_form, timestamp=ts, outputs=result.outputs) workflow = branch.append_workflow(modules=modules, action=wf.ACTION_APPEND, command=command, pending_modules=[module]) else: # Create new workflow by appending one module to the current # head of the branch. The module state is pending if the # workflow is active otherwise it depends on the associated # backend. if is_active: state = mstate.MODULE_PENDING elif is_error: state = mstate.MODULE_CANCELED else: state = self.backend.next_task_state() workflow = branch.append_workflow( modules=modules, action=wf.ACTION_APPEND, command=command, pending_modules=[ ModuleHandle(state=state, command=command, external_form=external_form) ]) if not is_active and not state == mstate.MODULE_CANCELED: self.execute_module(project_id=project_id, branch_id=branch_id, module=workflow.modules[-1], datasets=datasets) return workflow.modules[-1]
def create_dataset(self, identifier=None, columns=None, rows=None, column_counter=None, row_counter=None, annotations=None): """Create a new dataset in the data store for the given data. Raises ValueError if (1) any of the column or row identifier have a negative value, or (2) if the given column or row counter have value lower or equal to any of the column or row identifier. Parameters ---------- identifier: string, optional Unique dataset identifier columns: list(vizier.datastore.base.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.base.DatasetRow) List of dataset rows. column_counter: int, optional Counter to generate unique column identifier row_counter: int, optional Counter to generate unique row identifier annotations: vizier.datastore.metadata.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.fs.FileSystemDatasetHandle """ # Set columns and rows if not given if columns is None: columns = list() if rows is None: rows = list() else: # Validate the number of values in the given rows validate_schema(columns, rows) # Validate that all column identifier are smaller that the given # column counter if not column_counter is None: for col in columns: if col.identifier >= column_counter: raise ValueError('invalid column counter') else: # Set column counter to max. column identifier + 1 column_counter = -1 for col in columns: if col.identifier > column_counter: column_counter = col.identifier column_counter += 1 # Validate that all row ids are non-negative, unique, lower that the # given row_counter max_rowid = -1 row_ids = set() for row in rows: if row.identifier < 0: raise ValueError('invalid row identifier \'' + str(row.identifier) + '\'') elif not row_counter is None and row.identifier >= row_counter: raise ValueError('invalid row counter') elif row.identifier in row_ids: raise ValueError('duplicate row identifier \'' + str(row.identifier) + '\'') row_ids.add(row.identifier) if row_counter is None and row.identifier > max_rowid: max_rowid = row.identifier if row_counter is None: row_counter = max_rowid + 1 # Get new identifier and create directory for new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file datafile = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(datafile).write(rows) # Create dataset an write dataset file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, row_count=len(rows), datafile=datafile, column_counter=column_counter, row_counter=row_counter, annotations=annotations) dataset.to_file(os.path.join(dataset_dir, HANDLE_FILE)) # Write metadata file dataset.annotations.to_file(os.path.join(dataset_dir, METADATA_FILE)) # Return handle for new dataset return dataset
def register_dataset(self, table_name, columns, row_ids, column_counter=None, row_counter=None, annotations=None, update_rows=False): """Create a new record for a database table or view. Note that this method does not actually create the table or view in the database but adds the datasets metadata to the data store. The table or view will have been created by a load command or be the result from executing a lens or a VizUAL command. Parameters ---------- table_name: string Name of relational database table or view containing the dataset. columns: list(vizier.datastore.mimir.MimirDatasetColumn) List of column names in the dataset schema and their corresponding names in the relational database table or view. row_ids: list(int) List of row ids. Determines the order of rows in the dataset column_counter: int Counter for unique column ids row_counter: int Counter for unique row ids annotations: vizier.datastore.metadata.DatasetMetadata Annotations for dataset components update_rows: bool, optional Flag indicating that the number of rows may have changed and the list of row identifier therefore needs to be checked. Returns ------- vizier.datastore.mimir.MimirDatasetHandle """ # Depending on whether we need to update row ids we either query the # database or just get the schema. In either case mimir_schema will # contain a the returned Mimir schema information. sql = get_select_query(table_name, columns=columns) mimir_schema = json.loads(mimir._mimir.getSchema(sql)) if update_rows: sql = get_select_query(table_name) rs = json.loads( mimir._mimir.vistrailsQueryMimirJson(sql, False, False)) # Get list of row identifier in current dataset. Row ID's are # expected to be the only values in the returned result set. dataset_row_ids = set() for row in rs['data']: dataset_row_ids.add(int(row[0])) modified_row_ids = list() # Remove row id's that are no longer in the data. for row_id in row_ids: if row_id in dataset_row_ids: modified_row_ids.append(row_id) # Add new row ids for row_id in dataset_row_ids: if not row_id in modified_row_ids: modified_row_ids.append(row_id) # Replace row ids with modified list row_ids = modified_row_ids # Create a mapping of column name (in database) to column type. This # mapping is then used to update the data type information for all # column descriptors. col_types = dict() for col in mimir_schema: col_types[col['name']] = col['base_type'] for col in columns: col.data_type = col_types[col.name_in_rdb] # Create column for row Identifier rowid_column = MimirDatasetColumn(name_in_dataset=ROW_ID, data_type=col_types[ROW_ID]) # Set column counter to max column id + 1 if None if column_counter is None: column_counter = max_column_id(columns) + 1 # Set row counter to max. row id + 1 if None if row_counter is None: sql = 'SELECT COUNT(*) AS RECCNT FROM ' + table_name rs = json.loads( mimir._mimir.vistrailsQueryMimirJson(sql, False, False)) row_counter = int(rs['data'][0][0]) dataset = MimirDatasetHandle( identifier=get_unique_identifier(), columns=map(lambda cn: self.bad_col_names.get(cn, cn), columns), rowid_column=rowid_column, table_name=table_name, row_ids=row_ids, column_counter=column_counter, row_counter=row_counter, annotations=annotations) # Create a new directory for the dataset if it doesn't exist. dataset_dir = self.get_dataset_dir(dataset.identifier) if not os.path.isdir(dataset_dir): os.makedirs(dataset_dir) # Write dataset and annotation file to disk dataset.to_file(self.get_dataset_file(dataset.identifier)) dataset.annotations.to_file( self.get_metadata_filename(dataset.identifier)) return dataset
def create_dataset( self, columns: List[DatasetColumn], rows: List[DatasetRow], properties: Optional[Dict[str, Any]] = None, human_readable_name: str = "Untitled Dataset", backend_options: Optional[List[Tuple[str, str]]] = None, dependencies: Optional[List[str]] = None) -> DatasetDescriptor: """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Raises ValueError if (1) the column identifier are not unique, (2) the row identifier are not uniqe, (3) the number of columns and values in a row do not match, (4) any of the column or row identifier have a negative value, or (5) if the given column or row counter have value lower or equal to any of the column or row identifier. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. properties: dict(string, ANY), optional Properties for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Validate (i) that each column has a unique identifier, (ii) each row # has a unique identifier, and (iii) that every row has exactly one # value per column. properties = {} if properties is None else properties dependencies = [] if dependencies is None else dependencies identifiers = set( int(row.identifier) for row in rows if row.identifier is not None and int(row.identifier) >= 0) identifiers.add(0) max_row_id = max(identifiers) rows = [ DatasetRow(identifier=row.identifier if row.identifier is not None and int(row.identifier) >= 0 else str(idx + max_row_id), values=row.values, caveats=row.caveats) for idx, row in enumerate(rows) ] _, max_row_id = validate_dataset(columns=columns, rows=rows) # Get new identifier and create directory for new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file data_file = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(data_file).write(rows) # Create dataset an write dataset file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, data_file=data_file, row_count=len(rows), max_row_id=max_row_id, properties=properties) dataset.to_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE)) # Write metadata file if annotations are given if properties is not None: dataset.write_properties_to_file( self.get_properties_filename(identifier)) # Return handle for new dataset return DatasetDescriptor(identifier=dataset.identifier, name=human_readable_name, columns=dataset.columns)
def create_branch(self, viztrail_id, source_branch=DEFAULT_BRANCH, workflow_version=-1, properties=None, module_id=-1): """Create a new workflow branch in a given viztrail. The new branch is created from the specified workflow in the source branch starting at module module_id. If module_id is negative the new branch starts after the last module of the source branch head workflow. Returns the handle for the new branch or None if the given viztrail does not exist. Raises ValueError if (1) the source branch does not exist, (2) no module with the specified identifier exists, or (3) an attempt is made to branch from an empty workflow. Parameters ---------- viztrail_id : string Unique viztrail identifier source_branch : string, optional Unique branch identifier for existing branch workflow_version: int, optional Version number of the workflow that is being modified. If negative the branch head is being used. properties: dict, optional Set of properties for the new branch module_id: int, optional Start branch from module with given identifier in source_branch. The new branch starts at the end of the source branch if module_id has a negative value. Returns ------- vizier.workflow.base.ViztrailBranch """ # Get viztrail. Return None if the viztrail does not exist if not viztrail_id in self.cache: return None viztrail = self.cache[viztrail_id] # Raise exception if source branch does not exist if not source_branch in viztrail.branches: raise ValueError('unknown branch \'' + source_branch + '\'') # Get the referenced workflow. Raise exception if the workflow does not # exist oris empty workflow = viztrail.get_workflow(source_branch, workflow_version) if workflow is None: raise ValueError('unknown workflow') if len(workflow.modules) == 0: raise ValueError('attempt to branch from empty workflow') # Copy list of workflow modules depending on value of module_id if module_id < 0: modules = workflow.modules else: modules = [] found = False for m in workflow.modules: modules.append(m) if m.identifier == module_id: found = True break if not found: raise ValueError('unknown module \'' + str(module_id) + '\'') # Make a copy of the source workflow for the branch result = viztrail.engine.copy_workflow(viztrail.version_counter.inc(), modules) # Create file for new workflow created_at = viztrail.write_workflow(result) # Create new branch handle target_branch = get_unique_identifier() # Store provenance information for new branch in file prov_file = branch_prov_file(viztrail.fs_dir, target_branch) FileSystemBranchProvenance.to_file(prov_file, source_branch, workflow.version, result.modules[-1].identifier) branch = ViztrailBranch( target_branch, FilePropertiesHandler(branch_file(viztrail.fs_dir, target_branch), properties), FileSystemBranchProvenance(prov_file), workflows=[ WorkflowVersionDescriptor(result.version, action=ACTION_CREATE, package_id=PACKAGE_SYS, command_id=SYS_CREATE_BRANCH, created_at=created_at) ]) # Update the viztrail on disk viztrail.branches[target_branch] = branch viztrail.to_file() return branch