Exemplo n.º 1
0
    def upload_file(self, filename):
        """Create a new entry from a given local file. Will make a copy of the
        given file.

        Raises ValueError if the given file does not exist.

        Parameters
        ----------
        filename: string
            Path to file on disk

        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        # Ensure that the given file exists
        if not os.path.isfile(filename):
            raise ValueError('invalid file path \'' + str(filename) + '\'')
        name = name = os.path.basename(filename)
        # Create a new unique identifier for the file.
        identifier = get_unique_identifier()
        file_dir = self.get_file_dir(identifier, create=True)
        output_file = os.path.join(file_dir, DATA_FILENAME)
        # Copy the uploaded file
        shutil.copyfile(filename, output_file)
        # Add file to file index
        f_handle = FileHandle(
            identifier,
            filepath=output_file,
            file_name=name
        )
        # Write metadata file
        write_metadata_file(file_dir, f_handle)
        return f_handle
Exemplo n.º 2
0
    def create_viztrail(self, env_id, properties):
        """Create a new viztrail.

        Raises ValueError if the given execution environment is unknown.

        Parameters
        ----------
        env_id: string
            Identifier for workflow execution environment that is used fot the
            new viztrail
        properties: dict
            Set of properties for the new viztrail

        Returns
        -------
        vizier.workflow.base.ViztrailHandle
        """
        if not env_id in self.envs:
            raise ValueError('unknown execution environment \'' + env_id +
                             '\'')
        # Get unique viztrail identifier
        identifier = get_unique_identifier()
        # Create viztrail directory
        fs_dir = os.path.join(self.base_dir, identifier)
        os.makedirs(fs_dir)
        # Create new viztrail and add to cache
        viztrail = FileSystemViztrailHandle.create_viztrail(
            fs_dir, identifier, self.envs[env_id], properties=properties)
        self.cache[viztrail.identifier] = viztrail
        return viztrail
Exemplo n.º 3
0
def upload_file():
    """Upload CSV file (POST) - Upload a CSV or TSV file containing a full
    dataset.
    """
    # The upload request may contain a file object or an Url from where to
    # download the data.
    if request.files and 'file' in request.files:
        file = request.files['file']
        # A browser may submit a empty part without filename
        if file.filename == '':
            raise InvalidRequest('empty file name')
        # Save uploaded file to temp directory
        identifier = get_unique_identifier()
        upload_file = api.fileserver.get_filepath(identifier)
        file.save(upload_file)
        prov = {'filename': file.filename}
    elif request.json and 'url' in request.json:
        obj = validate_json_request(request, required=['url'])
        url = obj['url']
        # Save uploaded file to temp directory
        upload_file = url
        prov = {'url': url}
    else:
        raise InvalidRequest('no file or url specified in request')
    try:
        result = jsonify(api.upload_file(upload_file, provenance=prov)), 201
        return result
    except ValueError as ex:
        raise InvalidRequest(str(ex))
Exemplo n.º 4
0
    def upload_stream(self, file, file_name):
        """Create a new entry from a given file stream. Will copy the given
        file to a file in the base directory.

        Parameters
        ----------
        file: werkzeug.datastructures.FileStorage
            File object (e.g., uploaded via HTTP request)
        file_name: string
            Name of the file

        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        # Create a new unique identifier for the file.
        identifier = get_unique_identifier()
        file_dir = self.get_file_dir(identifier, create=True)
        output_file = os.path.join(file_dir, DATA_FILENAME)
        # Save the file object to the new file path
        file.save(output_file)
        f_handle = FileHandle(
            identifier,
            filepath=output_file,
            file_name=file_name
        )
        # Write metadata file
        write_metadata_file(file_dir, f_handle)
        return f_handle
Exemplo n.º 5
0
    def create_object(self, value, obj_type="text/plain"):
        """Update the annotations for a component of the datasets with the given
        identifier. Returns the updated annotations or None if the dataset
        does not exist.

        The distinction between old value and new value is necessary since
        annotations have no unique identifier. We use the key,value pair to
        identify an existing annotation for update. When creating a new
        annotation th old value is None.

        Parameters
        ----------
        key: string, optional
            object key
        old_value: string, optional
            Previous value when updating an existing annotation.
        new_value: string, optional
            Updated value
        Returns
        -------
        identifier
        """
        return mimir.createBlob(identifier="{}".format(
            get_unique_identifier()),
                                blob_type=obj_type,
                                data=value)
Exemplo n.º 6
0
    def from_file(f_handle):
        """Read dataset from file. Expects the file to be in Json format which
        is the default serialization format used by to_file().

        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle
            Handle for an uploaded file on a file server

        Returns
        -------
        vizier.datastore.base.Dataset
        """
        # Expects a CSV/TSV file. The first row contains the column names.
        # Read all information and return a InMemDatasetHandle
        if not f_handle.is_verified_csv:
            raise ValueError('failed to create dataset from file \'' + f_handle.name + '\'')
        # Read all information and return a InMemDatasetHandle
        columns = []
        rows = []
        with f_handle.open() as csvfile:
            reader = csv.reader(csvfile, delimiter=f_handle.delimiter)
            for col_name in reader.next():
                columns.append(DatasetColumn(len(columns), col_name.strip()))
            for row in reader:
                values = [cast(v.strip()) for v in row]
                rows.append(DatasetRow(len(rows), values))
        # Return InMemDatasetHandle
        return InMemDatasetHandle(
            identifier=get_unique_identifier(),
            columns=columns,
            rows=rows,
            column_counter=len(columns),
            row_counter=len(rows)
        )
Exemplo n.º 7
0
    def download_file(self, url, username=None, password=None):
        """Create a local copy of the identified web resource.

        Parameters
        ----------
        url : string
            Unique resource identifier for external resource that is accessed
        username: string, optional
            Optional user name for authentication
        password: string, optional
            Optional password for authentication

        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        # Get unique identifier and output file
        identifier = get_unique_identifier()
        file_dir = self.get_file_dir(identifier, create=True)
        output_file = os.path.join(file_dir, DATA_FILENAME)
        # Write web resource to output file.
        response = urllib.request.urlopen(url)
        filename = get_download_filename(url, response.info())
        mode = 'w'
        if filename.endswith('.gz'):
            mode += 'b'
        with open(output_file, mode) as f:
            f.write(response.read())
        # Add file to file index
        f_handle = FileHandle(identifier,
                              filepath=output_file,
                              file_name=filename)
        # Write metadata file
        write_metadata_file(file_dir, f_handle)
        return f_handle
Exemplo n.º 8
0
    def create_object(self, value, obj_type="text/plain"):
        """Update the annotations for a component of the datasets with the given
        identifier. Returns the updated annotations or None if the dataset
        does not exist.

        The distinction between old value and new value is necessary since
        annotations have no unique identifier. We use the key,value pair to
        identify an existing annotation for update. When creating a new
        annotation th old value is None.

        Parameters
        ----------
        value: bytes
            The value of the object
        obj_type: string, optional
            The type of the object
        Returns
        -------
        identifier
        """
        data_object_filename = None
        identifier = None
        while data_object_filename is None:
            identifier = "OBJ_" + get_unique_identifier()
            data_object_filename = self.get_data_object_file(identifier)
            if os.path.exists(data_object_filename):
                data_object_filename = None

        with open(data_object_filename, "wb") as f:
            f.write(value)
        with open(data_object_filename + ".mime", "w") as f:
            f.write(obj_type)

        return identifier
Exemplo n.º 9
0
    def create_dataset(
        self, identifier=None, columns=None, rows=None, column_counter=None,
        row_counter=None, annotations=None
    ):
        """Create a new dataset in the data store for the given data.

        Raises ValueError if (1) any of the column or row identifier have a
        negative value, or (2) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        identifier: string, optional
            Unique dataset identifier
        columns: list(vizier.datastore.base.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.base.DatasetRow)
            List of dataset rows.
        column_counter: int, optional
            Counter to generate unique column identifier
        row_counter: int, optional
            Counter to generate unique row identifier
        annotations: vizier.datastore.metadata.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.mem.InMemDatasetHandle
        """
        # Set columns and rows if not given
        if columns is None:
            columns = list()
        if rows is None:
            rows = list()
        else:
            # Validate the number of values in the given rows
            validate_schema(columns, rows)
        # Validate the given dataset schema. Will raise ValueError in case of
        # schema violations
        if identifier is None:
            identifier = get_unique_identifier()
        if column_counter is None:
            column_counter = max_column_id(columns) + 1
        if row_counter is None:
            row_counter = max_row_id(rows)
        # Make sure annotation sis not None
        if annotations is None:
            annotations = DatasetMetadata()
        self.datasets[identifier] = InMemDatasetHandle(
            identifier=identifier,
            columns=list(columns),
            rows=list(rows),
            column_counter=column_counter,
            row_counter=row_counter,
            annotations=annotations.copy_metadata()
        )
        return self.datasets[identifier]
Exemplo n.º 10
0
    def create_dataset(self, columns, rows, annotations=None):
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Raises ValueError if (1) the column identifier are not unique, (2) the
        row identifier are not uniqe, (3) the number of columns and values in a
        row do not match, (4) any of the column or row identifier have a
        negative value, or (5) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Validate (i) that each column has a unique identifier, (ii) each row
        # has a unique identifier, and (iii) that every row has exactly one
        # value per column.
        _, max_row_id = validate_dataset(columns=columns, rows=rows)
        # Get new identifier and create directory for new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Filter annotations for non-existing resources
        if not annotations is None:
            annotations = annotations.filter(
                columns=[c.identifier for c in columns],
                rows=[r.identifier for r in rows])
        # Create dataset an write dataset file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=max_row_id,
                                          annotations=annotations)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        # Write metadata file if annotations are given
        if not annotations is None:
            dataset.annotations.to_file(self.get_metadata_filename(identifier))
        # Return handle for new dataset
        return DatasetDescriptor(identifier=dataset.identifier,
                                 columns=dataset.columns,
                                 row_count=dataset.row_count)
Exemplo n.º 11
0
    def load_dataset(
        self,
        f_handle: FileHandle,
        proposed_schema: List[Tuple[str,
                                    str]] = []) -> FileSystemDatasetHandle:
        """Create a new dataset from a given file.

        Raises ValueError if the given file could not be loaded as a dataset.

        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle
            Handle for an uploaded file

        Returns
        -------
        vizier.datastore.fs.dataset.FileSystemDatasetHandle
        """
        # The file handle might be None in which case an exception is raised
        if f_handle is None:
            raise ValueError('unknown file')
        # Expects a file in a supported tabular data format.
        if not f_handle.is_tabular:
            raise ValueError('cannot create dataset from file \'' +
                             f_handle.name + '\'')
        # Open the file as a csv file. Expects that the first row contains the
        # column names. Read dataset schema and dataset rows into two separate
        # lists.
        columns: List[DatasetColumn] = []
        rows: List[DatasetRow] = []
        with f_handle.open() as csvfile:
            reader = csv.reader(csvfile, delimiter=f_handle.delimiter)
            for col_name in next(reader):
                columns.append(
                    DatasetColumn(identifier=len(columns),
                                  name=col_name.strip()))
            for row in reader:
                values = [cast(v.strip()) for v in row]
                rows.append(
                    DatasetRow(identifier=str(len(rows)), values=values))
        # Get unique identifier and create subfolder for the new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Create dataset an write descriptor to file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=len(rows) - 1)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        return dataset
Exemplo n.º 12
0
def get_tempfile():
    """Return the path to a temporary CSV file. Try to get a unique name to
    avoid problems with existing datasets.

    Returns
    -------
    string
    """
    tmp_prefix = 'DS_' + get_unique_identifier()
    return tempfile.mkstemp(suffix='.csv', prefix=tmp_prefix)[1]
Exemplo n.º 13
0
    def create_dataset(
            self,
            columns: List[DatasetColumn],
            rows: List[DatasetRow],
            properties: Dict[str, Any] = None,
            human_readable_name: str = "Untitled Dataset",
            backend_options: Optional[List[Tuple[str, str]]] = None,
            dependencies: Optional[List[str]] = None) -> MimirDatasetHandle:
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        properties: dict(string, any), optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Get unique identifier for new dataset
        properties = {} if properties is None else properties
        backend_options = [] if backend_options is None else backend_options
        dependencies = [] if dependencies is None else dependencies
        identifier = 'DS_' + get_unique_identifier()
        columns = [
            col if isinstance(col, MimirDatasetColumn) else MimirDatasetColumn(
                identifier=col.identifier,
                name_in_dataset=col.name,
                data_type=col.data_type) for col in columns
        ]

        table_name, schema = mimir.loadDataInline(
            schema=[{
                "name": base.sanitize_column_name(col.name),
                "type": col.data_type
            } for col in columns],
            rows=[row.values for row in rows],
            result_name=identifier,
            human_readable_name=human_readable_name,
            dependencies=dependencies,
            properties=properties)

        # Insert the new dataset metadata information into the datastore
        return MimirDatasetHandle.from_mimir_result(table_name=table_name,
                                                    schema=schema,
                                                    properties=properties,
                                                    name=human_readable_name)
Exemplo n.º 14
0
    def unload_dataset(self,
                       dataset_name,
                       format='csv',
                       options=[],
                       filename=""):
        """Export a dataset from a given name.

        Raises ValueError if the given dataset could not be exported.

        Parameters
        ----------
        dataset_name: string
            Name of the dataset to unload
            
        format: string
            Format for output (csv, json, ect.)
            
        options: dict
            Options for data unload
            
        filename: string
            The output filename - may be empty if outputting to a database

        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        name = os.path.basename(filename).lower()
        # Create a new unique identifier for the file.
        identifier = get_unique_identifier()

        abspath = ""
        if not filename == "":
            abspath = os.path.abspath((r'%s' %
                                       os.getcwd().replace('\\', '/')) + '/' +
                                      identifier)
        mimir._mimir.unloadDataSource(dataset_name, abspath, format,
                                      mimir._jvmhelper.to_scala_seq(options))

        created_at = get_current_time()
        output_file = abspath
        # Add file to file index
        f_handle = FileHandle(identifier,
                              name,
                              output_file,
                              created_at,
                              properties=dict())
        return f_handle
Exemplo n.º 15
0
    def unload_dataset(self,
                       dataset: DatasetDescriptor,
                       datastore: Datastore,
                       filestore: Filestore,
                       unload_format: str = 'csv',
                       options: List[Dict[str, Any]] = [],
                       resources: Dict[str, Any] = None):
        """Export (or unload) a dataset to a given file format. 

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        Parameters
        ----------
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets
        filestore: vizier.filestore.Filestore
            Filestore to retrieve uploaded datasets
        unload_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        resources: dict, optional
            Dictionary of additional resources (i.e., key,value pairs) that were
            generated during a previous execution of the associated module
        
        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        f_handles = None
        result_resources = dict()

        assert (isinstance(datastore, MimirDatastore))
        assert (isinstance(filestore, FileSystemFilestore))

        if dataset is not None:
            f_handles = datastore.unload_dataset(
                filepath=filestore.get_file_dir(get_unique_identifier()),
                dataset_name=dataset.identifier,
                format=unload_format,
                options=options)
        result_resources[base.RESOURCE_FILEID] = f_handles
        return VizualApiResult(dataset=dataset, resources=result_resources)
Exemplo n.º 16
0
    def unload_dataset(self,
                       filepath,
                       dataset_name,
                       format='csv',
                       options=[],
                       filename=""):
        """Export a dataset from a given name.
        Raises ValueError if the given dataset could not be exported.
        Parameters
        ----------
        dataset_name: string
            Name of the dataset to unload
            
        format: string
            Format for output (csv, json, ect.)
            
        options: dict
            Options for data unload
            
        filename: string
            The output filename - may be empty if outputting to a database
        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        name = os.path.basename(filepath).lower()
        basepath = filepath.replace(name, "")

        # Create a new unique identifier for the file.

        abspath = os.path.abspath((r'%s' % filepath))
        exported_files = mimir.unloadDataSource(dataset_name, abspath, format,
                                                options)
        file_handles = []
        for output_file in exported_files:
            name = os.path.basename(output_file).lower()
            identifier = get_unique_identifier()
            file_dir = os.path.join(basepath, identifier)
            if not os.path.isdir(file_dir):
                os.makedirs(file_dir)
            fs_output_file = os.path.join(file_dir, DATA_FILENAME)
            shutil.move(os.path.join(filepath, output_file), fs_output_file)
            f_handle = FileHandle(identifier, output_file, name)
            file_handles.append(f_handle)
            write_metadata_file(file_dir, f_handle)
        return file_handles
Exemplo n.º 17
0
    def __init__(self, project_id, branch_id, module_id, controller):
        """Initialize the components of the extended task handle. Generates a
        unique identifier for the task.

        Parameters
        ----------
        project_id: string
            Unique project identifier
        branch_id: string
            Unique branch identifier
        module_id: string
            Unique module identifier
        controller: vizier.engine.base.VizierEngine
            Reference to the vizier engine
        """
        super(ExtendedTaskHandle,
              self).__init__(task_id=get_unique_identifier(),
                             project_id=project_id,
                             controller=controller)
        self.branch_id = branch_id
        self.module_id = module_id
Exemplo n.º 18
0
    def __init__(self,
                 dataset_name,
                 identifier=None,
                 chart_name=None,
                 data=None,
                 x_axis=None,
                 chart_type=None,
                 grouped_chart=True):
        """Initialize the view handle.

        Parameters
        ----------
        identifier: string
            Unique view identifier
        dataset_name: string
            Name used toreference a dataset in the curation workflow within
            which this view is defined
        chart_name: string
            Unique chart name for reference
        data: list(vizier.plot.view.DataSeriesHandle), optional
            List of data series handles defining the data series in the chart
        x_axis: int, optional
            Optional index of the data series that is used for x-axis labels
        chart_type: string, optional
            Type of chart that is being displayed
        grouped_chart: bool, optional
            Flag indicating whether data series are grouped into single chart
        """
        self.dataset_name = dataset_name
        self.identifier = identifier if not identifier is None else get_unique_identifier(
        )
        self.chart_name = chart_name if not chart_name is None else 'Chart'
        self.data = data if not data is None else list()
        self.x_axis = x_axis
        self.chart_type = chart_type if not chart_type is None else 'Bar Chart'
        self.grouped_chart = grouped_chart
Exemplo n.º 19
0
    def create_dataset(self,
                       identifier=None,
                       columns=list(),
                       rows=list(),
                       column_counter=0,
                       row_counter=0,
                       annotations=None):
        """Create a new dataset in the data store for the given data.

        Raises ValueError if the number of values in each row of the dataset
        doesn't match the number of columns in the dataset schema.

        Parameters
        ----------
        dataset : vizier.datastore.base.DatasetHandle
            Dataset object

        Returns
        -------
        vizier.datastore.mimit.MimirDatasetHandle
        """
        # Get unique identifier for new dataset
        identifier = 'DS_' + get_unique_identifier()
        # Write rows to temporary file in CSV format
        tmp_file = os.path.abspath(self.base_dir + '/../../filestore/files/' +
                                   identifier)
        # Create a list of columns that contain the user-vizible column name and
        # the name in the database
        db_columns = list()
        colSql = 'ROWID() AS ' + ROW_ID
        for col in map(lambda cn: self.bad_col_names.get(cn, cn), columns):
            db_columns.append(
                MimirDatasetColumn(
                    identifier=col.identifier,
                    name_in_dataset=col.name,
                    name_in_rdb=col.name  #COL_PREFIX + str(len(db_columns))
                ))
            colSql = colSql + ', ' + col.name + ' AS ' + col.name
        # Create CSV file for load
        with open(tmp_file, 'w') as f_out:
            writer = csv.writer(f_out, quoting=csv.QUOTE_MINIMAL)
            writer.writerow([col.name_in_rdb for col in db_columns])
            for row in rows:
                record = encode_values(row.values)
                writer.writerow(record)
        # Load CSV file using Mimirs loadCSV method.
        table_name = mimir._mimir.loadCSV(tmp_file, ',', True, True)

        sql = 'SELECT ' + colSql + ' FROM {{input}}'
        view_name = mimir._mimir.createView(table_name, sql)
        #sql = 'SELECT '+ROW_ID+' FROM ' + view_name
        #rs = json.loads(mimir._mimir.vistrailsQueryMimirJson(sql, False, False))
        # List of row ids in the new dataset
        #row_ids = rs['prov'] #range(len(rs['prov']))

        sql = 'SELECT COUNT(*) AS RECCNT FROM ' + view_name
        rs_count = json.loads(
            mimir._mimir.vistrailsQueryMimirJson(sql, False, False))

        row_count = int(rs_count['data'][0][0])

        sql = 'SELECT * FROM ' + view_name + ' LIMIT ' + str(
            config.DEFAULT_MAX_ROW_LIMIT)
        rs = json.loads(mimir._mimir.vistrailsQueryMimirJson(
            sql, False, False))

        row_ids = rs['prov']

        # Insert the new dataset metadata information into the datastore
        return self.register_dataset(table_name=view_name,
                                     columns=db_columns,
                                     row_ids=row_ids,
                                     row_counter=row_count,
                                     annotations=annotations)
Exemplo n.º 20
0
def load_dataset(filename):
    """Create a table in Mimir from the given file.

    Parameters
    ----------
    filename: string
        Path to the file
    """
    # Create a copy of the original file under a unique name.
    tmp_file = get_tempfile()
    shutil.copyfile(filename, tmp_file)
    # Load dataset and retrieve the result to get the dataset schema
    init_load_name = mimir._mimir.loadCSV(tmp_file)
    sql = 'SELECT * FROM ' + init_load_name
    rs = mimir._mimir.vistrailsQueryMimir(sql, True, True)
    mimir_schema = rs.schema()
    reader = csv.reader(StringIO(rs.csvStr()),
                        delimiter=',',
                        skipinitialspace=True)
    # Write retieved result to temp file. Add unique column names and row
    # identifier
    os.remove(tmp_file)
    tmp_file = get_tempfile()
    # List of Mimir dataset column descriptors for the dataset schema
    columns = list()
    with open(tmp_file, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_NONE)
        # Get dataset schema information from retrieved result
        out_schema = [ROW_ID.upper()]
        for name_in_dataset in reader.next():
            name_in_dataset = name_in_dataset.strip()
            col_id = len(columns)
            name_in_rdb = COL_PREFIX + str(col_id)
            out_schema.append(name_in_rdb)
            columns.append(
                MimirDatasetColumn(
                    col_id,
                    name_in_dataset,
                    name_in_rdb,
                    data_type=str(mimir_schema.get(name_in_dataset))[5:-1]))
        writer.writerow(out_schema)
        # Remaining rows are dataset rows
        row_ids = list()
        for row in reader:
            row_id = len(row_ids)
            row_ids.append(row_id)
            out_row = [str(row_id)]
            for val in row:
                val = val.strip()
                if val.startswith('\'') and val.endswith('\''):
                    val = val[1:-1]
                elif val == 'NULL':
                    val = ''
                out_row.append(val)
            writer.writerow(out_row)
    table_name = mimir._mimir.loadCSV(tmp_file)
    os.remove(tmp_file)
    sql = 'SELECT * FROM ' + table_name
    rs = mimir._mimir.vistrailsQueryMimir(sql, True, True)
    reasons = rs.celReasons()
    uncertainty = rs.colsDet()
    return MimirDatasetHandle(get_unique_identifier(),
                              columns,
                              table_name,
                              row_ids,
                              len(columns),
                              len(row_ids),
                              annotations=get_annotations(
                                  columns, row_ids, reasons, uncertainty))
Exemplo n.º 21
0
    def load_dataset(
        self,
        f_handle: Optional[FileHandle] = None,
        proposed_schema: List[Tuple[str, str]] = [],
        url: Optional[str] = None,
        detect_headers: bool = True,
        infer_types: bool = True,
        properties: Dict[str, Any] = {},
        load_format: str = 'csv',
        options: List[Dict[str, str]] = [],
        human_readable_name: Optional[str] = None,
    ):
        """Create a new dataset from a given file or url. Expects that either
        the file handle or the url are not None. Raises ValueError if both are
        None or not None.


        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle, optional
            handle for an uploaded file on the associated file server.
        url: string, optional, optional
            Url for the file source
        detect_headers: bool, optional
            Detect column names in loaded file if True
        infer_types: bool, optional
            Infer column types for loaded dataset if True
        load_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        human_readable_name: string, optional
            Optional human readable name for the resulting table

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        assert (url is not None or f_handle is not None)
        if f_handle is None and url is None:
            raise ValueError('no load source given')
        elif f_handle is not None and url is not None:
            raise ValueError('too many load sources given')
        elif url is None and f_handle is not None:
            # os.path.abspath((r'%s' % os.getcwd().replace('\\','/') ) + '/' + f_handle.filepath)
            abspath = f_handle.filepath
        elif url is not None:
            abspath = url

        # for ease of debugging, associate each table with a prefix identifying its nature
        prefix = load_format if load_format in SAFE_FORMAT_IDENTIFIER_PREFIXES else "LOADED_"

        # Load dataset into Mimir
        table_name, mimirSchema = mimir.loadDataSource(
            abspath,
            infer_types,
            detect_headers,
            load_format,
            human_readable_name,
            options,
            properties=properties,
            result_name=prefix + get_unique_identifier(),
            proposed_schema=proposed_schema)
        return MimirDatasetHandle.from_mimir_result(table_name, mimirSchema,
                                                    properties,
                                                    human_readable_name)
Exemplo n.º 22
0
    def compute(self, command_id: str, arguments: "ModuleArguments",
                context: TaskContext) -> ExecResult:
        """Compute results for commands in the sampling package using 
        the set of user-provided arguments and the current database 
        state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """

        input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower()
        input_dataset: DatasetDescriptor = context.get_dataset(input_ds_name)
        if input_dataset is None:
            raise ValueError('unknown dataset \'' + input_ds_name + '\'')

        output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET,
                                             raise_error=False)
        if output_ds_name is None or output_ds_name == "":
            output_ds_name = input_ds_name + "_SAMPLE"
        output_ds_name = output_ds_name.lower()

        # Load the sampling configuration
        sample_mode = None

        if command_id == cmd.BASIC_SAMPLE:
            sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE))
            if sampling_rate > 1.0 or sampling_rate < 0.0:
                raise Exception("Sampling rate must be between 0.0 and 1.0")
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY,
                "probability": sampling_rate
            }
        elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE:
            column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN)
            column_defn = input_dataset.columns[column]
            if command_id == cmd.MANUAL_STRATIFIED_SAMPLE:
                strata = [{
                    "value":
                    stratum.get_value(cmd.PARA_STRATUM_VALUE),
                    "probability":
                    stratum.get_value(cmd.PARA_SAMPLING_RATE)
                } for stratum in arguments.get_value(cmd.PARA_STRATA)]
            else:
                probability = arguments.get_value(cmd.PARA_SAMPLING_RATE)
                strata = self.get_automatic_strata(input_dataset, column_defn,
                                                   probability)
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_STRATIFIED_ON,
                "column": column_defn.name,
                "type": column_defn.data_type,
                "strata": strata
            }
        else:
            raise Exception("Unknown sampling command: {}".format(command_id))

        table_name, schema = mimir.createSample(input_dataset.identifier,
                                                sample_mode,
                                                result_name="SAMPLE_" +
                                                get_unique_identifier())
        ds = MimirDatasetHandle.from_mimir_result(table_name,
                                                  schema,
                                                  properties={},
                                                  name=output_ds_name)

        # And start rendering some output
        outputs = ModuleOutputs()
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        if ds_output is not None:
            ds_output['name'] = output_ds_name
            outputs.stdout.append(DatasetOutput(ds_output))
        else:
            outputs.stderr.append(TextOutput("Error displaying dataset"))

        # Record Reads and writes
        provenance = ModuleProvenance(
            read={input_ds_name: input_dataset.identifier},
            write={
                output_ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=output_ds_name,
                                  columns=ds.columns)
            })

        # Return task result
        return ExecResult(outputs=outputs, provenance=provenance)
Exemplo n.º 23
0
    def register_dataset(self,
                         table_name,
                         columns,
                         row_counter=None,
                         annotations=None):
        """Create a new record for a database table or view. Note that this
        method does not actually create the table or view in the database but
        adds the datasets metadata to the data store. The table or view will
        have been created by a load command or be the result from executing
        a lens or a VizUAL command.

        Parameters
        ----------
        table_name: string
            Name of relational database table or view containing the dataset.
        columns: list(vizier.datastore.mimir.MimirDatasetColumn)
            List of column names in the dataset schema and their corresponding
            names in the relational database table or view.
        row_counter: int
            Counter for unique row ids
        annotations: vizier.datastore.metadata.DatasetMetadata
            Annotations for dataset components
        update_rows: bool, optional
            Flag indicating that the number of rows may have changed and the
            list of row identifier therefore needs to be checked.

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        # Depending on whether we need to update row ids we either query the
        # database or just get the schema. In either case mimir_schema will
        # contain a the returned Mimir schema information.
        sql = base.get_select_query(table_name, columns=columns) + ';'
        mimir_schema = mimir.getSchema(sql)

        # Create a mapping of column name (in database) to column type. This
        # mapping is then used to update the data type information for all
        # column descriptors.
        col_types = dict()
        for col in mimir_schema:
            col_types[base.sanitize_column_name(
                col['name'].upper())] = col['baseType']
        for col in columns:
            col.data_type = col_types[col.name_in_rdb]
        # Set row counter to max. row id + 1 if None
        if row_counter is None:
            row_counter = mimir.countRows(table_name)
        dataset = MimirDatasetHandle(identifier=get_unique_identifier(),
                                     columns=list(
                                         map(base.sanitize_column_name,
                                             columns)),
                                     table_name=table_name,
                                     row_counter=row_counter,
                                     annotations=annotations)
        # Create a new directory for the dataset if it doesn't exist.
        dataset_dir = self.get_dataset_dir(dataset.identifier)
        if not os.path.isdir(dataset_dir):
            os.makedirs(dataset_dir)
        # Write dataset and annotation file to disk
        dataset.to_file(self.get_dataset_file(dataset.identifier))
        dataset.annotations.to_file(
            self.get_metadata_filename(dataset.identifier))
        return dataset
Exemplo n.º 24
0
    def create_dataset(self,
                       columns,
                       rows,
                       human_readable_name=None,
                       annotations=None,
                       backend_options=[],
                       dependencies=[]):
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Get unique identifier for new dataset
        identifier = 'DS_' + get_unique_identifier()
        # Write rows to temporary file in CSV format
        tmp_file = os.path.abspath(self.base_path + identifier)
        # Create a list of columns that contain the user-vizible column name and
        # the name in the database
        db_columns = list()
        colSql = ''
        for col in map(base.sanitize_column_name, columns):
            db_columns.append(
                MimirDatasetColumn(identifier=col.identifier,
                                   name_in_dataset=col.name,
                                   name_in_rdb=col.name))
            if colSql == '':
                colSql = col.name + ' AS ' + col.name
            else:
                colSql = colSql + ', ' + col.name + ' AS ' + col.name
        # Create CSV file for load
        with open(tmp_file, 'w') as f_out:
            writer = csv.writer(f_out, quoting=csv.QUOTE_MINIMAL)
            writer.writerow([col.name_in_rdb for col in db_columns])
            for row in rows:
                record = helper.encode_values(row.values)
                writer.writerow(record)
        # Load CSV file using Mimirs loadCSV method.
        table_name = mimir.loadDataSource(
            tmp_file,
            True,
            True,
            human_readable_name=human_readable_name,
            backend_options=backend_options,
            dependencies=dependencies)
        os.remove(tmp_file)
        sql = 'SELECT ' + colSql + ' FROM {{input}};'
        view_name, dependencies = mimir.createView(table_name, sql)
        # Get number of rows in the view that was created in the backend
        row_count = mimir.countRows(view_name)

        # Insert the new dataset metadata information into the datastore
        return self.register_dataset(table_name=view_name,
                                     columns=db_columns,
                                     row_counter=row_count,
                                     annotations=annotations)
Exemplo n.º 25
0
    def append_workflow_module(self, project_id, branch_id, command):
        """Append module to the workflow at the head of the given viztrail
        branch. The modified workflow will be executed. The result is the new
        head of the branch.

        Returns the handle for the new module in the modified workflow. The
        result is None if the specified project or branch do not exist.

        Parameters
        ----------
        project_id: string
            Unique project identifier
        branch_id : string
            Unique branch identifier
        command : vizier.viztrail.command.ModuleCommand
            Specification of the command that is to be executed by the appended
            workflow module

        Returns
        -------
        vizier.viztrail.module.base.ModuleHandle
        """
        with self.backend.lock:
            # Get the handle for the specified branch
            branch = self.projects.get_branch(project_id=project_id,
                                              branch_id=branch_id)
            if branch is None:
                return None
            # Get the current database state from the last module in the current
            # branch head. At the same time we retrieve the list of modules for
            # the current head of the branch.
            head = branch.get_head()
            if not head is None and len(head.modules) > 0:
                datasets = head.modules[-1].datasets
                modules = head.modules
                is_active = head.is_active
                is_error = head.modules[-1].is_error or head.modules[
                    -1].is_canceled
            else:
                datasets = dict()
                modules = list()
                is_active = False
                is_error = False
            # Get the external representation for the command
            external_form = command.to_external_form(
                command=self.packages[command.package_id].get(
                    command.command_id),
                datasets=datasets)
            # If the workflow is not active and the command can be executed
            # synchronously we run the command immediately and return the
            # completed workflow. Otherwise, a pending workflow is created.
            if not is_active and self.backend.can_execute(command):
                ts_start = get_current_time()
                result = self.backend.execute(task=TaskHandle(
                    task_id=get_unique_identifier(),
                    project_id=project_id,
                    controller=self),
                                              command=command,
                                              context=task_context(datasets))
                ts = ModuleTimestamp(created_at=ts_start,
                                     started_at=ts_start,
                                     finished_at=get_current_time())
                # Depending on the execution outcome create a handle for the
                # executed module
                if result.is_success:
                    module = ModuleHandle(
                        state=mstate.MODULE_SUCCESS,
                        command=command,
                        external_form=external_form,
                        timestamp=ts,
                        datasets=result.provenance.get_database_state(
                            modules[-1].datasets if len(modules) > 0 else dict(
                            )),
                        outputs=result.outputs,
                        provenance=result.provenance)
                else:
                    module = ModuleHandle(state=mstate.MODULE_ERROR,
                                          command=command,
                                          external_form=external_form,
                                          timestamp=ts,
                                          outputs=result.outputs)
                workflow = branch.append_workflow(modules=modules,
                                                  action=wf.ACTION_APPEND,
                                                  command=command,
                                                  pending_modules=[module])
            else:
                # Create new workflow by appending one module to the current
                # head of the branch. The module state is pending if the
                # workflow is active otherwise it depends on the associated
                # backend.
                if is_active:
                    state = mstate.MODULE_PENDING
                elif is_error:
                    state = mstate.MODULE_CANCELED
                else:
                    state = self.backend.next_task_state()
                workflow = branch.append_workflow(
                    modules=modules,
                    action=wf.ACTION_APPEND,
                    command=command,
                    pending_modules=[
                        ModuleHandle(state=state,
                                     command=command,
                                     external_form=external_form)
                    ])
                if not is_active and not state == mstate.MODULE_CANCELED:
                    self.execute_module(project_id=project_id,
                                        branch_id=branch_id,
                                        module=workflow.modules[-1],
                                        datasets=datasets)
        return workflow.modules[-1]
Exemplo n.º 26
0
    def create_dataset(self,
                       identifier=None,
                       columns=None,
                       rows=None,
                       column_counter=None,
                       row_counter=None,
                       annotations=None):
        """Create a new dataset in the data store for the given data.

        Raises ValueError if (1) any of the column or row identifier have a
        negative value, or (2) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        identifier: string, optional
            Unique dataset identifier
        columns: list(vizier.datastore.base.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.base.DatasetRow)
            List of dataset rows.
        column_counter: int, optional
            Counter to generate unique column identifier
        row_counter: int, optional
            Counter to generate unique row identifier
        annotations: vizier.datastore.metadata.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.fs.FileSystemDatasetHandle
        """
        # Set columns and rows if not given
        if columns is None:
            columns = list()
        if rows is None:
            rows = list()
        else:
            # Validate the number of values in the given rows
            validate_schema(columns, rows)
        # Validate that all column identifier are smaller that the given
        # column counter
        if not column_counter is None:
            for col in columns:
                if col.identifier >= column_counter:
                    raise ValueError('invalid column counter')
        else:
            # Set column counter to max. column identifier + 1
            column_counter = -1
            for col in columns:
                if col.identifier > column_counter:
                    column_counter = col.identifier
            column_counter += 1
        # Validate that all row ids are non-negative, unique, lower that the
        # given row_counter
        max_rowid = -1
        row_ids = set()
        for row in rows:
            if row.identifier < 0:
                raise ValueError('invalid row identifier \'' +
                                 str(row.identifier) + '\'')
            elif not row_counter is None and row.identifier >= row_counter:
                raise ValueError('invalid row counter')
            elif row.identifier in row_ids:
                raise ValueError('duplicate row identifier \'' +
                                 str(row.identifier) + '\'')
            row_ids.add(row.identifier)
            if row_counter is None and row.identifier > max_rowid:
                max_rowid = row.identifier
        if row_counter is None:
            row_counter = max_rowid + 1
        # Get new identifier and create directory for new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        datafile = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(datafile).write(rows)
        # Create dataset an write dataset file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          row_count=len(rows),
                                          datafile=datafile,
                                          column_counter=column_counter,
                                          row_counter=row_counter,
                                          annotations=annotations)
        dataset.to_file(os.path.join(dataset_dir, HANDLE_FILE))
        # Write metadata file
        dataset.annotations.to_file(os.path.join(dataset_dir, METADATA_FILE))
        # Return handle for new dataset
        return dataset
Exemplo n.º 27
0
    def register_dataset(self,
                         table_name,
                         columns,
                         row_ids,
                         column_counter=None,
                         row_counter=None,
                         annotations=None,
                         update_rows=False):
        """Create a new record for a database table or view. Note that this
        method does not actually create the table or view in the database but
        adds the datasets metadata to the data store. The table or view will
        have been created by a load command or be the result from executing
        a lens or a VizUAL command.

        Parameters
        ----------
        table_name: string
            Name of relational database table or view containing the dataset.
        columns: list(vizier.datastore.mimir.MimirDatasetColumn)
            List of column names in the dataset schema and their corresponding
            names in the relational database table or view.
        row_ids: list(int)
            List of row ids. Determines the order of rows in the dataset
        column_counter: int
            Counter for unique column ids
        row_counter: int
            Counter for unique row ids
        annotations: vizier.datastore.metadata.DatasetMetadata
            Annotations for dataset components
        update_rows: bool, optional
            Flag indicating that the number of rows may have changed and the
            list of row identifier therefore needs to be checked.

        Returns
        -------
        vizier.datastore.mimir.MimirDatasetHandle
        """
        # Depending on whether we need to update row ids we either query the
        # database or just get the schema. In either case mimir_schema will
        # contain a the returned Mimir schema information.
        sql = get_select_query(table_name, columns=columns)
        mimir_schema = json.loads(mimir._mimir.getSchema(sql))
        if update_rows:
            sql = get_select_query(table_name)
            rs = json.loads(
                mimir._mimir.vistrailsQueryMimirJson(sql, False, False))
            # Get list of row identifier in current dataset. Row ID's are
            # expected to be the only values in the returned result set.
            dataset_row_ids = set()
            for row in rs['data']:
                dataset_row_ids.add(int(row[0]))
            modified_row_ids = list()
            # Remove row id's that are no longer in the data.
            for row_id in row_ids:
                if row_id in dataset_row_ids:
                    modified_row_ids.append(row_id)
            # Add new row ids
            for row_id in dataset_row_ids:
                if not row_id in modified_row_ids:
                    modified_row_ids.append(row_id)
            # Replace row ids with modified list
            row_ids = modified_row_ids
        # Create a mapping of column name (in database) to column type. This
        # mapping is then used to update the data type information for all
        # column descriptors.
        col_types = dict()
        for col in mimir_schema:
            col_types[col['name']] = col['base_type']
        for col in columns:
            col.data_type = col_types[col.name_in_rdb]
        # Create column for row Identifier
        rowid_column = MimirDatasetColumn(name_in_dataset=ROW_ID,
                                          data_type=col_types[ROW_ID])
        # Set column counter to max column id + 1 if None
        if column_counter is None:
            column_counter = max_column_id(columns) + 1
        # Set row counter to max. row id + 1 if None
        if row_counter is None:
            sql = 'SELECT COUNT(*) AS RECCNT FROM ' + table_name
            rs = json.loads(
                mimir._mimir.vistrailsQueryMimirJson(sql, False, False))
            row_counter = int(rs['data'][0][0])
        dataset = MimirDatasetHandle(
            identifier=get_unique_identifier(),
            columns=map(lambda cn: self.bad_col_names.get(cn, cn), columns),
            rowid_column=rowid_column,
            table_name=table_name,
            row_ids=row_ids,
            column_counter=column_counter,
            row_counter=row_counter,
            annotations=annotations)
        # Create a new directory for the dataset if it doesn't exist.
        dataset_dir = self.get_dataset_dir(dataset.identifier)
        if not os.path.isdir(dataset_dir):
            os.makedirs(dataset_dir)
        # Write dataset and annotation file to disk
        dataset.to_file(self.get_dataset_file(dataset.identifier))
        dataset.annotations.to_file(
            self.get_metadata_filename(dataset.identifier))
        return dataset
Exemplo n.º 28
0
    def create_dataset(
            self,
            columns: List[DatasetColumn],
            rows: List[DatasetRow],
            properties: Optional[Dict[str, Any]] = None,
            human_readable_name: str = "Untitled Dataset",
            backend_options: Optional[List[Tuple[str, str]]] = None,
            dependencies: Optional[List[str]] = None) -> DatasetDescriptor:
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Raises ValueError if (1) the column identifier are not unique, (2) the
        row identifier are not uniqe, (3) the number of columns and values in a
        row do not match, (4) any of the column or row identifier have a
        negative value, or (5) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        properties: dict(string, ANY), optional
            Properties for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Validate (i) that each column has a unique identifier, (ii) each row
        # has a unique identifier, and (iii) that every row has exactly one
        # value per column.
        properties = {} if properties is None else properties
        dependencies = [] if dependencies is None else dependencies
        identifiers = set(
            int(row.identifier) for row in rows
            if row.identifier is not None and int(row.identifier) >= 0)
        identifiers.add(0)
        max_row_id = max(identifiers)
        rows = [
            DatasetRow(identifier=row.identifier if row.identifier is not None
                       and int(row.identifier) >= 0 else str(idx + max_row_id),
                       values=row.values,
                       caveats=row.caveats) for idx, row in enumerate(rows)
        ]
        _, max_row_id = validate_dataset(columns=columns, rows=rows)
        # Get new identifier and create directory for new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Create dataset an write dataset file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=max_row_id,
                                          properties=properties)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        # Write metadata file if annotations are given
        if properties is not None:
            dataset.write_properties_to_file(
                self.get_properties_filename(identifier))
        # Return handle for new dataset
        return DatasetDescriptor(identifier=dataset.identifier,
                                 name=human_readable_name,
                                 columns=dataset.columns)
Exemplo n.º 29
0
    def create_branch(self,
                      viztrail_id,
                      source_branch=DEFAULT_BRANCH,
                      workflow_version=-1,
                      properties=None,
                      module_id=-1):
        """Create a new workflow branch in a given viztrail. The new branch is
        created from the specified workflow in the source branch starting at
        module module_id. If module_id is negative the new branch starts after
        the last module of the source branch head workflow.

        Returns the handle for the new branch or None if the given viztrail does
        not exist. Raises ValueError if (1) the source branch does not exist,
        (2) no module with the specified identifier exists, or (3) an attempt is
        made to branch from an empty workflow.

        Parameters
        ----------
        viztrail_id : string
            Unique viztrail identifier
        source_branch : string, optional
            Unique branch identifier for existing branch
        workflow_version: int, optional
            Version number of the workflow that is being modified. If negative
            the branch head is being used.
        properties: dict, optional
            Set of properties for the new branch
        module_id: int, optional
            Start branch from module with given identifier in source_branch.
            The new branch starts at the end of the source branch if module_id
            has a negative value.

        Returns
        -------
        vizier.workflow.base.ViztrailBranch
        """
        # Get viztrail. Return None if the viztrail does not exist
        if not viztrail_id in self.cache:
            return None
        viztrail = self.cache[viztrail_id]
        # Raise exception if source branch does not exist
        if not source_branch in viztrail.branches:
            raise ValueError('unknown branch \'' + source_branch + '\'')
        # Get the referenced workflow. Raise exception if the workflow does not
        # exist oris empty
        workflow = viztrail.get_workflow(source_branch, workflow_version)
        if workflow is None:
            raise ValueError('unknown workflow')
        if len(workflow.modules) == 0:
            raise ValueError('attempt to branch from empty workflow')
        # Copy list of workflow modules depending on value of module_id
        if module_id < 0:
            modules = workflow.modules
        else:
            modules = []
            found = False
            for m in workflow.modules:
                modules.append(m)
                if m.identifier == module_id:
                    found = True
                    break
            if not found:
                raise ValueError('unknown module \'' + str(module_id) + '\'')
        # Make a copy of the source workflow for the branch
        result = viztrail.engine.copy_workflow(viztrail.version_counter.inc(),
                                               modules)
        # Create file for new workflow
        created_at = viztrail.write_workflow(result)
        # Create new branch handle
        target_branch = get_unique_identifier()
        # Store provenance information for new branch in file
        prov_file = branch_prov_file(viztrail.fs_dir, target_branch)
        FileSystemBranchProvenance.to_file(prov_file, source_branch,
                                           workflow.version,
                                           result.modules[-1].identifier)
        branch = ViztrailBranch(
            target_branch,
            FilePropertiesHandler(branch_file(viztrail.fs_dir, target_branch),
                                  properties),
            FileSystemBranchProvenance(prov_file),
            workflows=[
                WorkflowVersionDescriptor(result.version,
                                          action=ACTION_CREATE,
                                          package_id=PACKAGE_SYS,
                                          command_id=SYS_CREATE_BRANCH,
                                          created_at=created_at)
            ])
        # Update the viztrail on disk
        viztrail.branches[target_branch] = branch
        viztrail.to_file()
        return branch