def test_create_and_delete_folder(self): """Test default functionality of create_folder and delete_folder methods. """ store = DefaultObjectStore() self.assertEqual(store.create_folder(BASE_DIRECTORY, identifier='A'), 'A') self.assertTrue(store.exists(store.join(BASE_DIRECTORY, 'A'))) self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, 'A'))) identifier = store.create_folder(BASE_DIRECTORY) self.assertTrue(store.exists(store.join(BASE_DIRECTORY, identifier))) self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, identifier))) # New store with short identifier factory store = DefaultObjectStore(identifier_factory=get_short_identifier) short_id = store.create_folder(BASE_DIRECTORY) self.assertTrue(store.exists(store.join(BASE_DIRECTORY, short_id))) self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, short_id))) # Delete folder with identifier store.delete_folder(store.join(BASE_DIRECTORY, identifier)) self.assertFalse(store.exists(store.join(BASE_DIRECTORY, identifier))) self.assertFalse( os.path.isdir(os.path.join(BASE_DIRECTORY, identifier))) # Delete folder with short_id when the keep_deleted_files flag is True store = DefaultObjectStore(keep_deleted_files=True) store.delete_folder(store.join(BASE_DIRECTORY, short_id)) self.assertTrue(store.exists(store.join(BASE_DIRECTORY, short_id))) self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, short_id))) # Delete folder 'A' overriding the keep_deleted_files flag self.assertTrue(store.exists(store.join(BASE_DIRECTORY, 'A'))) self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, 'A'))) store.delete_folder(store.join(BASE_DIRECTORY, 'A'), force_delete=True) self.assertFalse(store.exists(store.join(BASE_DIRECTORY, 'A'))) self.assertFalse(os.path.isdir(os.path.join(BASE_DIRECTORY, 'A')))
def test_error_on_missing(self): """Test that reading a missing object will raise a ValueError.""" store = DefaultObjectStore() filename = store.join(BASE_DIRECTORY, 'A.file') store.create_object(BASE_DIRECTORY, identifier='A.file', content={'A': 1}) self.assertTrue(store.exists(filename)) # Re-create the store to ensure that this has no effect store = DefaultObjectStore() store.read_object(filename) os.remove(filename) with self.assertRaises(ValueError): store.read_object(filename)
def test_exists(self): """Test exists method.""" store = DefaultObjectStore() filename = store.join(BASE_DIRECTORY, 'A.file') dirname = store.join(BASE_DIRECTORY, 'A.dir') self.assertFalse(store.exists(filename)) store.create_object(BASE_DIRECTORY, identifier='A.file') self.assertTrue(store.exists(filename)) self.assertFalse(store.exists(dirname)) os.makedirs(dirname) self.assertTrue(store.exists(dirname)) # Re-create the store to ensure that this has no effect store = DefaultObjectStore() self.assertTrue(store.exists(filename)) self.assertTrue(store.exists(dirname))
def __init__(self, identifier: str, is_default: bool, base_path: str, modules_folder: str, provenance: BranchProvenance, properties: ObjectAnnotationSet, workflows: List[WorkflowDescriptor] = list(), head: Optional[WorkflowHandle] = None, object_store: Optional[ObjectStore] = None, cache_size: int = DEFAULT_CACHE_SIZE ): """Initialize the branch handle. """ super(OSBranchHandle, self).__init__( identifier=identifier, properties=properties, provenance=provenance ) self.is_default = is_default self.base_path = base_path self.modules_folder = modules_folder self.object_store = init_value(object_store, DefaultObjectStore()) self.workflows = init_value(workflows, list()) self.head = head self.cache_size = cache_size if not cache_size is None else DEFAULT_CACHE_SIZE self.cache: List[WorkflowHandle] = list()
def __init__(self, object_path: str, object_store: Optional[ObjectStore] = None, properties: Optional[Dict[str, Any]] = None): """Initialize the file that maintains the properties. Annotations are read from file (if it exists). Provides the option to load an initial set of properties from a given dictionary. If the file exists and the properties dictionary is not None an exception is thrown. Parameters ---------- object_path: string Path to resource object_store: vizier.core.io.base.ObjectStore, optional Object store to materialize properties properties: dict, optional Dictionary with initial set of properties """ # Ensure that the object store is not None super().__init__() if object_store is None: object_store = DefaultObjectStore() if properties is not None: # Initialize properties from the given dictionary. The persistent # set can only be initialized once. if object_store.exists(object_path): raise ValueError('cannot initialize existing annotation set') # Initialize the default object annotation set super(PersistentAnnotationSet, self).__init__(writer=PersistentAnnotationStore( object_path=object_path, object_store=object_store)) for key in properties: value = properties[key] self.delete(key, persist=False) if isinstance(value, list): for val in value: self.add(key, val, persist=False) else: self.add(key, value, persist=False) cast(AnnotationStore, self.writer).store(self.elements) else: # Read properties from disk if the annotation file exists elements = dict() if object_store.exists(object_path): obj = cast(List[Dict[str, Any]], object_store.read_object(object_path)) for anno in obj: elements[anno['key']] = anno['value'] # Initialize the default object annotation set super(PersistentAnnotationSet, self).__init__( elements=elements, writer=PersistentAnnotationStore(object_path=object_path, object_store=object_store)) list.__init__(self, [{ 'key': k, 'value': v } for k, v in self.elements.items()])
def test_create_file_repeat(self): """Test create file with identifier factory that not always returns a unique identifier. """ store = DefaultObjectStore(identifier_factory=IdFactory( max_attempts=MAX_ATTEMPS - 1)) id1 = store.create_object(BASE_DIRECTORY) id2 = store.create_object(BASE_DIRECTORY) self.assertNotEqual(id1, id2) self.assertTrue(store.exists(store.join(BASE_DIRECTORY, id1))) self.assertTrue(store.exists(store.join(BASE_DIRECTORY, id2))) store.delete_object(store.join(BASE_DIRECTORY, id1)) store.delete_object(store.join(BASE_DIRECTORY, id2)) store = DefaultObjectStore(identifier_factory=IdFactory( max_attempts=MAX_ATTEMPS + 1)) id1 = store.create_object(BASE_DIRECTORY) with self.assertRaises(RuntimeError): store.create_object(BASE_DIRECTORY)
def __init__(self, object_path, object_store=None, annotations=None): """Initialize the file that maintains the annotations. Annotations are read from file (if it exists). Provides the option to load an initial set of annotations from a given dictionary. If the file exists and the annotations dictionary is not None an exception is thrown. Parameters ---------- object_path: string Path to resource object_store: vizier.core.io.base.ObjectStore, optional Object store to materialize annotations annotations: dict, optional Dictionary with initial set of annotations """ # Ensure that the object store is not None if object_store is None: object_store = DefaultObjectStore() if not annotations is None: # Initialize annotations from the given dictionary. The persistent # set can only be initialized once. if object_store.exists(object_path): raise ValueError('cannot initialize existing annotation set') # Initialize the default object annotation set super(PersistentAnnotationSet, self).__init__( writer=PersistentAnnotationStore( object_path=object_path, object_store=object_store ) ) for key in annotations: value = annotations[key] if isinstance(value, list): for val in value: self.add(key, val, persist=False) else: self.add(key, value, persist=False) self.writer.store(self.elements) else: # Read annotations from disk if the annotation file exists elements = dict() if object_store.exists(object_path): obj = object_store.read_object(object_path) for anno in obj: elements[anno['key']] = anno['value'] # Initialize the default object annotation set super(PersistentAnnotationSet, self).__init__( elements=elements, writer=PersistentAnnotationStore( object_path=object_path, object_store=object_store ) )
def __init__( self, viztrails: ViztrailRepository, container_file: str, config: AppConfig, datastores: DatastoreFactory, filestores: FilestoreFactory, ): """Initialize the cache components and load all projects in the given viztrails repository. Maintains all projects in an dictionary keyed by their identifier. Parameters ---------- viztrails: vizier.vizual.repository.ViztrailRepository Repository for viztrails container_file: string Path to the container information file config: vizier.config.app.AppConfig Application object """ self.viztrails = viztrails self.container_file = container_file self.config = config self.container_image = config.engine.backend.container.image # Keep track of the port numbers for the project containers. self.ports = config.engine.backend.container.ports # Instantiate the Docker daemon client using the default socket or # configuration in the environment. This may need to be adjusted for # production deployments. self.client = docker.from_env() # Read mapping of project identifier to container information self.store = DefaultObjectStore() containers = dict() if self.store.exists(self.container_file): for obj in cast(List[Dict[str, Any]], self.store.read_object(self.container_file)): containers[obj['projectId']] = obj # Create index of project handles from existing viztrails. The project # handles do not have a reference to the datastore or filestore. self.projects = dict() self.datastores = datastores self.filestores = filestores for viztrail in self.viztrails.list_viztrails(): container = containers[viztrail.identifier] project = ContainerProjectHandle( viztrail=viztrail, container_api=container['url'], port=container['port'], container_id=container['containerId'], datastore=self.datastores.get_datastore(viztrail.identifier), filestore=self.filestores.get_filestore(viztrail.identifier)) self.projects[viztrail.identifier] = project
def __init__(self, object_path, object_store=None): """Initialize the path to the resource in the object store. By default annotation stes are persisted as files on the locak file system. Parameters ---------- object_path: string Path to the resource object_store: vizier.core.io.base.ObjectStore, optional Object store to materialize annotations """ self.object_path = object_path self.object_store = init_value(object_store, DefaultObjectStore())
def __init__(self, identifier: str, properties: PersistentAnnotationSet, base_path: str, branches: List[BranchHandle], default_branch: Optional[BranchHandle], object_store: ObjectStore = DefaultObjectStore(), created_at: datetime = get_current_time(), branch_index: Optional[str] = None, branch_folder: Optional[str] = None, modules_folder: Optional[str] = None): """Initialize the viztrail descriptor. Parameters ---------- identifier : string Unique viztrail identifier properties: dict(string, any) Dictionary of user-defined properties base_path: string Identifier for folder containing viztrail resources object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources branches: list(vizier.viztrail.branch.BranchHandle) List of branches in the viztrail default_branch: vizier.viztrail.branch.BranchHandle Default branch for the viztrail created_at : datetime.datetime, optional Timestamp of project creation (UTC) branch_index: string, optional Path to branch index list branch_folder: string, optional Path to branches folder modules_folder: string, optional Path to modules folder """ super(OSViztrailHandle, self).__init__(identifier=identifier, properties=properties, branches=branches, default_branch=default_branch, created_at=created_at) # Initizlize the object store and identifier for all subfolders. self.base_path = base_path self.object_store = object_store self.branch_folder = init_value( branch_folder, self.object_store.join(base_path, FOLDER_BRANCHES)) self.branch_index = init_value( branch_index, self.object_store.join(self.branch_folder, OBJ_BRANCHINDEX)) self.modules_folder = init_value( modules_folder, self.object_store.join(base_path, FOLDER_MODULES))
def __init__(self, identifier, command, external_form, module_path, state=None, timestamp=None, datasets=None, outputs=None, provenance=None, object_store=None): """Initialize the module handle. For new modules, datasets and outputs are initially empty. Parameters ---------- identifier : string Unique module identifier command : vizier.viztrail.command.ModuleCommand Specification of the module (i.e., package, name, and arguments) external_form: string Printable representation of module command module_path: string Path to module resource in object store state: int Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS) timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp, optional Module timestamp datasets : dict(vizier.datastore.dataset.DatasetDescriptor), optional Dictionary of resulting datasets. Dataset descriptors are keyed by the user-specified dataset name. outputs: vizier.viztrail.module.output.ModuleOutputs, optional Module output streams STDOUT and STDERR provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional Provenance information about datasets that were read and writen by previous execution of the module. object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources """ super(OSModuleHandle, self).__init__( identifier=identifier, command=command, external_form=external_form, state=state if not state is None else mstate.MODULE_PENDING, timestamp=timestamp, datasets=datasets, outputs=outputs, provenance=provenance) self.module_path = module_path self.object_store = object_store if not object_store is None else DefaultObjectStore( )
def __init__(self, base_path: str, object_store: Optional[ObjectStore] = None): """Initialize the repository from a configuration dictionary. Expects a dictionary that contains at least the base path for the repository. The definition of the object store is optional. If none is given the default object store will be used. Parameters --------- base_path: string Path to the base directory for viztrail resources object_store: vizier.core.io.base.ObjectStore, optional Store for objects that represent viztrail resources not """ # Raise an exception if the base directory argument is not given if base_path is None: raise ValueError('missing path for base directory') # Create the base directory if it does not exist self.base_path = base_path if not os.path.isdir(self.base_path): os.makedirs(self.base_path) # The object store element is optional. If not given the default object # store is used. if object_store is not None: self.object_store: ObjectStore = object_store else: self.object_store = DefaultObjectStore() # Initialize the viztrails index. Create the index file if it does not # exist. self.viztrails_index = self.object_store.join(self.base_path, OBJ_VIZTRAILINDEX) if not self.object_store.exists(self.viztrails_index): self.object_store.create_object(parent_folder=self.base_path, identifier=OBJ_VIZTRAILINDEX, content=list()) # Load viztrails and intialize the remaining instance variables by # calling the constructor of the super class self.viztrails: Dict[str, OSViztrailHandle] = dict() for identifier in cast( Dict[str, Any], self.object_store.read_object(self.viztrails_index)): vt = OSViztrailHandle.load_viztrail( base_path=self.object_store.join(self.base_path, identifier), object_store=self.object_store) # We just got the identifier from the repository... the loaded # viztrail had better exist. assert vt is not None self.viztrails[vt.identifier] = vt
def test_list_folders(self): """Test list_folders method.""" store = DefaultObjectStore() # The result is an empty list even if the folder does not exist and # is not created using the create flag dirname = store.join(BASE_DIRECTORY, 'A') dirs = store.list_folders(parent_folder=dirname, create=False) self.assertEqual(len(dirs), 0) self.assertFalse(store.exists(dirname)) # The result is an empty list after the folder is created using the # create flag dirs = store.list_folders(parent_folder=dirname, create=True) self.assertEqual(len(dirs), 0) self.assertTrue(store.exists(dirname)) # Create directories and files os.makedirs(store.join(dirname, 'A')) dirs = store.list_folders(parent_folder=dirname) self.assertEqual(len(dirs), 1) self.assertTrue('A' in dirs) os.makedirs(store.join(dirname, 'B')) dirs = store.list_folders(parent_folder=dirname, create=True) self.assertEqual(len(dirs), 2) self.assertTrue('A' in dirs) self.assertTrue('B' in dirs) filename = store.join(BASE_DIRECTORY, 'A.file') store.create_object(BASE_DIRECTORY, identifier='A.file') dirs = store.list_folders(parent_folder=dirname, create=True) self.assertEqual(len(dirs), 2) self.assertTrue('A' in dirs) self.assertTrue('B' in dirs) # Re-create the store to ensure that this has no effect store = DefaultObjectStore() dirs = store.list_folders(parent_folder=dirname, create=True) self.assertEqual(len(dirs), 2) self.assertTrue('A' in dirs) self.assertTrue('B' in dirs)
def __init__(self, identifier, is_default, base_path, modules_folder, provenance, properties, workflows=None, head=None, object_store=None, cache_size=None): """Initialize the branch handle. Parameters ---------- identifier: string Unique branch identifier is_default: bool True if this is the default branch for its viztrail base_path: string Path to branch resources folder modules_folder: string Path to module resources folder provenance: vizier.viztrail.branch.BranchProvenance Branch provenance information properties: vizier.core.annotation.base.ObjectAnnotationSet Branch property set workflows: list(vizier.viztrail.workflow.WorkflowDescriptor), optional List of descriptors for workflows in branch history head: vizier.viztrail.workflow.WorkflowHandle, optional Current at the head of the branch object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources """ super(OSBranchHandle, self).__init__(identifier=identifier, properties=properties, provenance=provenance) self.is_default = is_default self.base_path = base_path self.modules_folder = modules_folder self.object_store = init_value(object_store, DefaultObjectStore()) self.workflows = init_value(workflows, list()) self.head = head self.cache_size = cache_size if not cache_size is None else DEFAULT_CACHE_SIZE self.cache = list()
def test_create_cache(self): """Test accessing and deleting projects for an empty repository.""" viztrails = OSViztrailRepository(base_path=VIZTRAILS_DIR) vt1 = viztrails.create_viztrail( properties={PROPERTY_NAME: 'My Project'}) vt2 = viztrails.create_viztrail( properties={PROPERTY_NAME: 'A Project'}) filename = os.path.join(SERVER_DIR, 'container.json') DefaultObjectStore().write_object(object_path=filename, content=[{ 'projectId': vt1.identifier, 'url': 'API1', 'port': 80, 'containerId': 'ID1' }, { 'projectId': vt2.identifier, 'url': 'API2', 'port': 81, 'containerId': 'ID2' }]) # Initialize the project cache viztrails = OSViztrailRepository(base_path=VIZTRAILS_DIR) filestores_dir = os.path.join(SERVER_DIR, DEFAULT_FILESTORES_DIR) datastores_dir = os.path.join(SERVER_DIR, DEFAULT_DATASTORES_DIR) projects = ContainerProjectCache( viztrails=viztrails, container_file=filename, config=AppConfig(), datastores=MimirDatastoreFactory(datastores_dir), filestores=FileSystemFilestoreFactory(filestores_dir)) self.assertEqual(len(projects.list_projects()), 2) self.assertEqual( projects.get_project(vt1.identifier).container_api, 'API1') self.assertEqual( projects.get_project(vt2.identifier).container_api, 'API2') self.assertEqual( projects.get_project(vt1.identifier).container_id, 'ID1') self.assertEqual( projects.get_project(vt2.identifier).container_id, 'ID2') self.assertEqual(projects.get_project(vt1.identifier).port, 80) self.assertEqual(projects.get_project(vt2.identifier).port, 81)
def test_create_object_with_identifier(self): """Test creating a new object with a given identifier.""" store = DefaultObjectStore() store.create_object(BASE_DIRECTORY, identifier='A') self.assertTrue(os.path.isfile(os.path.join(BASE_DIRECTORY, 'A'))) with self.assertRaises(ValueError): store.read_object(store.join(BASE_DIRECTORY, 'A')) store.create_object(BASE_DIRECTORY, identifier='B', content={'id': 100}) self.assertTrue(os.path.isfile(os.path.join(BASE_DIRECTORY, 'B'))) content = store.read_object(store.join(BASE_DIRECTORY, 'B')) self.assertEqual(content['id'], 100) store.create_object(BASE_DIRECTORY, identifier='A', content={'id': 100}) self.assertTrue(os.path.isfile(os.path.join(BASE_DIRECTORY, 'A'))) content = store.read_object(store.join(BASE_DIRECTORY, 'A')) self.assertEqual(content['id'], 100) store.create_object(BASE_DIRECTORY, identifier='B') self.assertTrue(os.path.isfile(os.path.join(BASE_DIRECTORY, 'B'))) with self.assertRaises(ValueError): store.read_object(store.join(BASE_DIRECTORY, 'B'))
def load_module(identifier, module_path, prev_state=None, object_store=None): """Load module from given object store. Parameters ---------- identifier: string Unique module identifier module_path: string Resource path for module object prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor) Dataset descriptors keyed by the user-provided name that exist in the database state of the previous moudle (in sequence of occurrence in the workflow) object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources Returns ------- vizier.viztrail.objectstore.module.OSModuleHandle """ # Make sure the object store is not None if object_store is None: object_store = DefaultObjectStore() # Read object from store. This may raise a ValueError to indicate that # the module does not exists (in a system error condtion). In this # case we return a new module that is in error state. try: obj = object_store.read_object(object_path=module_path) except ValueError: return OSModuleHandle( identifier=identifier, command=ModuleCommand(package_id=UNKNOWN_ID, command_id=UNKNOWN_ID), external_form='fatal error: object not found', module_path=module_path, state=mstate.MODULE_ERROR, object_store=object_store) # Create module command command = ModuleCommand(package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID], command_id=obj[KEY_COMMAND][KEY_COMMAND_ID], arguments=obj[KEY_COMMAND][KEY_ARGUMENTS]) # Create module timestamps created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT]) if KEY_STARTED_AT in obj[KEY_TIMESTAMP]: started_at = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT]) else: started_at = None if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]: finished_at = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT]) else: finished_at = None timestamp = ModuleTimestamp(created_at=created_at, started_at=started_at, finished_at=finished_at) # Create module output streams. outputs = ModuleOutputs( stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]), stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR])) # Create module provenance information read_prov = None if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]: read_prov = dict() for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]: read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID] write_prov = None if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]: write_prov = dict() for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]: descriptor = DatasetDescriptor( identifier=ds[KEY_DATASET_ID], columns=[ DatasetColumn(identifier=col[KEY_COLUMN_ID], name=col[KEY_COLUMN_NAME], data_type=col[KEY_COLUMN_TYPE]) for col in ds[KEY_DATASET_COLUMNS] ], row_count=ds[KEY_DATASET_ROWCOUNT]) write_prov[ds[KEY_DATASET_NAME]] = descriptor delete_prov = None if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]: delete_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE] res_prov = None if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]: res_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES] charts_prov = None if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]: charts_prov = [ ChartViewHandle.from_dict(c) for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS] ] provenance = ModuleProvenance(read=read_prov, write=write_prov, delete=delete_prov, resources=res_prov, charts=charts_prov) # Create dictionary of dataset descriptors only if previous state is # given and the module is in SUCCESS state. Otherwise, the database # state is empty. if obj[KEY_STATE] == mstate.MODULE_SUCCESS and not prev_state is None: datasets = provenance.get_database_state(prev_state) else: datasets = dict() # Return module handle return OSModuleHandle(identifier=identifier, command=command, external_form=obj[KEY_EXTERNAL_FORM], module_path=module_path, state=obj[KEY_STATE], timestamp=timestamp, datasets=datasets, outputs=outputs, provenance=provenance, object_store=object_store)
def create_module(command, external_form, state, timestamp, outputs, provenance, module_folder, datasets=None, object_store=None): """Create a new materialized module instance for the given values. Parameters ---------- command : vizier.viztrail.command.ModuleCommand Specification of the module (i.e., package, name, and arguments) external_form: string Printable representation of module command state: int Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS) timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp Module timestamp datasets : dict(vizier.datastore.dataset.DatasetDescriptor) Dictionary of resulting datasets. Dataset descriptors are keyed by the user-specified dataset name. outputs: vizier.viztrail.module.output.ModuleOutputs Module output streams STDOUT and STDERR provenance: vizier.viztrail.module.provenance.ModuleProvenance Provenance information about datasets that were read and writen by previous execution of the module. module_folder: string Object store folder containing module resources object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources Returns ------- vizier.viztrail.objectstore.module.OSModuleHandle """ # Make sure the object store is not None if object_store is None: object_store = DefaultObjectStore() # Serialize module components and materialize obj = serialize_module(command=command, external_form=external_form, state=state, timestamp=timestamp, outputs=outputs, provenance=provenance) identifier = object_store.create_object(parent_folder=module_folder, content=obj) # Return handle for created module return OSModuleHandle( identifier=identifier, command=command, external_form=external_form, module_path=object_store.join(module_folder, identifier), state=state, timestamp=timestamp, datasets=datasets if not datasets is None else dict(), outputs=outputs, provenance=provenance, object_store=object_store)
def get_engine(config: AppConfig) -> VizierEngine: """Create instance of the default vizual engine using the default datastore, filestore and viztrails factories. The default engine may use a multi-process backend or a celery backend. Parameters ---------- config: vizier.config.app.AppConfig Application configuration object Returns ------- vizier.engine.base.VizierEngine """ # Get backend identifier. Raise ValueError if value does not identify # a valid backend. backend_id = config.engine.backend.identifier if not backend_id in base.BACKENDS: raise ValueError('unknown backend \'' + str(backend_id) + '\'') # Get the identifier factory for the viztrails repository and create # the object store. At this point we use the default object store only. # We could add another environment variable to use different object # stores (once implemented). if config.engine.use_short_ids: id_factory = get_short_identifier else: id_factory = get_unique_identifier object_store = DefaultObjectStore( identifier_factory=id_factory ) # Create index of supported packages packages = load_packages(config.engine.package_path) # By default the vizier engine uses the objectstore implementation for # the viztrails repository. The datastore and filestore factories depend # on the values of engine identifier (DEV or MIMIR). base_dir = config.engine.data_dir # Create the local viztrails repository viztrails = OSViztrailRepository( base_path=os.path.join(base_dir, app.DEFAULT_VIZTRAILS_DIR), object_store=object_store ) filestores_dir = os.path.join(base_dir, app.DEFAULT_FILESTORES_DIR) datastores_dir = os.path.join(base_dir, app.DEFAULT_DATASTORES_DIR) if config.engine.identifier in [base.DEV_ENGINE, base.MIMIR_ENGINE]: filestore_factory=FileSystemFilestoreFactory(filestores_dir) datastore_factory: DatastoreFactory if config.engine.identifier == base.DEV_ENGINE: datastore_factory = FileSystemDatastoreFactory(datastores_dir) else: datastore_factory = MimirDatastoreFactory(datastores_dir) # The default engine uses a common project cache. projects: ProjectCache = CommonProjectCache( datastores=datastore_factory, filestores=filestore_factory, viztrails=viztrails ) # Get set of task processors for supported packages processors = load_processors(config.engine.processor_path) # Create an optional task processor for synchronous tasks if given sync_commands_list = config.engine.sync_commands if not sync_commands_list is None: commands:Dict[str,Dict[str,TaskProcessor]] = dict() for el in sync_commands_list.split(':'): package_id, command_id = el.split('.') if not package_id in commands: commands[package_id] = dict() commands[package_id][command_id] = processors[package_id] synchronous: TaskExecEngine = SynchronousTaskEngine( commands=commands, projects=projects ) else: synchronous = NonSynchronousEngine() # Create the backend backend: VizierBackend if backend_id == base.BACKEND_MULTIPROCESS: backend = MultiProcessBackend( processors=processors, projects=projects, synchronous=synchronous ) elif backend_id == base.BACKEND_CELERY: # Create and configure routing information (if given) backend = CeleryBackend( routes=config_routes(config), synchronous=synchronous ) else: # Not all combinations of engine identifier and backend identifier # are valid. raise ValueError('invalid backend \'' + str(backend_id) + '\'') elif config.engine.identifier == base.CONTAINER_ENGINE: if backend_id == base.BACKEND_CONTAINER: projects = ContainerProjectCache( viztrails=viztrails, container_file=os.path.join(base_dir, app.DEFAULT_CONTAINER_FILE), config=config, datastores=MimirDatastoreFactory(datastores_dir), filestores=FileSystemFilestoreFactory(filestores_dir) ) backend = ContainerBackend(projects=projects) else: # The container engine only supports a single backend type. raise ValueError('invalid backend \'' + str(backend_id) + '\'') else: raise ValueError('unknown vizier engine \'' + str(config.engine.identifier) + '\'') return VizierEngine( name=config.engine.identifier + ' (' + backend_id + ')', projects=projects, backend=backend, packages=packages )
def create_viztrail(identifier: str, base_path: str, object_store: Optional[ObjectStore] = None, properties: Optional[Dict[str, Any]] = None): """Create a new viztrail resource. Will create the base directory for the viztrail. Creates subfolders for viztrail resources. Writes viztrail metadata and properties to file. Create an empty default branch Parameters ---------- properties: dict(string, any) Dictionary of properties for the new viztrail base_path: string Identifier for folder containing viztrail resources object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources Returns ------- vizier.viztrail.driver.fs.viztrail.FSViztrailHandle """ # Make sure the object store is not None if object_store is None: object_store = DefaultObjectStore() # Create empty index file and subfolders for branches, workflows, and # modules. The base path folder is expected to exist. branch_folder = object_store.join(base_path, FOLDER_BRANCHES) object_store.create_folder(base_path, identifier=FOLDER_BRANCHES) branch_index = object_store.join(branch_folder, OBJ_BRANCHINDEX) content: List[str] = [] object_store.write_object(object_path=branch_index, content=content) modules_folder = object_store.join(base_path, FOLDER_MODULES) object_store.create_folder(base_path, identifier=FOLDER_MODULES) # Write viztrail metadata to disk created_at = get_current_time() object_store.write_object(object_path=object_store.join( base_path, OBJ_METADATA), content={ KEY_IDENTIFIER: identifier, KEY_CREATED_AT: created_at.isoformat() }) # Create the default branch for the new viztrail default_branch = create_branch( provenance=BranchProvenance(created_at=created_at), properties={PROPERTY_NAME: DEFAULT_BRANCH}, modules=None, branch_folder=branch_folder, modules_folder=modules_folder, object_store=object_store, is_default=True, created_at=created_at) # Materialize the updated branch index write_branch_index( branches={default_branch.identifier: default_branch}, object_path=branch_index, object_store=object_store) # Return handle for new viztrail return OSViztrailHandle(identifier=identifier, properties=PersistentAnnotationSet( object_path=object_store.join( base_path, OBJ_PROPERTIES), object_store=object_store, properties=properties), branches=[default_branch], default_branch=default_branch, created_at=created_at, base_path=base_path, object_store=object_store, branch_index=branch_index, branch_folder=branch_folder, modules_folder=modules_folder)
def load_viztrail( base_path: str, object_store: Optional[ObjectStore] = None ) -> Optional["OSViztrailHandle"]: """Load all viztrail resources from given object store. Parameters ---------- base_path: string Identifier for folder containing viztrail resources object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources Returns ------- vizier.viztrail.driver.os.viztrail.OSViztrailHandle """ # Make sure the object store is not None if object_store is None: object_store = DefaultObjectStore() object_store = cast(ObjectStore, object_store) # Load viztrail metadata metadata = object_store.read_object( object_store.join(base_path, OBJ_METADATA)) if metadata is None: return None metadata = cast(Dict[str, Any], metadata) identifier = metadata[KEY_IDENTIFIER] created_at = to_datetime(metadata[KEY_CREATED_AT]) # Load active branches. The branch index resource contains a list of # active branch identifiers. branch_folder = object_store.join(base_path, FOLDER_BRANCHES) branch_index = object_store.join(branch_folder, OBJ_BRANCHINDEX) modules_folder = object_store.join(base_path, FOLDER_MODULES) branches = list() default_branch: Optional[BranchHandle] = None for b in cast(List[Dict[str, Any]], object_store.read_object(branch_index)): branch_id = b[KEY_IDENTIFIER] is_default = b[KEY_DEFAULT] branches.append( OSBranchHandle.load_branch(identifier=branch_id, is_default=is_default, base_path=object_store.join( branch_folder, branch_id), modules_folder=modules_folder, object_store=object_store)) if is_default: default_branch = branches[-1] # Return handle for new viztrail return OSViztrailHandle(identifier=identifier, properties=PersistentAnnotationSet( object_path=object_store.join( base_path, OBJ_PROPERTIES), object_store=object_store), branches=branches, default_branch=default_branch, created_at=created_at, base_path=base_path, object_store=object_store, branch_index=branch_index, branch_folder=branch_folder, modules_folder=modules_folder)
def load_module( identifier: str, module_path: str, prev_state: Optional[Dict[str, ArtifactDescriptor]] = None, object_store: ObjectStore = DefaultObjectStore() ) -> "OSModuleHandle": """Load module from given object store. Parameters ---------- identifier: string Unique module identifier module_path: string Resource path for module object prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor) Dataset descriptors keyed by the user-provided name that exist in the database state of the previous moudle (in sequence of occurrence in the workflow) object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources Returns ------- vizier.viztrail.objectstore.module.OSModuleHandle """ # Make sure the object store is not None # Read object from store. This may raise a ValueError to indicate that # the module does not exists (in a system error condtion). In this # case we return a new module that is in error state. try: obj = cast(Dict[str, Any], object_store.read_object(object_path=module_path)) except ValueError: return OSModuleHandle( identifier=identifier, command=ModuleCommand( package_id=UNKNOWN_ID, command_id=UNKNOWN_ID, arguments=list(), packages=None ), external_form='fatal error: object not found', module_path=module_path, state=mstate.MODULE_ERROR, object_store=object_store ) # Create module command command = ModuleCommand( package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID], command_id=obj[KEY_COMMAND][KEY_COMMAND_ID], arguments=obj[KEY_COMMAND][KEY_ARGUMENTS], packages=None ) # Create module timestamps created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT]) if KEY_STARTED_AT in obj[KEY_TIMESTAMP]: started_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT]) else: started_at = None if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]: finished_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT]) else: finished_at = None timestamp = ModuleTimestamp( created_at=created_at, started_at=started_at, finished_at=finished_at ) # Create module output streams. outputs = ModuleOutputs( stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]), stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR]) ) # Create module provenance information read_prov = None if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]: read_prov = dict() for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]: read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID] write_prov = None if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]: write_prov = dict() for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]: if KEY_DATAOBJECT_TYPE in ds: descriptor = ArtifactDescriptor( identifier=ds[KEY_DATAOBJECT_ID], name=ds[KEY_DATAOBJECT_NAME], artifact_type=ds[KEY_DATAOBJECT_TYPE]) else: descriptor = DatasetDescriptor( identifier=ds[KEY_DATASET_ID], name=ds[KEY_DATASET_NAME], columns=[ DatasetColumn( identifier=col[KEY_COLUMN_ID], name=col[KEY_COLUMN_NAME], data_type=col[KEY_COLUMN_TYPE] ) for col in ds[KEY_DATASET_COLUMNS] ] ) write_prov[ds[KEY_DATASET_NAME]] = descriptor if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]: delete_prov = set(obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE]) else: delete_prov = set() if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]: res_prov = cast(Dict[str, Any], obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES]) else: res_prov = dict() if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]: charts_prov = [ ( c[0], ChartViewHandle.from_dict(c[1]) # type: ignore[no-untyped-call] ) if isinstance(c, list) else ( "Chart", ChartViewHandle.from_dict(c) ) for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS] ] else: charts_prov = list() provenance = ModuleProvenance( read=read_prov, write=write_prov, delete=delete_prov, resources=res_prov, charts=charts_prov ) # Return module handle return OSModuleHandle( identifier=identifier, command=command, external_form=obj[KEY_EXTERNAL_FORM], module_path=module_path, state=obj[KEY_STATE], timestamp=timestamp, outputs=outputs, provenance=provenance, object_store=object_store, )
def get_engine(config): """Create instance of vizier engine using the default datastore, filestore and viztrails factories. The default engine may use a multi-process backend or a celery backend. Parameters ---------- config: vizier.config.app.AppConfig Application configuration object Returns ------- vizier.engine.base.VizierEngine """ # Get backend identifier. Raise ValueError if value does not identify # a valid backend. backend_id = config.engine.backend.identifier if not backend_id in base.BACKENDS: raise ValueError('unknown backend \'' + str(backend_id) + '\'') # Get the identifier factory for the viztrails repository and create # the object store. At this point we use the default object store only. # We could add another environment variable to use different object # stores (once implemented). if config.engine.use_short_ids: id_factory = get_short_identifier else: id_factory = get_unique_identifier object_store = DefaultObjectStore( identifier_factory=id_factory ) # By default the vizier engine uses the objectstore implementation for # the viztrails repository. The datastore and filestore factories depend # on the values of engine identifier (DEV or MIMIR). base_dir = config.engine.data_dir viztrails_dir = os.path.join(base_dir, app.DEFAULT_VIZTRAILS_DIR) if config.engine.identifier in [base.DEV_ENGINE, base.MIMIR_ENGINE]: filestores_dir = os.path.join(base_dir, app.DEFAULT_FILESTORES_DIR) filestore_factory=FileSystemFilestoreFactory(filestores_dir) datastores_dir = os.path.join(base_dir, app.DEFAULT_DATASTORES_DIR) if config.engine.identifier == base.DEV_ENGINE: datastore_factory = FileSystemDatastoreFactory(datastores_dir) else: datastore_factory = MimirDatastoreFactory(datastores_dir) else: raise ValueError('unknown vizier engine \'' + str(config.engine.identifier) + '\'') # The default engine uses a common project cache. projects = SingleProjectCache( ProjectHandle( viztrail=ViztrailHandle(identifier=config.project_id), datastore=datastore_factory.get_datastore(config.project_id), filestore=filestore_factory.get_filestore(config.project_id) ) ) # Create workflow execution backend and processor for synchronous task packages = load_packages(config.engine.package_path) processors = load_processors(config.engine.processor_path) # Create the backend if backend_id == base.BACKEND_MULTIPROCESS: backend = MultiProcessBackend( processors=processors, projects=projects, synchronous=None ) elif backend_id == base.BACKEND_CELERY: # Create and configure routing information (if given) backend = CeleryBackend( routes=config_routes(config), synchronous=None ) else: # For completeness. Validity of the backend id is be checked before. raise ValueError('unknown backend \'' + str(backend_id) + '\'') return VizierEngine( name=config.engine.identifier + ' (' + backend_id + ')', projects=projects, backend=backend, packages=packages )
def create_branch( identifier: str, base_path: str, modules_folder: str, is_default: bool = False, provenance: Optional[BranchProvenance] = None, properties: Optional[Dict[str, Any]] = None, created_at: Optional[datetime] = None, modules: Optional[List[str]] = None, object_store: Optional[ObjectStore] = None ): """Create a new branch. If the workflow is given the new branch contains exactly this workflow. Otherwise, the branch is empty. Raises ValueError if any of the modules in the given list is in an active state. Parameters ---------- identifier: string Unique branch identifier base_path: string path to the folder for branch resources modules_folder: string Path to module resources folder is_default: bool, optional True if this is the default branch for its viztrail provenance: vizier.viztrail.branch.BranchProvenance, optional Branch provenance information properties: dict, optional Initial set of branch properties object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources modules: list(string), optional List of module identifier for the modules in the workflow at the head of the branch Returns ------- vizier.viztrail.objectstore.branch.OSBranchHandle """ # Make sure the object store is not None if object_store is None: object_store = DefaultObjectStore() # If base path does not exist raise an exception if not object_store.exists(base_path): raise ValueError('base path does not exist') # Read module handles first to ensure that none of the modules is in # an active state if not modules is None: wf_modules = read_workflow_modules( modules_list=modules, modules_folder=modules_folder, object_store=object_store ) for m in wf_modules: if m.is_active: raise ValueError('cannot branch from active workflow') # Set provenance object if not given if provenance is None: provenance = BranchProvenance() # Write provenance information to disk doc: Dict[str, Any] = {KEY_CREATED_AT: provenance.created_at.isoformat()} if not provenance.source_branch is None: # If one propery is not None all are expected to be not None doc[KEY_SOURCE_BRANCH] = provenance.source_branch doc[KEY_WORKFLOW_ID] = provenance.workflow_id doc[KEY_MODULE_ID] = provenance.module_id object_store.write_object( object_path=object_store.join(base_path, OBJ_METADATA), content=doc ) # Create the initial workflow if the list of modules is given workflows = list() head = None if not modules is None: # Write handle for workflow at branch head descriptor = write_workflow_handle( modules=modules, workflow_count=0, base_path=base_path, object_store=object_store, action=ACTION_CREATE, created_at=provenance.created_at ) workflows.append(descriptor) # Set the new workflow as the branch head head = WorkflowHandle( identifier=descriptor.identifier, branch_id=identifier, modules=wf_modules, descriptor=descriptor ) # Return handle for new viztrail branch return OSBranchHandle( identifier=identifier, is_default=is_default, base_path=base_path, modules_folder=modules_folder, provenance=provenance, properties=PersistentAnnotationSet( object_path=object_store.join(base_path, OBJ_PROPERTIES), object_store=object_store, properties=properties ), workflows=workflows, head=head, object_store=object_store )
def load_branch( identifier: str, is_default: bool, base_path: str, modules_folder: str, object_store: Optional[ObjectStore] = None ): """Load branch from disk. Reads the branch provenance information and descriptors for all workflows in the branch history. If the branch history is not empty the modules for the workflow at the branch head will be read as well. Parameters ---------- identifier: string Unique branch identifier is_default: bool True if this is the default branch for its viztrail base_path: string Path to folder containing branch resources modules_folder: string Path to folder containing workflow modules object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources Returns ------- vizier.viztrail.objectstore.branch.OSBranchHandle """ # Make sure the object store is not None if object_store is None: object_store = DefaultObjectStore() # Load branch provenance. The object will contain the created_at # timestamp and optionally the three entries that define the branch # point. doc = cast(Dict[str, Any], object_store.read_object( object_store.join(base_path, OBJ_METADATA) )) created_at = to_datetime(doc[KEY_CREATED_AT]) if len(doc) == 4: provenance = BranchProvenance( source_branch=doc[KEY_SOURCE_BRANCH], workflow_id=doc[KEY_WORKFLOW_ID], module_id=doc[KEY_MODULE_ID], created_at=created_at ) else: provenance = BranchProvenance(created_at=created_at) # Read descriptors for all branch workflows. Workflow descriptors are # objects in the base directory that do no match the name of any of the # predefied branch object. workflows = list() for resource in object_store.list_objects(base_path): if not resource in [OBJ_METADATA, OBJ_PROPERTIES]: resource_path = object_store.join(base_path, resource) obj = cast(Dict[str, Any], object_store.read_object(resource_path)) desc = obj[KEY_WORKFLOW_DESCRIPTOR] workflows.append( WorkflowDescriptor( identifier=obj[KEY_WORKFLOW_ID], action=desc[KEY_ACTION], package_id=desc[KEY_PACKAGE_ID], command_id=desc[KEY_COMMAND_ID], created_at=to_datetime(desc[KEY_CREATED_AT]) ) ) # Sort workflows in ascending order of their identifier workflows.sort(key=lambda x: x.identifier) # Read all modules for the workflow at the branch head (if exists) head = None if len(workflows) > 0: # The workflow descriptor is the last element in the workflows list descriptor = workflows[-1] head = read_workflow( branch_id=identifier, workflow_descriptor=descriptor, workflow_path=object_store.join( base_path, descriptor.identifier ), modules_folder=modules_folder, object_store=object_store ) return OSBranchHandle( identifier=identifier, is_default=is_default, base_path=base_path, modules_folder=modules_folder, provenance=provenance, properties=PersistentAnnotationSet( object_path=object_store.join(base_path, OBJ_PROPERTIES), object_store=object_store ), workflows=workflows, head=head, object_store=object_store )