def register_pipeline(self, config: Dict[Text, Any]): """ Registers a pipeline in the artifact store as a YAML file. Args: config: dict representation of ZenML config. """ Repository.get_instance().register_pipeline(file_name=self.file_name, config=config)
def __init__(self, name: Text, schema: Dict = None, _id: Text = None, *args, **kwargs): """ Construct the datasource Args: name (str): name of datasource schema (dict): schema of datasource _id: unique ID (for internal use) """ if _id: # Its loaded from config self._id = _id logger.debug(f'Datasource {name} loaded.') else: # If none, then this is assumed to be 'new'. Check dupes. all_names = Repository.get_instance().get_datasource_names() if any(d == name for d in all_names): raise AlreadyExistsException( name=name, resource_type='datasource') self._id = str(uuid4()) track(event=CREATE_DATASOURCE) logger.info(f'Datasource {name} created.') self.name = name self.schema = schema self._immutable = False self._source = source_utils.resolve_source_path( self.__class__.__module__ + '.' + self.__class__.__name__ )
def run(self, config: Dict[Text, Any]): # Extract the paths to create the tar logger.info('Orchestrating pipeline on Kubernetes..') repo: Repository = Repository.get_instance() repo_path = repo.path config_dir = repo.zenml_config.config_dir tar_file_name = \ f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz' path_to_tar = os.path.join(config_dir, tar_file_name) # Create tarfile but exclude .zenml folder if exists path_utils.create_tarfile(repo_path, path_to_tar) logger.info(f'Created tar of current repository at: {path_to_tar}') # Upload tar to artifact store store_path = config[keys.GlobalKeys.ARTIFACT_STORE] store_staging_area = os.path.join(store_path, STAGING_AREA) store_path_to_tar = os.path.join(store_staging_area, tar_file_name) path_utils.copy(path_to_tar, store_path_to_tar) logger.info(f'Copied tar to artifact store at: {store_path_to_tar}') # Remove tar path_utils.rm_dir(path_to_tar) logger.info(f'Removed tar at: {path_to_tar}') # Append path of tar in config orchestrator utils config[keys.GlobalKeys.BACKEND][ keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar # Launch the instance self.launch_job(config)
def _check_registered(self): if self.file_name in \ Repository.get_instance().get_pipeline_file_paths( only_file_names=True): raise AssertionError( f'Pipeline names must be unique in the repository. There ' f'is already a pipeline called {self.name}')
def wrapper(): repo: Repository = Repository.get_instance() pipelines_dir = repo.zenml_config.get_pipelines_dir() for p_config in path_utils.list_dir(pipelines_dir): try: os.remove(p_config) except Exception as e: print(e)
def wrapper(): repo: Repository = Repository.get_instance() repo.zenml_config.set_pipelines_dir(pipeline_root) for p_config in path_utils.list_dir(pipeline_root): y = yaml_utils.read_yaml(p_config) p: TrainingPipeline = TrainingPipeline.from_config(y) p.run()
def _get_one_pipeline(self): """Gets representative pipeline from all pipelines associated.""" pipelines = \ Repository.get_instance().get_pipelines_by_datasource(self) if len(pipelines) == 0: raise EmptyDatasourceException return pipelines[0]
def _get_one_pipeline(self): """Gets representative pipeline from all pipelines associated.""" pipelines = \ Repository.get_instance().get_pipelines_by_datasource(self) if len(pipelines) == 0: raise Exception('This datasource is not associated with any ' 'pipelines, therefore there is no data!') return pipelines[0]
def list_config(): """Print the current ZenML config to the command line""" try: repo: Repository = Repository.get_instance() except Exception as e: error(e) return click.echo(to_pretty_string(repo.zenml_config))
def load_source_path_class(source_path: Text) -> Type: """ Loads a Python class from the path provided. Args: source_path (str): relative module path e.g. this.module.Class[@sha] """ source = source_path.split('@')[0] pin = source_path.split('@')[-1] is_standard = is_standard_pin(pin) if '@' in source_path and not is_standard: logger.debug('Pinned step found with git sha. ' 'Loading class from git history.') wrapper: GitWrapper = Repository.get_instance().get_git_wrapper() module_path = get_module_path_from_source(source_path) relative_module_path = get_relative_path_from_module(module_path) logger.warning('Found source with a pinned sha. Will now checkout ' f'module: {module_path}') # critical step if not wrapper.check_module_clean(source_path): raise Exception(f'One of the files at {relative_module_path} ' f'is not committed and we ' f'are trying to load that directory from git ' f'history due to a pinned step in the pipeline. ' f'Please commit the file and then run the ' f'pipeline.') # Check out the directory at that sha wrapper.checkout(sha_or_branch=pin, directory=relative_module_path) # After this point, all exceptions will first undo the above try: class_ = import_class_by_path(source) wrapper.reset(relative_module_path) wrapper.checkout(directory=relative_module_path) except Exception: wrapper.reset(relative_module_path) wrapper.checkout(directory=relative_module_path) raise Exception elif '@' in source_path and is_standard: logger.debug(f'Default {APP_NAME} class used. Loading directly.') # TODO: [LOW] Check if ZenML version is installed before loading. class_ = import_class_by_path(source) else: logger.debug('Unpinned step found with no git sha. Attempting to ' 'load class from current repository state.') class_ = import_class_by_path(source) return class_
def run_pipeline(self, config_b64: str): # Load config from base64 config = json.loads(base64.b64decode(config_b64)) # Remove tar_path arg from config tar_path = config[keys.GlobalKeys.BACKEND][keys.BackendKeys.ARGS].pop( TAR_PATH_ARG) # Copy it over locally because it will be remote path_utils.copy(tar_path, EXTRACTED_TAR_FILE_PATH) # Extract it to EXTRACTED_TAR_DIR path_utils.extract_tarfile(EXTRACTED_TAR_FILE_PATH, EXTRACTED_TAR_DIR) # Append to sys to make user code discoverable sys.path.append(EXTRACTED_TAR_DIR) # Make sure the Repository is initialized at the right path Repository.get_instance(EXTRACTED_TAR_DIR) # Change orchestrator of pipeline to local OrchestratorBaseBackend().run(config)
def resolve_source_path(source_path: Text) -> Text: """ Resolves source path with an optional sha using Git. Args: source_path (str): relative module path e.g. this.module.Class """ if is_standard_step(source_path): # that means use standard version return resolve_standard_source_path(source_path) # otherwise use Git resolution wrapper: GitWrapper = Repository.get_instance().get_git_wrapper() source_path = wrapper.resolve_source_path(source_path) return source_path
def set_metadata_store(store_type, args): """Set metadata store for local config.""" try: parsed_args = parse_unknown_options(args) except AssertionError as e: click.echo(str(e)) return # TODO: [LOW] Hard-coded config = {'type': store_type, 'args': parsed_args} from zenml.core.metadata.metadata_wrapper import ZenMLMetadataStore store = ZenMLMetadataStore.from_config(config) repo: Repository = Repository.get_instance() repo.zenml_config.set_metadata_store(store) click.echo(f'Metadata store set to: {store.to_config()}')
def get_config(self): predictor_path = self.predictor.__module__ + '.' + \ self.predictor.__name__ p_file_path = \ get_path_from_source(get_class_path_from_source(predictor_path)) repo: Repository = Repository.get_instance() return { "cortex_serving_args": { "env": self.env, "api_config": self.api_config, "predictor_path": os.path.join(repo.path, p_file_path), "requirements": self.requirements, "conda_packages": self.conda_packages, "force": self.force, "wait": self.wait, } }
def __init__(self, **params): super(Application, self).__init__(**params) # lists result_list = [] hparam_list = [] repo: Repository = Repository.get_instance() # get all pipelines in this workspace all_pipelines: List[TrainingPipeline] = repo.get_pipelines_by_type( [TrainingPipeline.PIPELINE_TYPE]) # get a dataframe of all results + all hyperparameter combinations for p in all_pipelines: # This is slowing the comparison down but # necessary to update the status of each run if p.get_status() == PipelineStatusTypes.Succeeded.name: eval_path = p.get_artifacts_uri_by_component( GDPComponent.Evaluator.name)[0] evaluation = tfma.load_eval_result(eval_path) for s, m in evaluation.slicing_metrics: result_list.append( dict([('pipeline_name', '{}'.format(p.name)), ('slice_name', s[0][0] if s else ''), ('slice_value', s[0][1] if s else '')])) result_list[-1].update( {f'metric_{k}': m[k][''] for k, v in m.items()}) h_dict = p.get_hyperparameters() h_dict['pipeline_name'] = p.name hparam_list.append(h_dict) self.results = pd.DataFrame([parse_metrics(r) for r in result_list]) self.hparam_info = pd.DataFrame(hparam_list) # set params self.param.pipeline_run_selector.objects = self.results[ 'pipeline_name'].unique()
def run(self, config: Dict[Text, Any]): """ This run function essentially calls an underlying TFX orchestrator run. However it is meant as a higher level abstraction with some opinionated decisions taken. Args: config: a ZenML config dict """ # Extract the paths to create the tar logger.info('Orchestrating pipeline on GCP..') repo: Repository = Repository.get_instance() repo_path = repo.path config_dir = repo.zenml_config.config_dir tar_file_name = \ f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz' path_to_tar = os.path.join(config_dir, tar_file_name) # Create tarfile but excluse .zenml folder if exists path_utils.create_tarfile(repo_path, path_to_tar) logger.info(f'Created tar of current repository at: {path_to_tar}') # Upload tar to artifact store store_path = config[keys.GlobalKeys.ARTIFACT_STORE] store_staging_area = os.path.join(store_path, STAGING_AREA) store_path_to_tar = os.path.join(store_staging_area, tar_file_name) path_utils.copy(path_to_tar, store_path_to_tar) logger.info(f'Copied tar to artifact store at: {store_path_to_tar}') # Remove tar path_utils.rm_dir(path_to_tar) logger.info(f'Removed tar at: {path_to_tar}') # Append path of tar in config orchestrator utils config[keys.GlobalKeys.BACKEND][ keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar # Launch the instance self.launch_instance(config)
def __init__(self, name: Text = None, enable_cache: Optional[bool] = True, steps_dict: Dict[Text, BaseStep] = None, backend: OrchestratorBaseBackend = None, metadata_store: Optional[ZenMLMetadataStore] = None, artifact_store: Optional[ArtifactStore] = None, datasource: Optional[BaseDatasource] = None, pipeline_name: Optional[Text] = None, *args, **kwargs): """ Construct a base pipeline. This is a base interface that is meant to be overridden in multiple other pipeline use cases. Args: name: Outward-facing name of the pipeline. pipeline_name: A unique name that identifies the pipeline after it is run. enable_cache: Boolean, indicates whether or not caching should be used. steps_dict: Optional dict of steps. backend: Orchestrator backend. metadata_store: Configured metadata store. If None, the default metadata store is used. artifact_store: Configured artifact store. If None, the default artifact store is used. """ # Generate a name if not given if name is None: name = str(round(time.time() * 1000)) self.name = name self._immutable = False # Metadata store if metadata_store: self.metadata_store: ZenMLMetadataStore = metadata_store else: # use default self.metadata_store: ZenMLMetadataStore = \ Repository.get_instance().get_default_metadata_store() if pipeline_name: # This means its been loaded in through YAML, try to get context self.pipeline_name = pipeline_name self.file_name = self.pipeline_name + '.yaml' else: # if pipeline_name is None then its a new pipeline self.pipeline_name = self.create_pipeline_name_from_name() self.file_name = self.pipeline_name + '.yaml' # check duplicates here as its a 'new' pipeline self._check_registered() track(event=CREATE_PIPELINE) logger.info(f'Pipeline {name} created.') self.enable_cache = enable_cache if steps_dict is None: self.steps_dict: Dict[Text, BaseStep] = {} else: self.steps_dict = steps_dict # Default to local if backend is None: self.backend = OrchestratorBaseBackend() else: self.backend = backend # Artifact store if artifact_store: self.artifact_store = artifact_store else: # use default self.artifact_store = \ Repository.get_instance().get_default_artifact_store() # Datasource if datasource: self.datasource = datasource else: self.datasource = None self._source = source_utils.resolve_source_path( self.__class__.__module__ + '.' + self.__class__.__name__) self._kwargs = { keys.PipelineDetailKeys.NAME: self.pipeline_name, keys.PipelineDetailKeys.ENABLE_CACHE: self.enable_cache, } if kwargs: self._kwargs.update(kwargs)
CORTEX_MODEL_NAME = os.getenv('CORTEX_MODEL_NAME', 'zenml-classifier') # For this example, the ArtifactStore must be a GCP bucket, as the # CortexDeployer step is using the GCP env. from zenml.core.repo.repo import Repository # Define the training pipeline training_pipeline = TrainingPipeline() # Add a datasource. This will automatically track and version it. try: ds = CSVDatasource(name='Pima Indians Diabetes', path='gs://zenml_quickstart/diabetes.csv') except AlreadyExistsException: ds = Repository.get_instance().get_datasource_by_name( 'Pima Indians Diabetes') training_pipeline.add_datasource(ds) # Add a split training_pipeline.add_split(RandomSplit(split_map={'eval': 0.3, 'train': 0.7})) # Add a preprocessing unit training_pipeline.add_preprocesser( StandardPreprocesser(features=[ 'times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi', 'pedigree', 'age' ], labels=['has_diabetes'], overwrite={ 'has_diabetes': { 'transform': [{
def get_pipelines_dir(): """Print pipelines dir from local config.""" repo: Repository = Repository.get_instance() click.echo(f'Default pipelines dir points to: ' f'{repo.get_default_pipelines_dir()}')
def __init__(self, name: Text, enable_cache: Optional[bool] = True, steps_dict: Dict[Text, BaseStep] = None, backends_dict: Dict[Text, BaseBackend] = None, metadata_store: Optional[ZenMLMetadataStore] = None, artifact_store: Optional[ArtifactStore] = None, datasource: Optional[BaseDatasource] = None, pipeline_name: Optional[Text] = None, *args, **kwargs): """ Construct a base pipeline. This is a base interface that is meant to be overridden in multiple other pipeline use cases. Args: name: Outward-facing name of the pipeline. pipeline_name: A unique name that identifies the pipeline after it is run. enable_cache: Boolean, indicates whether or not caching should be used. steps_dict: Optional dict of steps. backends_dict: Optional dict of backends metadata_store: Configured metadata store. If None, the default metadata store is used. artifact_store: Configured artifact store. If None, the default artifact store is used. """ self.name = name # Metadata store if metadata_store: self.metadata_store: ZenMLMetadataStore = metadata_store else: # use default self.metadata_store: ZenMLMetadataStore = \ Repository.get_instance().get_default_metadata_store() if pipeline_name: # This means its been loaded in through YAML, try to get context if self.is_executed_in_metadata_store: self._immutable = True logger.debug(f'Pipeline {name} loaded and and is immutable.') else: # if metadata store does not have the pipeline_name, then we # can safely execute this again. self._immutable = False logger.debug(f'Pipeline {name} loaded and can be run.') self.pipeline_name = pipeline_name self.file_name = self.pipeline_name + '.yaml' else: # if pipeline_name is None then its a new pipeline self._immutable = False self.pipeline_name = self.create_pipeline_name_from_name() self.file_name = self.pipeline_name + '.yaml' # check duplicates here as its a 'new' pipeline if self.file_name in \ Repository.get_instance().get_pipeline_file_paths( only_file_names=True): raise AssertionError( f'Pipeline names must be unique in the repository. There ' f'is already a pipeline called {self.name}') track(event=CREATE_PIPELINE) logger.info(f'Pipeline {name} created.') self.enable_cache = enable_cache if steps_dict is None: self.steps_dict: Dict[Text, BaseStep] = {} else: self.steps_dict = steps_dict # Backends if backends_dict is None: self.backends_dict: Dict[Text, BaseBackend] = \ self.get_default_backends() else: self.backends_dict = backends_dict # Artifact store if artifact_store: self.artifact_store = artifact_store else: # use default self.artifact_store = \ Repository.get_instance().get_default_artifact_store() # Datasource if datasource: self.datasource = datasource else: self.datasource = None
import zenml import shutil from zenml.core.repo.repo import Repository from zenml.core.repo.zenml_config import ZenMLConfig, PIPELINES_DIR_KEY from zenml.utils.exceptions import InitializationException from zenml.utils import yaml_utils from zenml.core.standards import standard_keys as keys from zenml.core.repo.constants import ARTIFACT_STORE_DEFAULT_DIR, \ ZENML_DIR_NAME, ML_METADATA_SQLITE_DEFAULT_NAME from zenml.core.metadata.mock_metadata_wrapper import MockMetadataStore ZENML_ROOT = zenml.__path__[0] TEST_ROOT = os.path.join(ZENML_ROOT, "testing") pipelines_dir = os.path.join(TEST_ROOT, "test_pipelines") repo: Repository = Repository.get_instance() repo.zenml_config.set_pipelines_dir(pipelines_dir) config_root = os.path.dirname(ZENML_ROOT) artifact_store_path = os.path.join(config_root, ZENML_DIR_NAME, ARTIFACT_STORE_DEFAULT_DIR) sqlite_uri = os.path.join(artifact_store_path, ML_METADATA_SQLITE_DEFAULT_NAME) def test_zenml_config_init(): # in the root initialization should work _ = ZenMLConfig(config_root) # outside of an initialized repo path with pytest.raises(InitializationException):
def set_metadata_store(): """Compares pipelines in repo""" click.echo('Comparing pipelines in repo: Starting app..') repo: Repository = Repository.get_instance() repo.compare_pipelines()
def get_metadata_store(): """Print metadata store from local config.""" repo: Repository = Repository.get_instance() click.echo(f'Metadata store: ' f'{repo.get_default_metadata_store().to_config()}')
def set_artifact_store(path: Text = None): """Change artifact store for local config.""" repo: Repository = Repository.get_instance() repo.zenml_config.set_artifact_store(path) click.echo(f'Default artifact store updated to {path}')
def get_artifact_store(): """Print artifact store from local config.""" repo: Repository = Repository.get_instance() click.echo(f'Default artifact store points to: ' f'{repo.get_default_artifact_store().path}')
def _check_registered(self): if Repository.get_instance().get_pipeline_by_name( self.name) is not None: raise AlreadyExistsException(name=self.name, resource_type='pipeline')
def load_config(self) -> Dict[Text, Any]: """Loads a config dict from yaml file.""" return Repository.get_instance().load_pipeline_config( file_name=self.file_name)
def wrapper(filename): repo: Repository = Repository.get_instance() repo.zenml_config.set_pipelines_dir(pipeline_root) cfg = os.path.join(pipeline_root, filename) path_utils.rm_file(cfg)
def repo(): return Repository.get_instance()
def set_pipelines_dir(path: Text = None): """Change pipelines dir for local config.""" repo: Repository = Repository.get_instance() repo.zenml_config.set_pipelines_dir(path) click.echo(f'Default pipelines dir updated to {path}')