예제 #1
0
def init(repo_path: Text, pipelines_dir: Text = None,
         analytics_opt_in: bool = None):
    """Initialize ZenML on given path."""
    if repo_path is None:
        repo_path = os.getcwd()

    if analytics_opt_in is None:
        analytics_opt_in = confirmation(
            "ZenML collects anonymized usage information. This data helps us "
            "create a better product and understand the needs of the "
            "community better. You can find more information about exactly "
            "why, what and how we collect usage analytics statistics at: "
            "https://docs.zenml.io/misc/usage-analytics. "
            "Would you like to opt-in to usage analytics?")

    try:
        Repository.init_repo(
            repo_path,
            None,
            None,
            pipelines_dir,
            analytics_opt_in,
        )
        click.echo(f'ZenML repo initialized at {repo_path}')
    except git.InvalidGitRepositoryError:
        click.echo(f'{repo_path} is not a valid git repository! Please '
                   f'initialize ZenML within a git repository.')
예제 #2
0
    def register_pipeline(self, config: Dict[Text, Any]):
        """
        Registers a pipeline in the artifact store as a YAML file.

        Args:
            config: dict representation of ZenML config.
        """
        Repository.get_instance().register_pipeline(file_name=self.file_name,
                                                    config=config)
예제 #3
0
 def _check_registered(self):
     if self.file_name in \
             Repository.get_instance().get_pipeline_file_paths(
                 only_file_names=True):
         raise AssertionError(
             f'Pipeline names must be unique in the repository. There '
             f'is already a pipeline called {self.name}')
예제 #4
0
    def __init__(self,
                 name: Text,
                 schema: Dict = None,
                 _id: Text = None,
                 *args, **kwargs):
        """
        Construct the datasource

        Args:
            name (str): name of datasource
            schema (dict): schema of datasource
            _id: unique ID (for internal use)
        """
        if _id:
            # Its loaded from config
            self._id = _id
            logger.debug(f'Datasource {name} loaded.')
        else:
            # If none, then this is assumed to be 'new'. Check dupes.
            all_names = Repository.get_instance().get_datasource_names()
            if any(d == name for d in all_names):
                raise AlreadyExistsException(
                    name=name,
                    resource_type='datasource')
            self._id = str(uuid4())
            track(event=CREATE_DATASOURCE)
            logger.info(f'Datasource {name} created.')

        self.name = name
        self.schema = schema
        self._immutable = False
        self._source = source_utils.resolve_source_path(
            self.__class__.__module__ + '.' + self.__class__.__name__
        )
    def run(self, config: Dict[Text, Any]):
        # Extract the paths to create the tar
        logger.info('Orchestrating pipeline on Kubernetes..')

        repo: Repository = Repository.get_instance()
        repo_path = repo.path
        config_dir = repo.zenml_config.config_dir
        tar_file_name = \
            f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz'
        path_to_tar = os.path.join(config_dir, tar_file_name)

        # Create tarfile but exclude .zenml folder if exists
        path_utils.create_tarfile(repo_path, path_to_tar)
        logger.info(f'Created tar of current repository at: {path_to_tar}')

        # Upload tar to artifact store
        store_path = config[keys.GlobalKeys.ARTIFACT_STORE]
        store_staging_area = os.path.join(store_path, STAGING_AREA)
        store_path_to_tar = os.path.join(store_staging_area, tar_file_name)
        path_utils.copy(path_to_tar, store_path_to_tar)
        logger.info(f'Copied tar to artifact store at: {store_path_to_tar}')

        # Remove tar
        path_utils.rm_dir(path_to_tar)
        logger.info(f'Removed tar at: {path_to_tar}')

        # Append path of tar in config orchestrator utils
        config[keys.GlobalKeys.BACKEND][
            keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar

        # Launch the instance
        self.launch_job(config)
예제 #6
0
def get_datasource_by_name(repo: Repository, datasource_name: Text):
    """
    Gets pipeline from current repository by matching a name identifier
    against the data source name.

    """
    pretty_print(repo.get_datasource_by_name(datasource_name))
예제 #7
0
    def _get_one_pipeline(self):
        """Gets representative pipeline from all pipelines associated."""
        pipelines = \
            Repository.get_instance().get_pipelines_by_datasource(self)

        if len(pipelines) == 0:
            raise EmptyDatasourceException
        return pipelines[0]
예제 #8
0
파일: conftest.py 프로젝트: syllogy/zenml
 def wrapper():
     repo: Repository = Repository.get_instance()
     pipelines_dir = repo.zenml_config.get_pipelines_dir()
     for p_config in path_utils.list_dir(pipelines_dir):
         try:
             os.remove(p_config)
         except Exception as e:
             print(e)
예제 #9
0
파일: conftest.py 프로젝트: syllogy/zenml
    def wrapper():
        repo: Repository = Repository.get_instance()
        repo.zenml_config.set_pipelines_dir(pipeline_root)

        for p_config in path_utils.list_dir(pipeline_root):
            y = yaml_utils.read_yaml(p_config)
            p: TrainingPipeline = TrainingPipeline.from_config(y)
            p.run()
예제 #10
0
파일: config.py 프로젝트: zeta1999/zenml
def list_config():
    """Print the current ZenML config to the command line"""
    try:
        repo: Repository = Repository.get_instance()
    except Exception as e:
        error(e)
        return

    click.echo(to_pretty_string(repo.zenml_config))
예제 #11
0
    def _get_one_pipeline(self):
        """Gets representative pipeline from all pipelines associated."""
        pipelines = \
            Repository.get_instance().get_pipelines_by_datasource(self)

        if len(pipelines) == 0:
            raise Exception('This datasource is not associated with any '
                            'pipelines, therefore there is no data!')
        return pipelines[0]
예제 #12
0
def list_steps(repo: Repository):
    step_versions = repo.get_step_versions()
    name_version_data = []
    headers = ["step_name", "step_version"]
    for name, version_set in step_versions.items():
        names = [name] * len(version_set)
        versions = list(version_set)
        name_version_data.extend(list(zip(names, versions)))

    click.echo(tabulate(name_version_data, headers=headers))
예제 #13
0
파일: pipeline.py 프로젝트: zeta1999/zenml
def get_pipeline_by_name(repo: Repository, pipeline_name: Text):
    """
    Gets pipeline from current repository by matching a name against a
    pipeline name in the repository.
    """
    try:
        p = repo.get_pipeline_by_name(pipeline_name)
    except Exception as e:
        error(e)
        return

    pretty_print(p)
예제 #14
0
def load_source_path_class(source_path: Text) -> Type:
    """
    Loads a Python class from the path provided.

    Args:
        source_path (str): relative module path e.g. this.module.Class[@sha]
    """
    source = source_path.split('@')[0]
    pin = source_path.split('@')[-1]
    is_standard = is_standard_pin(pin)

    if '@' in source_path and not is_standard:
        logger.debug('Pinned step found with git sha. '
                     'Loading class from git history.')
        wrapper: GitWrapper = Repository.get_instance().get_git_wrapper()

        module_path = get_module_path_from_source(source_path)
        relative_module_path = get_relative_path_from_module(module_path)

        logger.warning('Found source with a pinned sha. Will now checkout '
                       f'module: {module_path}')

        # critical step
        if not wrapper.check_module_clean(source_path):
            raise Exception(f'One of the files at {relative_module_path} '
                            f'is not committed and we '
                            f'are trying to load that directory from git '
                            f'history due to a pinned step in the pipeline. '
                            f'Please commit the file and then run the '
                            f'pipeline.')

        # Check out the directory at that sha
        wrapper.checkout(sha_or_branch=pin, directory=relative_module_path)

        # After this point, all exceptions will first undo the above
        try:
            class_ = import_class_by_path(source)
            wrapper.reset(relative_module_path)
            wrapper.checkout(directory=relative_module_path)
        except Exception:
            wrapper.reset(relative_module_path)
            wrapper.checkout(directory=relative_module_path)
            raise Exception
    elif '@' in source_path and is_standard:
        logger.debug(f'Default {APP_NAME} class used. Loading directly.')
        # TODO: [LOW] Check if ZenML version is installed before loading.
        class_ = import_class_by_path(source)
    else:
        logger.debug('Unpinned step found with no git sha. Attempting to '
                     'load class from current repository state.')
        class_ = import_class_by_path(source)

    return class_
예제 #15
0
    def run_pipeline(self, config_b64: str):
        # Load config from base64
        config = json.loads(base64.b64decode(config_b64))

        # Remove tar_path arg from config
        tar_path = config[keys.GlobalKeys.BACKEND][keys.BackendKeys.ARGS].pop(
            TAR_PATH_ARG)

        # Copy it over locally because it will be remote
        path_utils.copy(tar_path, EXTRACTED_TAR_FILE_PATH)

        # Extract it to EXTRACTED_TAR_DIR
        path_utils.extract_tarfile(EXTRACTED_TAR_FILE_PATH, EXTRACTED_TAR_DIR)

        # Append to sys to make user code discoverable
        sys.path.append(EXTRACTED_TAR_DIR)

        # Make sure the Repository is initialized at the right path
        Repository.get_instance(EXTRACTED_TAR_DIR)

        # Change orchestrator of pipeline to local
        OrchestratorBaseBackend().run(config)
예제 #16
0
def resolve_source_path(source_path: Text) -> Text:
    """
    Resolves source path with an optional sha using Git.

    Args:
        source_path (str): relative module path e.g. this.module.Class
    """
    if is_standard_step(source_path):
        # that means use standard version
        return resolve_standard_source_path(source_path)

    # otherwise use Git resolution
    wrapper: GitWrapper = Repository.get_instance().get_git_wrapper()
    source_path = wrapper.resolve_source_path(source_path)
    return source_path
예제 #17
0
    def get_config(self):
        predictor_path = self.predictor.__module__ + '.' + \
                         self.predictor.__name__
        p_file_path = \
            get_path_from_source(get_class_path_from_source(predictor_path))
        repo: Repository = Repository.get_instance()

        return {
            "cortex_serving_args": {
                "env": self.env,
                "api_config": self.api_config,
                "predictor_path": os.path.join(repo.path, p_file_path),
                "requirements": self.requirements,
                "conda_packages": self.conda_packages,
                "force": self.force,
                "wait": self.wait,
            }
        }
예제 #18
0
파일: pipeline.py 프로젝트: zeta1999/zenml
def list_pipelines(repo: Repository):
    """Lists pipelines in the current repository."""
    try:
        pipelines = repo.get_pipelines()

        names = [p.name for p in pipelines]
        types = [p.PIPELINE_TYPE for p in pipelines]
        statuses = [p.get_status() for p in pipelines]
        cache_enabled = [p.enable_cache for p in pipelines]
        filenames = [p.file_name for p in pipelines]

        headers = ["name", "type", "cache enabled", "status", "file name"]

        click.echo(
            tabulate(zip(names, types, cache_enabled, statuses, filenames),
                     headers=headers))
    except Exception as e:
        error(e)
예제 #19
0
파일: config.py 프로젝트: zeta1999/zenml
def set_metadata_store(store_type, args):
    """Set metadata store for local config."""

    try:
        parsed_args = parse_unknown_options(args)
    except AssertionError as e:
        click.echo(str(e))
        return

    # TODO: [LOW] Hard-coded
    config = {'type': store_type, 'args': parsed_args}
    from zenml.core.metadata.metadata_wrapper import ZenMLMetadataStore

    store = ZenMLMetadataStore.from_config(config)
    repo: Repository = Repository.get_instance()
    repo.zenml_config.set_metadata_store(store)

    click.echo(f'Metadata store set to: {store.to_config()}')
예제 #20
0
    def run(self, config: Dict[Text, Any]):
        """
        This run function essentially calls an underlying TFX orchestrator run.
        However it is meant as a higher level abstraction with some
        opinionated decisions taken.

        Args:
            config: a ZenML config dict
        """
        # Extract the paths to create the tar
        logger.info('Orchestrating pipeline on GCP..')

        repo: Repository = Repository.get_instance()
        repo_path = repo.path
        config_dir = repo.zenml_config.config_dir
        tar_file_name = \
            f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz'
        path_to_tar = os.path.join(config_dir, tar_file_name)

        # Create tarfile but excluse .zenml folder if exists
        path_utils.create_tarfile(repo_path, path_to_tar)
        logger.info(f'Created tar of current repository at: {path_to_tar}')

        # Upload tar to artifact store
        store_path = config[keys.GlobalKeys.ARTIFACT_STORE]
        store_staging_area = os.path.join(store_path, STAGING_AREA)
        store_path_to_tar = os.path.join(store_staging_area, tar_file_name)
        path_utils.copy(path_to_tar, store_path_to_tar)
        logger.info(f'Copied tar to artifact store at: {store_path_to_tar}')

        # Remove tar
        path_utils.rm_dir(path_to_tar)
        logger.info(f'Removed tar at: {path_to_tar}')

        # Append path of tar in config orchestrator utils
        config[keys.GlobalKeys.BACKEND][
            keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar

        # Launch the instance
        self.launch_instance(config)
예제 #21
0
    def __init__(self, **params):
        super(Application, self).__init__(**params)

        # lists
        result_list = []
        hparam_list = []
        repo: Repository = Repository.get_instance()

        # get all pipelines in this workspace
        all_pipelines: List[TrainingPipeline] = repo.get_pipelines_by_type(
            [TrainingPipeline.PIPELINE_TYPE])

        # get a dataframe of all results + all hyperparameter combinations
        for p in all_pipelines:
            # This is slowing the comparison down but
            # necessary to update the status of each run
            if p.get_status() == PipelineStatusTypes.Succeeded.name:
                eval_path = p.get_artifacts_uri_by_component(
                    GDPComponent.Evaluator.name)[0]

                evaluation = tfma.load_eval_result(eval_path)
                for s, m in evaluation.slicing_metrics:
                    result_list.append(
                        dict([('pipeline_name', '{}'.format(p.name)),
                              ('slice_name', s[0][0] if s else ''),
                              ('slice_value', s[0][1] if s else '')]))
                    result_list[-1].update(
                        {f'metric_{k}': m[k]['']
                         for k, v in m.items()})

                h_dict = p.get_hyperparameters()
                h_dict['pipeline_name'] = p.name
                hparam_list.append(h_dict)

        self.results = pd.DataFrame([parse_metrics(r) for r in result_list])
        self.hparam_info = pd.DataFrame(hparam_list)

        # set params
        self.param.pipeline_run_selector.objects = self.results[
            'pipeline_name'].unique()
예제 #22
0
import zenml
import shutil
from zenml.core.repo.repo import Repository
from zenml.core.repo.zenml_config import ZenMLConfig, PIPELINES_DIR_KEY
from zenml.utils.exceptions import InitializationException
from zenml.utils import yaml_utils
from zenml.core.standards import standard_keys as keys
from zenml.core.repo.constants import ARTIFACT_STORE_DEFAULT_DIR, \
    ZENML_DIR_NAME, ML_METADATA_SQLITE_DEFAULT_NAME
from zenml.core.metadata.mock_metadata_wrapper import MockMetadataStore

ZENML_ROOT = zenml.__path__[0]
TEST_ROOT = os.path.join(ZENML_ROOT, "testing")

pipelines_dir = os.path.join(TEST_ROOT, "test_pipelines")
repo: Repository = Repository.get_instance()
repo.zenml_config.set_pipelines_dir(pipelines_dir)

config_root = os.path.dirname(ZENML_ROOT)
artifact_store_path = os.path.join(config_root, ZENML_DIR_NAME,
                                   ARTIFACT_STORE_DEFAULT_DIR)

sqlite_uri = os.path.join(artifact_store_path, ML_METADATA_SQLITE_DEFAULT_NAME)


def test_zenml_config_init():
    # in the root initialization should work
    _ = ZenMLConfig(config_root)

    # outside of an initialized repo path
    with pytest.raises(InitializationException):
예제 #23
0
    def __init__(self,
                 name: Text,
                 enable_cache: Optional[bool] = True,
                 steps_dict: Dict[Text, BaseStep] = None,
                 backends_dict: Dict[Text, BaseBackend] = None,
                 metadata_store: Optional[ZenMLMetadataStore] = None,
                 artifact_store: Optional[ArtifactStore] = None,
                 datasource: Optional[BaseDatasource] = None,
                 pipeline_name: Optional[Text] = None,
                 *args,
                 **kwargs):
        """
        Construct a base pipeline. This is a base interface that is meant
        to be overridden in multiple other pipeline use cases.

        Args:
            name: Outward-facing name of the pipeline.
            pipeline_name: A unique name that identifies the pipeline after
             it is run.
            enable_cache: Boolean, indicates whether or not caching
             should be used.
            steps_dict: Optional dict of steps.
            backends_dict: Optional dict of backends
            metadata_store: Configured metadata store. If None,
             the default metadata store is used.
            artifact_store: Configured artifact store. If None,
             the default artifact store is used.
        """
        self.name = name

        # Metadata store
        if metadata_store:
            self.metadata_store: ZenMLMetadataStore = metadata_store
        else:
            # use default
            self.metadata_store: ZenMLMetadataStore = \
                Repository.get_instance().get_default_metadata_store()

        if pipeline_name:
            # This means its been loaded in through YAML, try to get context
            if self.is_executed_in_metadata_store:
                self._immutable = True
                logger.debug(f'Pipeline {name} loaded and and is immutable.')
            else:
                # if metadata store does not have the pipeline_name, then we
                # can safely execute this again.
                self._immutable = False
                logger.debug(f'Pipeline {name} loaded and can be run.')

            self.pipeline_name = pipeline_name
            self.file_name = self.pipeline_name + '.yaml'
        else:
            # if pipeline_name is None then its a new pipeline
            self._immutable = False
            self.pipeline_name = self.create_pipeline_name_from_name()
            self.file_name = self.pipeline_name + '.yaml'
            # check duplicates here as its a 'new' pipeline
            if self.file_name in \
                    Repository.get_instance().get_pipeline_file_paths(
                        only_file_names=True):
                raise AssertionError(
                    f'Pipeline names must be unique in the repository. There '
                    f'is already a pipeline called {self.name}')
            track(event=CREATE_PIPELINE)
            logger.info(f'Pipeline {name} created.')

        self.enable_cache = enable_cache

        if steps_dict is None:
            self.steps_dict: Dict[Text, BaseStep] = {}
        else:
            self.steps_dict = steps_dict

        # Backends
        if backends_dict is None:
            self.backends_dict: Dict[Text, BaseBackend] = \
                self.get_default_backends()
        else:
            self.backends_dict = backends_dict

        # Artifact store
        if artifact_store:
            self.artifact_store = artifact_store
        else:
            # use default
            self.artifact_store = \
                Repository.get_instance().get_default_artifact_store()

        # Datasource
        if datasource:
            self.datasource = datasource
        else:
            self.datasource = None
예제 #24
0
 def load_config(self) -> Dict[Text, Any]:
     """Loads a config dict from yaml file."""
     return Repository.get_instance().load_pipeline_config(
         file_name=self.file_name)
예제 #25
0
CORTEX_MODEL_NAME = os.getenv('CORTEX_MODEL_NAME', 'zenml-classifier')

# For this example, the ArtifactStore must be a GCP bucket, as the
# CortexDeployer step is using the GCP env.

from zenml.core.repo.repo import Repository

# Define the training pipeline
training_pipeline = TrainingPipeline()

# Add a datasource. This will automatically track and version it.
try:
    ds = CSVDatasource(name='Pima Indians Diabetes',
                       path='gs://zenml_quickstart/diabetes.csv')
except AlreadyExistsException:
    ds = Repository.get_instance().get_datasource_by_name(
        'Pima Indians Diabetes')
training_pipeline.add_datasource(ds)

# Add a split
training_pipeline.add_split(RandomSplit(split_map={'eval': 0.3, 'train': 0.7}))

# Add a preprocessing unit
training_pipeline.add_preprocesser(
    StandardPreprocesser(features=[
        'times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi', 'pedigree',
        'age'
    ],
                         labels=['has_diabetes'],
                         overwrite={
                             'has_diabetes': {
                                 'transform': [{
예제 #26
0
from zenml.core.steps.preprocesser.standard_preprocesser \
    .standard_preprocesser import \
    StandardPreprocesser
from zenml.core.steps.split.categorical_domain_split_step import \
    CategoricalDomainSplit
from zenml.core.steps.trainer.tensorflow_trainers.tf_ff_trainer import \
    FeedForwardTrainer
from zenml.utils import path_utils
from zenml.utils.logger import get_logger

logger = get_logger(__name__)

# reset pipeline root to redirect to tests so that it writes the yamls there
ZENML_ROOT = str(Path(zenml.__path__[0]).parent)
TEST_ROOT = os.path.join(ZENML_ROOT, "tests")
Repository.init_repo(TEST_ROOT, analytics_opt_in=False)

pipeline_root = os.path.join(TEST_ROOT, "pipelines")
csv_root = os.path.join(TEST_ROOT, "test_data")
image_root = os.path.join(csv_root, "images")

repo: Repository = Repository.get_instance()
if path_utils.is_dir(pipeline_root):
    path_utils.rm_dir(pipeline_root)
repo.zenml_config.set_pipelines_dir(pipeline_root)

try:
    for i in range(1, 6):
        training_pipeline = TrainingPipeline(name='csvtest{0}'.format(i))

        try:
예제 #27
0
파일: conftest.py 프로젝트: syllogy/zenml
    def wrapper(filename):
        repo: Repository = Repository.get_instance()
        repo.zenml_config.set_pipelines_dir(pipeline_root)

        cfg = os.path.join(pipeline_root, filename)
        path_utils.rm_file(cfg)
예제 #28
0
def set_metadata_store():
    """Compares pipelines in repo"""
    click.echo('Comparing pipelines in repo: Starting app..')
    repo: Repository = Repository.get_instance()
    repo.compare_pipelines()
예제 #29
0
파일: pipeline.py 프로젝트: zeta1999/zenml
def compare_pipelines(repo: Repository):
    """Compares pipelines in repo"""
    click.echo('Comparing pipelines in repo: Starting app..')
    repo.compare_pipelines()
예제 #30
0
def list_datasources(repo: Repository):
    datasources = repo.get_datasources()

    click.echo(tabulate([ds.to_config() for ds in datasources],
                        headers="keys"))