Exemplo n.º 1
0
def pull_pipeline(info, pipeline_id, output_path, no_docs):
    """Copy the configuration of a registered pipeline"""
    p_api = ce_api.PipelinesApi(utils.api_client(info))
    ws_api = ce_api.WorkspacesApi(utils.api_client(info))

    active_user = info[constants.ACTIVE_USER]
    ws_id = info[active_user][constants.ACTIVE_WORKSPACE]

    all_ps = utils.api_call(
        ws_api.get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
        ws_id)
    p_uuid = utils.find_closest_uuid(pipeline_id, all_ps)

    utils.declare('Pulling pipeline: {}'.format(utils.format_uuid(p_uuid)))

    pp = utils.api_call(p_api.get_pipeline_api_v1_pipelines_pipeline_id_get,
                        pipeline_id=p_uuid)

    # Short term fix for these getting into the exp_config
    c = pp.pipeline_config
    if 'bq_args' in c:
        c.pop('bq_args')
    if 'ai_platform_training_args' in c:
        c.pop('ai_platform_training_args')

    utils.save_config(c, output_path, no_docs)
Exemplo n.º 2
0
def get_eval_dir(p_uuid, r_uuid, info, d_path=None):
    ws_id = info[info[constants.ACTIVE_USER]][constants.ACTIVE_WORKSPACE]

    if d_path is None:
        d_path = os.path.join(click.get_app_dir(constants.APP_NAME),
                              'eval_evaluator', str(ws_id), str(p_uuid),
                              str(r_uuid))

    if os.path.exists(os.path.join(d_path, 'eval_config.json')):
        return d_path

    api = ce_api.PipelinesApi(api_client(info))
    artifact = api_call(
        api.
        get_pipeline_artifacts_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_artifacts_component_type_get,
        pipeline_id=p_uuid,
        pipeline_run_id=r_uuid,
        component_type=GDPComponent.Evaluator.name)

    # TODO: [LOW] artifact[1] hard-coded because of upgrade to 0.21.4
    download_artifact(artifact[0].to_dict(), path=d_path)

    # replace google path with local path
    with open(os.path.join(d_path, 'eval_config.json'), 'r') as f:
        eval_config = json.load(f)

    # now override the google path to local path
    eval_config['modelLocations'][''] = d_path

    with open(os.path.join(d_path, 'eval_config.json'), 'w') as f:
        json.dump(eval_config, f)

    return d_path
Exemplo n.º 3
0
def list_pipelines(info, pipeline_id, ignore_empty):
    """List of registered pipelines"""
    utils.notice('Fetching pipeline(s). This might take a few seconds... \n')
    active_user = info[constants.ACTIVE_USER]
    ws = info[active_user][constants.ACTIVE_WORKSPACE]
    ws_api = ce_api.WorkspacesApi(utils.api_client(info))
    p_api = ce_api.PipelinesApi(utils.api_client(info))
    d_api = ce_api.DatasourcesApi(utils.api_client(info))

    pipelines = utils.api_call(
        ws_api.get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
        ws)

    if pipeline_id is not None:
        pipeline_id = utils.find_closest_uuid(pipeline_id, pipelines)

    pipelines.sort(key=lambda x: x.created_at)
    for p in pipelines:
        write_check = (len(p.pipeline_runs) > 0 or not ignore_empty) and \
                      (pipeline_id is None or pipeline_id == p.id)

        if write_check:
            # THIS WHOLE THING IS HERE FOR A REASON!!!!!!
            title = 'PIPELINE NAME: {} PIPELINE ID: {}'.format(
                p.name, utils.format_uuid(p.id))
            utils.declare(title)
            utils.declare('-' * len(title))
            if len(p.pipeline_runs) == 0:
                click.echo('No runs for this pipeline yet!')
            else:
                table = []
                for r in p.pipeline_runs:
                    author = utils.api_call(
                        p_api.get_pipeline_run_user_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_user_get,
                        p.id,
                        r.id)

                    # Resolve datasource
                    ds_commit = utils.api_call(
                        d_api.get_single_commit_api_v1_datasources_commits_commit_id_get,
                        r.datasource_commit_id)
                    ds = utils.api_call(
                        d_api.get_datasource_api_v1_datasources_ds_id_get,
                        ds_commit.datasource_id)

                    table.append({
                        'RUN ID': utils.format_uuid(r.id),
                        'TYPE': r.pipeline_run_type,
                        'CPUs PER WORKER': r.cpus_per_worker,
                        'WORKERS': r.workers,
                        'DATASOURCE': '{}_{}'.format(
                            ds.name,
                            utils.format_uuid(r.datasource_commit_id)),
                        'AUTHOR': author.email,
                        'CREATED AT': utils.format_date(r.start_time),
                    })
                click.echo(tabulate(table, headers='keys', tablefmt='plain'))
            click.echo('\n')
Exemplo n.º 4
0
    def get_pipeline_run(self, pipeline_id, pipeline_run_id) -> PipelineRun:

        api = ce_api.PipelinesApi(self.client)
        pr = api_utils.api_call(
            api.
            get_pipeline_run_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_get,
            pipeline_id, pipeline_run_id)

        return PipelineRun(**pr.to_dict())
Exemplo n.º 5
0
    def get_pipeline_run_logs(self, pipeline_id,
                              pipeline_run_id) -> PipelineRun:

        api = ce_api.PipelinesApi(self.client)
        logs_url = api_utils.api_call(
            api.
            get_pipeline_logs_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_logs_get,
            pipeline_id, pipeline_run_id)

        return logs_url
Exemplo n.º 6
0
    def get_pipeline_runs(self, pipeline_id: Text,
                          **kwargs) -> List[PipelineRun]:

        api = ce_api.PipelinesApi(self.client)
        pr_list = api_utils.api_call(
            api.get_pipeline_runs_api_v1_pipelines_pipeline_id_runs_get,
            pipeline_id)

        runs = [PipelineRun(**pr.to_dict()) for pr in pr_list]

        if kwargs:
            runs = client_utils.filter_objects(runs, **kwargs)
        return runs
Exemplo n.º 7
0
    def test_pipeline(self,
                      pipeline_id: Text,
                      datasource_id: Text = None,
                      datasource_commit_id: Text = None,
                      orchestration_backend: Text = None,
                      orchestration_args: Dict = None,
                      processing_backend: Text = None,
                      processing_args: Dict = None,
                      training_backend: Text = None,
                      training_args: Dict = None,
                      serving_backend: Text = None,
                      serving_args: Dict = None) -> PipelineRun:

        if datasource_id is None is datasource_commit_id is None:
            assert ValueError('Please either define a datasource_id '
                              '(to pick the latest commit) or a '
                              'datasource_commit_id to define a source.')

        ds_api = ce_api.DatasourcesApi(self.client)

        if datasource_id is not None:
            commits = api_utils.api_call(
                ds_api.get_commits_api_v1_datasources_ds_id_commits_get,
                datasource_id)

            commits.sort(key=lambda x: x.created_at)
            c_id = commits[-1].id

        elif datasource_commit_id is not None:
            c_id = datasource_commit_id
        else:
            raise LookupError('Hello there!')

        run_create = PipelineRun.creator(
            pipeline_run_type=PipelineRunTypes.test.name,
            datasource_commit_id=c_id,
            orchestration_backend=orchestration_backend,
            orchestration_args=orchestration_args,
            processing_backend=processing_backend,
            processing_args=processing_args,
            additional_args={
                'training_backend': training_backend,
                'training_args': training_args,
                'serving_backend': serving_backend,
                'serving_args': serving_args
            })

        p_api = ce_api.PipelinesApi(self.client)
        return api_utils.api_call(
            p_api.create_pipeline_run_api_v1_pipelines_pipeline_id_runs_post,
            run_create, pipeline_id)
Exemplo n.º 8
0
    def pull_pipeline(self, pipeline_id: Text) -> PipelineConfig:

        api = ce_api.PipelinesApi(self.client)
        pp = api_utils.api_call(
            api.get_pipeline_api_v1_pipelines_pipeline_id_get,
            pipeline_id=pipeline_id)

        c = pp.pipeline_config
        if GlobalKeys.BQ_ARGS_ in c:
            c.pop(GlobalKeys.BQ_ARGS_)
        if GlobalKeys.CUSTOM_CODE_ in c:
            c.pop(GlobalKeys.CUSTOM_CODE_)
        if 'ai_platform_training_args' in c:
            c.pop('ai_platform_training_args')

        return PipelineConfig(**c)
Exemplo n.º 9
0
def logs_pipeline(info, source_id):
    """Get link to the logs of a pipeline"""

    p_uuid, r_uuid = utils.resolve_pipeline_runs(info, source_id)
    utils.notice(
        'Generating logs url for the pipeline run ID {}. Please visit the '
        'url for all your logs.'.format(utils.format_uuid(r_uuid)))

    api = ce_api.PipelinesApi(utils.api_client(info))
    logs_url = utils.api_call(
        api.get_pipeline_logs_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_logs_get,
        pipeline_id=p_uuid,
        pipeline_run_id=r_uuid
    )

    click.echo(logs_url)
Exemplo n.º 10
0
def statistics_pipeline(info, pipeline_):
    """Serve the statistics of a pipeline run"""

    p_uuid, r_uuid = utils.resolve_pipeline_runs(info,
                                                 pipeline_,
                                                 run_type=PipelineRunTypes.training.name)

    utils.notice('Generating statistics for the pipeline run ID {}. If your '
                 'browser opens up to a blank window, please refresh '
                 'the page once.'.format(utils.format_uuid(r_uuid)))

    api = ce_api.PipelinesApi(utils.api_client(info))
    stat_artifact = utils.api_call(
        api.get_pipeline_artifacts_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_artifacts_component_type_get,
        pipeline_id=p_uuid,
        pipeline_run_id=r_uuid,
        component_type=GDPComponent.SplitStatistics.name)

    ws_id = info[info[constants.ACTIVE_USER]][constants.ACTIVE_WORKSPACE]
    path = Path(click.get_app_dir(constants.APP_NAME),
                'statistics',
                str(ws_id),
                p_uuid,
                r_uuid)
    utils.download_artifact(artifact_json=stat_artifact[0].to_dict(),
                            path=path)

    import tensorflow as tf
    from tensorflow_metadata.proto.v0 import statistics_pb2
    import panel as pn

    result = {}
    for split in os.listdir(path):
        stats_path = os.path.join(path, split, 'stats_tfrecord')
        serialized_stats = next(tf.compat.v1.io.tf_record_iterator(stats_path))
        stats = statistics_pb2.DatasetFeatureStatisticsList()
        stats.ParseFromString(serialized_stats)
        dataset_list = statistics_pb2.DatasetFeatureStatisticsList()
        for i, d in enumerate(stats.datasets):
            d.name = split
            dataset_list.datasets.append(d)
        result[split] = dataset_list
    h = utils.get_statistics_html(result)

    pn.serve(panels=pn.pane.HTML(h, width=1200), show=True)
Exemplo n.º 11
0
def resolve_pipeline_runs(info, source_id, run_type=None):
    ws_id = info[info[constants.ACTIVE_USER]][constants.ACTIVE_WORKSPACE]
    ws_api = ce_api.WorkspacesApi(api_client(info))
    p_api = ce_api.PipelinesApi(api_client(info))

    if len(source_id.split(':')) == 2:
        pipeline_id, run_id = source_id.split(':')
        pipelines = api_call(
            ws_api.
            get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
            ws_id)
        p_id = find_closest_uuid(pipeline_id, pipelines)

        runs = api_call(
            p_api.get_pipeline_runs_api_v1_pipelines_pipeline_id_runs_get,
            p_id)

        if run_type:
            runs = [r for r in runs if r.pipeline_run_type == run_type]

        r_id = find_closest_uuid(run_id, runs)
    elif len(source_id.split(':')) == 1:
        pipeline_id = source_id
        pipelines = api_call(
            ws_api.
            get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
            ws_id)
        p_id = find_closest_uuid(pipeline_id, pipelines)

        runs = api_call(
            p_api.get_pipeline_runs_api_v1_pipelines_pipeline_id_runs_get,
            p_id)

        if run_type:
            runs = [r for r in runs if r.pipeline_run_type == run_type]

        runs.sort(key=lambda x: x.start_time)
        if runs:
            r_id = runs[-1].id
        else:
            r_id = None
    else:
        raise ValueError('Unresolvable pipeline ID')

    return p_id, r_id
Exemplo n.º 12
0
def get_log_dir(p_uuid, r_uuid, info):
    # TODO: how do i kow that the pipeline is in this workspace, maybe i
    #   changed the ws
    ws_id = info[info[constants.ACTIVE_USER]][constants.ACTIVE_WORKSPACE]
    d_path = os.path.join(click.get_app_dir(constants.APP_NAME),
                          'eval_trainer', str(ws_id), str(p_uuid), str(r_uuid))

    if os.path.exists(os.path.join(d_path, 'eval_model_dir')):
        return d_path

    api = ce_api.PipelinesApi(api_client(info))
    artifact = api_call(
        api.
        get_pipeline_artifacts_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_artifacts_component_type_get,
        pipeline_id=p_uuid,
        pipeline_run_id=r_uuid,
        component_type=GDPComponent.Trainer.name)
    download_artifact(artifact[0].to_dict(), path=d_path)
    return d_path
Exemplo n.º 13
0
def resolve_pipeline_creation(info, pipeline_type, pipeline_, datasource,
                              orchestration_backend, orchestration_args,
                              processing_backend, processing_args, force,
                              additional_args):
    active_user = info[constants.ACTIVE_USER]

    # Initiate all required APIs
    p_api = ce_api.PipelinesApi(api_client(info))

    # Resolving the datasource connection
    if datasource is not None:
        ds_id, c_id = resolve_datasource_commits(info, datasource)
    elif constants.ACTIVE_DATASOURCE_COMMIT in info[active_user]:
        ds_id, c_id = info[active_user][
            constants.ACTIVE_DATASOURCE_COMMIT].split(':')
    else:
        raise AssertionError('Please either select an active datasource '
                             'commit to work with or explicitly define it.')

    declare('Using Datasource Commit:{}'.format(format_uuid(c_id)))

    # Resolving the pipeline uuid
    pipeline_id, _ = resolve_pipeline_runs(info, pipeline_)

    run_create = PipelineRunCreate(pipeline_run_type=pipeline_type,
                                   datasource_commit_id=c_id,
                                   orchestration_backend=orchestration_backend,
                                   orchestration_args=orchestration_args,
                                   processing_backend=processing_backend,
                                   processing_args=processing_args,
                                   additional_args=additional_args)

    notice('Provisioning required resources. This might take a few minutes..')

    r = api_call(
        p_api.create_pipeline_run_api_v1_pipelines_pipeline_id_runs_post,
        run_create, pipeline_id)

    declare('Run created with ID: {id}!\n'.format(id=format_uuid(r.id)))

    declare("Use 'cengine pipeline status -p {}' to check on its "
            "status".format(format_uuid(pipeline_id)))
Exemplo n.º 14
0
    def push_pipeline(self, name: Text, workspace_id: Text,
                      config: Union[Dict, PipelineConfig]) -> Pipeline:

        if isinstance(config, PipelineConfig):
            # config.check_completion()
            pass
        elif isinstance(config, dict):
            config = PipelineConfig(**config)
            # config.check_completion()
        else:
            raise ValueError('Please provide either a dict value or an '
                             'instance of cengine.PipelineConfig for '
                             'the config')

        api = ce_api.PipelinesApi(self.client)
        p = api_utils.api_call(func=api.create_pipeline_api_v1_pipelines_post,
                               body=Pipeline.creator(
                                   name=name,
                                   pipeline_config=config.to_serial(),
                                   workspace_id=workspace_id))
        return Pipeline(**p.to_dict())
Exemplo n.º 15
0
def model_pipeline(info, pipeline_, output_path):
    """Download the trained model to a specified location"""
    if os.path.exists(output_path) and os.path.isdir(output_path):
        if not [f for f in os.listdir(output_path) if
                not f.startswith('.')] == []:
            utils.error("Output path must be an empty directory!")
    if os.path.exists(output_path) and not os.path.isdir(output_path):
        utils.error("Output path must be an empty directory!")
    if not os.path.exists(output_path):
        "Creating directory {}..".format(output_path)

    p_uuid, r_uuid = utils.resolve_pipeline_runs(info, pipeline_)

    utils.notice('Downloading the trained model from pipeline run '
                 'ID {}. This might take some time if the model '
                 'resources are significantly large in size.\nYour patience '
                 'is much appreciated!'.format(utils.format_uuid(r_uuid)))

    api = ce_api.PipelinesApi(utils.api_client(info))
    artifact = utils.api_call(
        api.get_pipeline_artifacts_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_artifacts_component_type_get,
        pipeline_id=p_uuid,
        pipeline_run_id=r_uuid,
        component_type=GDPComponent.Deployer.name)

    spin = utils.Spinner()
    spin.start()
    if len(artifact) == 1:
        utils.download_artifact(artifact_json=artifact[0].to_dict(),
                                path=output_path)
        spin.stop()
    else:
        utils.error('Something unexpected happened! Please contact '
                    '[email protected] to get further information.')

    utils.declare('Model downloaded to: {}'.format(output_path))
    # TODO: [LOW] Make the Tensorflow version more dynamic
    utils.declare('Please note that the model is saved as a SavedModel '
                  'Tensorflow artifact, trained on Tensoflow 2.1.0.')
Exemplo n.º 16
0
def push_pipeline(info, config_path, pipeline_name):
    """Register a pipeline with the selected configuration"""
    active_user = info[constants.ACTIVE_USER]
    ws_id = info[active_user][constants.ACTIVE_WORKSPACE]

    try:
        with open(config_path, 'rt', encoding='utf8') as f:
            config = yaml.load(f)
    except:
        utils.error('Badly formatted YAML!')

    api = ce_api.PipelinesApi(utils.api_client(info))
    p = utils.api_call(api.create_pipeline_api_v1_pipelines_post,
                       PipelineCreate(name=pipeline_name,
                                      pipeline_config=config,
                                      workspace_id=ws_id))

    utils.declare('Pipeline pushed successfully!'.format(
        id=utils.format_uuid(p.id)))

    utils.declare(
        "Use `cengine pipeline train {} --datasource DS_COMMIT` "
        "to launch a training pipeline!".format(utils.format_uuid(p.id)))
Exemplo n.º 17
0
    def download_model(self, pipeline_id, pipeline_run_id, output_path):
        if os.path.exists(output_path) and os.path.isdir(output_path):
            if not [
                    f for f in os.listdir(output_path) if not f.startswith('.')
            ] == []:
                raise NotADirectoryError("Output path must be an empty "
                                         "directory!")
        if os.path.exists(output_path) and not os.path.isdir(output_path):
            raise NotADirectoryError("Output path must be an empty directory!")
        if not os.path.exists(output_path):
            logging.info("Creating directory {}..".format(output_path))

        # Resolve the pipeline run_id
        if pipeline_id is None or pipeline_run_id is None:
            raise ValueError('Please either a pipeline_id and a '
                             'pipeline_run_id to choose a trained model.')

        p_api = ce_api.PipelinesApi(self.client)

        artifact = api_utils.api_call(
            p_api.
            get_pipeline_artifacts_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_artifacts_component_type_get,
            pipeline_id=pipeline_id,
            pipeline_run_id=pipeline_run_id,
            component_type=GDPComponent.Deployer.name)

        spin = Spinner()
        spin.start()
        if len(artifact) == 1:
            download_artifact(artifact_json=artifact[0].to_dict(),
                              path=output_path)
            spin.stop()
        else:
            raise Exception('Something unexpected happened! Please contact '
                            '[email protected] to get further information.')

        logging.info('Model downloaded to: {}'.format(output_path))
Exemplo n.º 18
0
    def infer_pipeline(self,
                       pipeline_id: Text = None,
                       pipeline_run_id: Text = None,
                       datasource_id: Text = None,
                       datasource_commit_id: Text = None,
                       orchestration_backend: Text = None,
                       orchestration_args: Dict = None,
                       processing_backend: Text = None,
                       processing_args: Dict = None) -> PipelineRun:

        # Resolve the pipeline run_id
        if pipeline_id is None is pipeline_run_id is None:
            raise ValueError('Please either define a pipeline_id '
                             '(to pick the latest training run) or a '
                             'pipeline_run_id to choose a trained model.')

        p_api = ce_api.PipelinesApi(self.client)
        if pipeline_id is not None:
            runs = api_utils.api_call(
                p_api.get_pipeline_runs_api_v1_pipelines_pipeline_id_runs_get,
                pipeline_id)

            runs.sort(key=lambda x: x.run_time)
            training_runs = [
                r for r in runs
                if r.pipeline_run_type == PipelineRunTypes.training.name
            ]
            if len(training_runs) == 0:
                raise ValueError('You dont have any training runs with the '
                                 'pipeline {}'.format(pipeline_id))
            r_id = training_runs[-1].id
        elif pipeline_run_id is not None:
            # TODO: If you just have the pipeline_run_id, how do you get the
            #   run without the pipeline_id?
            # TODO: We need to check whether we have a training run here
            r_id = pipeline_run_id
        else:
            raise LookupError('Hello there!')

        if datasource_id is None is datasource_commit_id is None:
            raise ValueError('Please either define a datasource_id '
                             '(to pick the latest commit) or a '
                             'datasource_commit_id to define a source.')

        ds_api = ce_api.DatasourcesApi(self.client)

        if datasource_id is not None:
            commits = api_utils.api_call(
                ds_api.get_commits_api_v1_datasources_ds_id_commits_get,
                datasource_id)

            commits.sort(key=lambda x: x.created_at)
            c_id = commits[-1].id

        elif datasource_commit_id is not None:
            c_id = datasource_commit_id
        else:
            raise LookupError('General Kenobi!')

        run_create = PipelineRun.creator(
            pipeline_run_type=PipelineRunTypes.infer.name,
            datasource_commit_id=c_id,
            orchestration_backend=orchestration_backend,
            orchestration_args=orchestration_args,
            processing_backend=processing_backend,
            processing_args=processing_args,
            additional_args={'run_id': r_id})

        p_api = ce_api.PipelinesApi(self.client)
        return api_utils.api_call(
            p_api.create_pipeline_run_api_v1_pipelines_pipeline_id_runs_post,
            run_create, pipeline_id)
Exemplo n.º 19
0
    def get_pipeline_status(self,
                            workspace_id: Text,
                            pipeline_id: Text = None) -> Dict:

        ws_api = ce_api.WorkspacesApi(self.client)
        p_api = ce_api.PipelinesApi(self.client)
        d_api = ce_api.DatasourcesApi(self.client)

        status_dict = {}

        pipelines = api_utils.api_call(
            ws_api.
            get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
            workspace_id)

        pipelines.sort(key=lambda x: x.created_at)
        for p in pipelines:
            write_check = (len(p.pipeline_runs) > 0) and \
                          (pipeline_id is None or pipeline_id == p.id)

            if write_check:

                status_dict[p.id] = []
                for r in p.pipeline_runs:
                    run = api_utils.api_call(
                        p_api.
                        get_pipeline_run_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_get,
                        p.id, r.id)

                    # Resolve datasource
                    ds_commit = api_utils.api_call(
                        d_api.
                        get_single_commit_api_v1_datasources_commits_commit_id_get,
                        r.datasource_commit_id)
                    ds = api_utils.api_call(
                        d_api.get_datasource_api_v1_datasources_ds_id_get,
                        ds_commit.datasource_id)

                    if run.end_time:
                        td = run.end_time - run.start_time
                    else:
                        td = datetime.now(timezone.utc) - run.start_time

                    status_dict[p.id].append({
                        'RUN ID':
                        run.id,
                        'TYPE':
                        run.pipeline_run_type,
                        'STATUS':
                        run.status,
                        'DATASOURCE':
                        '{}_{}'.format(ds.name, run.datasource_commit_id),
                        'DATAPOINTS':
                        '{}'.format(ds_commit.n_datapoints),
                        'START TIME':
                        print_utils.format_date(run.start_time),
                        'DURATION':
                        print_utils.format_timedelta(td),
                    })

        return status_dict
Exemplo n.º 20
0
    def get_statistics(self,
                       pipeline_id: Text,
                       pipeline_run_id: Text,
                       magic: bool = False):

        api = ce_api.PipelinesApi(self.client)

        pipeline = api_utils.api_call(
            api.get_pipeline_api_v1_pipelines_pipeline_id_get,
            pipeline_id=pipeline_id)

        run = api_utils.api_call(
            api.
            get_pipeline_run_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_get,
            pipeline_id=pipeline_id,
            pipeline_run_id=pipeline_run_id)

        stat_artifact = api_utils.api_call(
            api.
            get_pipeline_artifacts_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_artifacts_component_type_get,
            pipeline_id=pipeline_id,
            pipeline_run_id=pipeline_run_id,
            component_type=GDPComponent.SplitStatistics.name)

        if run.pipeline_run_type != PipelineRunTypes.training.name:
            raise TypeError('The selected pipeline should be a training '
                            'pipeline')

        workspace_id = pipeline.workspace_id

        path = Path(click.get_app_dir(constants.APP_NAME), 'statistics',
                    workspace_id, pipeline_id, pipeline_run_id)

        download_artifact(artifact_json=stat_artifact[0].to_dict(), path=path)

        import tensorflow as tf
        from tensorflow_metadata.proto.v0 import statistics_pb2
        import panel as pn

        result = {}
        for split in os.listdir(path):
            stats_path = os.path.join(path, split, 'stats_tfrecord')
            serialized_stats = next(
                tf.compat.v1.io.tf_record_iterator(stats_path))
            stats = statistics_pb2.DatasetFeatureStatisticsList()
            stats.ParseFromString(serialized_stats)
            dataset_list = statistics_pb2.DatasetFeatureStatisticsList()
            for i, d in enumerate(stats.datasets):
                d.name = split
                dataset_list.datasets.append(d)
            result[split] = dataset_list
        h = get_statistics_html(result)

        if magic:
            import sys
            if 'ipykernel' not in sys.modules:
                raise EnvironmentError('The magic functions are only usable '
                                       'in a Jupyter notebook.')
            from IPython.core.display import display, HTML
            display(HTML(h))

        else:
            pn.serve(panels=pn.pane.HTML(h, width=1200), show=True)
Exemplo n.º 21
0
def get_pipeline_status(info, pipeline_id):
    """Get status of started pipelines"""
    utils.notice('Fetching pipeline(s). This might take a few seconds.. \n')
    active_user = info[constants.ACTIVE_USER]
    ws = info[active_user][constants.ACTIVE_WORKSPACE]

    ws_api = ce_api.WorkspacesApi(utils.api_client(info))
    p_api = ce_api.PipelinesApi(utils.api_client(info))
    d_api = ce_api.DatasourcesApi(utils.api_client(info))

    pipelines = utils.api_call(
        ws_api.get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
        ws)

    if pipeline_id is not None:
        pipeline_id = utils.find_closest_uuid(pipeline_id, pipelines)

    pipelines.sort(key=lambda x: x.created_at)
    for p in pipelines:
        write_check = (len(p.pipeline_runs) > 0) and \
                      (pipeline_id is None or pipeline_id == p.id)

        if write_check:
            title = 'PIPELINE NAME: {} PIPELINE ID: {}'.format(
                p.name, utils.format_uuid(p.id))
            utils.declare(title)
            utils.declare('-' * len(title))

            table = []
            for r in p.pipeline_runs:
                run = utils.api_call(
                    p_api.get_pipeline_run_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_get,
                    p.id,
                    r.id)

                # Resolve datasource
                ds_commit = utils.api_call(
                    d_api.get_single_commit_api_v1_datasources_commits_commit_id_get,
                    r.datasource_commit_id)
                ds = utils.api_call(
                    d_api.get_datasource_api_v1_datasources_ds_id_get,
                    ds_commit.datasource_id)

                if run.end_time:
                    td = run.end_time - run.start_time
                else:
                    td = datetime.now(timezone.utc) - run.start_time

                # # Resolve component status
                # stage = utils.get_run_stage(run.pipeline_components)

                table.append({
                    'RUN ID': utils.format_uuid(run.id),
                    'TYPE': run.pipeline_run_type,
                    'STATUS': run.status,
                    # 'STAGE': stage,
                    'DATASOURCE': '{}_{}'.format(
                        ds.name, utils.format_uuid(run.datasource_commit_id)),
                    'DATAPOINTS': '{}'.format(ds_commit.n_datapoints),
                    # 'RUNNING STAGE': stage,
                    'START TIME': utils.format_date(run.start_time),
                    'DURATION': utils.format_timedelta(td),
                })

            click.echo(tabulate(table, headers='keys', tablefmt='plain'))
            click.echo('\n')
Exemplo n.º 22
0
    def evaluate_single_pipeline(self,
                                 pipeline_id: Text,
                                 pipeline_run_id: Text,
                                 magic: bool = False):
        # Resolve the pipeline run_id
        if pipeline_id is None or pipeline_run_id is None:
            raise ValueError('Please either a pipeline_id and a '
                             'pipeline_run_id to choose a trained model.')

        p_api = ce_api.PipelinesApi(self.client)

        pipeline = api_utils.api_call(
            p_api.get_pipeline_api_v1_pipelines_pipeline_id_get,
            pipeline_id=pipeline_id)
        workspace_id = pipeline.workspace_id

        trainer_path = os.path.join(click.get_app_dir(constants.APP_NAME),
                                    'eval_trainer', workspace_id, pipeline_id,
                                    pipeline_run_id)

        eval_path = os.path.join(click.get_app_dir(constants.APP_NAME),
                                 'eval_evaluator', workspace_id, pipeline_id,
                                 pipeline_run_id)

        artifact = api_utils.api_call(
            p_api.
            get_pipeline_artifacts_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_artifacts_component_type_get,
            pipeline_id=pipeline_id,
            pipeline_run_id=pipeline_run_id,
            component_type=GDPComponent.Trainer.name)
        download_artifact(artifact[0].to_dict(), path=trainer_path)

        artifact = api_utils.api_call(
            p_api.
            get_pipeline_artifacts_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_artifacts_component_type_get,
            pipeline_id=pipeline_id,
            pipeline_run_id=pipeline_run_id,
            component_type=GDPComponent.Evaluator.name)
        download_artifact(artifact[0].to_dict(), path=eval_path)

        # Patch to make it work locally
        import json
        with open(os.path.join(eval_path, 'eval_config.json'), 'r') as f:
            eval_config = json.load(f)
        eval_config['modelLocations'][''] = eval_path
        with open(os.path.join(eval_path, 'eval_config.json'), 'w') as f:
            json.dump(eval_config, f)

        if magic:
            from cengine.utils.shell_utils import create_new_cell
            model_block = evaluation.get_model_block(trainer_path)
            eval_block = evaluation.get_eval_block(eval_path)

            create_new_cell(eval_block)
            create_new_cell(model_block)

        else:
            nb = nbf.v4.new_notebook()
            nb['cells'] = [
                nbf.v4.new_code_cell(evaluation.get_model_block(trainer_path)),
                nbf.v4.new_code_cell(evaluation.get_eval_block(eval_path))
            ]

            config_folder = click.get_app_dir(constants.APP_NAME)

            if not (os.path.exists(config_folder)
                    and os.path.isdir(config_folder)):
                os.makedirs(config_folder)

            final_out_path = os.path.join(config_folder,
                                          constants.EVALUATION_NOTEBOOK)
            s = nbf.writes(nb)
            if isinstance(s, bytes):
                s = s.decode('utf8')

            with open(final_out_path, 'w') as f:
                f.write(s)
            os.system('jupyter notebook "{}"'.format(final_out_path))