Exemplo n.º 1
0
    def _run(self, runobj: RunObject, execution: MLClientCtx):

        if runobj.metadata.iteration:
            self.store_run(runobj)

        meta = self._get_meta(runobj, True)

        job = self._generate_mpi_job(runobj, execution, meta)

        resp = self._submit_mpijob(job, meta.namespace)

        state = None
        timeout = int(config.submit_timeout) or 120
        for _ in range(timeout):
            resp = self.get_job(meta.name, meta.namespace)
            state = self._get_job_launcher_status(resp)
            if resp and state:
                break
            time.sleep(1)

        if resp:
            logger.info("MpiJob {} state={}".format(meta.name, state or "unknown"))
            if state:
                state = state.lower()
                launcher, _ = self._get_launcher(meta.name, meta.namespace)
                execution.set_hostname(launcher)
                execution.set_state("running" if state == "active" else state)
                if self.kfp:
                    writer = AsyncLogWriter(self._db_conn, runobj)
                    status = self._get_k8s().watch(
                        launcher, meta.namespace, writer=writer
                    )
                    logger.info(
                        "MpiJob {} finished with state {}".format(meta.name, status)
                    )
                    if status == "succeeded":
                        execution.set_state("completed")
                    else:
                        execution.set_state(
                            "error",
                            "MpiJob {} finished with state {}".format(
                                meta.name, status
                            ),
                        )
                else:
                    txt = "MpiJob {} launcher pod {} state {}".format(
                        meta.name, launcher, state
                    )
                    logger.info(txt)
                    runobj.status.status_text = txt
            else:
                txt = "MpiJob status unknown or failed, check pods: {}".format(
                    self.get_pods(meta.name, meta.namespace)
                )
                logger.warning(txt)
                runobj.status.status_text = txt
                if self.kfp:
                    execution.set_state("error", txt)

        return None
Exemplo n.º 2
0
def table_summary(context: MLClientCtx,
                  dask_client: Union[DataItem, str],
                  dask_key: str = 'my_dask_dataframe',
                  target_path: str = '',
                  name: str = 'table_summary.csv',
                  key: str = 'table_summary') -> None:
    """Summarize a table
    
    :param context:         the function context
    :param dask_client:     path to the dask client scheduler json file, as
                            string or artifact
    :param dask_key:        key of dataframe in dask client 'datasets' attribute
    :param target_path:     destimation folder for table summary file
    :param name:            name of table summary file (with extension like .csv)
    :param key:             key of table summary in artifact store
    """
    print(context.__dict__)
    dask_client = Client(scheduler_file=str(dask_client))
    df = dask_client.get_dataset('dask_key')
    print(df.head())
    dscr = df.describe()

    filepath = os.path.join(target_path, name)
    dd.to_csv(dscr, filepath, single_file=True, index=False)
    context.log_artifact(key, target_path=filepath)
Exemplo n.º 3
0
def create_classification_data(context: MLClientCtx,
                               n_samples: int,
                               m_features: int,
                               k_classes: int,
                               header: Optional[List[str]],
                               label_column: Optional[str] = 'labels',
                               weight: float = 0.5,
                               random_state: int = 1,
                               filename: Optional[str] = None,
                               key: str = 'classifier-data',
                               file_ext: str = 'pqt',
                               sk_params={}):
    """Create a binary classification sample dataset and save.
    If no filename is given it will default to:
    'simdata-{n_samples}X{m_features}.parquet'.
    
    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.
    
    :param context:       function context
    :param n_samples:     number of rows/samples
    :param m_features:    number of cols/features
    :param k_classes:     number of classes
    :param header:        header for features array
    :param label_column:  column name of ground-truth series
    :param weight:        fraction of sample negative value (ground-truth=0)
    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
    :param filename:      optional name for saving simulated data file
    :param key:           key of data in artifact store
    :param file_ext:      (pqt) extension for parquet file
    :param sk_params:     additional `sklearn.datasets.make_classification`
    
    outputs filename of created data (includes path) in the artifact store.
    """
    if not filename:
        name = f"simdata-{n_samples:0.0e}X{m_features}.{file_ext}".replace(
            "+", "")
        filename = os.path.join(context.artifact_path, name)
    else:
        filename = os.path.join(context.artifact_path, filename)

    features, labels = make_classification(n_samples=n_samples,
                                           n_features=m_features,
                                           weights=weight,
                                           n_classes=k_classes,
                                           random_state=random_state,
                                           **sk_params)

    # make dataframes, add column names, concatenate (X, y)
    X = pd.DataFrame(features)
    if not header:
        X.columns = ["feat_" + str(x) for x in range(m_features)]
    else:
        X.columns = header

    y = pd.DataFrame(labels, columns=[label_column])
    data = pd.concat([X, y], axis=1)

    pq.write_table(pa.Table.from_pandas(data), filename)
    context.log_artifact(key, local_path=name)
Exemplo n.º 4
0
def get_toy_data(context: MLClientCtx,
                 dataset: str,
                 params: dict = {}) -> None:
    """Loads a scikit-learn toy dataset for classification or regression
    
    The following datasets are available ('name' : desription):
    
        'boston'   : boston house-prices dataset (regression)
        'iris'     : iris dataset (classification)
        'diabetes' : diabetes dataset (regression)
        'digits'   : digits dataset (classification)
        'linnerud' : linnerud dataset (multivariate regression)
        'wine'     : wine dataset (classification)
        'cancer'   : breast cancer wisconsin dataset (classification)
    
    The scikit-learn functions return a data bunch including the following items:
    - data              the features matrix
    - target            the ground truth labels
    - DESCR             a description of the dataset
    - feature_names     header for data
    
    The features (and their names) are stored with the target labels in a DataFrame.

    For further details see https://scikit-learn.org/stable/datasets/index.html#toy-datasets
    
    :param context:    function execution context
    :param dataset:    name of the dataset to load 
    :param params:     params of the sklearn load_data method
    """
    filepath = os.path.join(context.artifact_path, dataset) + '.pqt'

    # check to see if we haven't already downloaded the file
    if not os.path.isfile(filepath):
        artifact_path = context.artifact_path

        # reach into module and import the appropriate load_xxx function
        pkg_module = 'sklearn.datasets'
        fname = f'load_{dataset}'

        pkg_module = __import__(pkg_module, fromlist=[fname])
        load_data_fn = getattr(pkg_module, fname)

        data = load_data_fn(**params)
        feature_names = data['feature_names']

        # save
        xy = np.concatenate([data['data'], data['target'].reshape(-1, 1)],
                            axis=1)
        feature_names.append('labels')
        df = pd.DataFrame(data=xy, columns=feature_names)
        df.to_parquet(filepath, engine='pyarrow', index=False)

    # either we just downloaded file, or it exists, log it:
    context.log_artifact(dataset, local_path=filepath.split('/')[-1])
Exemplo n.º 5
0
def load_dataset(
    context: MLClientCtx,
    dataset: str,
    name: str = "",
    file_ext: str = "parquet",
    params: dict = {},
) -> None:
    """Loads a scikit-learn toy dataset for classification or regression

    The following datasets are available ('name' : desription):

        'boston'          : boston house-prices dataset (regression)
        'iris'            : iris dataset (classification)
        'diabetes'        : diabetes dataset (regression)
        'digits'          : digits dataset (classification)
        'linnerud'        : linnerud dataset (multivariate regression)
        'wine'            : wine dataset (classification)
        'breast_cancer'   : breast cancer wisconsin dataset (classification)

    The scikit-learn functions return a data bunch including the following items:
    - data              the features matrix
    - target            the ground truth labels
    - DESCR             a description of the dataset
    - feature_names     header for data

    The features (and their names) are stored with the target labels in a DataFrame.

    For further details see https://scikit-learn.org/stable/datasets/index.html#toy-datasets

    :param context:    function execution context
    :param dataset:    name of the dataset to load
    :param name:       artifact name (defaults to dataset)
    :param file_ext:   output file_ext: parquet or csv
    :param params:     params of the sklearn load_data method
    """
    dataset = str(dataset)
    pkg_module = "sklearn.datasets"
    fname = f"load_{dataset}"

    pkg_module = __import__(pkg_module, fromlist=[fname])
    load_data_fn = getattr(pkg_module, fname)

    data = load_data_fn(**params)
    feature_names = data["feature_names"]

    xy = np.concatenate([data["data"], data["target"].reshape(-1, 1)], axis=1)
    if hasattr(feature_names, "append"):
        feature_names.append("labels")
    else:
        feature_names = np.append(feature_names, "labels")
    df = pd.DataFrame(data=xy, columns=feature_names)

    context.log_dataset(name or dataset, df=df, format=file_ext, index=False)
Exemplo n.º 6
0
def parquet_to_dask(context: MLClientCtx,
                    parquet_url: Union[DataItem, str, Path, IO[AnyStr]],
                    inc_cols: Optional[List[str]] = None,
                    index_cols: Optional[List[str]] = None,
                    shards: int = 4,
                    threads_per: int = 4,
                    processes: bool = False,
                    memory_limit: str = '2GB',
                    persist: bool = True,
                    dask_key: str = 'my_dask_dataframe',
                    target_path: str = '') -> None:
    """Load parquet dataset into dask cluster
    
    If no cluster is found loads a new one and persist the data to it. It
    shouold not be necessary to create a new cluster when the function
    is run as a 'dask' job.
    
    :param context:         the function context
    :param parquet_url:     url of the parquet file or partitioned dataset as either
                            artifact DataItem, string, or path object (see pandas read_csv)
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param shards:          number of workers to launch
    :param threads_per:     number of threads per worker
    :param processes:       
    """
    if hasattr(context, 'dask_client'):
        context.logger.info('found cluster...')
        dask_client = context.dask_client
    else:
        context.logger.info('starting new cluster...')
        cluster = LocalCluster(n_workers=shards,
                               threads_per_worker=threads_per,
                               processes=processes,
                               memory_limit=memory_limit)
        dask_client = Client(cluster)

    context.logger.info(dask_client)

    df = dd.read_parquet(parquet_url)

    if persist and context:
        df = dask_client.persist(df)
        dask_client.publish_dataset(dask_key=df)
        context.dask_client = dask_client

        # share the scheduler
        filepath = os.path.join(target_path, 'scheduler.json')
        dask_client.write_scheduler_file(filepath)
        context.log_artifact('scheduler', target_path=filepath)

        print(df.head())
Exemplo n.º 7
0
    def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels['mlrun/job'] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, 'image', self.full_image_path())
            self._update_container(pod_template, 'volumeMounts', self.spec.volume_mounts)
            extra_env = {'MLRUN_EXEC_CONFIG': runobj.to_json()}
            # if self.spec.rundb:
            #     extra_env['MLRUN_DBPATH'] = self.spec.rundb
            extra_env = [{'name': k, 'value': v} for k, v in extra_env.items()]
            self._update_container(pod_template, 'env', extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template, 'imagePullPolicy', self.spec.image_pull_policy)
            if self.spec.workdir:
                self._update_container(pod_template, 'workingDir', self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(pod_template, 'spec.imagePullSecrets',
                          [{'name': self.spec.image_pull_secret}])
            update_in(pod_template, 'metadata.labels', pod_labels)
            update_in(pod_template, 'spec.volumes', self.spec.volumes)

        # configuration for workers only
        # update resources only for workers because the launcher doesn't require
        # special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template)

        # update the replicas only for workers
        update_in(job, 'spec.mpiReplicaSpecs.Worker.replicas', self.spec.replicas or 1)

        if execution.get_param('slots_per_worker'):
            update_in(job, 'spec.slotsPerWorker', execution.get_param('slots_per_worker'))

        update_in(job, 'metadata', meta.to_dict())

        return job
Exemplo n.º 8
0
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None:
    """Train a model.

    :param context: The runtime context object.
    :param p1: A model parameter.
    :param p2: Another model parameter.
    """
    # access input metadata, values, and inputs
    print(f'Run: {context.name} (uid={context.uid})')
    print(f'Params: p1={p1}, p2={p2}')
    context.logger.info('started training')

    # <insert training code here>

    # log the run results (scalar values)
    context.log_result('accuracy', p1 * 2)
    context.log_result('loss', p1 * 3)

    # add a lable/tag to this run
    context.set_label('category', 'tests')

    # log a simple artifact + label the artifact
    # If you want to upload a local file to the artifact repo add src_path=<local-path>
    context.log_artifact('model',
                         body=b'abc is 123',
                         local_path='model.txt',
                         labels={'framework': 'tfkeras'})
Exemplo n.º 9
0
def validation(context: MLClientCtx, model: DataItem) -> None:
    """Model validation.
    
    Dummy validation function.
    
    :param context: The runtime context object.
    :param model: The extimated model object.
    """
    # access input metadata, values, files, and secrets (passwords)
    print(f'Run: {context.name} (uid={context.uid})')
    print(f'file - {model.url}:\n{model.get()}\n')
    context.logger.info('started validation')
    context.log_artifact('validation',
                         body=b'<b> validated </b>',
                         format='html')
Exemplo n.º 10
0
def load_dask(
        context: MLClientCtx,
        src_data: DataItem,
        dask_key: str = "dask_key",
        inc_cols: Optional[List[str]] = None,
        index_cols: Optional[List[str]] = None,
        dask_persist: bool = True,
        refresh_data: bool = True,
        scheduler_key: str = "scheduler"
) -> None:
    """Load dataset into an existing dask cluster

    dask jobs define the dask client parameters at the job level, this method will raise an error if no client is detected.

    :param context:         the function context
    :param src_data:        url of the data file or partitioned dataset as either
                            artifact DataItem, string, or path object (similar to
                            pandas read_csv)
    :param dask_key:        destination key of data on dask cluster and artifact store
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param dask_persist:    (True) should the data be persisted (through the `client.persist` op)
    :param refresh_data:    (False) if the dask_key already exists in the dask cluster, this will
                            raise an Exception.  Set to True to replace the existing cluster data.
    :param scheduler_key:   (scheduler) the dask scheduler configuration, json also logged as an artifact
    """
    if hasattr(context, "dask_client"):
        dask_client = context.dask_client
    else:
        raise Exception("a dask client was not found in the execution context")

    df = src_data.as_df(df_module=dd)

    if dask_persist:
        df = dask_client.persist(df)
        if dask_client.datasets and dask_key in dask_client.datasets:
            dask_client.unpublish_dataset(dask_key)
        dask_client.publish_dataset(df, name=dask_key)

    if context:
        context.dask_client = dask_client

    # share the scheduler, whether data is persisted or not
    dask_client.write_scheduler_file(scheduler_key + ".json")

    # we don't use log_dataset here until it can take into account
    # dask origin and apply dask describe.
    context.log_artifact(scheduler_key, local_path=scheduler_key + ".json")
Exemplo n.º 11
0
def gen_class_data(context: MLClientCtx,
                   n_samples: int,
                   m_features: int,
                   k_classes: int,
                   header: Optional[List[str]],
                   label_column: Optional[str] = "labels",
                   weight: float = 0.5,
                   random_state: int = 1,
                   key: str = "classifier-data",
                   file_ext: str = "parquet",
                   sk_params={}):
    """Create a binary classification sample dataset and save.
    If no filename is given it will default to:
    "simdata-{n_samples}X{m_features}.parquet".

    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.

    :param context:       function context
    :param n_samples:     number of rows/samples
    :param m_features:    number of cols/features
    :param k_classes:     number of classes
    :param header:        header for features array
    :param label_column:  column name of ground-truth series
    :param weight:        fraction of sample negative value (ground-truth=0)
    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
    :param key:           key of data in artifact store
    :param file_ext:      (pqt) extension for parquet file
    :param sk_params:     additional parameters for `sklearn.datasets.make_classification`
    """
    features, labels = make_classification(n_samples=n_samples,
                                           n_features=m_features,
                                           weights=weight,
                                           n_classes=k_classes,
                                           random_state=random_state,
                                           **sk_params)

    # make dataframes, add column names, concatenate (X, y)
    X = pd.DataFrame(features)
    if not header:
        X.columns = ["feat_" + str(x) for x in range(m_features)]
    else:
        X.columns = header

    y = pd.DataFrame(labels, columns=[label_column])
    data = pd.concat([X, y], axis=1)

    context.log_dataset(key, df=data, format=file_ext, index=False)
Exemplo n.º 12
0
def learning_curves(context: MLClientCtx,
                    results: dict,
                    figsz: Tuple[int, int] = (10, 10),
                    plots_dest: str = "plots") -> None:
    """plot xgb learning curves

    this will also log a model's learning curves
    """
    plt.clf()
    plt.figure(figsize=figsz)
    plt.plot(results["train"]["my_rmsle"], label="train-my-rmsle")
    plt.plot(results["valid"]["my_rmsle"], label="valid-my-rmsle")
    plt.title(f"learning curves")
    plt.legend()

    context.log_artifact(PlotArtifact(f"learning-curves", body=plt.gcf()),
                         local_path=f"{plots_dest}/learning-curves.html")
def pandas_profiling_report(
    context: MLClientCtx,
    data: DataItem,
) -> None:
    """Create a Pandas Profiling Report for a dataset.
    :param context:         the function context
    :param data:            Dataset to create report for
    """

    df = data.as_df()

    profile = df.profile_report(title="Pandas Profiling Report")

    context.log_artifact(
        "Pandas Profiling Report",
        body=profile.to_html(),
        local_path="pandas_profiling_report.html",
    )
Exemplo n.º 14
0
def sql_to_file(
    context: MLClientCtx,
    sql_query: str,
    database_url: str,
    file_ext: str = "parquet",
) -> None:
    """SQL Ingest - Ingest data using SQL query

    :param context:           the function context
    :param sql_query:         the sql query used to retrieve the data
    :param database_url:      database connection URL
    :param file_ext:          ("parquet") format for result file
    """

    engine = create_engine(database_url)
    df = pd.read_sql(sql_query, engine)

    context.log_dataset(
        "query result",
        df=df,
        format=file_ext,
        artifact_path=context.artifact_subpath("data"),
    )
Exemplo n.º 15
0
def validation(context: MLClientCtx, model: DataItem) -> None:
    """Model validation.

    Dummy validation function.

    :param context: The runtime context object.
    :param model: The extimated model object.
    """
    # access input metadata, values, files, and secrets (passwords)
    print(f"Run: {context.name} (uid={context.uid})")
    context.logger.info("started validation")

    # get the model file, class (metadata), and extra_data (dict of key: DataItem)
    model_file, model_obj, _ = get_model(model)

    # update model object elements and data
    update_model(model_obj, parameters={"one_more": 5})

    print(f"path to local copy of model file - {model_file}")
    print("parameters:", model_obj.parameters)
    print("metrics:", model_obj.metrics)
    context.log_artifact("validation",
                         body=b"<b> validated </b>",
                         format="html")
Exemplo n.º 16
0
def plot_confusion_matrix(context: MLClientCtx,
                          labels,
                          predictions,
                          key: str = "confusion_matrix",
                          plots_dir: str = "plots",
                          colormap: str = "Blues",
                          fmt: str = "png",
                          sample_weight=None):
    """Create a confusion matrix.
    Plot and save a confusion matrix using test data from a
    modelline step.
    
    See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
    
    TODO: fix label alignment
    TODO: consider using another packaged version
    TODO: refactor to take params dict for plot options

    :param context:         function context
    :param labels:          validation data ground-truth labels
    :param predictions:     validation data predictions
    :param key:             str
    :param plots_dir:       relative path of plots in artifact store
    :param colormap:        colourmap for confusion matrix
    :param fmt:             plot format
    :param sample_weight:   sample weights
    """
    _gcf_clear(plt)

    cm = metrics.confusion_matrix(labels, predictions, sample_weight=None)
    sns.heatmap(cm, annot=True, cmap=colormap, square=True)

    fig = plt.gcf()
    fname = f"{plots_dir}/{key}.{fmt}"
    fig.savefig(os.path.join(context.artifact_path, fname))
    context.log_artifact(PlotArtifact(key, body=fig), local_path=fname)
Exemplo n.º 17
0
def open_archive(
    context: MLClientCtx,
    archive_url: DataItem,
    subdir: str = "content",
    key: str = "content",
    target_path: str = None,
):
    """Open a file/object archive into a target directory

    Currently supports zip and tar.gz

    :param context:      function execution context
    :param archive_url:  url of archive file
    :param subdir:       path within artifact store where extracted files
                         are stored
    :param key:          key of archive contents in artifact store
    :param target_path:  file system path to store extracted files (use either this or subdir)
    """
    os.makedirs(target_path or subdir, exist_ok=True)

    archive_url = archive_url.local()
    if archive_url.endswith("gz"):
        with tarfile.open(archive_url, mode="r|gz") as ref:
            ref.extractall(target_path or subdir)
    elif archive_url.endswith("zip"):
        with zipfile.ZipFile(archive_url, "r") as ref:
            ref.extractall(target_path or subdir)
    else:
        raise ValueError(f"unsupported archive type in {archive_url}")

    kwargs = {}
    if target_path:
        kwargs = {"target_path": target_path}
    else:
        kwargs = {"local_path": subdir}
    context.log_artifact(key, **kwargs)
Exemplo n.º 18
0
def send_email(
    context: MLClientCtx,
    sender: str,
    to: str,
    subject: str,
    content: str = "",
    server_addr: str = None,
    attachments: List[str] = [],
) -> None:
    """Send an email.
    :param sender: Sender email address
    :param context: The function context
    :param to: Email address of mail recipient
    :param subject: Email subject
    :param content: Optional mail text
    :param server_addr: Address of SMTP server to use. Use format <addr>:<port>
    :param attachments: List of attachments to add.
    """

    email_user = context.get_secret("SMTP_USER")
    email_pass = context.get_secret("SMTP_PASSWORD")
    if email_user is None or email_pass is None:
        context.logger.error(
            "Missing sender email or password - cannot send email.")
        return

    if server_addr is None:
        context.logger.error("Server not specified - cannot send email.")
        return

    msg = EmailMessage()
    msg["From"] = sender
    msg["Subject"] = subject
    msg["To"] = to
    msg.set_content(content)

    for filename in attachments:
        context.logger.info(f"Looking at attachment: {filename}")
        if not os.path.isfile(filename):
            context.logger.warning(f"Filename does not exist {filename}")
            continue
        ctype, encoding = mimetypes.guess_type(filename)
        if ctype is None or encoding is not None:
            ctype = "application/octet-stream"
        maintype, subtype = ctype.split("/", 1)
        with open(filename, "rb") as fp:
            msg.add_attachment(
                fp.read(),
                maintype=maintype,
                subtype=subtype,
                filename=os.path.basename(filename),
            )
            context.logger.info(
                f"Added attachment: Filename: {filename}, of mimetype: {maintype}, {subtype}"
            )

    try:
        s = smtplib.SMTP(host=server_addr)
        s.starttls()
        s.login(email_user, email_pass)
        s.send_message(msg)
        context.logger.info("Email sent successfully.")
    except smtplib.SMTPException as exp:
        context.logger.error(f"SMTP exception caught in SMTP code: {exp}")
    except ConnectionError as ce:
        context.logger.error(f"Connection error caught in SMTP code: {ce}")
Exemplo n.º 19
0
def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    encode_cols: List[str] = [],
    sample: int = -1,
    test_size: float = 0.30,
    train_val_split: float = 0.75,
    test_set_key: str = "test_set",
    model_evaluator=None,
    models_dest: str = "",
    plots_dest: str = "plots",
    file_ext: str = "parquet",
    model_pkg_file: str = "",
    random_state: int = 1,
) -> None:
    """train a classifier
    
    An optional cutom model evaluator can be supplied that should have the signature:
    `my_custom_evaluator(context, xvalid, yvalid, model)` and return a dictionary of 
    scalar "results", a "plots" keys with a list of PlotArtifacts, and 
    and "tables" key containing a returned list of TableArtifacts.
    
    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param test_set_key:      key of held out data in artifact store
    :param model_evaluator:   (None) a custom model evaluator can be specified
    :param models_dest:       ("") models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param file_ext:          ("parquet") format for test_set_key hold out data
    :param random_state:      (1) sklearn rng seed

    """
    models_dest = models_dest or "model"

    raw, labels, header = get_sample(dataset, sample, label_column)

    if encode_cols:
        raw = pd.get_dummies(raw,
                             columns=list(encode_cols.keys()),
                             prefix=list(encode_cols.values()),
                             drop_first=True)

    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits(
        raw, labels, 3, test_size, 1 - train_val_split, random_state)

    context.log_dataset(test_set_key,
                        df=pd.concat([xtest, ytest.to_frame()], axis=1),
                        format=file_ext,
                        index=False,
                        labels={"data-type": "held-out"},
                        artifact_path=context.artifact_subpath('data'))

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": xtrain, "y": ytrain.values})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)
    plots_path = context.artifact_subpath(models_dest, plots_dest)
    if model_evaluator:
        eval_metrics = model_evaluator(context,
                                       xvalid,
                                       yvalid,
                                       model,
                                       plots_artifact_path=plots_path)
    else:
        eval_metrics = eval_model_v2(context,
                                     xvalid,
                                     yvalid,
                                     model,
                                     plots_artifact_path=plots_path)

    context.set_label('class', model_pkg_class)
    context.log_model("model",
                      body=dumps(model),
                      artifact_path=artifact_path,
                      extra_data=eval_metrics,
                      model_file="model.pkl",
                      metrics=context.results,
                      labels={"class": model_pkg_class})
Exemplo n.º 20
0
def arc_to_parquet(context: MLClientCtx,
                   archive_url: DataItem,
                   header: List[str] = [None],
                   chunksize: int = 0,
                   dtype=None,
                   encoding: str = "latin-1",
                   key: str = "data",
                   dataset: str = "None",
                   part_cols=[],
                   file_ext: str = "parquet",
                   index: bool = False,
                   refresh_data: bool = False,
                   stats: bool = False) -> None:
    """Open a file/object archive and save as a parquet file or dataset

    Notes
    -----
    * this function is typically for large files, please be sure to check all settings
    * partitioning requires precise specification of column types.
    * the archive_url can be any file readable by pandas read_csv, which includes tar files
    * if the `dataset` parameter is not empty, then a partitioned dataset will be created
    instead of a single file in the folder `dataset`
    * if a key exists already then it will not be re-acquired unless the `refresh_data` param
    is set to `True`.  This is in case the original file is corrupt, or a refresh is
    required.

    :param context:        the function context
    :param archive_url:    MLRun data input (DataItem object)
    :param chunksize:      (0) when > 0, row size (chunk) to retrieve
                           per iteration
    :param dtype           destination data type of specified columns
    :param encoding        ("latin-8") file encoding
    :param key:            key in artifact store (when log_data=True)
    :param dataset:        (None) if not None then "target_path/dataset"
                           is folder for partitioned files
    :param part_cols:      ([]) list of partitioning columns
    :param file_ext:       (parquet) csv/parquet file extension
    :param index:          (False) pandas save index option
    :param refresh_data:   (False) overwrite existing data at that location
    :param stats:          (None) calculate table stats when logging artifact
    """
    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)

    archive_url = archive_url.local()

    if dataset is not None:
        dest_path = os.path.join(base_path, dataset)
        exists = os.path.isdir(dest_path)
    else:
        dest_path = os.path.join(base_path, key + f".{file_ext}")
        exists = os.path.isfile(dest_path)

    if not exists:
        context.logger.info("destination file does not exist, downloading")
        if chunksize > 0:
            header = _chunk_readwrite(archive_url, dest_path, chunksize,
                                      encoding, dtype, dataset)
            context.log_dataset(key=key,
                                stats=stats,
                                format='parquet',
                                target_path=dest_path)
        else:
            df = pd.read_csv(archive_url)
            context.log_dataset(key, df=df, format=file_ext, index=index)
    else:
        context.logger.info("destination file already exists, nothing done")
Exemplo n.º 21
0
def summarize(
    context: MLClientCtx,
    table: DataItem,
    label_column: str = None,
    class_labels: List[str] = [],
    plot_hist: bool = True,
    plots_dest: str = "plots",
    update_dataset=False,
) -> None:
    """Summarize a table

    :param context:         the function context
    :param table:           MLRun input pointing to pandas dataframe (csv/parquet file path)
    :param label_column:    ground truth column label
    :param class_labels:    label for each class in tables and plots
    :param plot_hist:       (True) set this to False for large tables
    :param plots_dest:      destination folder of summary plots (relative to artifact_path)
    :param update_dataset:  when the table is a registered dataset update the charts in-place
    """
    df = table.as_df()
    header = df.columns.values
    extra_data = {}

    try:
        gcf_clear(plt)
        snsplt = sns.pairplot(df, hue=label_column)  # , diag_kws={"bw": 1.5})
        extra_data["histograms"] = context.log_artifact(
            PlotArtifact("histograms", body=plt.gcf()),
            local_path=f"{plots_dest}/hist.html",
            db_key=False,
        )
    except Exception as e:
        context.logger.error(
            f"Failed to create pairplot histograms due to: {e}")

    try:
        gcf_clear(plt)
        plot_cols = 3
        plot_rows = int((len(header) - 1) / plot_cols) + 1
        fig, ax = plt.subplots(plot_rows, plot_cols, figsize=(15, 4))
        fig.tight_layout(pad=2.0)
        for i in range(plot_rows * plot_cols):
            if i < len(header):
                sns.violinplot(
                    x=df[header[i]],
                    ax=ax[int(i / plot_cols)][i % plot_cols],
                    orient="h",
                    width=0.7,
                    inner="quartile",
                )
            else:
                fig.delaxes(ax[int(i / plot_cols)][i % plot_cols])
            i += 1
        extra_data["violin"] = context.log_artifact(
            PlotArtifact("violin", body=plt.gcf(), title="Violin Plot"),
            local_path=f"{plots_dest}/violin.html",
            db_key=False,
        )
    except Exception as e:
        context.logger.warn(
            f"Failed to create violin distribution plots due to: {e}")

    if label_column:
        labels = df.pop(label_column)
        imbtable = labels.value_counts(normalize=True).sort_index()
        try:
            gcf_clear(plt)
            balancebar = imbtable.plot(kind="bar",
                                       title="class imbalance - labels")
            balancebar.set_xlabel("class")
            balancebar.set_ylabel("proportion of total")
            extra_data["imbalance"] = context.log_artifact(
                PlotArtifact("imbalance", body=plt.gcf()),
                local_path=f"{plots_dest}/imbalance.html",
            )
        except Exception as e:
            context.logger.warn(
                f"Failed to create class imbalance plot due to: {e}")
        context.log_artifact(
            TableArtifact("imbalance-weights-vec",
                          df=pd.DataFrame({"weights": imbtable})),
            local_path=f"{plots_dest}/imbalance-weights-vec.csv",
            db_key=False,
        )

    tblcorr = df.corr()
    mask = np.zeros_like(tblcorr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    dfcorr = pd.DataFrame(data=tblcorr, columns=header, index=header)
    dfcorr = dfcorr[
        np.arange(dfcorr.shape[0])[:, None] > np.arange(dfcorr.shape[1])]
    context.log_artifact(
        TableArtifact("correlation-matrix", df=tblcorr, visible=True),
        local_path=f"{plots_dest}/correlation-matrix.csv",
        db_key=False,
    )

    try:
        gcf_clear(plt)
        ax = plt.axes()
        sns.heatmap(tblcorr, ax=ax, mask=mask, annot=False, cmap=plt.cm.Reds)
        ax.set_title("features correlation")
        extra_data["correlation"] = context.log_artifact(
            PlotArtifact("correlation",
                         body=plt.gcf(),
                         title="Correlation Matrix"),
            local_path=f"{plots_dest}/corr.html",
            db_key=False,
        )
    except Exception as e:
        context.logger.warn(
            f"Failed to create features correlation plot due to: {e}")

    gcf_clear(plt)
    if update_dataset and table.meta and table.meta.kind == "dataset":
        from mlrun.artifacts import update_dataset_meta

        update_dataset_meta(table.meta, extra_data=extra_data)
Exemplo n.º 22
0
def describe_spark(context: MLClientCtx,
                   dataset: DataItem,
                   artifact_path,
                   bins: int = 30,
                   describe_extended: bool = True):

    location = dataset.local()

    spark = SparkSession.builder.appName("Spark job").getOrCreate()

    df = spark.read.csv(location, header=True, inferSchema=True)

    kwargs = []

    float_cols = [
        item[0] for item in df.dtypes
        if item[1].startswith('float') or item[1].startswith('double')
    ]

    if describe_extended == True:

        table, variables, freq = describe(df, bins, float_cols, kwargs)

        tbl_1 = variables.reset_index()

        if len(freq) != 0:
            tbl_2 = pd.DataFrame.from_dict(
                freq, orient="index").sort_index().stack().reset_index()
            tbl_2.columns = ['col', 'key', 'val']
            tbl_2['Merged'] = [{
                key: val
            } for key, val in zip(tbl_2.key, tbl_2.val)]
            tbl_2 = tbl_2.groupby(
                'col',
                as_index=False).agg(lambda x: tuple(x))[['col', 'Merged']]

            summary = pd.merge(tbl_1,
                               tbl_2,
                               how='left',
                               left_on='index',
                               right_on='col')

        else:
            summary = tbl_1

        context.log_dataset("summary_stats",
                            df=summary,
                            format="csv",
                            index=False,
                            artifact_path=context.artifact_subpath('data'))

        context.log_results(table)

    else:
        tbl_1 = df.describe().toPandas()

        summary = tbl_1.T

        context.log_dataset("summary_stats",
                            df=summary,
                            format="csv",
                            index=False,
                            artifact_path=context.artifact_subpath('data'))

    spark.stop()
Exemplo n.º 23
0
def data_clean(
    context: MLClientCtx,
    src: DataItem,
    file_ext: str = "csv",
    models_dest: str = "models/encoders",
    cleaned_key: str = "cleaned-data",
    encoded_key: str = "encoded-data",
):
    """process a raw churn data file

    Data has 3 states here: `raw`, `cleaned` and `encoded`

    * `raw` kept by default, the pipeline begins with a raw data artifact
    * `cleaned` kept for charts, presentations
    * `encoded` is input for a cross validation and training function

    steps (not necessarily in correct order, some parallel)
    * column name maps
    * deal with nans and other types of missings/junk
    * label encode binary and ordinal category columns
    * create category ranges from numerical columns
    And finally,
    * test

    Why we don't one-hot-encode here? One hot encoding isn't a necessary
    step for all algorithms. It can also generate a very large feature
    matrix that doesn't need to be serialized (even if sparse).
    So we leave one-hot-encoding for the training step.

    What about scaling numerical columns? Same as why we don't one hot
    encode here. Do we scale before train-test split?  IMHO, no.  Scaling
    before splitting introduces a type of data leakage.  In addition,
    many estimators are completely immune to the monotonic transformations
    implied by scaling, so why waste the cycles?

    TODO:
        * parallelize where possible
        * more abstraction (more parameters, chain sklearn transformers)
        * convert to marketplace function

    :param context:          the function execution context
    :param src:              an artifact or file path
    :param file_ext:         file type for artifacts
    :param models_dest:       label encoders and other preprocessing steps
                             should be saved together with other pipeline
                             models
    :param cleaned_key:      key of cleaned data table in artifact store
    :param encoded_key:      key of encoded data table in artifact store
    """
    df = src.as_df()

    # drop columns
    drop_cols_list = ["customerID", "TotalCharges"]
    df.drop(drop_cols_list, axis=1, inplace=True)

    # header transformations
    rename_cols_map = {
        "SeniorCitizen": "senior",
        "Partner": "partner",
        "Dependents": "deps",
        "Churn": "labels",
    }
    df.rename(rename_cols_map, axis=1, inplace=True)

    # add drop column to logs:
    for col in drop_cols_list:
        rename_cols_map.update({col: "_DROPPED_"})

    # log the op
    tp = os.path.join(models_dest, "preproc-column_map.json")
    context.log_artifact("preproc-column_map.json",
                         body=json.dumps(rename_cols_map),
                         local_path=tp)

    # VALUE transformations

    # clean
    # truncate reply to "No"
    df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x)

    # encode numerical type as category bins (ordinal)
    bins = [0, 12, 24, 36, 48, 60, np.inf]
    labels = [0, 1, 2, 3, 4, 5]
    df["tenure_map"] = pd.cut(df.tenure, bins, labels=False)
    tenure_map = dict(zip(bins, labels))
    # save this transformation
    tp = os.path.join(models_dest, "preproc-numcat_map.json")
    context.log_artifact(
        "preproc-numcat_map.json",
        body=bytes(json.dumps(tenure_map).encode("utf-8")),
        local_path=tp,
    )

    context.log_dataset(cleaned_key, df=df, format=file_ext, index=False)

    # label encoding - generate model for each column saved in dict
    # some of these columns may be hot encoded in the training step
    fix_cols = [
        "gender",
        "partner",
        "deps",
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies",
        "PhoneService",
        "MultipleLines",
        "PaperlessBilling",
        "InternetService",
        "Contract",
        "PaymentMethod",
        "labels",
    ]

    d = defaultdict(LabelEncoder)
    df[fix_cols] = df[fix_cols].apply(
        lambda x: d[x.name].fit_transform(x.astype(str)))
    context.log_dataset(encoded_key, df=df, format=file_ext, index=False)

    model_bin = dumps(d)
    context.log_model(
        "model",
        body=model_bin,
        artifact_path=os.path.join(context.artifact_path, models_dest),
        model_file="model.pkl",
    )
Exemplo n.º 24
0
    def _generate_mpi_job(
        self,
        runobj: RunObject,
        execution: MLClientCtx,
        meta: client.V1ObjectMeta,
    ) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, "image",
                                       self.full_image_path())
            self._update_container(pod_template, "volumeMounts",
                                   self.spec.volume_mounts)
            extra_env = self._generate_runtime_env(runobj)
            extra_env = [{"name": k, "value": v} for k, v in extra_env.items()]
            self._update_container(pod_template, "env",
                                   extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template,
                    "imagePullPolicy",
                    self.spec.image_pull_policy,
                )
            if self.spec.workdir:
                self._update_container(pod_template, "workingDir",
                                       self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(
                    pod_template,
                    "spec.imagePullSecrets",
                    [{
                        "name": self.spec.image_pull_secret
                    }],
                )
            update_in(pod_template, "metadata.labels", pod_labels)
            update_in(pod_template, "spec.volumes", self.spec.volumes)

        # configuration for workers only
        # update resources only for workers because the launcher
        # doesn't require special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template,
                                              worker_pod_template)

        # update the replicas only for workers
        update_in(
            job,
            "spec.mpiReplicaSpecs.Worker.replicas",
            self.spec.replicas or 1,
        )

        update_in(
            job,
            "spec.cleanPodPolicy",
            self.spec.clean_pod_policy,
        )

        if execution.get_param("slots_per_worker"):
            update_in(
                job,
                "spec.slotsPerWorker",
                execution.get_param("slots_per_worker"),
            )

        update_in(job, "metadata", meta.to_dict())

        return job
Exemplo n.º 25
0
def train_model(context: MLClientCtx,
                dataset: DataItem,
                model_pkg_class: str,
                label_column: str = "label",
                train_validation_size: float = 0.75,
                sample: float = 1.0,
                models_dest: str = "models",
                test_set_key: str = "test_set",
                plots_dest: str = "plots",
                dask_key: str = "dask_key",
                dask_persist: bool = False,
                scheduler_key: str = '',
                file_ext: str = "parquet",
                random_state: int = 42) -> None:
    """
    Train a sklearn classifier with Dask
    
    :param context:                 Function context.
    :param dataset:                 Raw data file.
    :param model_pkg_class:         Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", 
                                    or json model config.
    :param label_column:            (label) Ground-truth y labels.
    :param train_validation_size:   (0.75) Train validation set proportion out of the full dataset.
    :param sample:                  (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.
    :param models_dest:             (models) Models subfolder on artifact path.
    :param test_set_key:            (test_set) Mlrun db key of held out data in artifact store.
    :param plots_dest:              (plots) Plot subfolder on artifact path.
    :param dask_key:                (dask key) Key of dataframe in dask client "datasets" attribute.
    :param dask_persist:            (False) Should the data be persisted (through the `client.persist`)
    :param scheduler_key:           (scheduler) Dask scheduler configuration, json also logged as an artifact.
    :param file_ext:                (parquet) format for test_set_key hold out data
    :param random_state:            (42) sklearn seed
    """

    if scheduler_key:
        client = Client(scheduler_key)

    else:
        client = Client()

    context.logger.info("Read Data")
    df = dataset.as_df(df_module=dd)

    context.logger.info("Prep Data")
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)

    if df.isna().any().any().compute() == True:
        raise Exception('NAs valus found')

    df_header = df.columns

    df = df.sample(frac=sample).reset_index(drop=True)
    encoder = LabelEncoder()
    encoder = encoder.fit(df[label_column])
    X = df.drop(label_column, axis=1).to_dask_array(lengths=True)
    y = encoder.transform(df[label_column])

    classes = df[label_column].drop_duplicates()  # no unique values in dask
    classes = [str(i) for i in classes]

    context.logger.info("Split and Train")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, train_size=train_validation_size, random_state=random_state)

    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    X_test_transformed = scaler.transform(X_test)

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": X_train_transformed, "y": y_train})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    with joblib.parallel_backend("dask"):

        model = model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.logger.info("Evaluate")
    extra_data_dict = {}
    for report in (ROCAUC, ClassificationReport, ConfusionMatrix):

        report_name = str(report.__name__)
        plt.cla()
        plt.clf()
        plt.close()

        viz = report(model, classes=classes, per_class=True, is_fitted=True)
        viz.fit(X_train_transformed,
                y_train)  # Fit the training data to the visualizer
        viz.score(X_test_transformed,
                  y_test.compute())  # Evaluate the model on the test data

        plot = context.log_artifact(PlotArtifact(report_name,
                                                 body=viz.fig,
                                                 title=report_name),
                                    db_key=False)
        extra_data_dict[str(report)] = plot

        if report_name == 'ROCAUC':
            context.log_results({
                "micro": viz.roc_auc.get("micro"),
                "macro": viz.roc_auc.get("macro")
            })

        elif report_name == 'ClassificationReport':
            for score_name in viz.scores_:
                for score_class in viz.scores_[score_name]:

                    context.log_results({
                        score_name + "-" + score_class:
                        viz.scores_[score_name].get(score_class)
                    })

    viz = FeatureImportances(model,
                             classes=classes,
                             per_class=True,
                             is_fitted=True,
                             labels=df_header.delete(
                                 df_header.get_loc(label_column)))
    viz.fit(X_train_transformed, y_train)
    viz.score(X_test_transformed, y_test)

    plot = context.log_artifact(PlotArtifact("FeatureImportances",
                                             body=viz.fig,
                                             title="FeatureImportances"),
                                db_key=False)
    extra_data_dict[str("FeatureImportances")] = plot

    plt.cla()
    plt.clf()
    plt.close()

    context.logger.info("Log artifacts")
    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.set_label('class', model_pkg_class)

    context.log_model("model",
                      body=dumps(model),
                      artifact_path=artifact_path,
                      model_file="model.pkl",
                      extra_data=extra_data_dict,
                      metrics=context.results,
                      labels={"class": model_pkg_class})

    context.log_artifact("standard_scaler",
                         body=dumps(scaler),
                         artifact_path=artifact_path,
                         model_file="scaler.gz",
                         label="standard_scaler")

    context.log_artifact("label_encoder",
                         body=dumps(encoder),
                         artifact_path=artifact_path,
                         model_file="encoder.gz",
                         label="label_encoder")

    df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()
    context.log_dataset(
        test_set_key,
        df=pd.DataFrame(df_to_save,
                        columns=df_header),  # improve log dataset ability
        format=file_ext,
        index=False,
        labels={"data-type": "held-out"},
        artifact_path=context.artifact_subpath('data'))

    context.logger.info("Done!")
Exemplo n.º 26
0
def permutation_importance(
    context: MLClientCtx,
    model: DataItem,
    dataset: DataItem,
    labels: str,
    figsz=(10, 5),
    plots_dest: str = "plots",
    fitype: str = "permute",
) -> pd.DataFrame:
    """calculate change in metric

    type 'permute' uses a pre-estimated model
    type 'dropcol' uses a re-estimates model

    :param context:     the function's execution context
    :param model:       a trained model
    :param dataset:     features and ground truths, regression targets
    :param labels       name of the ground truths column
    :param figsz:       matplotlib figure size
    :param plots_dest:  path within artifact store
    :
    """
    model_file, model_data, _ = get_model(model.url, suffix=".pkl")
    model = load(open(str(model_file), "rb"))

    X = dataset.as_df()
    y = X.pop(labels)
    header = X.columns

    metric = _oob_classifier_accuracy

    baseline = metric(model, X, y)

    imp = []
    for col in X.columns:
        if fitype is "permute":
            save = X[col].copy()
            X[col] = np.random.permutation(X[col])
            m = metric(model, X, y)
            X[col] = save
            imp.append(baseline - m)
        elif fitype is "dropcol":
            X_ = X.drop(col, axis=1)
            model_ = clone(model)
            #model_.random_state = random_state
            model_.fit(X_, y)
            o = model_.oob_score_
            imp.append(baseline - o)
        else:
            raise ValueError(
                "unknown fitype, only 'permute' or 'dropcol' permitted")

    zipped = zip(imp, header)
    feature_imp = pd.DataFrame(sorted(zipped),
                               columns=["importance", "feature"])
    feature_imp.sort_values(by="importance", ascending=False, inplace=True)

    plt.clf()
    plt.figure(figsize=figsz)
    sns.barplot(x="importance", y="feature", data=feature_imp)
    plt.title(f"feature importances-{fitype}")
    plt.tight_layout()

    context.log_artifact(
        PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()),
        local_path=f"{plots_dest}/feature-permutations.html",
    )
    context.log_dataset(f"feature-importances-{fitype}-tbl",
                        df=feature_imp,
                        index=False)
Exemplo n.º 27
0
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None:
    """Train a model.

    :param context: The runtime context object.
    :param p1: A model parameter.
    :param p2: Another model parameter.
    """
    # access input metadata, values, and inputs
    print(f"Run: {context.name} (uid={context.uid})")
    print(f"Params: p1={p1}, p2={p2}")
    context.logger.info("started training")

    # <insert training code here>

    # log the run results (scalar values)
    context.log_result("accuracy", p1 * 2)
    context.log_result("loss", p1 * 3)

    # add a lable/tag to this run
    context.set_label("category", "tests")

    # log a simple artifact + label the artifact
    # If you want to upload a local file to the artifact repo add src_path=<local-path>
    context.log_artifact("somefile",
                         body=b"abc is 123",
                         local_path="myfile.txt")

    # create a dataframe artifact
    df = pd.DataFrame([{
        "A": 10,
        "B": 100
    }, {
        "A": 11,
        "B": 110
    }, {
        "A": 12,
        "B": 120
    }])
    context.log_dataset("mydf", df=df)

    # Log an ML Model artifact, add metrics, params, and labels to it
    # and place it in a subdir ('models') under artifacts path
    context.log_model(
        "mymodel",
        body=b"abc is 123",
        model_file="model.txt",
        metrics={"accuracy": 0.85},
        parameters={"xx": "abc"},
        labels={"framework": "xgboost"},
        artifact_path=context.artifact_subpath("models"),
    )
Exemplo n.º 28
0
def fit(context: MLClientCtx,
        dataset: DataItem,
        num_boost_round: int = 10,
        evals: List[Tuple[DMatrix, str]] = [],
        obj: Union[Callable, str] = "",
        feval: Union[Callable, str] = None,
        maximize: bool = False,
        early_stopping_rounds: int = None,
        evals_result: dict = {},
        verbose_eval: bool = True,
        xgb_model: DataItem = None,
        callbacks: List[Callable] = [],
        label_column: str = "labels",
        encode_cols: dict = {},
        sample: int = -1,
        test_size: float = 0.25,
        valid_size: float = 0.75,
        random_state: int = 1994,
        models_dest: str = "models",
        plots_dest: str = "plots",
        file_ext: str = "csv",
        test_set_key: str = "test-set",
        gpus: bool = False) -> None:
    """low level xgboost train api

    for the xgboost `train` params see:
    https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.train

    Note:  the first parameter of xgboost's `train` method is a dict of parameters
           supplied to the booster (engine).  To modify one of those simply
           add a task parameter (when running you supply an mlrun NewTask) with the
           prefix "XGB_". So for example, to set the 'tree_method' parameter to 'approx',
           add {"XGB_tree_method":"approx"} to the task params key.

    :param context:           the function context
    :param dataset:           the full data set, train, valid and test will be extracted and
                              each converted to a DMatrix for input to xgboost's `train`
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param test_size:         (0.05) test set size
    :param valid_size:        (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param random_state:      (1) sklearn rng seed
    :param models_dest:       destination subfolder for model artifacts
    :param plots_dest:        destination subfolder for plot artifacts
    :param file_ext:          format for test_set_key hold out data
    :param test_set_key:      (test-set), key of held out data in artifact store
    :param gpus:              (False), run on gpus
    """
    raw, labels, header = get_sample(dataset, sample, label_column)

    # hot-encode
    if encode_cols:
        raw = pd.get_dummies(raw,
                             columns=list(encode_cols.keys()),
                             prefix=list(encode_cols.values()),
                             drop_first=True)

    # split the sample into train validate, test and calibration sets:
    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = \
        get_splits(raw, labels, 3, test_size, valid_size, random_state)

    # save test data as regular dataframe as it may be used by other process
    context.log_dataset(test_set_key,
                        df=pd.concat([xtest, ytest], axis=1),
                        format=file_ext,
                        index=False)

    # convert to xgboost DMatrix (todo - dask, gpu)
    dtrain = DMatrix(xtrain, label=ytrain)
    dvalid = DMatrix(xvalid, label=yvalid)

    boost_params = {
        "tree_method": "gpu_hist" if gpus else "hist",
        "seed": random_state,
        "disable_default_eval_metric": 1,
        "objective": "reg:squaredlogerror",
        "eval_metric": "rmsle"
    }

    # enable user to customize `booster param` parameters
    for k, v in context.parameters.items():
        if k.startswith('XGB_'):
            boost_params[k[4:]] = v

    # collect learning curves / training history
    results = dict()

    booster = train(
        boost_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dtrain, "train"), (dvalid, "valid")],
        evals_result=results,
        obj=squared_log,
        feval=rmsle,
        maximize=maximize,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=verbose_eval,
        # xgb_model=xgb_model,
        # callbacks: List[Callable] = []
    )

    context.log_model("model",
                      body=dumps(booster),
                      model_file="model.pkl",
                      artifact_path='/User/artifacts/tttt')

    learning_curves(context, results)
Exemplo n.º 29
0
    def _generate_mpi_job(
        self,
        runobj: RunObject,
        execution: MLClientCtx,
        meta: client.V1ObjectMeta,
    ) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)
        command, args, extra_env = self._get_cmd_args(runobj)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, "image",
                                       self.full_image_path())
            self._update_container(pod_template, "volumeMounts",
                                   self.spec.volume_mounts)
            self._update_container(pod_template, "env",
                                   extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template,
                    "imagePullPolicy",
                    self.spec.image_pull_policy,
                )
            if self.spec.workdir:
                self._update_container(pod_template, "workingDir",
                                       self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(
                    pod_template,
                    "spec.imagePullSecrets",
                    [{
                        "name": self.spec.image_pull_secret
                    }],
                )
            update_in(pod_template, "metadata.labels", pod_labels)
            update_in(pod_template, "spec.volumes", self.spec.volumes)
            update_in(pod_template, "spec.nodeName", self.spec.node_name)
            update_in(pod_template, "spec.nodeSelector",
                      self.spec.node_selector)
            update_in(pod_template, "spec.affinity",
                      self.spec._get_sanitized_affinity())
            if self.spec.priority_class_name and len(
                    mlconf.get_valid_function_priority_class_names()):
                update_in(
                    pod_template,
                    "spec.priorityClassName",
                    self.spec.priority_class_name,
                )

        # configuration for workers only
        # update resources only for workers because the launcher
        # doesn't require special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template,
                                             [command] + args)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template,
                                              worker_pod_template)

        # update the replicas only for workers
        update_in(
            job,
            "spec.mpiReplicaSpecs.Worker.replicas",
            self.spec.replicas or 1,
        )

        update_in(
            job,
            "spec.cleanPodPolicy",
            self.spec.clean_pod_policy,
        )

        if execution.get_param("slots_per_worker"):
            update_in(
                job,
                "spec.slotsPerWorker",
                execution.get_param("slots_per_worker"),
            )

        update_in(job, "metadata", meta.to_dict())

        return job
Exemplo n.º 30
0
def data_clean(context: MLClientCtx,
               src: DataItem,
               file_ext: str = "csv",
               models_dest: str = "models/encoders",
               cleaned_key: str = "cleaned-data",
               encoded_key: str = "encoded-data"):
    df = src.as_df()

    # drop columns
    drop_cols_list = ["customerID", "TotalCharges"]
    df.drop(drop_cols_list, axis=1, inplace=True)

    # header transformations
    old_cols = df.columns
    rename_cols_map = {
        "SeniorCitizen": "senior",
        "Partner": "partner",
        "Dependents": "deps",
        "Churn": "labels"
    }
    df.rename(rename_cols_map, axis=1, inplace=True)

    # add drop column to logs:
    for col in drop_cols_list:
        rename_cols_map.update({col: "_DROPPED_"})

    # log the op
    tp = os.path.join(models_dest, "preproc-column_map.json")
    context.log_artifact("preproc-column_map.json",
                         body=json.dumps(rename_cols_map),
                         local_path=tp)
    df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x)

    # encode numerical type as category bins (ordinal)
    bins = [0, 12, 24, 36, 48, 60, np.inf]
    labels = [0, 1, 2, 3, 4, 5]
    tenure = df.tenure.copy(deep=True)
    df["tenure_map"] = pd.cut(df.tenure, bins, labels=False)
    tenure_map = dict(zip(bins, labels))
    # save this transformation
    tp = os.path.join(models_dest, "preproc-numcat_map.json")
    context.log_artifact("preproc-numcat_map.json",
                         body=bytes(json.dumps(tenure_map).encode("utf-8")),
                         local_path=tp)

    context.log_dataset(cleaned_key, df=df, format=file_ext, index=False)
    fix_cols = [
        "gender", "partner", "deps", "OnlineSecurity", "OnlineBackup",
        "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
        "PhoneService", "MultipleLines", "PaperlessBilling", "InternetService",
        "Contract", "PaymentMethod", "labels"
    ]

    d = defaultdict(LabelEncoder)
    df[fix_cols] = df[fix_cols].apply(
        lambda x: d[x.name].fit_transform(x.astype(str)))
    context.log_dataset(encoded_key, df=df, format=file_ext, index=False)

    model_bin = dumps(d)
    context.log_model("model",
                      body=model_bin,
                      artifact_path=os.path.join(context.artifact_path,
                                                 models_dest),
                      model_file="model.pkl")