Exemplo n.º 1
0
def test_signature_inference_infers_input_and_output_as_expected():
    sig0 = infer_signature(np.array([1]))
    assert sig0.inputs is not None
    assert sig0.outputs is None
    sig1 = infer_signature(np.array([1]), np.array([1]))
    assert sig1.inputs == sig0.inputs
    assert sig1.outputs == sig0.inputs
Exemplo n.º 2
0
def test_input_examples(pandas_df_with_all_types):
    sig = infer_signature(pandas_df_with_all_types)
    # test setting example with data frame with all supported data types
    with TempDir() as tmp:
        example = _Example(pandas_df_with_all_types)
        example.save(tmp.path())
        filename = example.info["artifact_path"]
        with open(tmp.path(filename), "r") as f:
            data = json.load(f)
            assert set(data.keys()) == set(("columns", "data"))
        parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs)
        assert (pandas_df_with_all_types == parsed_df).all().all()
        # the frame read without schema should match except for the binary values
        assert (parsed_df.drop(columns=["binary"]) == _dataframe_from_json(tmp.path(filename))
                .drop(columns=["binary"])).all().all()

    # pass the input as dictionary instead
    with TempDir() as tmp:
        d = {name: pandas_df_with_all_types[name].values
             for name in pandas_df_with_all_types.columns}
        example = _Example(d)
        example.save(tmp.path())
        filename = example.info["artifact_path"]
        parsed_df = _dataframe_from_json(tmp.path(filename), sig.inputs)
        assert (pandas_df_with_all_types == parsed_df).all().all()

    # input passed as numpy array
    sig = infer_signature(pandas_df_with_all_types.values)
    with TempDir() as tmp:
        example = _Example(pandas_df_with_all_types.values)
        example.save(tmp.path())
        filename = example.info["artifact_path"]
        with open(tmp.path(filename), "r") as f:
            data = json.load(f)
            assert set(data.keys()) == set(("data",))
        parsed_ary = _dataframe_from_json(tmp.path(filename), schema=sig.inputs).values
        assert (pandas_df_with_all_types.values == parsed_ary).all().all()

    # pass multidimensional array
    with TempDir() as tmp:
        example = np.array([[[1, 2, 3]]])
        with pytest.raises(TensorsNotSupportedException):
            _Example(example)

    # pass multidimensional array
    with TempDir() as tmp:
        example = np.array([[1, 2, 3]])
        with pytest.raises(TensorsNotSupportedException):
            _Example({"x": example, "y": example})

    # pass dict with scalars
    with TempDir() as tmp:
        example = {"a": 1, "b": "abc"}
        x = _Example(example)
        x.save(tmp.path())
        filename = x.info["artifact_path"]
        parsed_df = _dataframe_from_json(tmp.path(filename))
        assert example == parsed_df.to_dict(orient="records")[0]
Exemplo n.º 3
0
def train_model(params):
    # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
    mlflow.xgboost.autolog()
    with mlflow.start_run(nested=True):
        train = xgb.DMatrix(data=X_train, label=y_train)
        test = xgb.DMatrix(data=X_test, label=y_test)
        # Pass in the test set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric
        # is no longer improving.
        booster = xgb.train(params=params, dtrain=train, num_boost_round=1000,\
                            evals=[(test, "test")], early_stopping_rounds=50)
        predictions_test = booster.predict(test)
        auc_score = roc_auc_score(y_test, predictions_test)
        mlflow.log_metric('auc', auc_score)

        signature = infer_signature(X_train, booster.predict(train))
        mlflow.xgboost.log_model(booster, "model", signature=signature)
        mlflow.log_artifact(train)
        mlflow.log_artifact(test)

        # Set the loss to -1*auc_score so fmin maximizes the auc_score
        return {
            'status': STATUS_OK,
            'loss': -1 * auc_score,
            'booster': booster.attributes()
        }
Exemplo n.º 4
0
def train_model(params):
    # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
    mlflow.xgboost.autolog()
    with mlflow.start_run(nested=True):
        train = xgb.DMatrix(data=X_train, label=y_train)
        test = xgb.DMatrix(data=X_test, label=y_test)

        # Train
        booster = xgb.train(params=params, dtrain=train, num_boost_round=20,\
                            evals=[(test, "test")], early_stopping_rounds=10)
        # Evaluate on test set
        predictions_test = booster.predict(test)

        # Calculate AUC, F1 & log values
        auc_score = roc_auc_score(y_test, predictions_test)
        mlflow.log_metric('auc', auc_score)
        f1_score_ = f1_eval(predictions_test, test)
        mlflow.log_metric('F1', f1_score_)

        # Log model signature (for traceability)
        signature = infer_signature(X_train, booster.predict(train))
        mlflow.xgboost.log_model(booster, "model", signature=signature)

        # Add tag for searchability
        mlflow.set_tag("model", model_name)

        # Set the loss to -1*auc_score so fmin maximizes the auc_score
        return {
            'status': STATUS_OK,
            'loss': -1 * auc_score,
            'booster': booster.attributes()
        }
Exemplo n.º 5
0
def model_signature(dataset: tuple) -> ModelSignature:
    X, y = dataset
    signature = infer_signature(X, y)

    signature.inputs.inputs[0]._name = "foo"
    signature.outputs.inputs[0]._name = "bar"

    return signature
Exemplo n.º 6
0
def convert_sklearn_mlflow(clf, x_sample):

    signature = infer_signature(x_sample, clf.predict(x_sample))
    input_example = {}
    for i in x_sample.columns:
        input_example[i] = x_sample[i][0]

    mlflow.sklearn.save_model(clf, "best_model", signature=signature, input_example=input_example)

    return
Exemplo n.º 7
0
 def _get_signature_and_example(self, model_name, inputs, model):
     model_config = self.config.models[model_name]
     if model_config.input in inputs:
         input_example_el = get_first_element(inputs[model_config.input])
         signature = infer_signature(input_example_el,
                                     model.predict(input_example_el))
     else:
         input_example_el = None
         signature = None
     return signature, input_example_el
Exemplo n.º 8
0
def test_signature_inference_infers_datime_types_as_expected():
    col_name = "datetime_col"
    test_datetime = np.datetime64("2021-01-01")
    test_series = pd.Series(pd.to_datetime([test_datetime]))
    test_df = test_series.to_frame(col_name)

    signature = infer_signature(test_series)
    assert signature.inputs == Schema([ColSpec(DataType.datetime)])

    signature = infer_signature(test_df)
    assert signature.inputs == Schema([ColSpec(DataType.datetime, name=col_name)])

    spark = pyspark.sql.SparkSession.builder.getOrCreate()
    spark_df = spark.range(1).selectExpr(
        "current_timestamp() as timestamp", "current_date() as date"
    )
    signature = infer_signature(spark_df)
    assert signature.inputs == Schema(
        [ColSpec(DataType.datetime, name="timestamp"), ColSpec(DataType.datetime, name="date")]
    )
Exemplo n.º 9
0
def test_input_examples_with_nan(df_with_nan, dict_of_ndarrays_with_nans):
    # test setting example with data frame with NaN values in it
    sig = infer_signature(df_with_nan)
    with TempDir() as tmp:
        example = _Example(df_with_nan)
        example.save(tmp.path())
        filename = example.info["artifact_path"]
        with open(tmp.path(filename), "r") as f:
            data = json.load(f)
            assert set(data.keys()) == set(("columns", "data"))
        parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs)
        # by definition of NaN, NaN == NaN is False but NaN != NaN is True
        assert (
            ((df_with_nan == parsed_df) | ((df_with_nan != df_with_nan) & (parsed_df != parsed_df)))
            .all()
            .all()
        )
        # the frame read without schema should match except for the binary values
        no_schema_df = _dataframe_from_json(tmp.path(filename))
        a = parsed_df.drop(columns=["binary"])
        b = no_schema_df.drop(columns=["binary"])
        assert ((a == b) | ((a != a) & (b != b))).all().all()

    # pass multidimensional array
    for col in dict_of_ndarrays_with_nans:
        input_example = dict_of_ndarrays_with_nans[col]
        sig = infer_signature(input_example)
        with TempDir() as tmp:
            example = _Example(input_example)
            example.save(tmp.path())
            filename = example.info["artifact_path"]
            parsed_ary = _read_tensor_input_from_json(tmp.path(filename), schema=sig.inputs)
            assert np.array_equal(parsed_ary, input_example, equal_nan=True)

            # without a schema/dtype specified, the resulting tensor will keep the None type
            no_schema_df = _read_tensor_input_from_json(tmp.path(filename))
            assert np.array_equal(
                no_schema_df, np.where(np.isnan(input_example), None, input_example)
            )
Exemplo n.º 10
0
def main(name):
   iris = load_iris()
   sk_model = tree.DecisionTreeClassifier()
   sk_model = sk_model.fit(iris.data, iris.target)
   predictions = sk_model.predict(iris.data[0:5])
   signature = infer_signature(iris.data, predictions)

   # Log model params
   mlflow.log_param("criterion", sk_model.criterion)
   mlflow.log_param("splitter", sk_model.splitter)

   # Log model and register the model with signature
   mlflow.sklearn.log_model(sk_model=sk_model,
                            artifact_path="sklearn-cls-model",
                            registered_model_name=name,
                            signature=signature)
Exemplo n.º 11
0
def train(trainer, experiment_name, version="1", *args, **kwargs):

    owd = os.getcwd()
    os.chdir(Paths().root_dir)

    mlflow.set_experiment(experiment_name)
    timestamp = time.strftime("%Y%m%d%H%M")
    run_name = f"{experiment_name}_{timestamp}"

    with mlflow.start_run(run_name=run_name):
        run_uuid = mlflow.active_run().info.run_uuid
        logging.info(f"MLflow Run ID: {run_uuid}")

        trainer.fit(*args, **kwargs)

        # Get training params
        params = trainer.get_params()

        # Log parameters
        mlflow.log_params(params)

        # calculate metrics
        metrics = {}
        for metric in trainer.metrics:
            metrics[metric] = trainer.history[metric][-1]
            metrics[f"val_{metric}"] = trainer.history[f"val_{metric}"][-1]
        metrics["loss"] = trainer.history["loss"][-1]
        metrics["val_loss"] = trainer.history["val_loss"][-1]

        # log metrics
        mlflow.log_metrics(metrics)

        # log model
        model = trainer.model.model
        model_name = trainer.model.name
        X_train = trainer.X_train
        y_pred = trainer.model.model.predict(X_train)
        signature = infer_signature(X_train, y_pred)
        mlflow.keras.log_model(model, model_name, signature=signature)
        models_path = Paths().model / "models"
        if not models_path.exists():
            models_path.mkdir()
        model_path = models_path / model_name / version
        model.save(model_path)
        logging.info(f"Model exported at {model_path}.")

    os.chdir(owd)
Exemplo n.º 12
0
def test_autolog_logs_signature_and_input_example(data_type):
    mlflow.sklearn.autolog(log_input_example=True, log_model_signature=True)

    X, y = get_iris()
    X = data_type(X)
    y = data_type(y)
    model = sklearn.linear_model.LinearRegression()

    with mlflow.start_run() as run:
        model.fit(X, y)
        model_path = os.path.join(run.info.artifact_uri, MODEL_DIR)

    model_conf = get_model_conf(run.info.artifact_uri)
    input_example = _read_example(model_conf, model_path)
    pyfunc_model = mlflow.pyfunc.load_model(model_path)

    assert model_conf.signature == infer_signature(X, model.predict(X[:5]))
    np.testing.assert_array_equal(pyfunc_model.predict(input_example), model.predict(X[:5]))
Exemplo n.º 13
0
def mlflow_register(dfXy, model_dict: dict, stats: dict, mlflow_pars: dict):
    log("#### Using mlflow #########################################################"
        )
    # def register(run_name, params, metrics, signature, model_class, tracking_uri= "sqlite:///local.db"):
    from run_mlflow import register
    from mlflow.models.signature import infer_signature

    train_signature = dfXy[model_dict['data_pars']['cols_model']]
    y_signature = dfXy[model_dict['data_pars']['coly']]
    signature = infer_signature(train_signature, y_signature)

    register(run_name=model_dict['global_pars']['config_name'],
             params=model_dict['global_pars'],
             metrics=stats["metrics_test"],
             signature=signature,
             model_class=model_dict['model_pars']["model_class"],
             tracking_uri=mlflow_pars.get('tracking_db',
                                          "sqlite:///mlflow_local.db"))
Exemplo n.º 14
0
def train(data):
    df = pd.read_csv(data)
    train, test = train_test_split(df,
                                   test_size=0.1,
                                   shuffle=True,
                                   random_state=42)
    X_train = train.text
    X_test = test.text
    xgboost = xgboost_model()
    with mlflow.start_run(run_name="xgboost_experiment") as run:
        tic = time.time()
        model_path = os.path.join('models', run.info.run_id)
        xgboost.fit(X_train, train['label'])
        duration_training = time.time() - tic
        mlflow.pyfunc.save_model(path=model_path, python_model=xgboost)
        loaded_model = mlflow.pyfunc.load_pyfunc(model_path)
        tic = time.time()

        model_output = loaded_model.predict(X_test)
        # acc = accuracy_score(test['label'],[i['label'] for i in model_output])
        class_report = classification_report(test['label'],
                                             model_output['label'],
                                             output_dict=True)

        print(model_path)

        #     ocrmodel.predict()
        duration_prediction = time.time() - tic
        mlflow.log_metric("Load Model Time", duration_training)
        mlflow.log_metric("predict Time", duration_prediction)
        # confusion_matrices = confusion_matrix(test['label'], model_output['label'])
        mlflow.log_metric("accuracy_score", class_report['accuracy'])
        mlflow.log_metric('precision',
                          class_report['weighted avg']['precision'])
        mlflow.log_metric("recall", class_report['weighted avg']['recall'])

        mlflow.log_param('input', data)
        signature = infer_signature(X_train, loaded_model.predict(X_train))
        # mlflow.pyfunc.log_model(loaded_model, "model")
        mlflow.sklearn.log_model(loaded_model, "model", signature=signature)
        mlflow.end_run()
Exemplo n.º 15
0
def build_model(algorithm, sample_size):
    with mlflow.start_run():
        train_df = pd.read_csv(
            f'datasets/processed/aws/train_{sample_size}.csv')
        test_df = pd.read_csv(f'datasets/processed/aws/test_{sample_size}.csv')

        mlflow.log_param("algorithm", algorithm)
        mlflow.log_param("use_tokenizer", False)
        mlflow.log_param("remove_stop_words", False)
        mlflow.log_param("sample_size", sample_size)

        model = modelUtils.build_model(algorithm)
        model.fit(train_df['text'], train_df['label'])

        y_pred = model.predict(test_df['text'])

        signature = infer_signature(train_df['text'], y_pred)
        mlflow.sklearn.log_model(model, algorithm, signature=signature)

        mlflow.log_metric("f1_score",
                          f1_score(test_df['label'], y_pred, average='micro'))
def fit_model(model_feature_lookups, n_iter=10):

  with mlflow.start_run():
    training_set = fs.create_training_set(outputDF,
                                          model_feature_lookups,
                                          label=label,
                                          exclude_columns=key)

    # Convert to pandas Dataframe
    training_pd = training_set.load_df().toPandas()
    X = training_pd.drop(label, axis=1)
    y = training_pd[label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Add weights given class imbalance
    damage_weight = 1.0 / y_train.sum()
    healthy_weight = 1.0 / (len(y) - y_train.sum())
    sample_weight = y_train.map(lambda damaged: damage_weight if damage_weight else healthy_weight)

    # Not attempting to tune the model at all for purposes here
    gb_classifier = GradientBoostingClassifier(n_iter_no_change=n_iter)
    
    # Encode categorical cols (if any)
#     encoders = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), X.columns[X.dtypes == 'object'])])
    
    pipeline = Pipeline([("gb_classifier", gb_classifier)])
    pipeline_model = pipeline.fit(X_train, y_train, gb_classifier__sample_weight=sample_weight)

    mlflow.log_metric('test_accuracy', pipeline_model.score(X_test, y_test))
    # mlflow.shap.log_explanation(gb_classifier.predict, encoders.transform(X))

   fs.log_model(pipeline_model,
                "model",
                flavor=mlflow.sklearn,
                training_set=training_set,
                registered_model_name=model_name,
                input_example=X[:100],
                signature=infer_signature(X, y))
def test_autolog_logs_signature_and_input_example(data_type):
    mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)

    X, y = get_iris()
    X = data_type(X)
    y = data_type(y)
    model = sklearn.linear_model.LinearRegression()

    with mlflow.start_run() as run:
        model.fit(X, y)
        model_path = os.path.join(run.info.artifact_uri, MODEL_DIR)

    model_conf = get_model_conf(run.info.artifact_uri)
    input_example = _read_example(model_conf, model_path)
    pyfunc_model = mlflow.pyfunc.load_model(model_path)

    assert model_conf.signature == infer_signature(X, model.predict(X[:5]))

    # On GitHub Actions, `pyfunc_model.predict` and `model.predict` sometimes return
    # slightly different results:
    #
    # >>> pyfunc_model.predict(input_example)
    # [[0.171504346208176  ]
    #  [0.34346150441640155]  <- diff
    #  [0.06895096846585114]  <- diff
    #  [0.05925789882165455]
    #  [0.03424907823290102]]
    #
    # >>> model.predict(X[:5])
    # [[0.171504346208176  ]
    #  [0.3434615044164018 ]  <- diff
    #  [0.06895096846585136]  <- diff
    #  [0.05925789882165455]
    #  [0.03424907823290102]]
    #
    # As a workaround, use `assert_array_almost_equal` instead of `assert_array_equal`
    np.testing.assert_array_almost_equal(pyfunc_model.predict(input_example),
                                         model.predict(X[:5]))
Exemplo n.º 18
0
def build_mlflow_model(homedir):
    from sklearn import datasets
    from sklearn.ensemble import RandomForestClassifier
    import mlflow
    import mlflow.sklearn
    from mlflow.models.signature import infer_signature
    import pandas as pd
    from mlflow.tracking._model_registry import fluent

    mlflow.set_tracking_uri('http://localhost:5000')
    with mlflow.start_run() as run:
        iris = datasets.load_iris()
        iris_train = pd.DataFrame(iris.data, columns=iris.feature_names)
        clf = RandomForestClassifier(max_depth=7, random_state=0)
        clf.fit(iris_train, iris.target)
        signature = infer_signature(iris_train, clf.predict(iris_train))
        model_name = "iris_rf"
        mlflow.sklearn.log_model(clf,
                                 model_name,
                                 signature=signature,
                                 registered_model_name=model_name)
        logging.info('runs:', os.fwalk(homedir))
        return fluent.MlflowClient().get_model_version_download_uri(
            name=model_name, version=1)
Exemplo n.º 19
0
def run_train(config_name, config_path="source/config_model.py", n_sample=5000,
              mode="run_preprocess", model_dict=None, return_mode='file', **kw):
    """
      Configuration of the model is in config_model.py file
    :param config_name:
    :param config_path:
    :param n_sample:
    :return:
    """
    model_dict  = model_dict_load(model_dict, config_path, config_name, verbose=True)

    mlflow_pars = model_dict.get('compute_pars', {}).get('mlflow_pars', None)


    m           = model_dict['global_pars']
    path_data_train   = m['path_data_train']
    path_train_X      = m.get('path_train_X', path_data_train + "/features.zip") #.zip
    path_train_y      = m.get('path_train_y', path_data_train + "/target.zip")   #.zip

    path_output         = m['path_train_output']
    # path_model          = m.get('path_model',          path_output + "/model/" )
    path_pipeline       = m.get('path_pipeline',       path_output + "/pipeline/" )
    path_features_store = m.get('path_features_store', path_output + '/features_store/' )  #path_data_train replaced with path_output, because preprocessed files are stored there
    path_check_out      = m.get('path_check_out',      path_output + "/check/" )
    log(path_output)


    log("#### load input column family  ##################################################")
    try :
        cols_group = model_dict['data_pars']['cols_input_type']  ### the model config file
    except :
        cols_group = json.load(open(path_data_train + "/cols_group.json", mode='r'))
    log(cols_group)


    log("#### Preprocess  ################################################################")        
    preprocess_pars = model_dict['model_pars']['pre_process_pars']
     
    if mode == "run_preprocess" :
        dfXy, cols      = preprocess(path_train_X, path_train_y,
                                     path_pipeline,    ### path to save preprocessing pipeline
                                     cols_group,       ### dict of column family
                                     n_sample,
                                     preprocess_pars,
                                     path_features_store  ### Store intermediate dataframe
                                     )

    elif mode == "load_preprocess"  :  #### Load existing data
        dfXy, cols      = preprocess_load(path_train_X, path_train_y, path_pipeline, cols_group, n_sample,
                                          preprocess_pars,  path_features_store=path_features_store)


    ### Actual column names for label y and Input X (colnum , colcat) 
    model_dict['data_pars']['coly']       = cols['coly']
    model_dict['data_pars']['cols_model'] = sum([  cols[colgroup] for colgroup in model_dict['data_pars']['cols_model_group'] ]   , [])


    #### Col Group to model input : Sparse, continuous, .... (ie Neural Network
    ## 'coldense' = [ 'colnum' ]     'colsparse' = ['colcat' ]
    ##
    model_dict['data_pars']['cols_model_type2'] = {}
    for colg, colg_list in model_dict['data_pars'].get('cols_model_type', {}).items() :
        model_dict['data_pars']['cols_model_type2'][colg] = sum([  cols[colgroup] for colgroup in colg_list ]   , [])


   
    log("#### Train model: #############################################################")
    log(str(model_dict)[:1000])
    post_process_fun      = model_dict['model_pars']['post_process_fun']
    dfXy, dfXytest,stats  = train(model_dict, dfXy, cols, post_process_fun)

    if mlflow_pars is not None:
        log("#### Using mlflow #########################################################")
        # def register(run_name, params, metrics, signature, model_class, tracking_uri= "sqlite:///local.db"):
        from run_mlflow import register
        from mlflow.models.signature import infer_signature

        train_signature = dfXy[model_dict['data_pars']['cols_model']]
        y_signature     = dfXy[model_dict['data_pars']['coly']]
        signature       = infer_signature(train_signature, y_signature)

        register( run_name    = model_dict['global_pars']['config_name'],
                 params       = model_dict['global_pars'],
                 metrics      = stats["metrics_test"],
                 signature    = signature,
                 model_class  = model_dict['model_pars']["model_class"],
                 tracking_uri = mlflow_pars.get( 'tracking_db', "sqlite:///mlflow_local.db")
                )


    if return_mode == 'dict' :
        return { 'dfXy' : dfXy, 'dfXytest': dfXytest, 'stats' : stats   }

    else :
        log("#### Export ##################################################################")
        os.makedirs(path_check_out, exist_ok=True)
        colexport = [cols['colid'], cols['coly'], cols['coly'] + "_pred"]
        dfXy[colexport].reset_index().to_csv(path_check_out + "/pred_check.csv")  # Only results
        dfXy.to_parquet(path_check_out + "/dfX.parquet")  # train input data generate parquet
        #dfXy.to_csv(path_check_out + "/dfX.csv")  # train input data generate csv
        dfXytest.to_parquet(path_check_out + "/dfXtest.parquet")  # Test input data  generate parquet
        #dfXytest.to_csv(path_check_out + "/dfXtest.csv")  # Test input data  generate csv
        log("######### Finish #############################################################", )
Exemplo n.º 20
0
        mlflow.set_tag("features", str(X_train.columns.values.tolist()))

        # Log tracked parameters only
        mlflow.log_params(run_parameters)

        mlflow.log_metrics({
            'RMSE_CV': score_cv.mean(),
            'RMSE': score,
        })

        # log training loss
        for s in model.train_score_:
            mlflow.log_metric("Train Loss", s)

        # get model signature
        signature = infer_signature(model_input=X_train, model_output=model.predict(X_train))

        # Save model to artifacts
        mlflow.sklearn.log_model(model, "model", signature=signature)

        # log charts
        mlflow.log_artifacts(model_artifacts_dir)

        # optional: auto-logging for scikit-learn estimators
        # mlflow.sklearn.autolog()

        # optional: log all model parameters
        # mlflow.log_params(model.get_params())

        print(f"Run {run_id}:", "Logging completed")
Exemplo n.º 21
0
    # Starts runs with different XGBoost parameters
    for md in args.max_depth.split(','):
        for lr in args.learning_rate.split(','):
            for ssr in args.subsample.split(','):
                # Creates an execution context for a single run with given parameters (`md`, `lr`, and `ssr`)
                with mlflow.start_run(run_name=args.run_name) as run:
                    clf = xgb.XGBClassifier(max_depth=int(md), learning_rate=float(lr), nthread=-1, subsample=float(ssr))
                    clf.fit(X_train, y_train)

                    # Computes a metric for the built model
                    pred = clf.predict(X_test)
                    rmse = np.sqrt(mean_squared_error(y_test, pred))

                    # For better tracking, stores the training logs and the built model
                    # in the MLflow logging framework
                    # TODO: Saves a graphviz image for feature importances in XGBoost
                    from mlflow.models.signature import infer_signature
                    infer_signature(X_train, y_test)

                    # This feature implemented in MLflow v1.12.0
                    # mlflow.shap.log_explanation(clf, X_train)

                    mlflow.set_tag('training algorithm', 'xgboost')
                    mlflow.log_metrics({'RMSE': rmse})
                    mlflow.xgboost.log_model(clf, 'model')

                    print('XGBoost model (max_depth=%s, learning_rate=%s, subsample=%s):' % (md, lr, ssr))
                    print('  RMSE: %f' % rmse)

Exemplo n.º 22
0
def test_parameter_search_estimators_produce_expected_outputs(
        cv_class, search_space, backend):
    mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)

    svc = sklearn.svm.SVC()
    cv_model = cv_class(svc, search_space, n_jobs=5, return_train_score=True)
    X, y = get_iris()

    def train_cv_model():
        if backend is None:
            cv_model.fit(X, y)
        else:
            with sklearn.utils.parallel_backend(backend=backend):
                cv_model.fit(X, y)

    with mlflow.start_run() as run:
        train_cv_model()
        run_id = run.info.run_id

    params, metrics, tags, artifacts = get_run_data(run_id)
    expected_cv_params = truncate_dict(
        stringify_dict_values(cv_model.get_params(deep=False)))
    expected_cv_params.update({
        "best_{}".format(param_name): str(param_value)
        for param_name, param_value in cv_model.best_params_.items()
    })
    assert params == expected_cv_params
    assert {
        TRAINING_SCORE: cv_model.score(X, y),
        "best_cv_score": cv_model.best_score_,
    }.items() <= metrics.items()
    assert tags == get_expected_class_tags(cv_model)
    assert MODEL_DIR in artifacts
    assert "best_estimator" in artifacts
    assert "cv_results.csv" in artifacts

    best_estimator = mlflow.sklearn.load_model(
        "runs:/{}/best_estimator".format(run_id))
    assert isinstance(best_estimator, sklearn.svm.SVC)
    cv_model = mlflow.sklearn.load_model("runs:/{}/{}".format(
        run_id, MODEL_DIR))
    assert isinstance(cv_model, cv_class)

    # Ensure that a signature and input example are produced for the best estimator
    best_estimator_conf = get_model_conf(run.info.artifact_uri,
                                         "best_estimator")
    assert best_estimator_conf.signature == infer_signature(
        X, best_estimator.predict(X[:5]))

    best_estimator_path = os.path.join(run.info.artifact_uri, "best_estimator")
    input_example = _read_example(best_estimator_conf, best_estimator_path)
    best_estimator.predict(
        input_example)  # Ensure that input example evaluation succeeds

    client = mlflow.tracking.MlflowClient()
    child_runs = client.search_runs(
        run.info.experiment_id,
        "tags.`mlflow.parentRunId` = '{}'".format(run_id))
    cv_results = pd.DataFrame.from_dict(cv_model.cv_results_)
    # We expect to have created a child run for each point in the parameter search space
    assert len(child_runs) == len(cv_results)

    # Verify that each set of parameter search results has a corresponding MLflow run
    # with the expected data
    for _, result in cv_results.iterrows():
        result_params = result.get("params", {})
        params_search_clause = " and ".join([
            "params.`{}` = '{}'".format(key, value)
            for key, value in result_params.items()
        ])
        search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format(
            run_id, params_search_clause)
        child_runs = client.search_runs(run.info.experiment_id, search_filter)
        assert len(child_runs) == 1
        child_run = child_runs[0]
        assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED)
        _, child_metrics, child_tags, _ = get_run_data(child_run.info.run_id)
        assert child_tags == get_expected_class_tags(svc)
        assert "mean_test_score" in child_metrics.keys()
        assert "std_test_score" in child_metrics.keys()
        # Ensure that we do not capture separate metrics for each cross validation split, which
        # would produce very noisy metrics results
        assert len([
            metric for metric in child_metrics.keys()
            if metric.startswith("split")
        ]) == 0
Exemplo n.º 23
0
def train(
    model: str,
    experiment_name: str = None,
    data_dir=None,
    root_dir=None,
    best_metric="val_accuracy",
    **kwargs,
):
    """Base method to train a model. Will train the model input based on `MODEL_DICT` correspondance, and define the `experiment_name` in MlFlow tracking.

    Args:
        model (str): the model to train. Only two choices: `model1` or `model2`.
        experiment_name (str, optional): The experiment name to define in MlFlow tracking server. Defaults to None. If None, will be define with `model` value.
        best_metric (str, optional): The metrics on which performing evaluation of the model, and to check if performance has improved since best last model. Defaults to "val_accuracy".
    """
    _check_input(model)

    if experiment_name is None:
        experiment_name = model

    owd = os.getcwd()
    root_dir = Paths(root_dir=root_dir).root_dir
    os.chdir(root_dir)

    mlflow.set_experiment(experiment_name)
    tracker = MlFlowTracker(root_dir=root_dir)
    print(tracker.root_dir)

    timestamp = time.strftime("%Y%m%d%H%M")
    run_name = f"{experiment_name}_{timestamp}"

    learner = MODEL_DICT.get(model)(data_dir=data_dir)
    print(learner.name)

    version = tracker.get_new_version(experiment_name)
    logging.info(version)

    with mlflow.start_run(run_name=run_name):
        run_uuid = mlflow.active_run().info.run_uuid
        logging.info(f"MLflow Run ID: {run_uuid}")

        learner.train(**kwargs)

        # Get training params
        params = learner.get_params()

        # Log parameters
        mlflow.log_params(params)

        # calculate metrics
        metrics = {}
        for metric in learner.metrics:
            metrics[metric] = learner.history[metric][-1]
            metrics[f"val_{metric}"] = learner.history[f"val_{metric}"][-1]
        metrics["loss"] = learner.history["loss"][-1]
        metrics["val_loss"] = learner.history["val_loss"][-1]

        final_metric = metrics.get(best_metric)

        # log metrics
        mlflow.log_metrics(metrics)

        # log model
        model_name = learner.model.name
        X_train = learner.X_train
        y_pred = learner.predict(X_train)
        signature = infer_signature(X_train, y_pred)
        mlflow.keras.log_model(learner.model.model,
                               model_name,
                               signature=signature,
                               save_format="tf")

    models_path = Paths(root_dir=root_dir).model / "models"
    if not models_path.exists():
        models_path.mkdir()

    final_metric_best = tracker.get_best_model_metric(experiment_name,
                                                      metric=best_metric)

    if final_metric >= final_metric_best:
        logging.info(
            "Best model found. Saving to model dir to use with Tensorflow Serving"
        )
        model_path = os.path.join(str(models_path), model)
        if not os.path.exists(model_path):
            os.mkdir(model_path)
            logging.info(f"Folder ")
        if model == "model2":
            tfmodel = TFModel(learner.model.model)
            tf.saved_model.save(
                tfmodel.model,
                os.path.join(model_path, "0"),
                signatures={"serving_default": tfmodel.prediction},
            )
            print(tfmodel)
        else:
            learner.model.model.save(os.path.join(model_path, "0"))
        logging.info(f"Model exported at {model_path}.")
    else:
        logging.info(
            f"Model logged but best performance not improved for experiment {experiment_name} (current version: {version})."
        )

    os.chdir(owd)
Exemplo n.º 24
0
def test_input_examples(pandas_df_with_all_types, dict_of_ndarrays):
    sig = infer_signature(pandas_df_with_all_types)
    # test setting example with data frame with all supported data types
    with TempDir() as tmp:
        example = _Example(pandas_df_with_all_types)
        example.save(tmp.path())
        filename = example.info["artifact_path"]
        with open(tmp.path(filename), "r") as f:
            data = json.load(f)
            assert set(data.keys()) == set(("columns", "data"))
        parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs)
        assert (pandas_df_with_all_types == parsed_df).all().all()
        # the frame read without schema should match except for the binary values
        assert ((parsed_df.drop(columns=["binary"]) == _dataframe_from_json(
            tmp.path(filename)).drop(columns=["binary"])).all().all())

    # NB: Drop columns that cannot be encoded by proto_json_utils.pyNumpyEncoder
    new_df = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])

    # pass the input as dictionary instead
    with TempDir() as tmp:
        d = {name: new_df[name].values for name in new_df.columns}
        example = _Example(d)
        example.save(tmp.path())
        filename = example.info["artifact_path"]
        parsed_dict = _read_tensor_input_from_json(tmp.path(filename))
        assert d.keys() == parsed_dict.keys()
        # Asserting binary will fail since it is converted to base64 encoded strings.
        # The check above suffices that the binary input is stored.
        del d["binary"]
        for key in d:
            assert np.array_equal(d[key], parsed_dict[key])

    # input passed as numpy array
    new_df = pandas_df_with_all_types.drop(columns=["binary"])
    for col in new_df:
        input_example = new_df[col].to_numpy()
        with TempDir() as tmp:
            example = _Example(input_example)
            example.save(tmp.path())
            filename = example.info["artifact_path"]
            parsed_ary = _read_tensor_input_from_json(tmp.path(filename))
            assert np.array_equal(parsed_ary, input_example)

    # pass multidimensional array
    for col in dict_of_ndarrays:
        input_example = dict_of_ndarrays[col]
        with TempDir() as tmp:
            example = _Example(input_example)
            example.save(tmp.path())
            filename = example.info["artifact_path"]
            parsed_ary = _read_tensor_input_from_json(tmp.path(filename))
            assert np.array_equal(parsed_ary, input_example)

    # pass multidimensional array as a list
    example = np.array([[1, 2, 3]])
    with pytest.raises(TensorsNotSupportedException):
        _Example([example, example])

    # pass dict with scalars
    with TempDir() as tmp:
        example = {"a": 1, "b": "abc"}
        x = _Example(example)
        x.save(tmp.path())
        filename = x.info["artifact_path"]
        parsed_df = _dataframe_from_json(tmp.path(filename))
        assert example == parsed_df.to_dict(orient="records")[0]
Exemplo n.º 25
0
def train(data_conf, model_conf, **kwargs):

    try:
        print()
        print("-----------------------------------")
        print("         Model Training            ")
        print("-----------------------------------")
        print()

        # ==============================
        # 1.0 Data Loading
        # ==============================

        # Loading of dataset
        iris = load_iris()                  #The Iris dataset is available through the scikit-learn API
        idx = list(range(len(iris.target)))
        np.random.shuffle(idx)              #We shuffle it (important if we want to split in train and test sets)
        X = iris.data[idx]
        y = iris.target[idx]

        # Load data in Pandas dataFrame
        data_pd = pd.DataFrame(data=np.column_stack((X,y)), columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
        data_pd.loc[data_pd['label']==0,'species'] = 'setosa'
        data_pd.loc[data_pd['label']==1,'species'] = 'versicolor'
        data_pd.loc[data_pd['label']==2,'species'] = 'virginica'
        data_pd.head()
        
        # Feature selection
        feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        target       = 'label'   
        
        X = data_pd[feature_cols].values
        y = data_pd[target].values

        # Creation of train and test datasets
        x_train, x_test, y_train, y_test = train_test_split(X,y,train_size=0.7, stratify=y) #stratify=y ensures that the same proportion of labels are in both train and test sets! 
        
        # Save test dataset
        test_pd = pd.DataFrame(data=np.column_stack((x_test,y_test)), columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
        test_pd.loc[data_pd['label']==0,'species'] = 'setosa'
        test_pd.loc[data_pd['label']==1,'species'] = 'versicolor'
        test_pd.loc[data_pd['label']==2,'species'] = 'virginica'
        test_df = spark.createDataFrame(test_pd)
        test_df.write.format("delta").mode("overwrite").save("/mnt/delta/{0}".format('test_data_sklearn_rf'))

        print("Step 1.0 completed: Loaded Iris dataset in Pandas")      

    except Exception as e:
        print("Errored on 1.0: data loading")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ========================================
        # 1.1 Model training
        # ========================================
        
        with mlflow.start_run() as run:          

            # Model definition
            max_depth = int(model_conf['hyperparameters']['max_depth'])
            n_estimators = int(model_conf['hyperparameters']['n_estimators'])
            max_features = model_conf['hyperparameters']['max_features']
            criterion = model_conf['hyperparameters']['criterion']
            class_weight = model_conf['hyperparameters']['class_weight']
            bootstrap = bool(model_conf['hyperparameters']['bootstrap'])
            clf = RandomForestClassifier(max_depth=max_depth,
                                       n_estimators=n_estimators,
                                       max_features=max_features,
                                       criterion=criterion,
                                       class_weight=class_weight,
                                       bootstrap=bootstrap,
                                       random_state=21,
                                       n_jobs=-1)          
            
            # Fit of the model on the training set
            model = clf.fit(x_train, y_train) 
            
            # Log the model within the MLflow run
            mlflow.log_param("max_depth", str(max_depth))
            mlflow.log_param("n_estimators", str(n_estimators))  
            mlflow.log_param("max_features", str(max_features))             
            mlflow.log_param("criterion", str(criterion))  
            mlflow.log_param("class_weight", str(class_weight))  
            mlflow.log_param("bootstrap", str(bootstrap))  
            mlflow.log_param("max_features", str(max_features)) 
            signature = infer_signature(x_train, clf.predict(x_train))
            mlflow.sklearn.log_model(model, 
                                   "model",
                                   registered_model_name="sklearn-rf",
                                   signature=signature)                        

        print("Step 1.1 completed: model training and saved to MLFlow")                  

    except Exception as e:
        print("Errored on step 1.1: model training")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e       

    print()     
Exemplo n.º 26
0
    def train(self, max_depth, max_leaf_nodes, model_name, output_path):
        with mlflow.start_run(run_name=self.run_origin) as run:  # NOTE: mlflow CLI ignores run_name
            if self.autolog:
                mlflow.sklearn.autolog()

            run_id = run.info.run_uuid
            experiment_id = run.info.experiment_id
            print("MLflow:")
            print("  run_id:", run_id)
            print("  experiment_id:", experiment_id)
            print("  experiment_name:", client.get_experiment(experiment_id).name)

            # MLflow tags
            mlflow.set_tag("autolog",self.autolog)
            mlflow.set_tag("save_signature",self.save_signature)
            mlflow.set_tag("mlflow.runName", self.run_origin) # mlflow CLI picks this up
            mlflow.set_tag("data_path", self.data_path)
            mlflow.set_tag("run_origin", self.run_origin)
            mlflow.set_tag("version.mlflow", mlflow.__version__)
            mlflow.set_tag("version.sklearn", sklearn.__version__)
            mlflow.set_tag("version.platform", platform.platform())
            mlflow.set_tag("version.python", platform.python_version())
            mlflow.set_tag("model_name",model_name)

            # Create model
            dt = DecisionTreeRegressor(max_depth=max_depth, max_leaf_nodes=max_leaf_nodes)
            print("Model:\n ", dt)

            # Fit and predict
            dt.fit(self.X_train, self.y_train)
            predictions = dt.predict(self.X_test)
            signature = infer_signature(self.X_train, predictions) if self.save_signature else None
            print("signature:",signature)

            # MLflow params
            print("Parameters:")
            print("  max_depth:", max_depth)
            print("  max_leaf_nodes:", max_leaf_nodes)
            if not self.autolog:
                mlflow.log_param("max_depth", max_depth)
                mlflow.log_param("max_leaf_nodes", max_leaf_nodes)

                # MLflow metrics
                rmse = np.sqrt(mean_squared_error(self.y_test, predictions))
                mae = mean_absolute_error(self.y_test, predictions)
                r2 = r2_score(self.y_test, predictions)
                print("Metrics:")
                print("  rmse:", rmse)
                print("  mae:", mae)
                print("  r2:", r2)
                mlflow.log_metric("rmse", rmse)
                mlflow.log_metric("r2", r2)
                mlflow.log_metric("mae", mae)
            
                # MLflow log model -  autolog creates a model called "model"
                mlflow.sklearn.log_model(dt, "sklearn-model", registered_model_name=model_name, signature=signature)

            # Convert sklearn model to ONNX and log model
            if self.log_as_onnx:
                from wine_quality import onnx_utils
                onnx_utils.log_model(dt, "onnx-model", model_name, self.X_test)

            # MLflow artifact - plot file
            plot_file = "plot.png"
            plot_utils.create_plot_file(self.y_test, predictions, plot_file)
            mlflow.log_artifact(plot_file)

            # Write run ID to file
            if (output_path):
                mlflow.set_tag("output_path", output_path)
                output_path = output_path.replace("dbfs:","/dbfs")
                with open(output_path, "w") as f:
                    f.write(run_id)

        return (experiment_id,run_id)
Exemplo n.º 27
0
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

iris = datasets.load_iris()
iris_train = pd.DataFrame(iris.data, columns=iris.feature_names)
clf = RandomForestClassifier(max_depth=7, random_state=0)
clf.fit(iris_train, iris.target)
signature = infer_signature(iris_train, clf.predict(iris_train))
mlflow.sklearn.log_model(clf, "iris_rf", signature=signature)
Exemplo n.º 28
0
    mlflow.log_param("min_impurity_decrease", "0.0")
    mlflow.log_param("min_impurity_split", "None")
    mlflow.log_param("min_samples_leaf", "1")
    mlflow.log_param("min_samples_split", "2")
    mlflow.log_param("min_weight_fraction_leaf", "0.0")
    mlflow.log_param("n_jobs", "None")
    mlflow.log_param("verbose", "0")
    mlflow.log_param("warm_start", "False")

    mlflow.set_tag("estimator_class",
                   "sklearn.ensemble._forest.RandomForestRegressor")
    mlflow.set_tag("estimator_name", "RandomForestRegressor")
    mlflow.set_tag("sparkDatasourceInfo",
                   "path=dbfs:/mnt/delta/flights/gold,version=4,format=delta")

    sig = infer_signature(X_train[:100], y_train[:100])
    mlflow.sklearn.log_model(
        rfr,
        'model_signature',
        signature=sig,
        input_example=X_train.head(10),
        registered_model_name=
        '2020-10-27_clemens_mewald@databricks_com_based on clemens_flightdelays_gold'
    )

    import shap
    shap_values = shap.TreeExplainer(rfr).shap_values(X_train[:10])
    shap_plt = shap.summary_plot(shap_values,
                                 X_train[:10],
                                 plot_type="bar",
                                 show=False)
from mlflow.models.signature import infer_signature

plt.close()

with mlflow.start_run() as run:
    best_iteration = int(
        spark_trials.best_trial['result']['booster']['best_iteration'])
    booster = xgb.train(params=params_to_xgb(best_params),
                        dtrain=xgb.DMatrix(data=X, label=y),
                        num_boost_round=best_iteration)
    mlflow.log_params(best_params)
    mlflow.log_param('best_iteration', best_iteration)
    mlflow.xgboost.log_model(booster,
                             "xgboost",
                             input_example=X.head(),
                             signature=infer_signature(X, y))

    shap_values = shap.TreeExplainer(booster).shap_values(X, y=y)
    shap.summary_plot(shap_values,
                      X,
                      feature_names=display_cols,
                      plot_size=(14, 6),
                      max_display=10,
                      show=False)
    plt.savefig("summary_plot.png", bbox_inches="tight")
    plt.close()
    mlflow.log_artifact("summary_plot.png")

    best_run = run.info

# COMMAND ----------
Exemplo n.º 30
0
# COMMAND ----------

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])

# COMMAND ----------

# Split the dataset randomly into 70% for training and 30% for testing. Passing a seed for deterministic behavior
train, test = silverDepDF_1.randomSplit([0.7, 0.3], seed=0)
print("Departure Delay: There are %d training examples and %d test examples." %
      (train.count(), test.count()))

# COMMAND ----------

from mlflow.models.signature import infer_signature
signature = infer_signature(train.drop("DEP_DELAY"), train.select("DEP_DELAY"))

# COMMAND ----------

signature

# COMMAND ----------

display(silverDepDF_1)

# COMMAND ----------

import mlflow
import mlflow.spark
from mlflow.models.signature import infer_signature
# turn on autologging