示例#1
0
    def mlflow_run(self, df, r_name="Lab-1:RF Petrol Regression Experiment"):
        """
        This method trains, computes metrics, and logs all metrics, parameters,
        and artifacts for the current run
        :param df: pandas dataFrame
        :param r_name: Name of the experiment as logged by MLflow
        :return: MLflow Tuple (ExperimentID, runID)
        """

        with mlflow.start_run(run_name=r_name) as run:
            # get all rows and columns but the last column
            X = dataset.iloc[:, 0:4].values
            # get all the last columns, which is what we want to predict
            y = dataset.iloc[:, 4].values

            # create train and test data
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=0)

            # Feature Scaling
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_test = sc.transform(X_test)

            # train and predict
            self.rf.fit(X_train, y_train)
            y_pred = self.rf.predict(X_test)

            # Log model and params using the MLflow sklearn APIs
            mlflow.sklearn.log_model(self.rf, "random-forest-reg-model")
            mlflow.log_params(self.params)

            # compute  metrics
            mae = metrics.mean_absolute_error(y_test, y_pred)
            mse = metrics.mean_squared_error(y_test, y_pred)
            rsme = np.sqrt(mse)
            r2 = metrics.r2_score(y_test, y_pred)

            # Log metrics
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("rsme", rsme)
            mlflow.log_metric("r2", r2)

            # update global class instance variable with values
            self.rsme.append(rsme)
            self.r2.append(r2)
            self.estimators.append(params["n_estimators"])

            # plot RSME graph and save as artifacts
            (fig, ax) = Utils.plot_graphs(rfr.estimators, rfr.rsme,
                                          "Random Forest Estimators",
                                          "Root Mean Square",
                                          "Root Mean Square vs Estimators")

            # get current run and experiment id
            runID = run.info.run_uuid
            experimentID = run.info.experiment_id

            # create temporary artifact file name and log artifact
            temp_file_name = Utils.get_temporary_directory_path(
                "rsme_estimators-", ".png")
            temp_name = temp_file_name.name
            try:
                fig.savefig(temp_name)
                mlflow.log_artifact(temp_name, "rsme_estimators_plots")
            finally:
                temp_file_name.close()  # Delete the temp file

            # plot R2 graph and save as artifacts
            (fig_2, ax) = Utils.plot_graphs(rfr.estimators, rfr.r2,
                                            "Random Forest Estimators", "R2",
                                            "R2 vs Estimators")

            # create temporary artifact file name and log artifact
            temp_file_name = Utils.get_temporary_directory_path(
                "r2-estimators-", ".png")
            temp_name = temp_file_name.name
            try:
                fig_2.savefig(temp_name)
                mlflow.log_artifact(temp_name, "r2_estimators_plots")
            finally:
                temp_file_name.close()  # Delete the temp file

            # print some data
            print("-" * 100)
            print(
                "Inside MLflow Run with run_id {} and experiment_id {}".format(
                    runID, experimentID))
            print("Estimator trees        :", self.params["n_estimators"])
            print("Estimator trees depth  :", self.params["max_depth"])
            print('Mean Absolute Error    :', mae)
            print('Mean Squared Error     :', mse)
            print('Root Mean Squared Error:', rsme)
            print('R2                     :', r2)

            return (experimentID, runID)
示例#2
0
        'batch_size': 128
    }, {
        'input_units': 256,
        'input_shape': (4, ),
        'activation': 'relu',
        'optimizer': 'adam',
        'loss': 'mse',
        'epochs': 300,
        'batch_size': 128
    }, {
        'input_units': 512,
        'input_shape': (4, ),
        'activation': 'relu',
        'optimizer': 'adam',
        'loss': 'mse',
        'epochs': 500,
        'batch_size': 256
    }]

    dataset = Utils.load_data("data/petrol_consumption.csv")
    # get all feature independent attributes
    X = dataset.iloc[:, 0:4].values
    # get all the values of last columns, dependent variables,
    # which is what we want to predict as our values, the petrol consumption
    y = dataset.iloc[:, 4].values
    for params in params_list:
        keras_model = KerasRegModel(params)
        (runID, experimentID) = keras_model.train_model(X, y)
        print("MLflow completed with run_id {} and experiment_id {}".format(
            runID, experimentID))
示例#3
0
    def mlflow_run(self, df, r_name="Lab-4:RF Experiment Model"):
        """
        Override the base class mlflow_run for this epxerimental runs
        This method trains the model, evaluates, computes the metrics, logs
        all the relevant metrics, artifacts, and models.
        :param df: pandas dataFrame
        :param r_name: name of the experiment run
        :return:  MLflow Tuple (ExperimentID, runID)
        """

        with mlflow.start_run(run_name=r_name) as run:
            X_train, X_test, y_train, y_test = train_test_split(
                df.drop(["price"], axis=1),
                df[["price"]].values.ravel(),
                random_state=42)
            self.rf.fit(X_train, y_train)
            predictions = self.rf.predict(X_test)

            # Log model and parameters
            mlflow.sklearn.log_model(self.rf, "random-forest-model")

            # Note we are logging as a dictionary of all params instead of logging each parameter
            mlflow.log_params(self.params)

            # Log params
            #[mlflow.log_param(param, value) for param, value in self.params.items()]

            # Create metrics
            mse = metrics.mean_squared_error(y_test, predictions)
            rmse = np.sqrt(mse)
            mae = metrics.mean_absolute_error(y_test, predictions)
            r2 = metrics.r2_score(y_test, predictions)

            # Log metrics
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("rsme", rmse)
            mlflow.log_metric("r2", r2)

            # get experimentalID and runID
            runID = run.info.run_uuid
            experimentID = run.info.experiment_id

            # Create feature importance and save them as artifact
            # This allows us to remove least important features from the dataset
            # with each iteration if they don't have any effect on the predictive power of
            # the prediction.
            importance = pd.DataFrame(
                list(zip(df.columns, self.rf.feature_importances_)),
                columns=["Feature", "Importance"]).sort_values("Importance",
                                                               ascending=False)

            # Log importance file as feature artifact
            temp_file_name = Utils.get_temporary_directory_path(
                "feature-importance-", ".csv")
            temp_name = temp_file_name.name
            try:
                importance.to_csv(temp_name, index=False)
                mlflow.log_artifact(temp_name, "feature-importance-files")
            finally:
                temp_file_name.close()  # Delete the temp file

            # Create residual plots and image directory
            # Residuals R = observed value - predicted value
            (plt, fig, ax) = Utils.plot_residual_graphs(
                predictions, y_test, "Predicted values for Price ($)",
                "Residual", "Residual Plot")

            # Log residuals images
            temp_file_name = Utils.get_temporary_directory_path(
                "residuals-", ".png")
            temp_name = temp_file_name.name
            try:
                fig.savefig(temp_name)
                mlflow.log_artifact(temp_name, "residuals-plots")
            finally:
                temp_file_name.close()  # Delete the temp file

            print("-" * 100)
            print("Inside MLflow {} Run with run_id {} and experiment_id {}".
                  format(r_name, runID, experimentID))
            print("  mse: {}".format(mse))
            print(" rmse: {}".format(rmse))
            print("  mae: {}".format(mae))
            print("  R2 : {}".format(r2))

            return (experimentID, runID)
示例#4
0
            return (experimentID, runID)


#
# TODO in Lab/Homework for Some Experimental runs
#
# 1. Consult RandomForestRegressor documentation
# 2. Change or add parameters, such as depth of the tree or random_state: 42 etc.
# 3. Change or alter the range of runs and increments of n_estimators.
# 4. Check in MLfow UI if the metrics are affected
# challenge-1: create mean square error and r2 artifacts and save them for each run

if __name__ == '__main__':
    # load and print dataset
    dataset = Utils.load_data("data/petrol_consumption.csv")
    Utils.print_pandas_dataset(dataset)
    # iterate over several runs with different parameters, stepping up by 50
    # limiting to 300
    max_depth = 0
    for n in range(50, 350, 50):
        max_depth = max_depth + 2
        params = {
            "n_estimators": n,
            "max_depth": max_depth,
            "random_state": 42
        }
        rfr = RFRModel.new_instance(params)
        (experimentID, runID) = rfr.mlflow_run(dataset)
        print(
            "MLflow Run completed with run_id {} and experiment_id {}".format(
示例#5
0
#
# Lab/Homework for Some Experimental runs
#
# 1. Consult RandomForestRegressor documentation
# 2. Change or add parameters, such as depth of the tree or random_state: 42 etc.
# 3. Change or alter the range of runs and increments of n_estimators
# 4. Check in MLfow UI if the metrics are affected

if __name__ == '__main__':
    # TODO add more parameters to the list
    # create four experiments with different parameters
    # run these different experiments, each with its own instance of model with the supplied parameters.
    # add more parameters to this dictionary list here
    params_list = [{"n_estimators": 200, "max_depth": 6, "random_state": 42}]
    # load the data
    dataset = Utils.load_data("data/airbnb-cleaned-mlflow.csv")

    # run these experiments, each with its own instance of model with the supplied parameters.
    for params in params_list:
        rfr = RFFExperimentModel.new_instance(params)
        experiment = "Experiment with {} trees".format(params['n_estimators'])
        (experimentID, runID) = rfr.mlflow_run(dataset, experiment)
        print(
            "MLflow Run completed with run_id {} and experiment_id {}".format(
                runID, experimentID))
        print("-" * 100)

    # Use MLflowClient API to query programmatically any previous run info under an experiment ID
    # consult https://mlflow.org/docs/latest/python_api/mlflow.tracking.html
    client = MlflowClient()
    run_list = client.list_run_infos(experimentID)
示例#6
0
    def mlflow_run(self,
                   df,
                   r_name="Lab-2:RF Bank Note Classification Experiment"):
        """
        This method trains, computes metrics, and logs all metrics, parameters,
        and artifacts for the current run
        :param df: pandas dataFrame
        :param r_name: Name of the experiment as logged by MLflow
        :return: MLflow Tuple (ExperimentID, runID)
        """

        with mlflow.start_run(run_name=r_name) as run:
            # get all rows and columns but the last column, which is our class
            X = df.iloc[:, 0:4].values
            # get all observed values in the last columns, which is what we want to predict
            y = df.iloc[:, 4].values

            # create train and test data
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=0)

            # Feature Scaling
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_test = sc.transform(X_test)

            # train and predict
            self.rf.fit(X_train, y_train)
            y_pred = self.rf.predict(X_test)

            # Log model and params using the MLflow sklearn APIs
            mlflow.sklearn.log_model(self.rf, "random-forest-class-model")
            mlflow.log_params(self.params)

            # compute evaluation metrics
            acc = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            conf_matrix = confusion_matrix(y_test, y_pred)

            # get confusion matrix values
            true_positive = conf_matrix[0][0]
            true_negative = conf_matrix[1][1]
            false_positive = conf_matrix[0][1]
            false_negative = conf_matrix[1][0]

            # get confusion matrix as a dictionary
            class_report = classification_report(y_test,
                                                 y_pred,
                                                 output_dict=True)
            recall_0 = class_report['0']['recall']
            f1_score_0 = class_report['0']['f1-score']
            recall_1 = class_report['1']['recall']
            f1_score_1 = class_report['1']['f1-score']

            # log metrics
            mlflow.log_metric("accuracy_score", acc)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("true_positive", true_positive)
            mlflow.log_metric("true_negative", true_negative)
            mlflow.log_metric("false_positive", false_positive)
            mlflow.log_metric("false_negative", false_negative)
            mlflow.log_metric("recall_0", recall_0)
            mlflow.log_metric("f1_score_0", f1_score_0)
            mlflow.log_metric("recall_1", recall_1)
            mlflow.log_metric("f1_score_1", f1_score_1)

            # get current run and experiment id
            runID = run.info.run_uuid
            experimentID = run.info.experiment_id

            # create confusion matrix images
            (plt, fig, ax) = Utils.plot_confusion_matrix(
                y_test,
                y_pred,
                y,
                title="Bank Note Classification Confusion Matrix")

            # create temporary artifact file name and log artifact
            temp_file_name = Utils.get_temporary_directory_path(
                "confusion_matrix-", ".png")
            temp_name = temp_file_name.name
            try:
                fig.savefig(temp_name)
                mlflow.log_artifact(temp_name, "confusion_matrix_plots")
            finally:
                temp_file_name.close()  # Delete the temp file

            # print some data
            print("-" * 100)
            print(
                "Inside MLflow Run with run_id {} and experiment_id {}".format(
                    runID, experimentID))
            print("Estimators trees:", self.params["n_estimators"])
            print(conf_matrix)
            print(classification_report(y_test, y_pred))
            print("Accuracy Score:", acc)
            print("Precision     :", precision)

            return (experimentID, runID)
示例#7
0
            return (experimentID, runID)


#
# Lab/Homework for Some Experimental runs
#
# 1. Consult RandomForestClassifier documentation
# 2. Change or add parameters, such as depth of the tree or random_state: 42 etc.
# 3. Change or alter the range of runs and increments of n_estimators
# 4. Check in MLfow UI if the metrics are affected
# 5. Log confusion matirx, recall and F1-score as metrics
# Nice blog: https://joshlawman.com/metrics-classification-report-breakdown-precision-recall-f1/

if __name__ == '__main__':
    # load and print dataset
    dataset = Utils.load_data("data/bill_authentication.csv")
    Utils.print_pandas_dataset(dataset)
    # iterate over several runs with different parameters
    # TODO in the Lab (change these parameters, n_estimators and random_state
    # with each iteration.
    # Does that change the metrics and accuracy?
    # start with n=10, step by 10 up to X <=120
    for n in range(10, 120, 10):
        params = {"n_estimators": n, "random_state": 42}
        rfr = RFCModel.new_instance(params)
        (experimentID, runID) = rfr.mlflow_run(dataset)
        print(
            "MLflow Run completed with run_id {} and experiment_id {}".format(
                runID, experimentID))
        print("-" * 100)
示例#8
0
            print("Estimator trees        :", self.params["n_estimators"])
            print('Mean Absolute Error    :', mae)
            print('Mean Squared Error     :', mse)
            print('Root Mean Squared Error:', rsme)
            print('R2                     :', r2)

            return (experimentID, runID)


#
# TODO in Lab/Homework for Some Experimental runs
#
# 1. Consult RandomForest documentation
# 2. Run the baseline model
# 3. Check in MLflow UI for parameters, metrics, and artifacts

if __name__ == '__main__':
    # load and print dataset
    dataset = Utils.load_data("data/airbnb-cleaned-mlflow.csv")
    Utils.print_pandas_dataset(dataset)
    #
    # create a base line model parameters
    # this is our benchmark model to compare experimental results with
    #
    params = {"n_estimators": 100, "max_depth": 3, "random_state": 0}
    rfr = RFRBaseModel.new_instance(params)
    (experimentID, runID) = rfr.mlflow_run(dataset)
    print("MLflow completed with run_id {} and experiment_id {}".format(
        runID, experimentID))
    print("-" * 100)
示例#9
0
    def mlflow_run(self, df, r_name="Lab-1:RF Petrol Regression Experiment"):
        """
        This method trains, computes metrics, and logs all metrics, parameters,
        and artifacts for the current run
        :param df: pandas dataFrame
        :param r_name: Name of the run as logged by MLflow
        :return: MLflow Tuple (ExperimentID, runID)
        """

        with mlflow.start_run(run_name=r_name) as run:
            # get all feature independent attributes
            X = df.iloc[:, 0:4].values
            # get all the values of last columns, dependent variables,
            # which is what we want to predict as our values, the petrol consumption
            y = df.iloc[:, 4].values

            # create train and test data
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=0)

            # Feature Scaling, though for RF is not necessary.
            # z = (X - u)/ s, where u is the man, s the standard deviation
            # get the handle to the transformer
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_test = sc.transform(X_test)

            # train and predict
            self.rf.fit(X_train, y_train)
            y_pred = self.rf.predict(X_test)

            # Log model and params using the MLflow sklearn APIs
            mlflow.sklearn.log_model(self.model, "random-forest-reg-model")
            mlflow.log_params(self.params)

            # compute  metrics; r2 is a statistical measure of how well the
            # data fits the model: higher the value indicates better fit.
            mae = metrics.mean_absolute_error(y_test, y_pred)
            mse = metrics.mean_squared_error(y_test, y_pred)
            rsme = np.sqrt(mse)
            r2 = metrics.r2_score(y_test, y_pred)

            # Log metrics
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("rsme", rsme)
            mlflow.log_metric("r2", r2)

            # update global class instance variable with values
            self.rsme.append(rsme)
            self.estimators.append(self._params["n_estimators"])

            # plot graphs and save as artifacts
            (fig, ax) = Utils.plot_graphs(self.estimators, self.rsme,
                                          "Random Forest Estimators",
                                          "Root Mean Square",
                                          "Root Mean Square vs Estimators")

            # get current run and experiment id
            runID = run.info.run_uuid
            experimentID = run.info.experiment_id

            # create temporary artifact file name and log artifact
            temp_file_name = Utils.get_temporary_directory_path(
                "rsme_estimators-", ".png")
            temp_name = temp_file_name.name
            try:
                fig.savefig(temp_name)
                mlflow.log_artifact(temp_name, "rsme_estimators_plots")
            finally:
                temp_file_name.close()  # Delete the temp file

            # print some data
            print("-" * 100)
            print(
                "Inside MLflow Run with run_id {} and experiment_id {}".format(
                    runID, experimentID))
            print("Estimator trees        :", self.params["n_estimators"])
            print('Mean Absolute Error    :', mae)
            print('Mean Squared Error     :', mse)
            print('Root Mean Squared Error:', rsme)
            print('R2                     :', r2)

            return (experimentID, runID)