예제 #1
0
    def report_stats(self):

        if self.get_low_conf() > 0:
            mlops.health_alert(
                "Low confidence alert",
                "{}% of inferences had confidence below {}%".format(
                    self.get_low_conf() * 100.0 / self.get_total(),
                    self._conf_thresh * 100))

        for i in range(0, self._num_categories):
            print(i, "label_total =", self._label_hist[i], "infer_total = ",
                  self._infer_hist[i])

        print("total = ", self.get_total(), "total_correct = ",
              self.get_correct())

        self._infer_tbl.add_row(str(self.get_total()), [
            self._infer_hist[0], self._infer_hist[1], self._infer_hist[2],
            self._infer_hist[3], self._infer_hist[4], self._infer_hist[5],
            self._infer_hist[6], self._infer_hist[7], self._infer_hist[8],
            self._infer_hist[9]
        ])

        if self._stats_type != "none":
            mlops.set_stat("correct_percent",
                           self.get_correct() * 100.0 / self.get_total())
            mlops.set_stat(self._infer_tbl)
예제 #2
0
    def report_confidence(self, total_predictions):

        if self._track_conf == 0:
            return

        ## MLOps start
        # Show the prediction distribution as a bar graph
        self._conf_graph.data(self._conf_hist)
        mlops.set_stat(self._conf_graph)
        ## MLOps end

        # Percentage of low confidence predictions in this reporting interval
        low_conf_percent = self._low_confidence_predictions * 100.0 / total_predictions

        print("low confidence predictions: {} ({})%".format(
            self._low_confidence_predictions, low_conf_percent))

        if low_conf_percent > self._conf_percent:
            msg = "Low confidence: {}% of inferences had confidence below {}%".format(
                low_conf_percent, self._conf_thresh)
            print(msg)

            ## MLOps start
            mlops.health_alert("Low confidence alert", msg)
            ## MLOps end

        # reset counters for next round
        for i in range(0, 9):
            self._conf_hist[i] = 0
        self._low_confidence_predictions = 0
예제 #3
0
def test_events_basic():

    print("Testing events")
    mlops.health_alert("Health_Alert", "Alert while running test")
    mlops.data_alert("Data_Alert", "Alert from test")
    mlops.system_alert("System_Alert", "Operational alert from test")

    mlops.event(
        SystemAlert(label="sys_alert_1", description="sys_alert_1 desc"))
    mlops.event(
        HealthAlert(label="health_alert_1", description="health_alert_1 desc"))
    mlops.event(
        DataAlert(label="data_alert_1", description="data_alert_1 desc"))
예제 #4
0
    def _report_stats(self, file_path):
        self._logger.info(" *** generate stats .. params:{}".format(
            self._params))
        self._logger.info(" *** Source file {}".format(file_path))

        # Read the file
        data = pd.read_csv(file_path, sep=' |,', header=None, skiprows=1)
        data = data.rename(index=str,
                           columns={
                               1: "label",
                               2: "confidence0",
                               3: "confidence1"
                           })
        prediction_distribution = data['label'].value_counts()
        column_names = np.array(
            prediction_distribution.index).astype(str).tolist()

        # Initialize mlops
        mlops.init()

        # Report a bar graph
        bar = BarGraph().name("Prediction Distribution").cols(
            np.array(prediction_distribution.index).astype(str).tolist()).data(
                prediction_distribution.values.tolist())
        mlops.set_stat(bar)

        # Generate an alert on low confidence if the argument is set to true
        if (self._params["alert"]):
            index = data.values[:, 1].astype(int)
            confidence = data.values[:, 2:4]
            confidence_per_prediction = confidence[:, index][:, 0] * 100
            low_conf_percent = len(confidence_per_prediction[
                confidence_per_prediction < self._params["confidence"]]) / len(
                    confidence_per_prediction) * 100
            if low_conf_percent > self._params["samples"]:
                msg = "Low confidence: {}% of inferences had confidence below {}%".format(
                    low_conf_percent, self._params["confidence"])
                print(msg)
                mlops.health_alert("Low confidence alert", msg)

        mlops.done()

        return []
예제 #5
0
def test_alerts_fetching():

    print("test_alerts_fetching")
    mlops.health_alert("Health_Alert-2",
                       "Health alert generated by 'test_alerts_fetching'")
    mlops.data_alert("Data_Alert-2",
                     "Data alert generated by 'test_alerts_fetching'")
    mlops.system_alert(
        "System_Alert-2",
        "Operational (System) alert generated by 'test_alerts_fetching'")

    # It takes time for the alerts to propagate up to the database
    active_ion_alerts = None
    for counter in range(FETCH_ALERTS_NUM_RETRIES):
        active_ion_alerts = mlops.get_events()
        if active_ion_alerts is None or active_ion_alerts.empty:
            print("Did not find alerts, trying again...")
            time.sleep(SLEEP_TIME_PER_RETRY_SEC)
            continue
        break

    assert active_ion_alerts is not None and not active_ion_alerts.empty

    all_alerts = mlops.get_events()
    print("\n\n\nALL ALERTS\n{}".format(all_alerts))

    num_all_alerts = len(all_alerts)
    num_active_ion_alerts = len(active_ion_alerts)

    assert num_active_ion_alerts <= num_all_alerts

    active_ion_alerts_ids = active_ion_alerts['id'].tolist()
    print("List of ides: {}".format(active_ion_alerts_ids))
    print("Active ion alerts ({}):".format(num_active_ion_alerts))

    for index, alert in active_ion_alerts.iterrows():
        _print_alert(alert)

    print("Other alerts ({}):".format(num_all_alerts - num_active_ion_alerts))
    for index, alert in all_alerts.iterrows():
        if alert["id"] not in active_ion_alerts_ids:
            _print_alert(alert)
예제 #6
0
    def report_stats(self):

        # what percentage of the predictions had confidences less than the threshold
        low_conf_percent = self.get_low_conf(
        ) * 100.0 / self.get_report_interval()

        if low_conf_percent > self._conf_percent:
            mlops.health_alert(
                "Low confidence alert",
                "{}% of inferences had confidence below {}%".format(
                    low_conf_percent, self._conf_thresh * 100))

        for i in range(0, self._num_categories):
            print(i, "label_total =", self._label_hist[i], "infer_total = ",
                  self._infer_hist[i])

        print("total = ", self.get_total(), "total_correct = ",
              self.get_correct())

        category_data = [
            self._infer_hist[0], self._infer_hist[1], self._infer_hist[2],
            self._infer_hist[3], self._infer_hist[4], self._infer_hist[5],
            self._infer_hist[6], self._infer_hist[7], self._infer_hist[8],
            self._infer_hist[9]
        ]

        self._infer_tbl.add_row(str(self.get_cum_total()), category_data)
        self._infer_bar.data(category_data)

        if self._stats_type != "none":
            mlops.set_stat("correct_percent",
                           self.get_correct() * 100.0 / self.get_total())
            mlops.set_stat(self._infer_tbl)
            mlops.set_stat(self._infer_bar)
            # Update total prediction count with the all new predictions since we last reported.
            mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT,
                           self.get_report_interval())
            print("Completed {} predictions".format(
                self.get_report_interval()))

        self.reset()
예제 #7
0
def test_events_basic():

    print("Testing events")
    mlops.health_alert("Health_Alert", "Alert while running test")
    mlops.data_alert("Data_Alert", "Alert from test")
    mlops.system_alert("System_Alert", "Operational alert from test")
    mlops.canary_alert("Canary_Alert", True, 5, 1)

    mlops.event(
        SystemAlert(label="sys_alert_1", description="sys_alert_1 desc"))
    mlops.event(
        HealthAlert(label="health_alert_1", description="health_alert_1 desc"))
    mlops.event(
        DataAlert(label="data_alert_1", description="data_alert_1 desc"))
    mlops.event(
        CanaryAlert(label="canary_alert_1",
                    is_healthy=False,
                    score=0.0,
                    threshold=0.1))

    mlops.set_event(name="system alert",
                    type=EventType.System,
                    data="blablabla")
예제 #8
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")

    print("PM: # Validation Split:          [{}]".format(pm_options.validation_split))

    print("PM: # AUC Threshold:             [{}]".format(pm_options.auc_threshold))
    print("PM: # KS Threshold:              [{}]".format(pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(pm_options.psi_threshold))

    print("PM: # Estimators:                [{}]".format(pm_options.n_estimators))
    print("PM: # Max Depth:                 [{}]".format(pm_options.max_depth))
    print("PM: # Learning Rate:             [{}]".format(pm_options.learning_rate))
    print("PM: # Min Child Weight:          [{}]".format(pm_options.min_child_weight))
    print("PM: # Objective:                 [{}]".format(pm_options.objective))
    print("PM: # Gamma:                     [{}]".format(pm_options.gamma))
    print("PM: # Max Delta Step:            [{}]".format(pm_options.max_delta_step))
    print("PM: # Subsample:                 [{}]".format(pm_options.subsample))
    print("PM: # Reg Alpha:                 [{}]".format(pm_options.reg_alpha))
    print("PM: # Reg Lambda:                [{}]".format(pm_options.reg_lambda))
    print("PM: # Scale Pos Weight:          [{}]".format(pm_options.scale_pos_weight))

    print("PM: # Input File:                [{}]".format(pm_options.input_file))
    print("PM: Output model:                [{}]".format(pm_options.output_model))

    min_auc_requirement = float(pm_options.auc_threshold)
    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # mlops Init
    mlops.init()

    # Loading and cleaning the data
    # This section goes though the various stages of loading and cleaning the data:
    loan_df = pd.read_csv(pm_options.input_file)

    # Cleaning NAs
    print("dataset_size = ", loan_df.shape[0])
    mlops.set_data_distribution_stat(loan_df)
    print("number of NAs per columns = ",  loan_df.isnull().sum())
    loan_df = loan_df.dropna()
    print("dataset_size without NA rows= ", loan_df.shape[0])

    # Marking the label field. remove it from the features set:
    y = loan_df["bad_loan"]
    X = loan_df.drop("bad_loan", axis=1)

    from sklearn_pandas import DataFrameMapper

    # Splitting the data to train and test sets:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=float(pm_options.validation_split),
                                                        random_state=42)

    All_columns = X_train.columns.tolist()
    categorical_columns = ["verification_status", "addr_state", "purpose", "home_ownership", "term"]
    mapper_list =[]
    for d in All_columns:
        if d in categorical_columns:
            mapper_list.append(([d], OneHotEncoder(handle_unknown='ignore')))
        else:
            mapper_list.append(([d], MinMaxScaler()))

    mapper = DataFrameMapper(mapper_list)

    # ## Training
    # XGBoost Training:
    import xgboost as xgb
    xgboost_model = xgb.XGBClassifier(max_depth=int(pm_options.max_depth),
                                    min_child_weight=int(pm_options.min_child_weight),
                                    learning_rate=float(pm_options.learning_rate),
                                    n_estimators=int(pm_options.n_estimators),
                                    silent=True,
                                    objective=pm_options.objective,
                                    gamma=float(pm_options.gamma),
                                    max_delta_step=int(pm_options.max_delta_step),
                                    subsample=float(pm_options.subsample),
                                    colsample_bytree=1,
                                    colsample_bylevel=1,
                                    reg_alpha=float(pm_options.reg_alpha),
                                    reg_lambda=float(pm_options.reg_lambda),
                                    scale_pos_weight=float(pm_options.scale_pos_weight),
                                    seed=1,
                                    n_jobs=1,
                                    missing=None)

    final_model = Pipeline([("mapper", mapper), ("xgboost", xgboost_model)])

    final_model.fit(X_train, y_train)
    # Random Forest Training
    from sklearn.ensemble import RandomForestClassifier
    rf_only_model = RandomForestClassifier(n_estimators=int(pm_options.n_estimators), max_depth=int(pm_options.max_depth)+3, random_state=42, n_jobs=1, class_weight="balanced")
    rf_model = Pipeline([("mapper", mapper), ("rf", rf_only_model)])

    rf_model.fit(X_train, y_train)

    # ## Statistics on Test Dataset

    # Prediction and prediction distribution
    pred_labels = final_model.predict(X_test)
    pred_probs = final_model.predict_proba(X_test)
    rf_pred_labels = rf_model.predict(X_test)
    rf_pred_probs = rf_model.predict_proba(X_test)

    # Accuracy calculation
    # Accuracy for the xgboost model
    accuracy = accuracy_score(y_test, pred_labels)
    print("XGBoost Accuracy value: {0}".format(accuracy))
    #     Output accuracy of the chosen model using MCenter
    mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES)

    # Accuracy for the RF model
    rf_accuracy = accuracy_score(y_test, rf_pred_labels)
    print("RF Accuracy value: {0}".format(rf_accuracy))
    #     Output accuracy of the chosen model using MCenter
    mlops.set_stat("RF Accuracy", rf_accuracy, st.TIME_SERIES)

    # Label distribution:
    # Label distribution in training
    value, counts = np.unique(y_test, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    print("Validation Actual Label distributions: \n {0}".format(label_distribution))
    # Output Label distribution as a BarGraph using MCenter
    export_bar_table(label_distribution[:,0], label_distribution[:,1], "Validation - Actual Label Distribution")

    # Prediction distribution and prediction confidence distribution
    # Pred Label distribution in training
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    print("XGBoost Validation Prediction Label Distributions: \n {0}".format(pred_label_distribution))
    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_label_distribution[:,0], pred_label_distribution[:,1], "Validation - XGBoost Prediction Distribution")

    rf_pred_value, rf_pred_counts = np.unique(rf_pred_labels, return_counts=True)
    rf_pred_label_distribution = np.asarray((rf_pred_value, rf_pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("RF Validation Prediction Label Distributions: \n {0}".format(rf_pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(rf_pred_label_distribution[:,0], rf_pred_label_distribution[:,1], "Validation - RF Prediction Distribution")

    # Pred confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(max_pred_probs[index_class])/(float(pred_counts[i]))
        else:
            average_confidence[i] = 0
    print("XGBoost Validation Average Prediction confidence per label: \n {0}".format(average_confidence))

    #  Pred confidence per label
    rf_label_number = len(rf_pred_counts)
    rf_average_confidence = np.zeros(rf_label_number)
    rf_max_pred_probs = rf_pred_probs.max(axis=1)
    for i in range(0, rf_label_number):
        rf_index_class = np.where(rf_pred_labels == i)[0]
        if rf_pred_counts[i] > 0:
            rf_average_confidence[i] = np.sum(rf_max_pred_probs[rf_index_class])/(float(rf_pred_counts[i]))
        else:
            rf_average_confidence[i] = 0
    print("RF Validation Average Prediction confidence per label: \n {0}".format(rf_average_confidence))

    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class")
    export_bar_table(rf_pred_value, rf_average_confidence, "Validation - RF Average confidence per class")

    # Confusion Matrix
    # XGBoost Confusion Matrix
    confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels)
    print("Confusion Matrix for XGBoost: \n {0}".format(confmat))
    # Output Confusion Matrix as a Table using MCenter
    export_confusion_table(confmat, "XGBoost")
    # RF Confusion Matrix
    rf_confmat = confusion_matrix(y_true=y_test, y_pred=rf_pred_labels)
    print("Confusion Matrix for RF: \n {0}".format(rf_confmat))
    # Output Confusion Matrix as a Table using MCenter
    export_confusion_table(rf_confmat, "RF")

    # Classification Report
    # XGBoost Classification Report
    class_rep = classification_report(y_true=y_test, y_pred=pred_labels, output_dict=True)
    print("XGBoost Classification Report: \n {0}".format(class_rep))
    # RF Classification Report
    rf_class_rep = classification_report(y_true=y_test, y_pred=rf_pred_labels, output_dict=True)
    print("RF Classification Report: \n {0}".format(rf_class_rep))
    # Output Classification Report as a Table using MCenter
    export_classification_report(class_rep, "XGBoost")
    export_classification_report(rf_class_rep, "RF")

    # AUC and ROC Curves
    # ROC for XGBoost model
    roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
    print("XGBoost ROC AUC value: {}".format(roc_auc))
    rf_roc_auc = roc_auc_score(y_test, rf_pred_probs[:, 1])
    print("RF ROC AUC value:  {}".format(rf_roc_auc))
    # Output ROC of the chosen model using MCenter
    mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES)
    mlops.set_stat("RF ROC AUC", rf_roc_auc, st.TIME_SERIES)

    if roc_auc <= min_auc_requirement:
        mlops.health_alert("[Training] AUC Violation From Training Node",
                           "AUC Went Below {}. Current AUC Is {}".format(min_auc_requirement, roc_auc))

    # ROC curve
    fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])
    rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, rf_pred_probs[:, 1])

    cg = MultiGraph().name("Receiver Operating Characteristic ").set_continuous()
    cg.add_series(label='Random curve ''', x=fpr.tolist(), y=fpr.tolist())
    cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})'''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist())
    cg.add_series(label='RF ROC curve (area = {0:0.2f})'''.format(rf_roc_auc), x=rf_fpr.tolist(), y=rf_tpr.tolist())
    cg.x_title('False Positive Rate')
    cg.y_title('True Positive Rate')
    mlops.set_stat(cg)

    # Feature importance comparison
    # XGBoost Feature importance
    export_feature_importance(final_model, list(X_train.columns), 5, "XGBoost")
    export_feature_importance(rf_model, list(X_train.columns), 5, "RF")

    # KS Analysis
    max_pred_probs = pred_probs.max(axis=1)
    y_test0=np.where(y_test == 0)[0]
    y_test1=np.where(y_test == 1)[0]
    rf_max_pred_probs = rf_pred_probs.max(axis=1)

    # KS for the XGBoost model
    ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue
    print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(ks_stat, ks_pvalue))
    # KS for the RF model
    rf_ks = ks_2samp(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1])
    rf_ks_stat = rf_ks.statistic
    rf_ks_pvalue = rf_ks.pvalue
    print("RF KS values: \n Statistics: {} \n pValue: {}\n".format(rf_ks_stat, rf_ks_pvalue))
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES)
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for RF", rf_ks_stat, st.TIME_SERIES)

    # raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert("[Training] KS Violation From Training Node",
                           "KS Stat Went Above {}. Current KS Stat Is {}".format(max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats for XGBoost").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # PSI Analysis
    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[y_test0], max_pred_probs[y_test1])
    rf_total_psi, rf_psi_table = get_psi(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1])
    psi_table_stat = Table().name("PSI Stats for XGBoost").cols(
        ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent",
         "Segment PSI"])
    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(psi_table_stat)
    print("Total XGBoost PSI values: \n {}".format(total_psi))
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES)

    if total_psi >= min_psi_requirement:
        mlops.health_alert("[Training] PSI Violation From Training Node",
                           "PSI Went Below {}. Current PSI Is {}".format(min_psi_requirement,
                                                                         total_psi))

    print("Total RF PSI values: \n {}".format(rf_total_psi))
    rf_psi_table_stat = Table().name("PSI Stats for RF").cols(
        ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent",
         "Segment PSI"])
    row_num = 1
    for each_value in rf_psi_table.values:
        str_values = [str(i) for i in each_value]
        rf_psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(rf_psi_table_stat)
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total RF PSI ", rf_total_psi, st.TIME_SERIES)

    # ## Save the XGBoost Model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()

    # ## Finish the program
    mlops.done()
예제 #9
0
    def _prep_and_train(self, df_dataset):
        self.min_auc_requirement = self._params["auc_threshold"]
        self.max_ks_requirement = self._params["ks_threshold"]
        self.min_psi_requirement = self._params["psi_threshold"]
        train_on_col = self._params["train_on_column"]

        #mlops Init
        mlops.init()

        y = df_dataset[train_on_col]
        self._logger.info("train_on_col= {}".format(train_on_col))
        self._logger.info("df_dataset {}".format(df_dataset.shape[1]))
        X = df_dataset.drop(train_on_col, axis=1)
        mlops.set_data_distribution_stat(X)
        self._logger.info("df_dataset {}".format(X.shape[1]))

        # Splitting the data to train and test sets:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self._params["validation_split"], random_state=42)
        All_columns = X_train.columns.tolist()
        categorical_columns = self._params["categorical_cols"]
        mapper_list = []
        for d in All_columns:
            if d in categorical_columns:
                mapper_list.append(
                    ([d], OneHotEncoder(handle_unknown='ignore')))
            else:
                mapper_list.append(([d], MinMaxScaler()))

        mapper = DataFrameMapper(mapper_list)

        ## Training
        # XGBoost Training:
        n_cpu = multiprocessing.cpu_count()

        xgboost_model = xgb.XGBClassifier(
            max_depth=int(self._params["max_depth"]),
            min_child_weight=int(self._params["min_child_weight"]),
            learning_rate=float(self._params["learning_rate"]),
            n_estimators=int(self._params["n_estimators"]),
            silent=True,
            objective=self._params["objective"],
            gamma=float(self._params["gamma"]),
            max_delta_step=int(self._params["max_delta_step"]),
            subsample=float(self._params["subsample"]),
            colsample_bytree=1,
            colsample_bylevel=1,
            reg_alpha=float(self._params["reg_alpha"]),
            reg_lambda=float(self._params["reg_lambda"]),
            scale_pos_weight=float(self._params["scale_pos_weight"]),
            seed=1,
            n_jobs=n_cpu,
            missing=None)

        final_model = Pipeline([("mapper", mapper),
                                ("xgboost", xgboost_model)])
        final_model.fit(X_train, y_train)

        # Prediction and prediction distribution
        pred_labels = final_model.predict(X_test)
        pred_probs = final_model.predict_proba(X_test)

        # Accuracy calculation
        # Accuracy for the xgboost model
        accuracy = accuracy_score(y_test, pred_labels)
        self._logger.info("XGBoost Accuracy value: {0}".format(accuracy))
        #     Output accuracy of the chosen model using MCenter
        mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES)

        # Label distribution:
        # Label distribution in training
        value, counts = np.unique(y_test, return_counts=True)
        label_distribution = np.asarray((value, counts)).T
        self._logger.info(
            "Validation Actual Label distributions: \n {0}".format(
                label_distribution))
        # Output Label distribution as a BarGraph using MCenter
        export_bar_table(label_distribution[:, 0], label_distribution[:, 1],
                         "Validation - Actual Label Distribution")

        # Prediction distribution and prediction confidence distribution
        # Pred Label distribution in training
        pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
        pred_label_distribution = np.asarray((pred_value, pred_counts)).T
        self._logger.info(
            "XGBoost Validation Prediction Label Distributions: \n {0}".format(
                pred_label_distribution))
        # Output Pred label distribution as a BarGraph using MCenter
        export_bar_table(pred_label_distribution[:, 0],
                         pred_label_distribution[:, 1],
                         "Validation - XGBoost Prediction Distribution")

        # Pred confidence per label
        label_number = len(pred_counts)
        average_confidence = np.zeros(label_number)
        max_pred_probs = pred_probs.max(axis=1)
        for i in range(0, label_number):
            index_class = np.where(pred_labels == i)[0]
            if pred_counts[i] > 0:
                average_confidence[i] = np.sum(
                    max_pred_probs[index_class]) / (float(pred_counts[i]))
            else:
                average_confidence[i] = 0
        self._logger.info(
            "XGBoost Validation Average Prediction confidence per label: \n {0}"
            .format(average_confidence))

        # Output Pred label distribution as a BarGraph using MCenter
        export_bar_table(pred_value, average_confidence,
                         "Validation - XGBoost Average confidence per class")

        # Confusion Matrix
        # XGBoost Confusion Matrix
        confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels)
        self._logger.info(
            "Confusion Matrix for XGBoost: \n {0}".format(confmat))
        # Output Confusion Matrix as a Table using MCenter
        export_confusion_table(confmat, "XGBoost")

        # Classification Report
        # XGBoost Classification Report
        class_rep = classification_report(y_true=y_test,
                                          y_pred=pred_labels,
                                          output_dict=True)
        self._logger.info(
            "XGBoost Classification Report: \n {0}".format(class_rep))

        # AUC and ROC Curves
        # ROC for XGBoost model
        roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
        self._logger.info("XGBoost ROC AUC value: {}".format(roc_auc))

        # Output ROC of the chosen model using MCenter
        mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES)

        if roc_auc <= self.min_auc_requirement:
            mlops.health_alert(
                "[Training] AUC Violation From Training Node",
                "AUC Went Below {}. Current AUC Is {}".format(
                    self.min_auc_requirement, roc_auc))

        # ROC curve
        fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])

        cg = MultiGraph().name(
            "Receiver Operating Characteristic ").set_continuous()
        cg.add_series(label='Random curve ' '', x=fpr.tolist(), y=fpr.tolist())
        cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})'
                      ''.format(roc_auc),
                      x=fpr.tolist(),
                      y=tpr.tolist())
        cg.x_title('False Positive Rate')
        cg.y_title('True Positive Rate')
        mlops.set_stat(cg)

        # Feature importance comparison
        # XGBoost Feature importance
        export_feature_importance(final_model, list(X_train.columns), 5,
                                  "XGBoost")

        # KS Analysis
        max_pred_probs = pred_probs.max(axis=1)
        y_test0 = np.where(y_test == 0)[0]
        y_test1 = np.where(y_test == 1)[0]

        # KS for the XGBoost model
        ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
        ks_stat = ks.statistic
        ks_pvalue = ks.pvalue
        self._logger.info(
            "KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(
                ks_stat, ks_pvalue))

        # Output KS Stat of the chosen model using MCenter
        mlops.set_stat("KS Stats for CGBoost", ks_stat, st.TIME_SERIES)

        # raising alert if ks-stat goes above required threshold
        if ks_stat >= self.max_ks_requirement:
            mlops.health_alert(
                "[Training] KS Violation From Training Node",
                "KS Stat Went Above {}. Current KS Stat Is {}".format(
                    self.max_ks_requirement, ks_stat))

        ks_table = Table().name("KS Stats for XGBoost").cols(
            ["Statistic", "pValue"])
        ks_table.add_row([ks_stat, ks_pvalue])
        mlops.set_stat(ks_table)

        # PSI Analysis
        # Calculating PSI
        total_psi, psi_table = get_psi(self, max_pred_probs[y_test0],
                                       max_pred_probs[y_test1])
        psi_table_stat = Table().name("PSI Stats for XGBoost").cols([
            "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound",
            "Base Percent", "Curr Percent", "Segment PSI"
        ])
        row_num = 1
        for each_value in psi_table.values:
            str_values = [str(i) for i in each_value]
            psi_table_stat.add_row(str(row_num), str_values)
            row_num += 1
        mlops.set_stat(psi_table_stat)
        self._logger.info("Total XGBoost PSI values: \n {}".format(total_psi))
        #     Output Total PSI of the chosen model using MCenter
        mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES)

        if total_psi >= self.min_psi_requirement:
            mlops.health_alert(
                "[Training] PSI Violation From Training Node",
                "PSI Went Below {}. Current PSI Is {}".format(
                    self.min_psi_requirement, total_psi))

        # ## Save the XGBoost Model
        model_file = open(self._params["output-model"], 'wb')
        pickle.dump(final_model, model_file)
        model_file.close()

        # ## Finish the program
        mlops.done()

        return (model_file)
예제 #10
0
def test_alerts_fetching():

    print("test_alerts_fetching")
    mlops.health_alert("Health_Alert-2",
                       "Health alert generated by 'test_alerts_fetching'")
    mlops.data_alert("Data_Alert-2",
                     "Data alert generated by 'test_alerts_fetching'")
    mlops.system_alert(
        "System_Alert-2",
        "Operational (System) alert generated by 'test_alerts_fetching'")
    mlops.canary_alert("Canary_Alert-2",
                       is_healthy=False,
                       score=0.2,
                       threshold=0.1)

    # It takes time for the alerts to propagate up to the database
    active_ion_alerts = None
    for counter in range(FETCH_ALERTS_NUM_RETRIES):
        active_ion_alerts = mlops.get_events()
        if active_ion_alerts is None or active_ion_alerts.empty:
            print("Did not find alerts, trying again...")
            time.sleep(SLEEP_TIME_PER_RETRY_SEC)
            continue
        break

    assert active_ion_alerts is not None and not active_ion_alerts.empty

    time.sleep(3)
    all_alerts = mlops.get_events()
    print("\n\n\nALL ALERTS\n{}".format(all_alerts))

    num_all_alerts = len(all_alerts)
    num_active_ion_alerts = len(active_ion_alerts)

    assert num_active_ion_alerts <= num_all_alerts

    active_ion_alerts_ids = active_ion_alerts['id'].tolist()
    print("List of ides: {}".format(active_ion_alerts_ids))
    print("Active ion alerts ({}):".format(num_active_ion_alerts))

    nodes = active_ion_alerts["node"].tolist()
    print("Nodes col: {}".format(nodes))

    for index, alert in active_ion_alerts.iterrows():
        _print_alert(alert)

    print("Other alerts ({}):".format(num_all_alerts - num_active_ion_alerts))
    for index, alert in all_alerts.iterrows():
        if alert["id"] not in active_ion_alerts_ids:
            _print_alert(alert)

    # Fetching by time window
    now = datetime.utcnow()
    last_10_hour = (now - timedelta(hours=10))
    print("Getting last 10 hours events")
    events_df_per_time_window = mlops.get_events(start_time=last_10_hour,
                                                 end_time=now)
    num_time_window_alerts = len(events_df_per_time_window)
    print("Last 10 hours events num {}  - all events num {}".format(
        num_time_window_alerts, num_all_alerts))
    assert num_time_window_alerts == num_all_alerts

    # Verifying types:
    type_only_df = events_df_per_time_window[['type']]
    unique_types_list = type_only_df.drop_duplicates()['type'].tolist()
    print("Unique types:")
    print(unique_types_list)

    for event_type in EVENTS_TYPE:
        if event_type not in unique_types_list:
            msg = "Error: event {} is not in unique event list".format(
                event_type)
            print(msg)
            raise Exception(msg)
    print("Done event fetching test")
예제 #11
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")

    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: # Model File:                [{}]".format(
        pm_options.input_model))

    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            model_file_obj = open(filename, 'rb')
            mlops.set_stat("# Model Files Used", 1)
        except Exception as e:
            print("Model Not Found")
            print("Got Exception: {}".format(e))
            mlops.set_stat("# Model Files Used", 0)
            mlops.done()
            return 0

    final_model = pickle.load(model_file_obj)

    # Loading the data
    loan_df = pd.read_csv(pm_options.input_file)
    X = loan_df

    # Cleaning NAs
    mlops.set_data_distribution_stat(loan_df)
    print("dataset_size = ", loan_df.shape[0])
    print("number of NAs per columns = \n", loan_df.isnull().sum())
    loan_df = loan_df.dropna()
    print("dataset_size without NA rows= ", loan_df.shape[0])

    # ## Inference
    pred_labels = final_model.predict(X)
    pred_probs = final_model.predict_proba(X)

    # Prediction distribution and prediction confidence distribution
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    print("XGBoost Inference Prediction Label Distributions: \n {0}".format(
        pred_label_distribution))
    export_bar_table(pred_label_distribution[:, 0], pred_label_distribution[:,
                                                                            1],
                     "Inference - XGBoost Prediction Distribution")

    # Pred confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(
                max_pred_probs[index_class]) / (float(pred_counts[i]))
        else:
            average_confidence[i] = 0
    print("XGBoost Validation Average Prediction confidence per label: \n {0}".
          format(average_confidence))
    # Output Pred label distribution as a BarGraph using MCenter
    export_bar_table(pred_value, average_confidence,
                     "Validation - XGBoost Average confidence per class")

    # Feature importance comparison
    export_feature_importance(final_model, list(X.columns), 5, "XGBoost")

    # KS Analysis
    max_pred_probs = pred_probs.max(axis=1)
    y_test0 = np.where(pred_labels == 0)[0]
    y_test1 = np.where(pred_labels == 1)[0]
    ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue
    print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))
    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES)
    # raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Training] KS Violation From Training Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))
    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # PSI Analysis
    total_psi, psi_table = get_psi(max_pred_probs[y_test0],
                                   max_pred_probs[y_test1])
    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])
    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1
    mlops.set_stat(psi_table_stat)
    print("Total XGBoost PSI values: \n {}".format(total_psi))
    print("XGBoost PSI Stats: \n {}".format(psi_table))
    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    if total_psi >= min_psi_requirement:
        mlops.health_alert(
            "[Training] PSI Violation From Training Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # ## Finish the program
    mlops.done()
예제 #12
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: # Model File:                [{}]".format(
        pm_options.input_model))

    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            model_file_obj = open(filename, 'rb')
            mlops.set_stat("# Model Files Used", 1)
        except Exception as e:
            print("Model Not Found")
            print("Got Exception: {}".format(e))
            mlops.set_stat("# Model Files Used", 0)
            mlops.done()
            return 0

    final_model = pickle.load(model_file_obj)

    try:
        data_filename = pm_options.input_file
        data_file_obj = open(data_filename, 'rb')
        data = np.loadtxt(data_file_obj)

        X = data  # select columns 1 through end

    except Exception as e:
        print("Generating Synthetic Data Because {}".format(e))

        # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
        num_samples = int(pm_options.num_samples)
        num_features = int(pm_options.num_features)

        # Create synthetic data using scikit learn
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            #                                binary classification only!
            n_classes=2,
            random_state=42)

        # Add random noise to the data randomly
        import random
        if random.randint(1, 21) / 2 == 0:
            print("Adding Random Noise!")

            noisy_features = np.random.uniform(0, 1) * \
                             np.random.normal(0, 1,
                                              (num_samples, num_features))
            X = X + noisy_features

    # Separate into features and labels
    features = X

    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    mlops.set_data_distribution_stat(features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features),
                   st.TIME_SERIES)

    # Accuracy for the chosen model
    pred_labels = final_model.predict(features)
    pred_probs = final_model.predict_proba(features)

    print("Pred Labels: ", pred_labels)  # Remove printout can be huge
    print("Pred Probabilities: ", pred_probs)  # Remove printout can be huge

    # Pred Label distribution
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("Pred Label distributions: \n {0}".format(pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    pred_bar = BarGraph().name("Pred Label Distribution").cols(
        (pred_label_distribution[:, 0]).astype(str).tolist()).data(
            (pred_label_distribution[:, 1]).tolist())
    mlops.set_stat(pred_bar)

    # Pred Label confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        print(" np.sum(confidence[index_class])",
              np.sum(max_pred_probs[index_class]))
        print("counts_elements[i] ", pred_counts[i])
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(
                max_pred_probs[index_class]) / (float(pred_counts[i]))
        else:
            average_confidence[i] = 0

    # BarGraph showing confidence per class
    pred_values1 = [str(i) for i in pred_value]
    bar = BarGraph().name("Average Confidence Per Class").cols(
        pred_values1).data(average_confidence.tolist())
    mlops.set_stat(bar)

    # KS for the chosen model
    ks = ks_2samp(max_pred_probs[pred_labels == 1],
                  max_pred_probs[pred_labels == 0])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue

    print("KS values: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))

    # Output KS Stat of the chosen model using MCenter
    if not np.isnan(ks_stat):
        print("printing KS_stat ")
        mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES)
    else:
        print("not printing KS_stat ")

    # Raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Inference] KS Violation From Inference Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[pred_labels == 1],
                                   max_pred_probs[pred_labels == 0])

    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])

    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1

    mlops.set_stat(psi_table_stat)

    print("Total PSI values: \n {}".format(total_psi))

    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    # Raising alert if total_psi goes below required threshold
    if total_psi <= min_psi_requirement:
        mlops.health_alert(
            "[Inference] PSI Violation From Inference Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # Terminate MLOPs
    mlops.done()
예제 #13
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: # Validation Split:          [{}]".format(
        pm_options.validation_split))

    print("PM: # AUC Threshold:             [{}]".format(
        pm_options.auc_threshold))
    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Estimators:                [{}]".format(
        pm_options.n_estimators))
    print("PM: # Max Depth:                 [{}]".format(pm_options.max_depth))
    print("PM: # Learning Rate:             [{}]".format(
        pm_options.learning_rate))
    print("PM: # Min Child Weight:          [{}]".format(
        pm_options.min_child_weight))
    print("PM: # Objective:                 [{}]".format(pm_options.objective))
    print("PM: # Gamma:                     [{}]".format(pm_options.gamma))
    print("PM: # Max Delta Step:            [{}]".format(
        pm_options.max_delta_step))
    print("PM: # Subsample:                 [{}]".format(pm_options.subsample))
    print("PM: # Reg Alpha:                 [{}]".format(pm_options.reg_alpha))
    print("PM: # Reg Lambda:                [{}]".format(
        pm_options.reg_lambda))
    print("PM: # Scale Pos Weight:          [{}]".format(
        pm_options.scale_pos_weight))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    min_auc_requirement = float(pm_options.auc_threshold)
    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Initialize MLOps Library
    mlops.init()

    try:
        data_filename = pm_options.input_file
        data_file_obj = open(data_filename, 'rb')
        data = np.loadtxt(data_file_obj)

        X = data[:, 1:]  # select columns 1 through end
        y = data[:, 0]

    except Exception as e:
        print("Generating Synthetic Data Because {}".format(e))

        # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
        num_samples = int(pm_options.num_samples)
        num_features = int(pm_options.num_features)

        # Create synthetic data using scikit learn
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            # binary classification only!
            n_classes=2,
            random_state=42)

        print("Adding Random Noise!")

        noisy_features = np.random.uniform(0, 1) * \
                         np.random.normal(0, 1,
                                          (num_samples, num_features))
        X = X + noisy_features

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=float(pm_options.validation_split), random_state=42)

    import xgboost as xgb

    # Create a model that should be deployed into production
    final_model = xgb.XGBClassifier(
        max_depth=int(pm_options.max_depth),
        min_child_weight=int(pm_options.min_child_weight),
        learning_rate=float(pm_options.learning_rate),
        n_estimators=int(pm_options.n_estimators),
        silent=True,
        objective=str(pm_options.objective),
        gamma=float(pm_options.gamma),
        max_delta_step=int(pm_options.max_delta_step),
        subsample=float(pm_options.subsample),
        colsample_bytree=1,
        colsample_bylevel=1,
        reg_alpha=float(pm_options.reg_alpha),
        reg_lambda=float(pm_options.reg_lambda),
        scale_pos_weight=float(pm_options.scale_pos_weight),
        seed=1,
        missing=None)

    final_model.fit(X_train, y_train)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(X_train)

    # Accuracy for the chosen model
    pred_labels = final_model.predict(X_test)
    pred_probs = final_model.predict_proba(X_test)

    print("Pred Labels: ", pred_labels)
    print("Pred Probabilities: ", pred_probs)

    accuracy = accuracy_score(y_test, pred_labels)
    print("Accuracy values: \n {0}".format(accuracy))
    # Output accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Label distribution in training
    value, counts = np.unique(y_test, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    # column_names = value.astype(str).tolist()
    print("Validation Actual Label distributions: \n {0}".format(
        label_distribution))

    # Output Label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Validation Actual Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Pred Label distribution in training
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("Validation Prediction Label Distributions: \n {0}".format(
        pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    pred_bar = BarGraph().name(
        "Validation Prediction Label Distributions").cols(
            (pred_label_distribution[:, 0]).astype(str).tolist()).data(
                (pred_label_distribution[:, 1]).tolist())
    mlops.set_stat(pred_bar)

    # ROC for the chosen model
    roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
    print("ROC AUC values: \n {}".format(roc_auc))

    #     Output ROC of the chosen model using MCenter
    mlops.set_stat("ROC AUC", roc_auc, st.TIME_SERIES)

    if roc_auc <= min_auc_requirement:
        mlops.health_alert(
            "[Training] AUC Violation From Training Node",
            "AUC Went Below {}. Current AUC Is {}".format(
                min_auc_requirement, roc_auc))

    # ROC Curve
    fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])
    cg = MultiGraph().name(
        "Receiver Operating Characteristic ").set_continuous()
    cg.add_series(label='Random Curve ' '', x=fpr.tolist(), y=fpr.tolist())
    cg.add_series(label='ROC Curve (Area = {0:0.2f})'
                  ''.format(roc_auc),
                  x=fpr.tolist(),
                  y=tpr.tolist())
    cg.x_title('False Positive Rate')
    cg.y_title('True Positive Rate')
    mlops.set_stat(cg)

    max_pred_probs = pred_probs.max(axis=1)

    # KS for the chosen model
    ks = ks_2samp(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue

    print("KS values: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))

    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES)

    # Raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Training] KS Violation From Training Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[y_test == 1],
                                   max_pred_probs[y_test == 0])

    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])

    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1

    mlops.set_stat(psi_table_stat)

    print("Total PSI values: \n {}".format(total_psi))

    # Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    # Raising alert if total_psi goes below required threshold
    if total_psi <= min_psi_requirement:
        mlops.health_alert(
            "[Training] PSI Violation From Training Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()