示例#1
0
def main():
    pm_options = parse_args()
    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            file_obj = open(filename, 'rb')
            mlops.set_stat("model_file", 1)
        except Exception as e:
            print("Model not found")
            print("Got exception: {}".format(e))
            mlops.set_stat("model_file", 0)
            mlops.done()
            return 0

    classifier = pickle.load(file_obj)

    # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)

    np.random.seed(0)
    g = np.random.normal(0, 1, (num_samples, num_features))
    p = np.random.poisson(0.7, (num_samples, num_features))
    b = np.random.beta(2, 2, (num_samples, num_features))

    test_data = np.concatenate((g, p, b), axis=0)
    np.random.seed()
    test_features = test_data[np.random.choice(test_data.shape[0],
                                               num_samples,
                                               replace=False)]

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    # reported during training to generate the similarity score.
    mlops.set_data_distribution_stat(test_features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples,
                   st.TIME_SERIES)

    # Predict labels
    result = classifier.predict(test_features)

    # Label distribution in prediction
    value, counts = np.unique(result, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Terminate MLOPs
    mlops.done()
示例#2
0
def main():
    print("Starting example")
    mlops.init(run_in_non_pm_mode=True, mlops_mode=MLOpsMode.PYTHON)

    # Line graphs
    mlops.set_stat("myCounterDouble", 5.5)
    mlops.set_stat("myCounterDouble2", 7.3)

    # Multi-line graphs
    mlt = MultiLineGraph().name("Multi Line").labels(["l1",
                                                      "l2"]).data([5, 16])
    mlops.set_stat(mlt)

    tbl = Table().name("MyTable").cols(["Date", "Some number"])
    tbl.add_row(["2001Q1", "55"])
    tbl.add_row(["2001Q2", "66"])
    tbl.add_row(["2003Q3", "33"])
    tbl.add_row(["2003Q2", "22"])
    mlops.set_stat(tbl)

    bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd",
                                         "ee"]).data([10, 15, 12, 9, 8])
    mlops.set_stat(bar)

    mlops.done()
    print("Example done")
示例#3
0
    def feature_importance(self,
                           model_obj,
                           feature_importance_vector=None,
                           feature_names=None,
                           model=None, df=None,
                           num_significant_features=100):
        """
         present feature importance, either according to the provided vector or generated from
         the provided model if available.
         Feature importance bar graph is attached to the current model and can be fetched later for
          this model.
         this function implements:
         1) use feature_importance_vector if exists
         2) feature_names from the model if available

         3) get feature names vector if exists
         4) extract feature name from pipeline model or dataframe if exists -
          (code different to pyspark and sklearn)

         5) sort the vector.
         6) take first k elements
         7) create a bar graph for feature importance

         :param model_obj: model  object
         :param feature_importance_vector: feature importance vector optional
         :param feature_names: feature names vector optional
         :param model: optional pipeline model for pyspark, sklearn model for python
         :param df: optional dataframe for analysis
         :param num_significant_features: Number of significant features
         :raises: MLOpsException
         """

        self._validate_feature_importance_inputs(feature_importance_vector, feature_names, model, df)

        important_named_features = self._output_channel.feature_importance(feature_importance_vector, feature_names,
                                                                           model, df)

        if important_named_features:
            # Sort the feature importance vector
            important_named_features_sorted = sorted(important_named_features,
                                                     key=lambda x: x[1], reverse=True)
            self._logger.info("Full important_named_features_sorted = {}"
                              .format(important_named_features_sorted))

            # output k significant features
            if int(num_significant_features) < len(important_named_features_sorted):
                important_named_features_sorted = important_named_features_sorted[0:int(num_significant_features)]

            # Plot results in a bar graph
            self._logger.info("Important_named_features_sorted = {}"
                              .format(important_named_features_sorted))
            col_names = [v[0] for i, v in enumerate(important_named_features_sorted)]
            col_value = [v[1] for i, v in enumerate(important_named_features_sorted)]
            bar = BarGraph().name("Feature Importance").cols(col_names).data(col_value)
            model_obj.set_stat(bar)
示例#4
0
def export_bar_table(bar_names, bar_data, title_name):
    """
    This function provides a bar_graph for a bar type data at MCenter data scientist view
    :param bar_names: Bar graph names
    :param bar_data: Bar graph data.
    :param title_name: Title of the bar Graph
    :return:
    """
    bar_graph_data = BarGraph().name(title_name).cols(
        bar_names.astype(str).tolist()).data(bar_data.tolist())
    mlops.set_stat(bar_graph_data)
示例#5
0
    def _report_bar_graph_metric(self, metric_meta, metrics):
        cols = []
        data = []
        for related_m, bar_name in metric_meta.related_metric:
            cols.append(bar_name)
            data.append(metrics[related_m.metric_name])

        if not all(v == 0
                   for v in data) or not metric_meta.metric_already_displayed:
            metric_meta.metric_already_displayed = True
            mlt = BarGraph().name(metric_meta.title).cols(cols).data(data)
            mlops.set_stat(mlt)
示例#6
0
def test_bar_graph():
    pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE)

    with pytest.raises(MLOpsException):
        BarGraph().name("bar").cols(["g1", "g2"]).data(["aa", "bb"])

    with pytest.raises(MLOpsException):
        BarGraph().name("bar").data(["aa", "bb"])

    with pytest.raises(MLOpsException):
        mlt = BarGraph().name("mlt").cols(["g1"]).data([55, 66])
        pm.set_stat(mlt)

    with pytest.raises(MLOpsException):
        mlt_cont = BarGraph().name("mlt").cols([1,
                                                2]).data([55,
                                                          66]).as_continuous()
        pm.set_stat(mlt_cont)

    mlt = BarGraph().name("mlt").cols(["g1", "g2"]).data([55, 66])
    pm.set_stat(mlt)

    mlt_cont = BarGraph().name("mlt").cols([1, 2,
                                            3]).data([55, 66]).as_continuous()
    pm.set_stat(mlt_cont)

    pm.done()
示例#7
0
    def __init__(self, track_conf, conf_thresh, conf_percent, output):
        self._track_conf = track_conf
        self._conf_thresh = conf_thresh
        self._conf_percent = conf_percent
        self._output_low_confidence_predictions = output
        self._low_confidence_predictions = 0

        print("track_conf: {}".format(track_conf))
        if track_conf > 0:
            print("conf_thresh: {}".format(conf_thresh))
            print("conf_percent: {}".format(conf_percent))

        categories = [
            "10", "20", "30", "40", "50", "60", "70", "80", "90", "100"
        ]
        self._conf_hist = []
        for i in range(0, 10):
            self._conf_hist.append(0)

        ## MLOps start
        self._conf_graph = BarGraph().name(
            "Confidence Distribution Bar Graph").cols(categories)
示例#8
0
    def __init__(self,
                 print_interval,
                 stats_type,
                 num_categories,
                 conf_thresh,
                 conf_percent,
                 hot_label=True):
        super(CategoricalStatistics, self).__init__(print_interval)
        self._num_categories = num_categories
        self._hot_label = hot_label
        self._stats_type = stats_type
        self._conf_thresh = conf_thresh / 100.0
        self._conf_percent = conf_percent

        # These are useful for development, but should be replaced by mlops library functions
        self._label_hist = []
        self._infer_hist = []
        for i in range(0, self._num_categories):
            self._label_hist.append(0)
            self._infer_hist.append(0)

        if self._stats_type == "python":
            mlops.init(ctx=None,
                       connect_mlops=True,
                       mlops_mode=MLOpsMode.AGENT)
        elif self._stats_type == "file":
            mlops.init(ctx=None,
                       connect_mlops=False,
                       mlops_mode=MLOpsMode.STAND_ALONE)
        else:
            self._stats_type = "none"

        if self._stats_type != "none":
            column_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
            self._infer_tbl = Table().name("categories").cols(column_names)
            self._infer_bar = BarGraph().name("categories bar").cols(
                column_names)
示例#9
0
    def _materialize(self, parent_data_objs, user_data):
        for param in parent_data_objs:
            prent_param = "parent param is: {param}".format(param=param)
            print(prent_param)
            self._logger.info(prent_param)

        for k,v in self._params.items():
            params_info = "key: {key} ==> value: {value}".format(key=k, value=v)
            print(params_info)
            self._logger.info(params_info)

        mlt = BarGraph().name("Kenshoo Bar graph example").cols(["bar", "bar2"]).data([1500, 2000])
        mlops.set_stat(mlt)

        return []
示例#10
0
    def _report_stats(self, file_path):
        self._logger.info(" *** generate stats .. params:{}".format(
            self._params))
        self._logger.info(" *** Source file {}".format(file_path))

        # Read the file
        data = pd.read_csv(file_path, sep=' |,', header=None, skiprows=1)
        data = data.rename(index=str,
                           columns={
                               1: "label",
                               2: "confidence0",
                               3: "confidence1"
                           })
        prediction_distribution = data['label'].value_counts()
        column_names = np.array(
            prediction_distribution.index).astype(str).tolist()

        # Initialize mlops
        mlops.init()

        # Report a bar graph
        bar = BarGraph().name("Prediction Distribution").cols(
            np.array(prediction_distribution.index).astype(str).tolist()).data(
                prediction_distribution.values.tolist())
        mlops.set_stat(bar)

        # Generate an alert on low confidence if the argument is set to true
        if (self._params["alert"]):
            index = data.values[:, 1].astype(int)
            confidence = data.values[:, 2:4]
            confidence_per_prediction = confidence[:, index][:, 0] * 100
            low_conf_percent = len(confidence_per_prediction[
                confidence_per_prediction < self._params["confidence"]]) / len(
                    confidence_per_prediction) * 100
            if low_conf_percent > self._params["samples"]:
                msg = "Low confidence: {}% of inferences had confidence below {}%".format(
                    low_conf_percent, self._params["confidence"])
                print(msg)
                mlops.health_alert("Low confidence alert", msg)

        mlops.done()

        return []
示例#11
0
class ConfidenceTracker(object):
    def __init__(self, track_conf, conf_thresh, conf_percent, output):
        self._track_conf = track_conf
        self._conf_thresh = conf_thresh
        self._conf_percent = conf_percent
        self._output_low_confidence_predictions = output
        self._low_confidence_predictions = 0

        print("track_conf: {}".format(track_conf))
        if track_conf > 0:
            print("conf_thresh: {}".format(conf_thresh))
            print("conf_percent: {}".format(conf_percent))

        categories = [
            "10", "20", "30", "40", "50", "60", "70", "80", "90", "100"
        ]
        self._conf_hist = []
        for i in range(0, 10):
            self._conf_hist.append(0)

        ## MLOps start
        self._conf_graph = BarGraph().name(
            "Confidence Distribution Bar Graph").cols(categories)
        ## MLOps end

    def check_confidence(self, confidence, sample):

        if self._track_conf == 0:
            return

        conf_bin = int(math.floor(confidence / 10))

        # include 100% confidence in the 90-100 range
        if conf_bin == 10:
            conf_bin = 9

        self._conf_hist[conf_bin] += 1

        if confidence < self._conf_thresh:
            self._low_confidence_predictions += 1

            if self._output_low_confidence_predictions != 0:
                import tensorflow as tf
                import matplotlib
                matplotlib.use('Agg')
                import matplotlib.pyplot as plt
                image = tf.reshape(sample, [28, 28])
                plotData = sample
                plotData = plotData.reshape(28, 28)
                plt.gray(
                )  # use this line if you don't want to see it in color
                plt.imshow(plotData)
                plt.savefig(
                    "/opt/data-lake/image{}_conf{}_prediction{}.png".format(
                        total_predictions, int(round(confidence)), prediction))

    def report_confidence(self, total_predictions):

        if self._track_conf == 0:
            return

        ## MLOps start
        # Show the prediction distribution as a bar graph
        self._conf_graph.data(self._conf_hist)
        mlops.set_stat(self._conf_graph)
        ## MLOps end

        # Percentage of low confidence predictions in this reporting interval
        low_conf_percent = self._low_confidence_predictions * 100.0 / total_predictions

        print("low confidence predictions: {} ({})%".format(
            self._low_confidence_predictions, low_conf_percent))

        if low_conf_percent > self._conf_percent:
            msg = "Low confidence: {}% of inferences had confidence below {}%".format(
                low_conf_percent, self._conf_thresh)
            print(msg)

            ## MLOps start
            mlops.health_alert("Low confidence alert", msg)
            ## MLOps end

        # reset counters for next round
        for i in range(0, 9):
            self._conf_hist[i] = 0
        self._low_confidence_predictions = 0
示例#12
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))
    print("PM: # Classes:                   [{}]".format(
        pm_options.num_classes))

    print("PM: C:                           [{}]".format(pm_options.C))
    print("PM: Kernel:                      [{}]".format(pm_options.kernel))
    print("PM: Degree:                      [{}]".format(pm_options.degree))
    print("PM: Gamma:                       [{}]".format(pm_options.gamma))
    print("PM: Tolerance:                   [{}]".format(pm_options.tol))
    print("PM: Maximum iterations:          [{}]".format(pm_options.max_iter))

    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    # Initialize MLOps Library
    mlops.init()

    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)
    num_classes = int(pm_options.num_classes)

    # Create synthetic data using scikit learn
    X, y = make_classification(n_samples=num_samples,
                               n_features=num_features,
                               n_informative=2,
                               n_redundant=1,
                               n_classes=num_classes,
                               n_clusters_per_class=1,
                               random_state=42)

    # Separate into features and labels
    features = X
    labels = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (num_samples, num_features))
    features = features + noisy_features

    # Create a model that should be deployed into production
    final_model = SVC(C=float(pm_options.C),
                      probability=True,
                      kernel=pm_options.kernel,
                      degree=int(pm_options.degree),
                      gamma=str(pm_options.gamma),
                      tol=float(pm_options.tol),
                      max_iter=int(pm_options.max_iter))

    final_model.fit(features, labels)

    value, counts = np.unique(labels, return_counts=True)
    label_distribution = np.asarray((value, counts)).T

    # Output actual label distribution as a BarGraph using MCenter
    bar = BarGraph().name("User Defined: Actual Label Distribution") \
        .cols((label_distribution[:, 0]).astype(str).tolist()) \
        .data((label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    pos_label = 1

    # calculate classification prediction
    labels_pred = final_model.predict(features)
    # calculate decision scores [n_sample, n_class]
    labels_decision_score = final_model.decision_function(features)
    # calculate classification probabilities [n_sample, n_class]
    labels_prob = final_model.predict_proba(features)
    # calculate classification probabilities of positive labels
    label_pos_class_prob = list(map(lambda x: x[pos_label], labels_prob))
    # list of sorted labels. i.e. [0, 1, 2, ..]
    labels_ordered = sorted(set(labels))

    value_pred, counts_pred = np.unique(labels_pred, return_counts=True)
    label_distribution_pred = np.asarray((value_pred, counts_pred)).T

    # Output prediction label distribution as a BarGraph using MCenter
    bar_pred = BarGraph().name("User Defined: Prediction Label Distribution") \
        .cols((label_distribution_pred[:, 0]).astype(str).tolist()) \
        .data((label_distribution_pred[:, 1]).tolist())
    mlops.set_stat(bar_pred)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(features)

    ################################################################
    #################### Start: Output Accuracy ####################
    ################################################################

    accuracy = final_model.score(features, labels)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output accuracy of the chosen model using MCenter
    # mlops.set_stat("User Defined: Accuracy", accuracy, st.TIME_SERIES)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.ACCURACY_SCORE, accuracy)

    # OR

    # Third Way
    mlops.metrics.accuracy_score(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output Accuracy ####################
    ##############################################################

    ################################################################
    #################### Start: Output AUC ####################
    ################################################################

    fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels,
                                                     labels_pred,
                                                     pos_label=pos_label)
    auc = sklearn.metrics.auc(fpr, tpr)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output auc of the chosen model using MCenter
    # mlops.set_stat("User Defined: AUC", auc)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.AUC, auc)

    # OR

    # Third Way
    mlops.metrics.auc(x=fpr, y=tpr)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output AUC ####################
    ##############################################################

    ###############################################################################
    #################### Start: Output Average Precision Score ####################
    ###############################################################################

    # average precision is not supported for multiclass
    if len(labels_ordered) <= 2:
        aps = sklearn.metrics.average_precision_score(labels,
                                                      labels_decision_score)

        #################### OLD WAY ####################
        # First Way
        #
        # # Output aps of the chosen model using MCenter
        # mlops.set_stat("User Defined: Average Precision Score", aps)
        #################### DONE OLD WAY ####################

        #################### NEW WAY ####################
        # Second Way
        mlops.set_stat(ClassificationMetrics.AVERAGE_PRECISION_SCORE, aps)

        # OR

        # Third Way
        mlops.metrics.average_precision_score(y_true=labels,
                                              y_score=labels_decision_score)
        #################### DONE NEW WAY ####################

    #############################################################################
    #################### End: Output Average Precision Score ####################
    #############################################################################

    #########################################################################
    #################### Start: Output Balanced Accuracy ####################
    #########################################################################

    bas = sklearn.metrics.balanced_accuracy_score(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output bas of the chosen model using MCenter
    # mlops.set_stat("User Defined: Balanced Accuracy Score", bas)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.BALANCED_ACCURACY_SCORE, data=bas)

    # OR

    # Third Way
    mlops.metrics.balanced_accuracy_score(y_true=labels, y_pred=labels_pred)
    #################### DONE NEW WAY ####################

    #######################################################################
    #################### End: Output Balanced Accuracy ####################
    #######################################################################

    ########################################################################
    #################### Start: Output Brier Score Loss ####################
    ########################################################################

    bsl = sklearn.metrics.brier_score_loss(labels,
                                           label_pos_class_prob,
                                           pos_label=pos_label)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output bsl of the chosen model using MCenter
    # mlops.set_stat("User Defined: Brier Score Loss", bsl)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.BRIER_SCORE_LOSS, data=bsl)

    # OR

    # Third Way
    mlops.metrics.brier_score_loss(y_true=labels,
                                   y_prob=label_pos_class_prob,
                                   pos_label=pos_label)
    #################### DONE NEW WAY ####################

    ######################################################################
    #################### End: Output Brier Score Loss ####################
    ######################################################################

    #############################################################################
    #################### Start: Output Classification Report ####################
    #############################################################################
    cr = sklearn.metrics.classification_report(labels, labels_pred)
    print("Classification Report\n{}".format(cr))
    #################### OLD WAY ####################
    # First Way
    #
    # from parallelm.mlops.stats.table import Table
    #
    # arrayReport = list()
    # for row in cr.split("\n"):
    #     parsed_row = [x for x in row.split("  ") if len(x) > 0]
    #     if len(parsed_row) > 0:
    #         arrayReport.append(parsed_row)
    #
    # header = arrayReport[0]
    # cr_table = Table().name("User Defined: Classification Report").cols(header)
    #
    # for index in range(1, len(arrayReport)):
    #     row_title = arrayReport[index][0]
    #     row_value = arrayReport[index][:-1]
    #     cr_table.add_row(row_title, row_value)
    #
    # # output classification report using MCenter
    # mlops.set_stat(cr_table)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.CLASSIFICATION_REPORT, data=cr)

    # OR

    # Third Way
    mlops.metrics.classification_report(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ###########################################################################
    #################### End: Output Classification Report ####################
    ###########################################################################

    #########################################################################
    #################### Start: Output Cohen Kappa Score ####################
    #########################################################################

    cks = sklearn.metrics.cohen_kappa_score(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output cks of the chosen model using MCenter
    # mlops.set_stat("User Defined: Cohen Kappa Score", cks)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.COHEN_KAPPA_SCORE, data=cks)

    # OR

    # Third Way
    mlops.metrics.cohen_kappa_score(labels, labels_pred)
    #################### DONE NEW WAY ####################

    #######################################################################
    #################### End: Output Cohen Kappa Score ####################
    #######################################################################

    ########################################################################
    #################### Start: Output Confusion Matrix ####################
    ########################################################################

    cm = sklearn.metrics.confusion_matrix(labels,
                                          labels_pred,
                                          labels=labels_ordered)

    #################### OLD WAY ####################
    # First Way
    # from parallelm.mlops.stats.table import Table

    # labels_string = [str(i) for i in labels_ordered]
    # cm_matrix = Table().name("User Defined: Confusion Matrix").cols(labels_string)
    #
    # for index in range(len(cm)):
    #     cm_matrix.add_row(labels_string[index], list(cm[index]))
    #
    # mlops.set_stat(cm_matrix)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.CONFUSION_MATRIX,
                   cm,
                   labels=labels_ordered)

    # OR

    # Third Way
    mlops.metrics.confusion_matrix(y_true=labels,
                                   y_pred=labels_pred,
                                   labels=labels_ordered)
    #################### DONE NEW WAY ####################

    ######################################################################
    #################### End: Output Confusion Matrix ####################
    ######################################################################

    ################################################################
    #################### Start: Output F1 Score ####################
    ################################################################

    f1 = sklearn.metrics.f1_score(labels,
                                  labels_pred,
                                  pos_label=pos_label,
                                  average=None)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output f1 score of the chosen model using MCenter
    # mlops.set_stat("User Defined: F1 Score", f1)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.F1_SCORE, data=f1)

    # OR

    # Third Way
    mlops.metrics.f1_score(labels,
                           labels_pred,
                           pos_label=pos_label,
                           average=None)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output F1 Score ####################
    ##############################################################

    ################################################################
    #################### Start: Output FBeta Score ####################
    ################################################################

    fbeta = sklearn.metrics.fbeta_score(labels,
                                        labels_pred,
                                        beta=0.5,
                                        average=None)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output fbeta score of the chosen model using MCenter
    # mlops.set_stat("User Defined: F-beta Score", fbeta)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.FBETA_SCORE, data=fbeta)

    # OR

    # Third Way
    mlops.metrics.fbeta_score(labels,
                              labels_pred,
                              pos_label=pos_label,
                              beta=0.5,
                              average=None)
    #################### DONE NEW WAY ####################

    #################################################################
    #################### End: Output FBeta Score ####################
    #################################################################

    ####################################################################
    #################### Start: Output Hamming Loss ####################
    ####################################################################

    hamming_loss = sklearn.metrics.hamming_loss(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output hamming loss of the chosen model using MCenter
    # mlops.set_stat("User Defined: Hamming Loss", hamming_loss)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.HAMMING_LOSS, data=hamming_loss)

    # OR

    # Third Way
    mlops.metrics.hamming_loss(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ##################################################################
    #################### End: Output Hamming Loss ####################
    ##################################################################

    ##################################################################
    #################### Start: Output Hinge Loss ####################
    ##################################################################

    hinge_loss = sklearn.metrics.hinge_loss(labels, labels_decision_score)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output hinge loss of the chosen model using MCenter
    # mlops.set_stat("User Defined: Hinge Loss", hinge_loss)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.HINGE_LOSS, data=hinge_loss)

    # OR

    # Third Way
    mlops.metrics.hinge_loss(labels, labels_decision_score)
    #################### DONE NEW WAY ####################

    ################################################################
    #################### End: Output Hinge Loss ####################
    ################################################################

    ##############################################################################
    #################### Start: Output Jaccard Similarity Score ####################
    ##############################################################################

    jaccard_sim_score = sklearn.metrics.jaccard_similarity_score(
        labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output jaccard similarity score of the chosen model using MCenter
    # mlops.set_stat("User Defined: Jaccard Similarity Score", jaccard_sim_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.JACCARD_SIMILARITY_SCORE,
                   data=jaccard_sim_score)

    # OR

    # Third Way
    mlops.metrics.jaccard_similarity_score(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ############################################################################
    #################### End: Output Jaccard Similary Score ####################
    ############################################################################

    ################################################################
    #################### Start: Output Log Loss ####################
    ################################################################

    log_loss = sklearn.metrics.log_loss(labels, labels_prob)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output log loss of the chosen model using MCenter
    # mlops.set_stat("User Defined: Log Loss", log_loss)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.LOG_LOSS, data=log_loss)

    # OR

    # Third Way
    mlops.metrics.log_loss(labels, labels_prob)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: Output Log Loss ####################
    ##############################################################

    ########################################################################################
    #################### Start: Output Matthews Correlation Coefficient ####################
    ########################################################################################

    mcc = sklearn.metrics.matthews_corrcoef(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output mcc of the chosen model using MCenter
    # mlops.set_stat("User Defined: Matthews Correlation Coefficient", mcc)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.MATTHEWS_CORRELATION_COEFFICIENT,
                   data=mcc)

    # OR

    # Third Way
    mlops.metrics.matthews_corrcoef(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ######################################################################################
    #################### End: Output Matthews Correlation Coefficient ####################
    ######################################################################################

    ##############################################################################
    #################### Start: Output Precision Recall Curve ####################
    ##############################################################################

    # precision_recall_curve is not supported for multiclass
    if len(labels_ordered) <= 2:
        precision, recall, thresholds = sklearn.metrics.precision_recall_curve(
            labels, labels_decision_score, pos_label=pos_label)
        classes = len(labels_ordered)
        average_precision = sklearn.metrics.average_precision_score(
            labels, labels_decision_score, average="macro")

        graph_label_str = "{}-class Precision Recall Curve -- AP: {}".format(
            classes, average_precision)

        #################### OLD WAY ####################
        # First Way
        # from parallelm.mlops.stats.graph import Graph
        #
        # p_r_curve = Graph() \
        #     .name("User Defined: Precision Recall Curve") \
        #     .set_x_series(list(recall)) \
        #     .add_y_series(label="User Defined: {}".format(graph_label_str), data=list(precision))
        #
        # p_r_curve.x_title("Recall")
        # p_r_curve.y_title("Precision")
        # mlops.set_stat(p_r_curve)
        #################### DONE OLD WAY ####################

        #################### NEW WAY ####################
        # Second Way
        mlops.set_stat(ClassificationMetrics.PRECISION_RECALL_CURVE,
                       [precision, recall],
                       legend=graph_label_str)

        # OR

        # Third Way
        mlops.metrics.precision_recall_curve(y_true=labels,
                                             probas_pred=labels_decision_score,
                                             pos_label=pos_label,
                                             average="macro")

        #################### DONE NEW WAY ####################

    ############################################################################
    #################### End: Output Precision Recall Curve ####################
    ############################################################################

    #######################################################################
    #################### Start: Output Precision Score ####################
    #######################################################################

    precision_score = sklearn.metrics.precision_score(labels,
                                                      labels_pred,
                                                      pos_label=pos_label,
                                                      average=None)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output precision score of the chosen model using MCenter
    # mlops.set_stat("User Defined: Precision Score", precision_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.PRECISION_SCORE, data=precision_score)

    # OR

    # Third Way
    mlops.metrics.precision_score(labels,
                                  labels_pred,
                                  pos_label=pos_label,
                                  average=None)
    #################### DONE NEW WAY ####################

    ############################################################################
    #################### End: Output Precision Score ###########################
    ############################################################################

    ####################################################################
    #################### Start: Output Recall Score ####################
    ####################################################################

    recall_score = sklearn.metrics.recall_score(labels,
                                                labels_pred,
                                                pos_label=pos_label,
                                                average=None)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output recall score of the chosen model using MCenter
    # mlops.set_stat("User Defined: Recall Score", recall_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.RECALL_SCORE, data=recall_score)

    # OR

    # Third Way
    mlops.metrics.recall_score(labels,
                               labels_pred,
                               pos_label=pos_label,
                               average=None)
    #################### DONE NEW WAY ####################

    #########################################################################
    #################### End: Output Recall Score ###########################
    #########################################################################

    #####################################################################
    #################### Start: Output ROC AUC Score ####################
    #####################################################################

    # roc_auc_score is not supported for multiclass
    if len(labels_ordered) <= 2:
        roc_auc_score = sklearn.metrics.roc_auc_score(labels,
                                                      labels_decision_score)

        #################### OLD WAY ####################
        # First Way
        #
        # # Output roc auc score of the chosen model using MCenter
        # mlops.set_stat("User Defined: ROC AUC Score", roc_auc_score)
        #################### DONE OLD WAY ####################

        #################### NEW WAY ####################
        # Second Way
        mlops.set_stat(ClassificationMetrics.ROC_AUC_SCORE, data=roc_auc_score)

        # OR

        # Third Way
        mlops.metrics.roc_auc_score(labels, labels_decision_score)
        #################### DONE NEW WAY ####################

    ###################################################################
    #################### End: Output ROC AUC Score ####################
    ###################################################################

    #################################################################
    #################### Start: Output ROC Curve ####################
    #################################################################

    # roc_auc_score is not supported for multiclass
    if len(labels_ordered) <= 2:
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(labels,
                                                         labels_decision_score,
                                                         pos_label=pos_label)

        roc_auc_score = sklearn.metrics.roc_auc_score(labels,
                                                      labels_decision_score)

        graph_label_str = "ROC Curve, AUC: {}".format(roc_auc_score)

        #################### OLD WAY ####################
        # First Way
        # from parallelm.mlops.stats.graph import Graph
        #
        # roc_curve = Graph() \
        #     .name("User Defined: ROC Curve") \
        #     .set_x_series(list(fpr)) \
        #     .add_y_series(label="User Defined: {}".format(graph_label_str), data=list(tpr))
        #
        # roc_curve.x_title("False Positive Rate")
        # roc_curve.y_title("True Positive Rate")
        #
        # mlops.set_stat(roc_curve)
        #################### DONE OLD WAY ####################

        #################### NEW WAY ####################
        mlops.set_stat(ClassificationMetrics.ROC_CURVE, [tpr, fpr],
                       legend=graph_label_str)

        # OR

        # Third Way
        mlops.metrics.roc_curve(y_true=labels,
                                y_score=labels_decision_score,
                                pos_label=pos_label)
        #################### DONE NEW WAY ####################

    ###############################################################
    #################### End: Output ROC Curve ####################
    ###############################################################

    #####################################################################
    #################### Start: Output Zero One Loss ####################
    #####################################################################

    zol = sklearn.metrics.zero_one_loss(labels, labels_pred)

    #################### OLD WAY ####################
    # First Way
    #
    # # Output zol of the chosen model using MCenter
    # mlops.set_stat("User Defined: Zero One Loss", zol)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClassificationMetrics.ZERO_ONE_LOSS, data=zol)

    # OR

    # Third Way
    mlops.metrics.zero_one_loss(labels, labels_pred)
    #################### DONE NEW WAY ####################

    ###################################################################
    #################### End: Output Zero One Loss ####################
    ###################################################################

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
示例#13
0
def main():
    # Initialize spark and MLOps
    spark = SparkSession.builder.appName(
        "RandomForestClassifier").getOrCreate()
    mlops.init(spark.sparkContext)

    # parse the arguments to component
    options = parse_args()

    # Load the model, exit gracefully if model is not found
    try:
        model_rf = \
            SparkPipelineModelHelper() \
                .set_shared_context(spark_context=spark.sparkContext) \
                .set_local_path(local_path=options.input_model) \
                .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \
                .load_sparkml_model()
    except Exception as e:
        print(e)
        mlops.done()
        spark.sparkContext.stop()
        exit()

    # Generate synthetic data for inference (Gaussian Distribution, Poisson Distribution and Beta Distribution)
    num_samples = 50
    num_features = 20

    np.random.seed(0)
    g = np.random.normal(0, 1, (num_samples, num_features))
    p = np.random.poisson(0.7, (num_samples, num_features))
    b = np.random.beta(2, 2, (num_samples, num_features))
    test_data = np.concatenate((g, p, b), axis=0)
    np.random.seed()
    test_features = test_data[np.random.choice(test_data.shape[0],
                                               num_samples,
                                               replace=False)]
    feature_names = [
        "".join(ascii_lowercase[a]) for a in range(num_features + 1)
    ]

    # Create a spark dataframe from the synthetic data generated
    inferenceData = spark.createDataFrame(
        pd.DataFrame(test_features, columns=feature_names[1:num_features + 1]))

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    # reported during training to generate the similarity score
    mlops.set_data_distribution_stat(inferenceData)

    num_samples = inferenceData.count()

    # Report the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples,
                   st.TIME_SERIES)

    # Make inference predictions
    predicted_df = model_rf.transform(inferenceData)

    # Create a bar graph with label and confidence distributions
    histogram_predictions = predicted_df.groupby("prediction").count()
    prediction_values = np.array(
        histogram_predictions.select("prediction").collect())
    prediction_counts = np.array(
        histogram_predictions.select("count").collect())

    # Report label distribution as a BarGraph using MCenter
    bar_predictions = BarGraph().name("Prediction Distribution").cols(
        (prediction_values[0]).astype(str).tolist()).data(
            (prediction_counts[0]).tolist())
    mlops.set_stat(bar_predictions)

    # Stop spark context and MLOps
    spark.sparkContext.stop()
    mlops.done()
示例#14
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: C:                           [{}]".format(pm_options.C))
    print("PM: Kernel:                      [{}]".format(pm_options.kernel))
    print("PM: Degree:                      [{}]".format(pm_options.degree))
    print("PM: Gamma:                       [{}]".format(pm_options.gamma))
    print("PM: Tolerance:                   [{}]".format(pm_options.tol))
    print("PM: Maximum iterations:          [{}]".format(pm_options.max_iter))

    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    # Initialize MLOps Library
    mlops.init()

    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)
    # Create synthetic data using scikit learn
    X, y = make_classification(n_samples=num_samples,
                               n_features=num_features,
                               n_informative=2,
                               n_redundant=1,
                               n_classes=3,
                               n_clusters_per_class=1,
                               random_state=42)

    # Separate into features and labels
    features = X
    labels = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (num_samples, num_features))
    features = features + noisy_features

    # Create a model that should be deployed into production
    final_model = SVC(C=float(pm_options.C),
                      kernel=pm_options.kernel,
                      degree=int(pm_options.degree),
                      gamma=str(pm_options.gamma),
                      tol=float(pm_options.tol),
                      max_iter=int(pm_options.max_iter))

    final_model.fit(features, labels)

    # Accuracy for the chosen model
    accuracy = final_model.score(features, labels)
    print("Accuracy values: \n {0}".format(accuracy))

    # Label distribution in training
    value, counts = np.unique(labels, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Output accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(features)

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
示例#15
0
def canary_comparator(options, start_time, end_time, mode):
    sc = None
    if mode == RunModes.PYSPARK:
        from pyspark import SparkContext
        sc = SparkContext(appName="canary-comparator")
        mlops.init(sc)
    elif mode == RunModes.PYTHON:
        mlops.init()
    else:
        raise Exception("Invalid mode " + mode)

    not_enough_data = False

    # Following are main and canary component names
    main_prediction_component_name = options.nodeA
    canary_prediction_component_name = options.nodeB

    main_stat_name = options.predictionHistogramA
    canary_stat_name = options.predictionHistogramB

    main_agent = utils._get_agent_id(main_prediction_component_name,
                                     options.agentA)
    canary_agent = utils._get_agent_id(canary_prediction_component_name,
                                       options.agentB)
    if main_agent is None or canary_agent is None:
        print("Invalid agent provided {} or {}".format(options.agentA,
                                                       options.agentB))
        mlops.system_alert(
            "PyException",
            "Invalid Agent {} or {}".format(options.agentA, options.agentB))
        return

    try:
        main_data_frame = mlops.get_stats(
            name=main_stat_name,
            mlapp_node=main_prediction_component_name,
            agent=main_agent,
            start_time=start_time,
            end_time=end_time)

        canary_data_frame = mlops.get_stats(
            name=canary_stat_name,
            mlapp_node=canary_prediction_component_name,
            agent=canary_agent,
            start_time=start_time,
            end_time=end_time)

        main_pdf = pd.DataFrame(main_data_frame)
        canary_pdf = pd.DataFrame(canary_data_frame)

        try:
            row1 = main_pdf.tail(1).iloc[0]
            row2 = canary_pdf.tail(1).iloc[0]
        except Exception as e:
            not_enough_data = True
            print("Not enough histograms produced in pipelines")
            raise ValueError("Not enough data to compare")

        if row1['hist_type'] != row2['hist_type']:
            raise ValueError(
                'Canary and Main pipelines dont produce histograms' +
                'of same type {} != {}'.format(row1['hist_type'],
                                               row2['hist_type']))

        if row1['hist_type'] == 'continuous':
            rmse = _compare_cont_hist(row1['bin_edges'], row2['bin_edges'],
                                      row1['hist_values'], row2['hist_values'])
            gg2 = MultiGraph().name("Prediction Histograms").set_categorical()

            gg2.x_title("Predictions")
            gg2.y_title("Normalized Frequency")

            gg2.add_series(label="Main",
                           x=[float(x) for x in row1['bin_edges']][:-1],
                           y=[y for y in row1['hist_values']])
            gg2.add_series(label="Canary",
                           x=[float(x) for x in row2['bin_edges']][:-1],
                           y=[y for y in row2['hist_values']])
            mlops.set_stat(gg2)

            bar1 = BarGraph().name("Main Pipeline").cols([
                "{} to {}".format(x, y)
                for (x, y) in pairwise(row1['bin_edges'])
            ]).data([x for x in row1['hist_values']])
            mlops.set_stat(bar1)

            bar2 = BarGraph().name("Canary Pipeline").cols([
                "{} to {}".format(x, y)
                for (x, y) in pairwise(row2['bin_edges'])
            ]).data([x for x in row2['hist_values']])
            mlops.set_stat(bar2)

        elif row1['hist_type'] == 'categorical':
            rmse = _compare_cat_hist(row1['bin_edges'], row2['bin_edges'],
                                     row1['hist_values'], row2['hist_values'])

            gg2 = MultiGraph().name("Prediction Histograms").set_categorical()

            gg2.x_title("Predictions")
            gg2.y_title("Normalized Frequency")

            gg2.add_series(label="Main",
                           x=row1['bin_edges'],
                           y=[y for y in row1['hist_values']])
            gg2.add_series(label="Canary",
                           x=row2['bin_edges'],
                           y=[y for y in row2['hist_values']])
            mlops.set_stat(gg2)

            bar1 = BarGraph().name("Main Pipeline").cols([
                "{}".format(x) for x in row1['bin_edges']
            ]).data([x for x in row1['hist_values']])
            mlops.set_stat(bar1)

            bar2 = BarGraph().name("Canary Pipeline").cols([
                "{}".format(x) for x in row2['bin_edges']
            ]).data([x for x in row2['hist_values']])
            mlops.set_stat(bar2)
        else:
            raise ValueError('Invalid histogram type: {}'.format(
                row1['hist_type']))

        mlops.set_stat("RMSE", rmse, st.TIME_SERIES)

        print("mlops policy {}".format(mlops.mlapp_policy))

        if mlops.mlapp_policy.canary_threshold is None:
            print("Canary health threshold not set")
            raise ValueError("Canary health threshold not set in config")

        # Following code perform comparison between the histograms.
        # Here you can insert your own code
        if rmse > mlops.mlapp_policy.canary_threshold:
            print("Canary Alert {} > {}".format(
                rmse, mlops.mlapp_policy.canary_threshold))
            mlops.event(
                CanaryAlert(label="CanaryAlert",
                            is_healthy=False,
                            score=rmse,
                            threshold=mlops.mlapp_policy.canary_threshold))
        else:
            print("Data matches {}".format(rmse))
            mlops.event(
                CanaryAlert(label="CanaryAlert",
                            is_healthy=True,
                            score=rmse,
                            threshold=mlops.mlapp_policy.canary_threshold))

    except Exception as e:
        if not_enough_data is False:
            print("Got exception while getting stats: {}".format(e))
            mlops.system_alert(
                "PyException",
                "Got exception {}".format(traceback.format_exc()))

    if mode == RunModes.PYSPARK:
        sc.stop()
    mlops.done()
示例#16
0
def infer_loop(model, input, output_file, stats_interval, conf_thresh, conf_percent):

    output = open(output_file, "w")

    # Initialize statistics
    total_predictions = 0
    low_confidence_predictions = 0
    categories = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    prediction_hist = []
    for i in range(0, len(categories)):
        prediction_hist.append(0)

    ### MLOPS start
    # Create a bar graph and table for reporting prediction distributions and set the column names
    infer_bar = BarGraph().name("Prediction Distribution Bar Graph").cols(categories)
    infer_tbl = Table().name("Prediction Distribution Table").cols(categories)
    ### MLOPS end
    
    while True:
        try:
            sample, label = input.get_next_input()
            sample_np = ny.array(sample).reshape(1, -1)

            
            # The prediction is the class with the highest probability
            prediction = model.predict(sample_np)
            

            # Append the prediction to the output file
            output.write("{}\n".format(prediction))

            # Calculate statistics
            total_predictions += 1
            prediction_hist[ny.int(prediction[0])] += 1

            # Report statistics
            if total_predictions % stats_interval == 0:

                # Report the prediction distribution
                for i in range(0, len(categories)):
                    print("category: {} predictions: {}".format(categories[i], prediction_hist[i]))


                ### MLOPS start


                # Show the prediction distribution as a table
                infer_tbl.add_row(str(total_predictions), prediction_hist)

                # Show the prediction distribution as a bar graph
                infer_bar.data(prediction_hist)

                


        except EOFError:
            # stop when we hit end of input
            # Report the stats
            mlops.set_stat(infer_tbl)
            mlops.set_stat(infer_bar)

            ### MLOPS end
            output.close()

            ### MLOPS start
            mlops.done()
            ### MLOPS end

            break
示例#17
0
文件: LRTrain.py 项目: theromis/mlhub
def main():
    pm_options = parse_args()
    print("PM: Configuration:")
    print("PM: Data file:            [{}]".format(pm_options.data_file))
    print("PM: Output model:         [{}]".format(pm_options.output_model))
    print("PM: regularization_range:         [{}]".format(
        pm_options.regularization_range))

    mlops.init()

    # Read the Samsung datafile
    dataset = pd.read_csv(pm_options.data_file)

    # Separate into features and labels
    features = dataset.iloc[:, 1:].values
    labels = dataset.iloc[:, 0].values

    # Hyper-parameter search using k-fold cross-validation
    # Applying k_fold cross validation
    regularization_range = pm_options.regularization_range.split(',')
    regularization = [
        float(regularization_var)
        for regularization_var in regularization_range
    ]
    tune_parameters = [{'C': regularization}]

    # Initialize logistic regression algorithm
    LR = LogisticRegression(class_weight='balanced',
                            multi_class='multinomial',
                            solver='lbfgs')
    clf = GridSearchCV(LR, tune_parameters, cv=5, scoring='accuracy')
    clf.fit(features, labels)
    print("best parameter = ", clf.best_params_)
    accuracy = clf.cv_results_['mean_test_score']
    print(
        'Accuracy values: \n {0} \n for `Regularization values: \n{1}'.format(
            accuracy, regularization))

    ########## Start of ParallelM instrumentation ##############
    # Report Hyper-parameter Table
    tbl = Table().name("Hyper-parameter Search Results").cols(
        ["Mean accuracy from k-fold cross-validation"])
    print("length of regularization", len(regularization))
    index_max = np.argmax(accuracy)
    for a in range(0, len(regularization)):
        print("adding row", regularization[a])
        if a == index_max:
            tbl.add_row("[Best] Regularization = " + np.str(regularization[a]),
                        [accuracy[a]])
        else:
            tbl.add_row("Regularization = " + np.str(regularization[a]),
                        [accuracy[a]])
    mlops.set_stat(tbl)
    ########## End of ParallelM instrumentation ##############

    # Label distribution in training
    label_distribution = dataset['label'].value_counts()
    column_names = np.array(label_distribution.index).astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    ########## Start of ParallelM instrumentation ##############
    # Report label distribution as a BarGraph
    bar = BarGraph().name("Label Distribution").cols(
        np.array(label_distribution.index).astype(str).tolist()).data(
            label_distribution.values.tolist())
    mlops.set_stat(bar)
    ########## Start of ParallelM instrumentation ##############

    #################### Start of ParallelM instrumentation ################
    # Report accuracy of the chosen model
    mlops.set_stat("K-fold cross-validation Accuracy", accuracy[index_max],
                   st.TIME_SERIES)
    #################### End of ParallelM instrumentation ################

    # Histogram input
    mlops.set_data_distribution_stat(dataset)

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(clf, model_file)
    model_file.close()
    mlops.done()
示例#18
0
def kmeans_train(pm_options, spark):
    """
    Kmeans Training function
    :param pm_options:
    :param spark:
    :return:
    """

    # Import Data
    ##################################
    input_data = (spark.read.format("csv")
                  .option("header", pm_options.with_headers)
                  .option("ignoreLeadingWhiteSpace", "true")
                  .option("ignoreTrailingWhiteSpace", "true")
                  .option("inferschema", "true")
                  .load(pm_options.data_file)).repartition(10)

    # If Data doesn't have headers Create column names c0-cn
    column_names_all = input_data.columns
    if not pm_options.with_headers == "true":
        for col_index in range(0, len(column_names_all)):
            input_data = input_data.withColumnRenamed(column_names_all[col_index],
                                                      'c' + str(col_index))

    input_data = input_data.cache()

    # Set both train and tesst data to the entire dataset
    input_train = input_data
    input_test = input_data

    # SparkML pipeline
    ##################################
    # Create column names for vector assembler. Handle exclude columns for vector assembler
    exclude_cols = [] # No columns to exclude - kmeans of all columns
    column_names = input_train.columns
    input_col_names = []
    for elmts in column_names:
        ind = True
        for excludes in exclude_cols:
            if elmts == excludes:
                ind = False
        if ind:
            input_col_names.append(elmts)
    print(input_col_names)

    # Set hyper parameters search parameters
    k_range = pm_options.KRange.split(',')
    db_index_max = np.finfo(np.float64).max
    k_max = k_range[0]
    db_index_array = np.zeros(len(k_range))

    for index_hs in range (0,len(k_range)):
        vector_assembler = VectorAssembler(
                inputCols=input_col_names,
                outputCol="features")
        kmeans_pipe = KMeans(
            k=int(k_range[index_hs]),
            initMode="k-means||",
            initSteps=5,
            tol=1e-4,
            maxIter=100,
            featuresCol="features")
        full_pipe = [vector_assembler, kmeans_pipe]
        model_kmeans = Pipeline(stages=full_pipe).fit(input_train)

        # Test validation and statistics collection
        ############################################################
        predicted_df = model_kmeans.transform(input_test)

        print("model_kmeans.stages(1) = ", model_kmeans.stages[1])

        sum_errors = model_kmeans.stages[1].computeCost(predicted_df)
        print("Sum of Errors for Kmeans = " + str(sum_errors))

        kmeans_centers = model_kmeans.stages[1].clusterCenters()
        print("Kmeans Centers: ")
        for center in kmeans_centers:
            print(center)

        # calculating stats
        ############################################################

        # Calculating Inter cluster distance
        inter_cluster_distance = np.zeros((len(kmeans_centers), len(kmeans_centers)))

        for centerIndex1 in range(0, len(kmeans_centers)):
            for centerIndex2 in range(0, len(kmeans_centers)):
                inter_cluster_distance[centerIndex1, centerIndex2] = \
                    eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2])

        print("inter_cluster_distance = ", inter_cluster_distance)
        
        # Calculating Intra cluster distances and the bars for the cluster distribution
        intra_cluster_distance = np.zeros(len(kmeans_centers))
        cluster_dist = np.zeros(len(kmeans_centers))

        for centerIndex1 in range(0, len(kmeans_centers)):
            filtered_df = predicted_df.filter(predicted_df["prediction"] == centerIndex1)
            cluster_dist[centerIndex1] = filtered_df.count()
            if cluster_dist[centerIndex1] == 0:
                intra_cluster_distance[centerIndex1] = 0
            else:
                filtered_df = \
                    filtered_df.withColumn('distance',
                                           udf(eq_dist, FloatType())(col("features"),
                                                                     array([lit(v) for v in kmeans_centers[centerIndex1]])))
                intra_cluster_distance[centerIndex1] = \
                    filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1]

        # calculating Davies-Boulding Index
        ############################################################
        # R[i,j] = (S[i] + S[j])/M[i,j]
        # D[i] = max(R[i,j]) for i !=j
        # DB = (1/K) * sum(D[i])
        r_index = np.zeros((len(kmeans_centers), len(kmeans_centers)))
        for centerIndex1 in range(0, len(kmeans_centers)):
            for centerIndex2 in range(0, len(kmeans_centers)):
                r_index[centerIndex1, centerIndex2] = 0
                if not inter_cluster_distance[centerIndex1, centerIndex2] == 0:
                    r_index[centerIndex1, centerIndex2] = \
                        (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2]) \
                        / inter_cluster_distance[centerIndex1, centerIndex2]
        d_index = np.max(r_index, axis=0)
        db_index = np.sum(d_index, axis=0) / len(kmeans_centers)
        db_index_array[index_hs] = db_index

        # Check Hyper Parameter Search max
        if (db_index < db_index_max):
            db_index_max = db_index
            k_max = k_range[index_hs]
            model_kmeans_max = model_kmeans
            sum_errors_max = sum_errors
            kmeans_centers_max = kmeans_centers
            inter_cluster_distance_max = inter_cluster_distance
            intra_cluster_distance_max = intra_cluster_distance
            cluster_dist_max = cluster_dist




    # PM stats
    ############################################################
    print("Optimal K = " + str(k_max))
    pm.set_stat("Optimal number of clusters", k_max, st.TIME_SERIES)

    print("Sum of Errors for Kmeans = " + str(sum_errors_max))
    pm.set_stat("Sum of Errors for Kmeans", sum_errors_max, st.TIME_SERIES)

    print("Davies-Bouldin index = " + str(db_index_max))
    pm.set_stat("Davies-Bouldin index", db_index_max, st.TIME_SERIES)

    # Tables
    tbl_col_name = []
    for j in range(0, len(k_range)):
        tbl_col_name.append(str(k_range[j]))
    tbl = Table().name("Davies-Bouldin index for hyper parameter Search").cols(tbl_col_name)
    tbl.add_row("Davies-Bouldin index:", ["%.2f" % x for x in db_index_array])
    pm.set_stat(tbl)

    tbl_col_name = []
    for j in range(0, len(kmeans_centers_max)):
        tbl_col_name.append(str(j))
    tbl = Table().name("Inter cluster distance").cols(tbl_col_name)
    for j in range(0, len(kmeans_centers_max)):
        tbl.add_row(str(j) + ":", ["%.2f" % x for x in inter_cluster_distance_max[j, :]])
    pm.set_stat(tbl)

    tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name)
    tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance_max])
    pm.set_stat(tbl)

    if (len(kmeans_centers_max) < 6) & (len(kmeans_centers_max[0]) < 12):
        tbl_col_name1 = []
        for j in range(0, len(kmeans_centers_max[0])):
            tbl_col_name1.append(str(j))
        tbl = Table().name("Centers (for K<6, Attr<12)").cols(tbl_col_name1)
        for j in range(0, len(kmeans_centers_max)):
            tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers_max[j]])
        pm.set_stat(tbl)

    # BarGraph
    bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(cluster_dist_max.tolist())
    pm.set_stat(bar)


    return model_kmeans_max
示例#19
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    add_parameters(parser)
    args = parser.parse_args()
    print("PM: Configuration:")
    print("PM: Step size:                  [{}]".format(args.step_size))
    print("PM: Iterations:                 [{}]".format(args.iterations))
    print("PM: Model version:              [{}]".format(args.model_version))
    print("PM: Stats interval:             [{}]".format(args.stats_interval))
    print("PM: Save dir:                   [{}]".format(args.save_dir))

    # Initialize MLOps Library
    mlops.init()

    # print the number of iteration used by optimization algorithm
    print('Training for %i iterations' % args.iterations)

    # Create sythetic data using scikit learn
    num_samples = 50
    num_features = 20

    features, labels = make_classification(n_samples=50,
                                           n_features=20,
                                           n_informative=2,
                                           n_redundant=1,
                                           n_classes=3,
                                           n_clusters_per_class=1,
                                           random_state=42)

    # Add noise to the data
    noisy_features = np.random.uniform(0, 5) * np.random.normal(
        0, 1, (num_samples, num_features))
    features = features + noisy_features

    num_features = (features.shape[1])
    num_labels = len(np.unique(labels))

    # One-hot encode labels for all data
    onehot_labels = np.eye(num_labels)[labels]

    # Label distribution in training
    value, counts = np.unique(labels, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Output Health Statistics to MCenter
    # Report features whose distribution should be compared during inference
    mlops.set_data_distribution_stat(features)

    # Algorithm parameters parsed from arguments
    learning_rate = args.step_size
    training_epochs = args.iterations
    display_step = args.stats_interval

    # tf Graph Input
    x = tf.placeholder(tf.float32, [None, num_features], name="features")
    y = tf.placeholder(tf.float32, [None, num_labels], name="labels")

    # Set model weights
    W = tf.Variable(tf.zeros([num_features, num_labels]))
    b = tf.Variable(tf.zeros([num_labels]))

    # Store values for saving model
    serialized_tf_example = tf.placeholder(tf.string, name='tf_example')

    # Construct model
    pred = tf.nn.softmax(tf.matmul(x, W) + b, name="predictions")  # Softmax

    # Minimize error using cross entropy
    cost = tf.reduce_mean(-tf.reduce_sum(y *
                                         tf.log(pred), reduction_indices=1))

    # Gradient Descent
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

    # Evaluation
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(pred, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))

    # Start timer
    training_start_time = time.time()

    # Initialize the variables in a tf session
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())

    iteration_array = []
    cost_array = []
    accuracy_array = []

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0
        temp, c, a = sess.run([optimizer, cost, accuracy],
                              feed_dict={
                                  x: features,
                                  y: onehot_labels
                              })
        # Compute average loss
        avg_cost += c / num_samples
        # Display logs per epoch step
        if (epoch + 1) % display_step == 0:
            iteration_array.append(epoch)
            cost_array.append(avg_cost)
            accuracy_array.append(np.float(a))
            print("accuracy", a)
            print("Epoch:", '%04d' % (epoch + 1), "cost=",
                  "{:.9f}".format(avg_cost))

    # Plot the cost function using MCenter
    gg = Graph().name("Cost function across epochs").set_x_series(
        iteration_array).add_y_series(label="Cost Function Across Iterations",
                                      data=cost_array)
    gg.x_title("Average Cost")
    gg.y_title('Iterations')
    mlops.set_stat(gg)

    # Plot the accuracy function using MCenter
    gg1 = Graph().name("Accuracy across epochs").set_x_series(
        iteration_array).add_y_series(label="Accuracy Across Iterations",
                                      data=accuracy_array)
    gg1.x_title("Accuracy")
    gg1.y_title('Iterations')
    mlops.set_stat(gg1)

    # Plot accuracy and cost across epochs using MCenter
    mg = MultiGraph().name("Cost and Accuracy Progress Across Epochs")
    mg.add_series(x=iteration_array,
                  label="Cost Function Across Iterations",
                  y=cost_array)
    mg.add_series(x=iteration_array,
                  label="Accuracy across epochs",
                  y=accuracy_array)
    mlops.set_stat(mg)

    # Plot final cost and accuracy in this session using MCenter
    mlt = MultiLineGraph().name("Final Accuracy and Cost").labels(
        ["Cost", "Accuracy"])
    mlt.data([cost_array[-1], accuracy_array[-1]])
    mlops.set_stat(mlt)

    # Save the model
    export_path = args.save_dir
    print('Exporting trained model to', export_path)
    builder = tf.saved_model.builder.SavedModelBuilder(export_path)

    values, indices = tf.nn.top_k(y, num_labels)
    table = tf.contrib.lookup.index_to_string_table_from_tensor(
        tf.constant([str(i) for i in range(num_labels)]))
    prediction_classes = table.lookup(tf.to_int64(indices))

    # Build the signature_def_map.
    classification_inputs = tf.saved_model.utils.build_tensor_info(
        serialized_tf_example)
    classification_outputs_classes = tf.saved_model.utils.build_tensor_info(
        prediction_classes)
    classification_outputs_scores = tf.saved_model.utils.build_tensor_info(
        values)

    classification_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={
                tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                classification_inputs
            },
            outputs={
                tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
                classification_outputs_classes,
                tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
                classification_outputs_scores
            },
            method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME
        ))

    tensor_info_x = tf.saved_model.utils.build_tensor_info(x)
    tensor_info_y = tf.saved_model.utils.build_tensor_info(y)

    prediction_signature = (
        tf.saved_model.signature_def_utils.build_signature_def(
            inputs={'inputs': tensor_info_x},
            outputs={'outputs': tensor_info_y},
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
    )

    legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')

    builder.add_meta_graph_and_variables(
        sess, [tf.saved_model.tag_constants.SERVING],
        signature_def_map={
            'predict_images':
            prediction_signature,
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            classification_signature,
        },
        legacy_init_op=legacy_init_op)

    builder.save(as_text=args.use_text)
示例#20
0
    def _prep_and_infer(self, df_dataset):
        # Get number of features
        self.num_features = df_dataset.shape[1]
        # Get number of samples
        self.num_samples = df_dataset.shape[0]
        #get input model
        self.input_model = self._params["input-model"]

        self._logger.info("PM: Configuration:")
        self._logger.info("PM: # Sample:                    [{}]".format(
            self.num_samples))
        self._logger.info("PM: # Features:                  [{}]".format(
            self.num_features))
        self._logger.info("PM: # Input-Model:               [{}]".format(
            self.input_model))

        # Initialize MLOps Library
        mlops.init()
        # Load the model
        if self.input_model is not None:
            try:
                filename = self._params["input-model"]
                model_file_obj = open(filename, 'rb')
                mlops.set_stat("# Model Files Used", 1)
            except Exception as e:
                #self._logger.error("Model Not Found")
                self._logger.error("Got Exception: {}".format(e))
                mlops.set_stat("# Model Files Used", 0)
                mlops.done()
                return 0

        final_model = pickle.load(model_file_obj)
        features = df_dataset

        # Output Health Statistics to MCenter
        # MLOps API to report the distribution statistics of each feature in the data
        # and compare it automatically with the ones
        mlops.set_data_distribution_stat(features)

        # Output the number of samples being processed using MCenter
        mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features),
                       st.TIME_SERIES)

        # Accuracy for the chosen model
        pred_labels = final_model.predict(features)
        pred_probs = final_model.predict_proba(features)

        self._logger.info("Pred Labels: {}".format(
            pred_labels))  # Remove printout can be huge
        self._logger.info("Pred Probabilities: {}".format(
            pred_probs))  # Remove printout can be huge

        # Pred Label distribution
        pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
        pred_label_distribution = np.asarray((pred_value, pred_counts)).T
        # pred_column_names = pred_value.astype(str).tolist()
        self._logger.info(
            "Pred Label distributions: \n {}".format(pred_label_distribution))

        # Output Pred label distribution as a BarGraph using MCenter
        pred_bar = BarGraph().name("Pred Label Distribution").cols(
            (pred_label_distribution[:, 0]).astype(str).tolist()).data(
                (pred_label_distribution[:, 1]).tolist())
        mlops.set_stat(pred_bar)

        # Pred Label confidence per label
        label_number = len(pred_counts)
        average_confidence = np.zeros(label_number)
        max_pred_probs = pred_probs.max(axis=1)
        for i in range(0, label_number):
            index_class = np.where(pred_labels == i)[0]
            self._logger.info("np.sum(confidence[index_class]) {}".format(
                np.sum(max_pred_probs[index_class])))
            self._logger.info("counts_elements[i] {}".format(pred_counts[i]))
            if pred_counts[i] > 0:
                average_confidence[i] = np.sum(
                    max_pred_probs[index_class]) / (float(pred_counts[i]))
            else:
                average_confidence[i] = 0

        # BarGraph showing confidence per class
        pred_values1 = [str(i) for i in pred_value]
        bar = BarGraph().name("Average Confidence Per Class").cols(
            pred_values1).data(average_confidence.tolist())
        mlops.set_stat(bar)
        # Terminate MLOPs
        mlops.done()

        df_result = pd.concat([
            df_dataset,
            pd.DataFrame({'predict': pred_labels}),
            pd.DataFrame({
                'probs-0': pred_probs[:, 0],
                'probs-1': pred_probs[:, 1]
            })
        ],
                              axis=1)

        df_result.insert(0,
                         'idx', [x for x in range(1, df_result.shape[0] + 1)],
                         allow_duplicates=False)

        return df_result
示例#21
0
def main():
    pm_options = parse_args()
    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: # Model File:                [{}]".format(
        pm_options.input_model))

    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            model_file_obj = open(filename, 'rb')
            mlops.set_stat("# Model Files Used", 1)
        except Exception as e:
            print("Model Not Found")
            print("Got Exception: {}".format(e))
            mlops.set_stat("# Model Files Used", 0)
            mlops.done()
            return 0

    final_model = pickle.load(model_file_obj)

    try:
        data_filename = pm_options.input_file
        data_file_obj = open(data_filename, 'rb')
        data = np.loadtxt(data_file_obj)

        X = data  # select columns 1 through end

    except Exception as e:
        print("Generating Synthetic Data Because {}".format(e))

        # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
        num_samples = int(pm_options.num_samples)
        num_features = int(pm_options.num_features)

        # Create synthetic data using scikit learn
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            #                                binary classification only!
            n_classes=2,
            random_state=42)

        # Add random noise to the data randomly
        import random
        if random.randint(1, 21) / 2 == 0:
            print("Adding Random Noise!")

            noisy_features = np.random.uniform(0, 1) * \
                             np.random.normal(0, 1,
                                              (num_samples, num_features))
            X = X + noisy_features

    # Separate into features and labels
    features = X

    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    mlops.set_data_distribution_stat(features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features),
                   st.TIME_SERIES)

    # Accuracy for the chosen model
    pred_labels = final_model.predict(features)
    pred_probs = final_model.predict_proba(features)

    print("Pred Labels: ", pred_labels)  # Remove printout can be huge
    print("Pred Probabilities: ", pred_probs)  # Remove printout can be huge

    # Pred Label distribution
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("Pred Label distributions: \n {0}".format(pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    pred_bar = BarGraph().name("Pred Label Distribution").cols(
        (pred_label_distribution[:, 0]).astype(str).tolist()).data(
            (pred_label_distribution[:, 1]).tolist())
    mlops.set_stat(pred_bar)

    # Pred Label confidence per label
    label_number = len(pred_counts)
    average_confidence = np.zeros(label_number)
    max_pred_probs = pred_probs.max(axis=1)
    for i in range(0, label_number):
        index_class = np.where(pred_labels == i)[0]
        print(" np.sum(confidence[index_class])",
              np.sum(max_pred_probs[index_class]))
        print("counts_elements[i] ", pred_counts[i])
        if pred_counts[i] > 0:
            average_confidence[i] = np.sum(
                max_pred_probs[index_class]) / (float(pred_counts[i]))
        else:
            average_confidence[i] = 0

    # BarGraph showing confidence per class
    pred_values1 = [str(i) for i in pred_value]
    bar = BarGraph().name("Average Confidence Per Class").cols(
        pred_values1).data(average_confidence.tolist())
    mlops.set_stat(bar)

    # KS for the chosen model
    ks = ks_2samp(max_pred_probs[pred_labels == 1],
                  max_pred_probs[pred_labels == 0])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue

    print("KS values: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))

    # Output KS Stat of the chosen model using MCenter
    if not np.isnan(ks_stat):
        print("printing KS_stat ")
        mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES)
    else:
        print("not printing KS_stat ")

    # Raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Inference] KS Violation From Inference Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[pred_labels == 1],
                                   max_pred_probs[pred_labels == 0])

    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])

    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1

    mlops.set_stat(psi_table_stat)

    print("Total PSI values: \n {}".format(total_psi))

    #     Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    # Raising alert if total_psi goes below required threshold
    if total_psi <= min_psi_requirement:
        mlops.health_alert(
            "[Inference] PSI Violation From Inference Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # Terminate MLOPs
    mlops.done()
示例#22
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))

    print("PM: # Validation Split:          [{}]".format(
        pm_options.validation_split))

    print("PM: # AUC Threshold:             [{}]".format(
        pm_options.auc_threshold))
    print("PM: # KS Threshold:              [{}]".format(
        pm_options.ks_threshold))
    print("PM: # PSI Threshold:             [{}]".format(
        pm_options.psi_threshold))

    print("PM: # Estimators:                [{}]".format(
        pm_options.n_estimators))
    print("PM: # Max Depth:                 [{}]".format(pm_options.max_depth))
    print("PM: # Learning Rate:             [{}]".format(
        pm_options.learning_rate))
    print("PM: # Min Child Weight:          [{}]".format(
        pm_options.min_child_weight))
    print("PM: # Objective:                 [{}]".format(pm_options.objective))
    print("PM: # Gamma:                     [{}]".format(pm_options.gamma))
    print("PM: # Max Delta Step:            [{}]".format(
        pm_options.max_delta_step))
    print("PM: # Subsample:                 [{}]".format(pm_options.subsample))
    print("PM: # Reg Alpha:                 [{}]".format(pm_options.reg_alpha))
    print("PM: # Reg Lambda:                [{}]".format(
        pm_options.reg_lambda))
    print("PM: # Scale Pos Weight:          [{}]".format(
        pm_options.scale_pos_weight))

    print("PM: # Input File:                [{}]".format(
        pm_options.input_file))
    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    min_auc_requirement = float(pm_options.auc_threshold)
    max_ks_requirement = float(pm_options.ks_threshold)
    min_psi_requirement = float(pm_options.psi_threshold)

    # Initialize MLOps Library
    mlops.init()

    try:
        data_filename = pm_options.input_file
        data_file_obj = open(data_filename, 'rb')
        data = np.loadtxt(data_file_obj)

        X = data[:, 1:]  # select columns 1 through end
        y = data[:, 0]

    except Exception as e:
        print("Generating Synthetic Data Because {}".format(e))

        # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
        num_samples = int(pm_options.num_samples)
        num_features = int(pm_options.num_features)

        # Create synthetic data using scikit learn
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            # binary classification only!
            n_classes=2,
            random_state=42)

        print("Adding Random Noise!")

        noisy_features = np.random.uniform(0, 1) * \
                         np.random.normal(0, 1,
                                          (num_samples, num_features))
        X = X + noisy_features

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=float(pm_options.validation_split), random_state=42)

    import xgboost as xgb

    # Create a model that should be deployed into production
    final_model = xgb.XGBClassifier(
        max_depth=int(pm_options.max_depth),
        min_child_weight=int(pm_options.min_child_weight),
        learning_rate=float(pm_options.learning_rate),
        n_estimators=int(pm_options.n_estimators),
        silent=True,
        objective=str(pm_options.objective),
        gamma=float(pm_options.gamma),
        max_delta_step=int(pm_options.max_delta_step),
        subsample=float(pm_options.subsample),
        colsample_bytree=1,
        colsample_bylevel=1,
        reg_alpha=float(pm_options.reg_alpha),
        reg_lambda=float(pm_options.reg_lambda),
        scale_pos_weight=float(pm_options.scale_pos_weight),
        seed=1,
        missing=None)

    final_model.fit(X_train, y_train)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(X_train)

    # Accuracy for the chosen model
    pred_labels = final_model.predict(X_test)
    pred_probs = final_model.predict_proba(X_test)

    print("Pred Labels: ", pred_labels)
    print("Pred Probabilities: ", pred_probs)

    accuracy = accuracy_score(y_test, pred_labels)
    print("Accuracy values: \n {0}".format(accuracy))
    # Output accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Label distribution in training
    value, counts = np.unique(y_test, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    # column_names = value.astype(str).tolist()
    print("Validation Actual Label distributions: \n {0}".format(
        label_distribution))

    # Output Label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Validation Actual Label Distribution").cols(
        (label_distribution[:, 0]).astype(str).tolist()).data(
            (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Pred Label distribution in training
    pred_value, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_label_distribution = np.asarray((pred_value, pred_counts)).T
    # pred_column_names = pred_value.astype(str).tolist()
    print("Validation Prediction Label Distributions: \n {0}".format(
        pred_label_distribution))

    # Output Pred label distribution as a BarGraph using MCenter
    pred_bar = BarGraph().name(
        "Validation Prediction Label Distributions").cols(
            (pred_label_distribution[:, 0]).astype(str).tolist()).data(
                (pred_label_distribution[:, 1]).tolist())
    mlops.set_stat(pred_bar)

    # ROC for the chosen model
    roc_auc = roc_auc_score(y_test, pred_probs[:, 1])
    print("ROC AUC values: \n {}".format(roc_auc))

    #     Output ROC of the chosen model using MCenter
    mlops.set_stat("ROC AUC", roc_auc, st.TIME_SERIES)

    if roc_auc <= min_auc_requirement:
        mlops.health_alert(
            "[Training] AUC Violation From Training Node",
            "AUC Went Below {}. Current AUC Is {}".format(
                min_auc_requirement, roc_auc))

    # ROC Curve
    fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1])
    cg = MultiGraph().name(
        "Receiver Operating Characteristic ").set_continuous()
    cg.add_series(label='Random Curve ' '', x=fpr.tolist(), y=fpr.tolist())
    cg.add_series(label='ROC Curve (Area = {0:0.2f})'
                  ''.format(roc_auc),
                  x=fpr.tolist(),
                  y=tpr.tolist())
    cg.x_title('False Positive Rate')
    cg.y_title('True Positive Rate')
    mlops.set_stat(cg)

    max_pred_probs = pred_probs.max(axis=1)

    # KS for the chosen model
    ks = ks_2samp(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0])
    ks_stat = ks.statistic
    ks_pvalue = ks.pvalue

    print("KS values: \n Statistics: {} \n pValue: {}\n".format(
        ks_stat, ks_pvalue))

    # Output KS Stat of the chosen model using MCenter
    mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES)

    # Raising alert if ks-stat goes above required threshold
    if ks_stat >= max_ks_requirement:
        mlops.health_alert(
            "[Training] KS Violation From Training Node",
            "KS Stat Went Above {}. Current KS Stat Is {}".format(
                max_ks_requirement, ks_stat))

    ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"])
    ks_table.add_row([ks_stat, ks_pvalue])
    mlops.set_stat(ks_table)

    # Calculating PSI
    total_psi, psi_table = get_psi(max_pred_probs[y_test == 1],
                                   max_pred_probs[y_test == 0])

    psi_table_stat = Table().name("PSI Stats").cols([
        "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent",
        "Curr Percent", "Segment PSI"
    ])

    row_num = 1
    for each_value in psi_table.values:
        str_values = [str(i) for i in each_value]
        psi_table_stat.add_row(str(row_num), str_values)
        row_num += 1

    mlops.set_stat(psi_table_stat)

    print("Total PSI values: \n {}".format(total_psi))

    # Output Total PSI of the chosen model using MCenter
    mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES)

    # Raising alert if total_psi goes below required threshold
    if total_psi <= min_psi_requirement:
        mlops.health_alert(
            "[Training] PSI Violation From Training Node",
            "PSI Went Below {}. Current PSI Is {}".format(
                min_psi_requirement, total_psi))

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(final_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
示例#23
0
                                                      "l2"]).data([5, 16])
    mlops.set_stat(mlt)

    # Example of sending a table to pm system.
    # Multi-line graphs
    mlt = MultiLineGraph().name("Multi Line").labels(["l1",
                                                      "l2"]).data([5, 16])
    mlops.set_stat(mlt)

    # Table example
    tbl = Table().name("MyTable").cols(["", "Date"])
    tbl.add_row(["line 1", "2001Q1"])
    tbl.add_row(["line 2", "2014Q3"])
    mlops.set_stat(tbl)

    bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd",
                                         "ee"]).data([10, 15, 12, 9, 8])
    mlops.set_stat(bar)

    partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
    n = 100000 * partitions

    def f(_):
        x = random() * 2 - 1
        y = random() * 2 - 1
        return 1 if x**2 + y**2 <= 1 else 0

    count = spark.sparkContext.parallelize(range(1, n + 1),
                                           partitions).map(f).reduce(add)
    print("Pi is roughly %f" % (4.0 * count / n))

    spark.stop()
示例#24
0
def kmeans_train(pm_options, spark):
    """
    Kmeans Training function
    :param pm_options:
    :param spark:
    :return:
    """

    # Import Data
    ##################################
    input_data = (spark.read.format("csv").option(
        "header", pm_options.with_headers).option(
            "ignoreLeadingWhiteSpace",
            "true").option("ignoreTrailingWhiteSpace", "true").option(
                "inferschema",
                "true").load(pm_options.data_file)).repartition(10)

    column_names_all = input_data.columns
    if not pm_options.with_headers == "true":
        for col_index in range(0, len(column_names_all)):
            input_data = input_data.withColumnRenamed(
                column_names_all[col_index], 'c' + str(col_index))

    input_data = input_data.cache()

    input_train = input_data
    input_test = input_data

    # SparkML pipeline
    ##################################
    exclude_cols = []
    column_names = input_train.columns
    input_col_names = []
    for elmts in column_names:
        ind = True
        for excludes in exclude_cols:
            if elmts == excludes:
                ind = False
        if ind:
            input_col_names.append(elmts)
    print(input_col_names)

    vector_assembler = VectorAssembler(inputCols=input_col_names,
                                       outputCol="features")
    kmeans_pipe = KMeans(k=int(pm_options.K),
                         initMode="k-means||",
                         initSteps=2,
                         tol=1e-4,
                         maxIter=100,
                         featuresCol="features")
    full_pipe = [vector_assembler, kmeans_pipe]
    model_kmeans = Pipeline(stages=full_pipe).fit(input_train)

    # Test validation and statistics collection
    ############################################################
    predicted_df = model_kmeans.transform(input_test)

    print("model_kmeans.stages(1) = ", model_kmeans.stages[1])

    sum_errors = model_kmeans.stages[1].computeCost(predicted_df)
    print("Sum of Errors for Kmeans = " + str(sum_errors))

    # Shows the result.
    kmeans_centers = model_kmeans.stages[1].clusterCenters()
    print("Kmeans Centers: ")
    for center in kmeans_centers:
        print(center)

    # calculating stats
    ############################################################

    # Calculating Inter cluster distance
    inter_cluster_distance = np.zeros(
        (len(kmeans_centers), len(kmeans_centers)))

    for centerIndex1 in range(0, len(kmeans_centers)):
        for centerIndex2 in range(0, len(kmeans_centers)):
            inter_cluster_distance[centerIndex1, centerIndex2] =\
                eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2])

    print("inter_cluster_distance = ", inter_cluster_distance)
    # Calculating Intra cluster distances and the bars for the cluster distribution
    intra_cluster_distance = np.zeros(len(kmeans_centers))
    cluster_dist = np.zeros(len(kmeans_centers))

    for centerIndex1 in range(0, len(kmeans_centers)):
        filtered_df = predicted_df.filter(
            predicted_df["prediction"] == centerIndex1)
        cluster_dist[centerIndex1] = filtered_df.count()
        if cluster_dist[centerIndex1] == 0:
            intra_cluster_distance[centerIndex1] = 0
        else:
            filtered_df =\
                filtered_df.withColumn('distance',
                                       udf(eq_dist, FloatType())(col("features"),
                                            array([lit(v) for v in kmeans_centers[centerIndex1]])))
            intra_cluster_distance[centerIndex1] =\
                filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1]

    # calculating Davis-Boulding Index
    ############################################################
    # R[i,j] = (S[i] + S[j])/M[i,j]
    # D[i] = max(R[i,j]) for i !=j
    # DB = (1/K) * sum(D[i])
    r_index = np.zeros((len(kmeans_centers), len(kmeans_centers)))
    for centerIndex1 in range(0, len(kmeans_centers)):
        for centerIndex2 in range(0, len(kmeans_centers)):
            r_index[centerIndex1, centerIndex2] = 0
            if not inter_cluster_distance[centerIndex1, centerIndex2] == 0:
                r_index[centerIndex1, centerIndex2] =\
                    (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2])\
                    / inter_cluster_distance[centerIndex1, centerIndex2]
    d_index = np.max(r_index, axis=0)
    db_index = np.sum(d_index, axis=0) / len(kmeans_centers)

    # pmml model generation
    ############################################################
    pmml_file = toPMMLBytes(spark, input_train, model_kmeans).decode("UTF-8")

    # PM stats
    ############################################################
    print("Sum of Errors for Kmeans = " + str(sum_errors))
    pm.set_stat("Sum of Errors for Kmeans", sum_errors, st.TIME_SERIES)

    print("Davies-Bouldin index = " + str(db_index))
    pm.set_stat("Davies-Bouldin index", db_index, st.TIME_SERIES)

    # Tables
    tbl_col_name = []
    for j in range(0, len(kmeans_centers)):
        tbl_col_name.append(str(j))
    tbl = Table().name("Inter cluster distance").cols(tbl_col_name)
    for j in range(0, len(kmeans_centers)):
        tbl.add_row(
            str(j) + ":", ["%.2f" % x for x in inter_cluster_distance[j, :]])
    pm.set_stat(tbl)

    tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name)
    tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance])
    pm.set_stat(tbl)

    tbl_col_name1 = []
    for j in range(0, len(kmeans_centers[0])):
        tbl_col_name1.append(str(j))
    tbl = Table().name("Centers (for K<6, Attr<11)").cols(tbl_col_name1)
    for j in range(0, len(kmeans_centers)):
        tbl.add_row("center" + str(j) + ":",
                    ["%.2f" % x for x in kmeans_centers[j]])
    pm.set_stat(tbl)

    # BarGraph
    bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(
        cluster_dist.tolist())
    pm.stat(bar)

    print("PM: generating histogram from data-frame and model")
    print("PM:" + pmml_file)
    try:
        pm.set_data_distribution_stat(data=input_train, model=pmml_file)
        print("PM: done generating histogram")
    except Exception as e:
        print("PM: failed to generate histogram using pm.stat")
        print(e)

    return pmml_file
示例#25
0
def main():
    # Initialize spark and MLOps
    spark = SparkSession.builder.appName("RandomForestClassifier").getOrCreate()
    mlops.init(spark.sparkContext)

    # parse the arguments to component
    options = parse_args()
    print("PM: Configuration:")
    print("PM: Number of trees:                [{}]".format(options.num_trees))
    print("PM: Maximum depth:                  [{}]".format(options.max_depth))
    print("PM: Output model:                   [{}]".format(options.output_model))
    print("PM: Temp shared path:               [{}]".format(options.temp_shared_path))

    # Generate synthetic data using scikit learn
    num_samples = 50
    num_features = 20
    num_classes = 3
    X, y = make_classification(n_samples=num_samples, n_features=num_features, n_informative=2, n_redundant=1,
                               n_classes=num_classes, n_clusters_per_class=1, random_state=42)
    X = X + np.random.uniform(0, 5) * np.random.normal(0, 1, (num_samples, num_features))

    feature_names = ["".join(ascii_lowercase[a]) for a in range(num_features + 1)]
    feature_names[0] = "label"

    # Create a spark dataframe from the synthetic data generated 
    trainingData = spark.createDataFrame(
        pd.DataFrame(np.concatenate((y.reshape(-1, 1), X), axis=1), columns=feature_names))

    # Histogram of label distribution
    value, counts = np.unique(y, return_counts=True)
    label_distribution = np.asarray((value, counts)).T
    column_names = value.astype(str).tolist()
    print("Label distributions: \n {0}".format(label_distribution))

    # Output label distribution as a BarGraph using MCenter
    bar = BarGraph().name("Label Distribution").cols((label_distribution[:, 0]).astype(str).tolist()).data(
        (label_distribution[:, 1]).tolist())
    mlops.set_stat(bar)

    # Output Health Statistics to MCenter
    # Report features whose distribution should be compared during inference
    mlops.set_data_distribution_stat(trainingData)

    # Fit a random forest classifiction model
    assembler = VectorAssembler(inputCols=feature_names[1:num_features + 1], outputCol="features")
    layers = [num_features, 5, 4, num_classes]
    classifier = RandomForestClassifier(numTrees=int(options.num_trees), maxDepth=int(options.max_depth))

    pipeline = Pipeline(stages=[assembler, classifier])
    model = pipeline.fit(trainingData)
    predictions = model.transform(trainingData)

    # Select (prediction, true label) and compute training error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # Report accuracy of the chosen model using MCenter
    mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES)

    # Save the spark model 
    SparkPipelineModelHelper() \
        .set_shared_context(spark_context=spark.sparkContext) \
        .set_local_path(local_path=options.output_model) \
        .set_shared_path_prefix(shared_path_prefix=options.temp_shared_path) \
        .save_sparkml_model(model)

    # Stop spark context and MLOps
    spark.sparkContext.stop()
    mlops.done()
示例#26
0
def main(args):
    # Parse arguments
    parser = argparse.ArgumentParser()
    add_parameters(parser)
    args = parser.parse_args()

    # Initialize MLOps Library
    mlops.init()

    # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
    num_samples = 50
    num_features = 20

    np.random.seed(0)
    g = np.random.normal(0, 1, (num_samples, num_features))
    p = np.random.poisson(0.7, (num_samples, num_features))
    b = np.random.beta(2, 2, (num_samples, num_features))

    test_data = np.concatenate((g, p, b), axis=0)
    np.random.seed()
    features = test_data[np.random.choice(test_data.shape[0],
                                          num_samples,
                                          replace=False)]

    # Start tensorflow session
    sess = tf.InteractiveSession()
    tag_set = ["serve"]
    if args.model_dir is not None:
        try:
            print("args.model_dir = ", args.model_dir)
            tf.saved_model.loader.load(sess, tag_set, args.model_dir)
        except Exception as e:
            print("Model not found")
            print("Got exception: " + str(e))
            return 0

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    # reported during training to generate the similarity score.
    mlops.set_data_distribution_stat(data=features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features))

    graph = tf.get_default_graph()
    x = graph.get_tensor_by_name("features:0")
    y_pred = graph.get_tensor_by_name("predictions:0")
    predictions = sess.run(y_pred, {x: features})
    print('predictions', np.array(predictions))

    # Ouput prediction distribution as a BarGraph using MCenter
    predict_int = np.argmax(predictions, axis=1)
    unique, counts = np.unique(predict_int, return_counts=True)
    counts = list(map(int, counts))
    x_series = list(map(str, unique))
    mlt = BarGraph().name("Prediction Distribution").cols(x_series).data(
        list(counts))
    mlops.set_stat(mlt)

    # Show average prediction probability value for each prediction
    num_labels = len(np.unique(predict_int))
    probability = np.zeros((num_labels, ))
    for a in range(0, num_labels):
        temp = predictions[np.argmax(predictions, axis=1) == a, :]
        print(temp)
        probability[a] = np.mean(temp[:, a])
    print("probability", list(np.squeeze(probability)))

    # Plot average probability in each class using MCenter
    bg = BarGraph().name("Probability of Each Label").cols(x_series).data(
        list(np.squeeze(probability)))
    mlops.set_stat(bg)
示例#27
0
def main():
    pm_options = parse_args()
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            file_obj = open(filename, 'rb')
            mlops.set_stat("model_file", 1)
        except Exception as e:
            print("Model not found")
            print("Got exception: {}".format(e))
            mlops.set_stat("model_file", 0)
            mlops.done()
            return 0

    classifier = pickle.load(file_obj)

    # Load the data
    test_dataset = pd.read_csv(pm_options.input_file)

    mlops.set_data_distribution_stat(test_dataset)
    # Extract numpy array
    test_features = test_dataset.values
    # Predict labels
    result = classifier.predict(test_features)
    # Predict probability
    class_probability = classifier.predict_proba(test_features)
    maximum_prob = np.max(class_probability, axis=1)

    # Tag samples that are below a certain probability and write to a file
    confidence = 0.8
    low_prob_samples = test_features[np.where(maximum_prob < confidence)]
    low_prob_predictions = result[np.where(maximum_prob < confidence)]
    unique_elements_low, counts_elements_low = np.unique(low_prob_predictions,
                                                         return_counts=True)
    unique_elements_low = [str(i) for i in unique_elements_low]
    print("Low confidence predictions: \n {0} \n with frequency {1}".format(
        unique_elements_low, counts_elements_low))

    ########## Start of ParallelM instrumentation ##############
    # BarGraph showing distribution of low confidence labels
    bar = BarGraph().name("Low confidence label distribution").cols(
        unique_elements_low).data(counts_elements_low.tolist())
    mlops.set_stat(bar)
    ########## End of ParallelM instrumentation ################

    # Samples with high probability
    high_prob_samples = test_features[np.where(maximum_prob >= confidence)]
    high_prob_predictions = result[np.where(maximum_prob >= confidence)]
    unique_elements_high, counts_elements_high = np.unique(
        high_prob_predictions, return_counts=True)
    unique_elements_high = [str(i) for i in unique_elements_high]
    print("High confidence predictions: \n {0} \n with frequency {1}".format(
        unique_elements_high, counts_elements_high))

    ########## Start of ParallelM instrumentation ##############
    # BarGraph showing distribution of high confidence labels
    bar = BarGraph().name("High confidence label distribution").cols(
        unique_elements_high).data(counts_elements_high.tolist())
    mlops.set_stat(bar)
    ########## End of ParallelM instrumentation ################

    mlops.done()
示例#28
0
def main():
    pm_options = parse_args()
    # Initialize MLOps Library
    mlops.init()
    # Load the model
    if pm_options.input_model is not None:
        try:
            filename = pm_options.input_model
            file_obj = open(filename, 'rb')
            mlops.set_stat("model_file", 1)
        except Exception as e:
            print("Model not found")
            print("Got exception: {}".format(e))
            mlops.set_stat("model_file", 0)
            mlops.done()
            return 0

    regression = pickle.load(file_obj)

    # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution)
    num_samples = int(pm_options.num_samples)
    num_features = int(pm_options.num_features)

    mae_threshold = float(pm_options.threshold)

    # Create synthetic data using scikit learn
    X, y = make_regression(n_samples=num_samples,
                           n_features=num_features,
                           n_informative=2,
                           random_state=42)

    # for making labels all positive
    y = y + -1 * np.min(y)

    # Separate into features and labels
    features = X
    labels = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (num_samples, num_features))
    features = features + noisy_features

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones
    # reported during training to generate the similarity score.
    mlops.set_data_distribution_stat(features)

    # Output the number of samples being processed using MCenter
    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, num_samples,
                   st.TIME_SERIES)

    # Predict labels
    labels_pred = regression.predict(features)

    hist_pred, bin_edges_pred = np.histogram(labels_pred)

    # Output prediction label distribution as a BarGraph using MCenter
    pred_label_bar = BarGraph().name("User Defined: Prediction Label Distribution") \
        .cols(bin_edges_pred.astype(str).tolist()) \
        .data(hist_pred.tolist()) \
        .as_continuous()

    mlops.set_stat(pred_label_bar)

    ##########################################################################
    #################### Start: Output Sample/Conversions ####################
    ########################################################################@@
    mae = np.absolute(labels_pred - labels)
    conversions = sum(i < mae_threshold for i in mae)
    samples = num_samples

    mlops.set_stat("samples", samples)

    mlops.set_stat("conversions", conversions)

    ########################################################################
    #################### End: Output Sample/Conversions ####################
    ########################################################################

    # Terminate MLOPs
    mlops.done()
示例#29
0
def main():
    pm_options = parse_args()

    print("PM: Configuration:")
    print("PM: # Sample:                    [{}]".format(
        pm_options.num_samples))
    print("PM: # Features:                  [{}]".format(
        pm_options.num_features))
    print("PM: # Classes:                   [{}]".format(
        pm_options.num_cluster))

    print("PM: Init:                        [{}]".format(pm_options.init))
    print("PM: N Init:                      [{}]".format(pm_options.n_init))
    print("PM: Tolerance:                   [{}]".format(pm_options.tol))
    print("PM: Maximum Iterations:          [{}]".format(pm_options.max_iter))
    print("PM: Pre-Compute Distances:       [{}]".format(
        pm_options.precompute_distances))
    print("PM: Algorithm:                   [{}]".format(pm_options.algorithm))

    print("PM: Output model:                [{}]".format(
        pm_options.output_model))

    # Initialize MLOps Library
    mlops.init()

    n_samples = int(pm_options.num_samples)
    n_features = int(pm_options.num_features)
    n_clusters = int(pm_options.num_cluster)

    init = str(pm_options.init)
    n_init = int(pm_options.n_init)
    max_iter = int(pm_options.max_iter)
    tol = float(pm_options.tol)
    precompute_distances = str(pm_options.precompute_distances)
    algorithm = str(pm_options.algorithm)
    verbose = 0
    n_jobs = 1

    # Create synthetic data using scikit learn
    X, y = make_classification(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=10,
                               n_redundant=1,
                               n_classes=n_clusters,
                               n_clusters_per_class=1,
                               random_state=42)

    # Separate into features and labels
    features = X
    labels_true = y

    # Add noise to the data
    noisy_features = np.random.uniform(0, 10) * \
                     np.random.normal(0, 1,
                                      (n_samples, n_features))
    features = features + noisy_features

    kmeans_model = KMeans(n_clusters=n_clusters,
                          init=init,
                          n_init=n_init,
                          max_iter=max_iter,
                          tol=tol,
                          precompute_distances=precompute_distances,
                          verbose=verbose,
                          random_state=None,
                          copy_x=True,
                          n_jobs=n_jobs,
                          algorithm=algorithm).fit(features, labels_true)

    mlops.set_stat("User Defined: Training Inertia", kmeans_model.inertia_)
    mlops.set_stat("User Defined: Training Iteration", kmeans_model.n_iter_)

    value, counts = np.unique(labels_true, return_counts=True)
    label_distribution = np.asarray((value, counts)).T

    # Output actual label distribution as a BarGraph using MCenter
    bar_true = BarGraph().name("User Defined: Actual Label Distribution") \
        .cols((label_distribution[:, 0]).astype(str).tolist()) \
        .data((label_distribution[:, 1]).tolist())
    mlops.set_stat(bar_true)

    # prediction labels
    labels_pred = kmeans_model.predict(features)

    value_pred, counts_pred = np.unique(labels_pred, return_counts=True)
    label_distribution_pred = np.asarray((value_pred, counts_pred)).T

    # Output prediction label distribution as a BarGraph using MCenter
    bar_pred = BarGraph().name("User Defined: Prediction Label Distribution") \
        .cols((label_distribution_pred[:, 0]).astype(str).tolist()) \
        .data((label_distribution_pred[:, 1]).tolist())
    mlops.set_stat(bar_pred)

    # Output Health Statistics to MCenter
    # MLOps API to report the distribution statistics of each feature in the data
    mlops.set_data_distribution_stat(features)

    ###########################################################################
    #################### Start: Adjusted Mutual Info Score ####################
    ###########################################################################

    adjusted_mutual_info_score = sklearn.metrics \
        .adjusted_mutual_info_score(labels_true=labels_true,
                                    labels_pred=labels_pred)

    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Adjusted Mutual Info Score", adjusted_mutual_info_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.ADJUSTED_MUTUAL_INFO_SCORE,
                   adjusted_mutual_info_score)

    # OR

    # Third Way
    mlops.metrics.adjusted_mutual_info_score(labels_true=labels_true,
                                             labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    #########################################################################
    #################### End: Adjusted Mutual Info Score ####################
    #########################################################################

    ####################################################################
    #################### Start: Adjusted Rand Score ####################
    ####################################################################

    adjusted_rand_score = sklearn.metrics \
        .adjusted_rand_score(labels_true=labels_true,
                             labels_pred=labels_pred)

    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Adjusted Rand Score", adjusted_rand_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.ADJUSTED_RAND_SCORE, adjusted_rand_score)

    # OR

    # Third Way
    mlops.metrics.adjusted_rand_score(labels_true=labels_true,
                                      labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ##################################################################
    #################### End: Adjusted Rand Score ####################
    ##################################################################

    #######################################################################
    #################### Start: Calinski Harabaz Score ####################
    #######################################################################

    calinski_harabaz_score = sklearn.metrics \
        .calinski_harabaz_score(X=features, labels=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Calinski Harabaz Score", calinski_harabaz_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.CALINSKI_HARABAZ_SCORE,
                   calinski_harabaz_score)

    # OR

    # Third Way
    mlops.metrics.calinski_harabaz_score(X=features, labels=labels_pred)
    #################### DONE NEW WAY ####################

    #####################################################################
    #################### End: Calinski Harabaz Score ####################
    #####################################################################

    ###################################################################
    #################### Start: Completeness Score ####################
    ###################################################################

    completeness_score = sklearn.metrics \
        .completeness_score(labels_true=labels_true, labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Completeness Score", completeness_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.COMPLETENESS_SCORE, completeness_score)

    # OR

    # Third Way
    mlops.metrics.completeness_score(labels_true=labels_true,
                                     labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    #################################################################
    #################### End: Completeness Score ####################
    #################################################################

    ###################################################################
    #################### Start: Contingency Matrix ####################
    ###################################################################

    contingency_matrix = sklearn.metrics.cluster \
        .contingency_matrix(labels_true, labels_pred)

    # list of sorted labels. i.e. [0, 1, 2, ..]
    pred_labels_list = sorted(set(labels_pred))
    true_labels_list = sorted(set(labels_true))

    #################### OLD WAY ####################
    # First Way
    # from parallelm.mlops.stats.table import Table
    #
    # cm_cols_ordered_string = [str(i) for i in pred_labels_list]
    # cm_rows_ordered_string = [str(i) for i in true_labels_list]
    # cm_matrix = Table().name("User Defined: Contingency Matrix").cols(cm_cols_ordered_string)
    #
    # for index in range(len(contingency_matrix)):
    #     cm_matrix.add_row(cm_rows_ordered_string[index], list(contingency_matrix[index]))
    #
    # mlops.set_stat(cm_matrix)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.CONTINGENCY_MATRIX,
                   data=contingency_matrix,
                   true_labels=true_labels_list,
                   pred_labels=pred_labels_list)

    # OR

    # Third Way
    mlops.metrics.cluster.contingency_matrix(labels_true, labels_pred)
    #################### DONE NEW WAY ####################

    #################################################################
    #################### End: Contingency Matrix ####################
    #################################################################

    ######################################################################
    #################### Start: Fowlkes Mallows Score ####################
    ######################################################################

    fowlkes_mallows_score = \
        sklearn.metrics.fowlkes_mallows_score(labels_true=labels_true,
                                              labels_pred=labels_pred,
                                              sparse=False)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Fowlkes Mallows Score", fowlkes_mallows_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.FOWLKES_MALLOWS_SCORE,
                   fowlkes_mallows_score)

    # OR

    # Third Way
    mlops.metrics.fowlkes_mallows_score(labels_true=labels_true,
                                        labels_pred=labels_pred,
                                        sparse=False)
    #################### DONE NEW WAY ####################

    ####################################################################
    #################### End: Fowlkes Mallows Score ####################
    ####################################################################

    #####################################################################################
    #################### Start: Homogeneity, Completeness, V Measure ####################
    #####################################################################################

    homogeneity, completeness, v_measure = sklearn.metrics \
        .homogeneity_completeness_v_measure(labels_true=labels_true, labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # multiline_object = MultiLineGraph() \
    #     .name("User Defined: Homogeneity - Completeness - V Measure") \
    #     .labels(["Homogeneity", "Completeness", "V Measure"])
    #
    # multiline_object.data([homogeneity, completeness, v_measure])
    #
    # mlops.set_stat(multiline_object)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.HOMOGENEITY_COMPLETENESS_V_MEASURE,
                   data=[homogeneity, completeness, v_measure])

    # OR

    # Third Way
    mlops.metrics \
        .homogeneity_completeness_v_measure(labels_true=labels_true, labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ###################################################################################
    #################### End: Homogeneity, Completeness, V Measure ####################
    ###################################################################################

    ##################################################################
    #################### Start: Homogeneity Score ####################
    ##################################################################

    homogeneity_score = sklearn.metrics \
        .homogeneity_score(labels_true=labels_true, labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Homogeneity Score", homogeneity_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.HOMOGENEITY_SCORE, homogeneity_score)

    # OR

    # Third Way
    mlops.metrics \
        .homogeneity_score(labels_true=labels_true, labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ################################################################
    #################### End: Homogeneity Score ####################
    ################################################################

    ##################################################################
    #################### Start: Mutual Info Score ####################
    ##################################################################

    mutual_info_score = sklearn.metrics \
        .mutual_info_score(labels_true=labels_true, labels_pred=labels_pred, contingency=None)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Mutual Info Score", mutual_info_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.MUTUAL_INFO_SCORE, mutual_info_score)

    # OR

    # Third Way
    mlops.metrics \
        .mutual_info_score(labels_true=labels_true, labels_pred=labels_pred, contingency=None)
    #################### DONE NEW WAY ####################

    ################################################################
    #################### End: Mutual Info Score ####################
    ################################################################

    #############################################################################
    #################### Start: Normalized Mutual Info Score ####################
    #############################################################################

    normalized_mutual_info_score = sklearn.metrics \
        .normalized_mutual_info_score(labels_true=labels_true,
                                      labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Normalized Mutual Info Score", normalized_mutual_info_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.NORMALIZED_MUTUAL_INFO_SCORE,
                   normalized_mutual_info_score)

    # OR

    # Third Way
    mlops.metrics \
        .normalized_mutual_info_score(labels_true=labels_true,
                                      labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ###########################################################################
    #################### End:  Normalized Mutual Info Score ####################
    ###########################################################################

    #################################################################
    #################### Start: Silhouette Score ####################
    #################################################################

    silhouette_score = sklearn.metrics \
        .silhouette_score(X=features, labels=labels_pred, metric="euclidean", sample_size=None, random_state=None)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: Silhouette Score", silhouette_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.SILHOUETTE_SCORE, silhouette_score)

    # OR

    # Third Way
    mlops.metrics \
        .silhouette_score(X=features, labels=labels_pred, metric="euclidean", sample_size=None, random_state=None)
    #################### DONE NEW WAY ####################

    ###############################################################
    #################### End: Silhouette Score ####################
    ###############################################################

    ################################################################
    #################### Start: V Measure Score ####################
    ################################################################

    v_measure_score = sklearn.metrics.v_measure_score(labels_true=labels_true,
                                                      labels_pred=labels_pred)
    #################### OLD WAY ####################
    # First Way
    # mlops.set_stat("User Defined: V Measure Score", v_measure_score)
    #################### DONE OLD WAY ####################

    #################### NEW WAY ####################
    # Second Way
    mlops.set_stat(ClusteringMetrics.V_MEASURE_SCORE, v_measure_score)

    # OR

    # Third Way
    mlops.metrics \
        .v_measure_score(labels_true=labels_true, labels_pred=labels_pred)
    #################### DONE NEW WAY ####################

    ##############################################################
    #################### End: V Measure Score ####################
    ##############################################################

    # Save the model
    import pickle
    model_file = open(pm_options.output_model, 'wb')
    pickle.dump(kmeans_model, model_file)
    model_file.close()
    # Terminate MLOPs
    mlops.done()
def infer_loop(model, input, output_file, stats_interval, conf_tracker):

    output = open(output_file, "w")

    # Initialize statistics
    total_predictions = 0
    low_confidence_predictions = 0
    categories = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    prediction_hist = []
    for i in range(0, model.get_num_categories()):
        prediction_hist.append(0)

    ### MLOPS start
    # Create a bar graph and table for reporting prediction distributions and set the column names
    infer_bar = BarGraph().name("Prediction Distribution Bar Graph").cols(
        categories)
    infer_tbl = Table().name("Prediction Distribution Table").cols(categories)
    ### MLOPS end

    while True:
        try:
            sample, label = input.get_next_input()

            # Get the inference. This is an array of probabilities for each output value.
            inference = model.infer(sample)

            # The prediction is the class with the highest probability
            prediction = ny.argmax(inference)

            # The confidence for that prediction
            confidence = inference[prediction] * 100

            # Append the prediction to the output file
            output.write("{}\n".format(prediction))

            # Calculate statistics
            total_predictions += 1
            prediction_hist[prediction] += 1

            conf_tracker.check_confidence(confidence, sample)

            # Report statistics
            if total_predictions % stats_interval == 0:

                # Report the prediction distribution
                for i in range(0, model.get_num_categories()):
                    print("category: {} predictions: {}".format(
                        categories[i], prediction_hist[i]))

                ### MLOPS start

                # Update total prediction count with the all new predictions since we last reported
                mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT,
                               stats_interval)

                # Show the prediction distribution as a table
                infer_tbl.add_row(str(total_predictions), prediction_hist)

                # Show the prediction distribution as a bar graph
                infer_bar.data(prediction_hist)

                # Report the stats
                mlops.set_stat(infer_tbl)
                mlops.set_stat(infer_bar)

                ### MLOPS end

                conf_tracker.report_confidence(stats_interval)

        except EOFError:
            # stop when we hit end of input
            print("Reached end of input")
            output.close()

            break