def _report_avg_response_time_metrics(self): self._logger.debug("Reporting about workers average response time ...") tbl = Table().name(StatsConstants.AVG_RESP_TIME_TABLE_NAME).cols( [StatsConstants.AVG_RESP_TIME_COL_NAME]) for col, rt in self._curr_stats_snapshot.avg_workers_response_time: tbl.add_row(col, rt) mlops.set_stat(tbl)
def export_classification_report(class_rep, algo): """ This function provides the classification report as a table in at MCenter data scientist view :param class_rep: Classification report data :param algo: text for the algorithm type :return: """ col_keys = [] row_keys = [] class_tlb = [] add_col_keys = True for row_key in class_rep.keys(): row_keys.append(str(row_key)) class_tlb_row = [] class_row = class_rep[row_key] for col_key in class_row.keys(): if add_col_keys: col_keys.append(str(col_key)) class_tlb_row.append(str(class_row[col_key])) add_col_keys = False class_tlb.append(class_tlb_row) tbl = Table().name("Classification Report "+str(algo)).cols(col_keys) for i in range(len(row_keys)): tbl.add_row(row_keys[i], class_tlb[i]) mlops.set_stat(tbl)
def get_table_value_stat_object(name, list_2d, match_header_pattern=None): """ Create Table Value stat object from list of list. Where first element of 2d list is header. And from remaining lists, list's first index is Row's header. :param name: Name of stat :param list_2d: 2d representation of table to output :param match_header_pattern: If not none, then header of table should match the pattern provided :return: MLOps Table Value object, general stat category """ category = StatCategory.GENERAL try: header = list(map(lambda x: str(x).strip(), list_2d[0])) if match_header_pattern is not None: assert header == match_header_pattern, \ "headers {} is not matching expected headers pattern {}" \ .format(header, match_header_pattern) len_of_header = len(header) table_object = Table().name(name).cols(header) for index in range(1, len(list_2d)): assert len(list_2d[index]) - 1 == len_of_header, \ "length of row value does not match with headers length" row_title = str(list_2d[index][0]).strip() row_value = list( map(lambda x: str(x).strip(), list_2d[index][1:])) table_object.add_row(row_title, row_value) return table_object, category except Exception as e: raise MLOpsStatisticsException \ ("error happened while outputting table object from list_2d: {}. error: {}".format(list_2d, e))
def test_table(): pm.init(ctx=None, mlops_mode=MLOpsMode.STAND_ALONE) with pytest.raises(MLOpsException): Table().name("mytable").cols(["a", "b", "c"]).add_row([1, 2, 3]).add_row([1, 2]) with pytest.raises(MLOpsException): tbl = Table().name("mytable").cols(["a", "b"]) pm.set_stat(tbl) tbl = Table().name("good-1").cols(["a", "b", "c"]).add_rows([[1, 2, 3], [1, 2, 3]]) pm.set_stat(tbl) tbl = Table().name("good-2").cols(["a", "b", "c"]) tbl.add_row("r1", [1, 2, 3]) tbl.add_row("r2", [3, 4, 5]) pm.set_stat(tbl) tbl = Table().name("good-3").cols(["a", "b", "c"]) tbl.add_row([6, 7, 8]) tbl.add_row([9, 0, 1]) pm.set_stat(tbl) pm.done()
def job_secondary_transitions(rows): tbl = Table().name("SageMaker Job Transitions")\ .cols(["Start Time", "End Time", "Time Span", "Status", "Description"]) for row in rows: tbl.add_row(row) mlops.set_stat(tbl)
def main(): print("Starting example") mlops.init(run_in_non_pm_mode=True, mlops_mode=MLOpsMode.PYTHON) # Line graphs mlops.set_stat("myCounterDouble", 5.5) mlops.set_stat("myCounterDouble2", 7.3) # Multi-line graphs mlt = MultiLineGraph().name("Multi Line").labels(["l1", "l2"]).data([5, 16]) mlops.set_stat(mlt) tbl = Table().name("MyTable").cols(["Date", "Some number"]) tbl.add_row(["2001Q1", "55"]) tbl.add_row(["2001Q2", "66"]) tbl.add_row(["2003Q3", "33"]) tbl.add_row(["2003Q2", "22"]) mlops.set_stat(tbl) bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd", "ee"]).data([10, 15, 12, 9, 8]) mlops.set_stat(bar) mlops.done() print("Example done")
def job_host_metrics(job_name, metrics_data): tbl = Table().name("Job Host Metrics").cols(["Metric", "Value"]) for metric_data in metrics_data: tbl.add_row([ metric_data['Label'], metric_data['Values'][0] if metric_data['Values'] else 0 ]) mlops.set_stat(tbl)
def job_status(job_name, running_time_sec, billing_time_sec, status=""): Report._last_metric_values[job_name] = status tbl = Table().name("SageMaker Job Status").cols( ["Job Name", "Total Running Time", "Time for Billing", "Status"]) tbl.add_row([ job_name, Report.seconds_fmt(running_time_sec), Report.seconds_fmt(billing_time_sec), status ]) mlops.set_stat(tbl)
def _materialize(self, parent_data_objs, user_data): for param in parent_data_objs: prent_param = "parent param is: {param}".format(param=param) print(prent_param) self._logger.info(prent_param) tbl = Table().name("Table example").cols(["Worker", "Requests"]) for index in range(0, 10): tbl.add_row(["kenshoo-worker-{}".format(index), index + 3]) mlops.set_stat(tbl) return ["s3://Kenshoo/this is the logistic model path/model.pmml"]
def _report_acc_requests_and_status(self): self._logger.debug("Reporting about workers requests & status ...") tbl = Table().name(StatsConstants.ACC_REQS_TABLE_NAME).cols([ StatsConstants.ACC_REQS_NUM_REQS_COL_NAME, StatsConstants.ACC_REQS_STATUS_COL_NAME ]) for col, value, status in self._curr_stats_snapshot.sorted_worker_stats: tbl.add_row(col, [value, status]) tbl.add_row(StatsConstants.ACC_REQS_LAST_ROW_NAME, [self._curr_stats_snapshot.total_requests, "---"]) mlops.set_stat(tbl) mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, self._curr_stats_snapshot.total_requests_diff)
def export_confusion_table(confmat, algo): """ This function provides the confusion matrix as a table in at MCenter data scientist view :param confmat: Confusion matrix :param algo: text for the algorithm type :return: """ tbl = Table()\ .name("Confusion Matrix for " + str(algo))\ .cols(["Predicted label: " + str(i) for i in range(0, confmat.shape[0])]) for i in range(confmat.shape[1]): tbl.add_row("True Label: " + str(i), [str(confmat[i, j]) for j in range(0, confmat.shape[0])]) mlops.set_stat(tbl)
def _report_acc_requests_and_status(self): self._logger.debug("Reporting about workers requests & status ...") try: predict_reqs = self._curr_stats_snapshot.total_requests - \ self._curr_stats_snapshot.uwsgi_pm_metric_by_name(PredefinedStats.PM_STAT_REQUESTS) mlops.set_stat("Number of Predict Requests", predict_reqs) except: self._logger.error("Failed to retrieve pm stat requests") predict_reqs = self._curr_stats_snapshot.total_requests tbl = Table().name(StatsConstants.ACC_REQS_TABLE_NAME).cols([ StatsConstants.ACC_REQS_NUM_REQS_COL_NAME, StatsConstants.ACC_REQS_STATUS_COL_NAME ]) for col, value, status in self._curr_stats_snapshot.sorted_worker_stats: tbl.add_row(col, [value, status]) tbl.add_row(StatsConstants.ACC_REQS_LAST_ROW_NAME, [self._curr_stats_snapshot.total_requests, "---"]) mlops.set_stat(tbl) mlops.set_stat(PredefinedStats.WORKER_STATS, len(self._curr_stats_snapshot.worker_ids))
def infer_loop(model, input, output_file, stats_interval, conf_thresh, conf_percent): output = open(output_file, "w") # Initialize statistics total_predictions = 0 low_confidence_predictions = 0 categories = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] prediction_hist = [] for i in range(0, len(categories)): prediction_hist.append(0) ### MLOPS start # Create a bar graph and table for reporting prediction distributions and set the column names infer_bar = BarGraph().name("Prediction Distribution Bar Graph").cols(categories) infer_tbl = Table().name("Prediction Distribution Table").cols(categories) ### MLOPS end while True: try: sample, label = input.get_next_input() sample_np = ny.array(sample).reshape(1, -1) # The prediction is the class with the highest probability prediction = model.predict(sample_np) # Append the prediction to the output file output.write("{}\n".format(prediction)) # Calculate statistics total_predictions += 1 prediction_hist[ny.int(prediction[0])] += 1 # Report statistics if total_predictions % stats_interval == 0: # Report the prediction distribution for i in range(0, len(categories)): print("category: {} predictions: {}".format(categories[i], prediction_hist[i])) ### MLOPS start # Show the prediction distribution as a table infer_tbl.add_row(str(total_predictions), prediction_hist) # Show the prediction distribution as a bar graph infer_bar.data(prediction_hist) except EOFError: # stop when we hit end of input # Report the stats mlops.set_stat(infer_tbl) mlops.set_stat(infer_bar) ### MLOPS end output.close() ### MLOPS start mlops.done() ### MLOPS end break
class CategoricalStatistics(InferenceStatistics): def __init__(self, print_interval, stats_type, num_categories, conf_thresh, hot_label=True): super(CategoricalStatistics, self).__init__(print_interval) self._num_categories = num_categories self._hot_label = hot_label self._stats_type = stats_type self._conf_thresh = conf_thresh / 100.0 # These are useful for development, but should be replaced by mlops library functions self._label_hist = [] self._infer_hist = [] for i in range(0, self._num_categories): self._label_hist.append(0) self._infer_hist.append(0) if self._stats_type == "python": mlops.init(ctx=None, connect_mlops=True, mlops_mode=MLOpsMode.AGENT) elif self._stats_type == "file": mlops.init(ctx=None, connect_mlops=False, mlops_mode=MLOpsMode.STAND_ALONE) else: self._stats_type = "none" if self._stats_type != "none": self._infer_tbl = Table().name("inferences").cols( ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]) def infer_stats(self, sample, label, inference): # for now, we only process 1 inference at a time inference = inference[0] prediction = ny.argmax(inference) confidence = inference[prediction] if confidence < self._conf_thresh: self.increment_low_conf() self._infer_hist[prediction] += 1 if label is not None: if (self._hot_label): label = ny.argmax(label) self._label_hist[label] += 1 if prediction == label: self.increment_correct() self.increment_total() if self.is_time_to_report(): self.report_stats() return prediction def report_stats(self): if self.get_low_conf() > 0: mlops.health_alert( "Low confidence alert", "{}% of inferences had confidence below {}%".format( self.get_low_conf() * 100.0 / self.get_total(), self._conf_thresh * 100)) for i in range(0, self._num_categories): print(i, "label_total =", self._label_hist[i], "infer_total = ", self._infer_hist[i]) print("total = ", self.get_total(), "total_correct = ", self.get_correct()) self._infer_tbl.add_row(str(self.get_total()), [ self._infer_hist[0], self._infer_hist[1], self._infer_hist[2], self._infer_hist[3], self._infer_hist[4], self._infer_hist[5], self._infer_hist[6], self._infer_hist[7], self._infer_hist[8], self._infer_hist[9] ]) if self._stats_type != "none": mlops.set_stat("correct_percent", self.get_correct() * 100.0 / self.get_total()) mlops.set_stat(self._infer_tbl) def __del__(self): mlops.done() super(CategoricalStatistics, self).__del__()
def _prep_and_train(self, df_dataset): self.min_auc_requirement = self._params["auc_threshold"] self.max_ks_requirement = self._params["ks_threshold"] self.min_psi_requirement = self._params["psi_threshold"] train_on_col = self._params["train_on_column"] #mlops Init mlops.init() y = df_dataset[train_on_col] self._logger.info("train_on_col= {}".format(train_on_col)) self._logger.info("df_dataset {}".format(df_dataset.shape[1])) X = df_dataset.drop(train_on_col, axis=1) mlops.set_data_distribution_stat(X) self._logger.info("df_dataset {}".format(X.shape[1])) # Splitting the data to train and test sets: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self._params["validation_split"], random_state=42) All_columns = X_train.columns.tolist() categorical_columns = self._params["categorical_cols"] mapper_list = [] for d in All_columns: if d in categorical_columns: mapper_list.append( ([d], OneHotEncoder(handle_unknown='ignore'))) else: mapper_list.append(([d], MinMaxScaler())) mapper = DataFrameMapper(mapper_list) ## Training # XGBoost Training: n_cpu = multiprocessing.cpu_count() xgboost_model = xgb.XGBClassifier( max_depth=int(self._params["max_depth"]), min_child_weight=int(self._params["min_child_weight"]), learning_rate=float(self._params["learning_rate"]), n_estimators=int(self._params["n_estimators"]), silent=True, objective=self._params["objective"], gamma=float(self._params["gamma"]), max_delta_step=int(self._params["max_delta_step"]), subsample=float(self._params["subsample"]), colsample_bytree=1, colsample_bylevel=1, reg_alpha=float(self._params["reg_alpha"]), reg_lambda=float(self._params["reg_lambda"]), scale_pos_weight=float(self._params["scale_pos_weight"]), seed=1, n_jobs=n_cpu, missing=None) final_model = Pipeline([("mapper", mapper), ("xgboost", xgboost_model)]) final_model.fit(X_train, y_train) # Prediction and prediction distribution pred_labels = final_model.predict(X_test) pred_probs = final_model.predict_proba(X_test) # Accuracy calculation # Accuracy for the xgboost model accuracy = accuracy_score(y_test, pred_labels) self._logger.info("XGBoost Accuracy value: {0}".format(accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES) # Label distribution: # Label distribution in training value, counts = np.unique(y_test, return_counts=True) label_distribution = np.asarray((value, counts)).T self._logger.info( "Validation Actual Label distributions: \n {0}".format( label_distribution)) # Output Label distribution as a BarGraph using MCenter export_bar_table(label_distribution[:, 0], label_distribution[:, 1], "Validation - Actual Label Distribution") # Prediction distribution and prediction confidence distribution # Pred Label distribution in training pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T self._logger.info( "XGBoost Validation Prediction Label Distributions: \n {0}".format( pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_label_distribution[:, 0], pred_label_distribution[:, 1], "Validation - XGBoost Prediction Distribution") # Pred confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 self._logger.info( "XGBoost Validation Average Prediction confidence per label: \n {0}" .format(average_confidence)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class") # Confusion Matrix # XGBoost Confusion Matrix confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels) self._logger.info( "Confusion Matrix for XGBoost: \n {0}".format(confmat)) # Output Confusion Matrix as a Table using MCenter export_confusion_table(confmat, "XGBoost") # Classification Report # XGBoost Classification Report class_rep = classification_report(y_true=y_test, y_pred=pred_labels, output_dict=True) self._logger.info( "XGBoost Classification Report: \n {0}".format(class_rep)) # AUC and ROC Curves # ROC for XGBoost model roc_auc = roc_auc_score(y_test, pred_probs[:, 1]) self._logger.info("XGBoost ROC AUC value: {}".format(roc_auc)) # Output ROC of the chosen model using MCenter mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES) if roc_auc <= self.min_auc_requirement: mlops.health_alert( "[Training] AUC Violation From Training Node", "AUC Went Below {}. Current AUC Is {}".format( self.min_auc_requirement, roc_auc)) # ROC curve fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1]) cg = MultiGraph().name( "Receiver Operating Characteristic ").set_continuous() cg.add_series(label='Random curve ' '', x=fpr.tolist(), y=fpr.tolist()) cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})' ''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist()) cg.x_title('False Positive Rate') cg.y_title('True Positive Rate') mlops.set_stat(cg) # Feature importance comparison # XGBoost Feature importance export_feature_importance(final_model, list(X_train.columns), 5, "XGBoost") # KS Analysis max_pred_probs = pred_probs.max(axis=1) y_test0 = np.where(y_test == 0)[0] y_test1 = np.where(y_test == 1)[0] # KS for the XGBoost model ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1]) ks_stat = ks.statistic ks_pvalue = ks.pvalue self._logger.info( "KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stats for CGBoost", ks_stat, st.TIME_SERIES) # raising alert if ks-stat goes above required threshold if ks_stat >= self.max_ks_requirement: mlops.health_alert( "[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( self.max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats for XGBoost").cols( ["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # PSI Analysis # Calculating PSI total_psi, psi_table = get_psi(self, max_pred_probs[y_test0], max_pred_probs[y_test1]) psi_table_stat = Table().name("PSI Stats for XGBoost").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) self._logger.info("Total XGBoost PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES) if total_psi >= self.min_psi_requirement: mlops.health_alert( "[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format( self.min_psi_requirement, total_psi)) # ## Save the XGBoost Model model_file = open(self._params["output-model"], 'wb') pickle.dump(final_model, model_file) model_file.close() # ## Finish the program mlops.done() return (model_file)
def main(): parser = argparse.ArgumentParser() add_parameters(parser) args = parser.parse_args() if args.training_iteration <= 0: print('Please specify a positive value for training iteration.') sys.exit(-1) # Read the train and test data sets mnist = mnist_input_data.read_data_sets(args.input_cache_dir, one_hot=True) ## MLOps start # Initialize the mlops library mlops.init() # Report the feature distribution for the training data train_images = mnist.train.images mlops.set_data_distribution_stat(train_images) # Initialize a table to track training accuracy and cost train_table = Table().name("Training Stats").cols(["Accuracy", "Cost"]) ## MLOps end # Create the model sess = tf.InteractiveSession() serialized_tf_example = tf.placeholder(tf.string, name='tf_example') feature_configs = { 'x': tf.FixedLenFeature(shape=[784], dtype=tf.float32), } tf_example = tf.parse_example(serialized_tf_example, feature_configs) x = tf.identity(tf_example['x'], name='x') # use tf.identity() to assign name y_ = tf.placeholder('float', shape=[None, 10]) w = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) sess.run(tf.global_variables_initializer()) y = tf.nn.softmax(tf.matmul(x, w) + b, name='y') # Set the cost function and optimizer cross_entropy = -tf.reduce_sum(y_ * tf.log(y)) train_step = tf.train.GradientDescentOptimizer(0.01).minimize( cross_entropy) values, indices = tf.nn.top_k(y, 10) table = tf.contrib.lookup.index_to_string_table_from_tensor( tf.constant([str(i) for i in range(10)])) prediction_classes = table.lookup(tf.to_int64(indices)) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float')) # Train the model print('Training model...') for i in range(args.training_iteration): batch = mnist.train.next_batch(50) _, train_cost, train_acc = sess.run( [train_step, cross_entropy, accuracy], feed_dict={ x: batch[0], y_: batch[1] }) # Display stats if (i + 1 ) % args.display_step == 0 or i + 1 == args.training_iteration: # Report training accuracy and cost print("Training. step={}, accuracy={}, cost={}".format( i + 1, train_acc, train_cost)) # MLOps start # multiply by 1 to convert into double train_table.add_row("Iterations: {}".format(i + 1), [train_acc * 100, train_cost * 1]) mlops.set_stat(train_table) # MLOps end print('Done training!') # Report final cost and accuracy on test set test_cost, test_acc = sess.run([cross_entropy, accuracy], feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) print("Testing. accuracy={}, cost={}".format(test_acc, test_cost)) ## MLOps start acc_table = Table().name("Test Accuracy").cols(["Accuracy"]) acc_table.add_row("Total iterations: {}".format(args.training_iteration), [test_acc]) mlops.set_stat(acc_table) # Release mlops resources mlops.done() ## MLOps end # Export the trained model so it can be used for inference # WARNING(break-tutorial-inline-code): The following code snippet is # in-lined in tutorials, please update tutorial documents accordingly # whenever code changes. export_path = args.save_dir print('Exporting trained model to', export_path) builder = tf.saved_model.builder.SavedModelBuilder(export_path) # Build the signature_def_map. classification_inputs = tf.saved_model.utils.build_tensor_info( serialized_tf_example) classification_outputs_classes = tf.saved_model.utils.build_tensor_info( prediction_classes) classification_outputs_scores = tf.saved_model.utils.build_tensor_info( values) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classification_inputs }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classification_outputs_classes, tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES: classification_outputs_scores }, method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME )) tensor_info_x = tf.saved_model.utils.build_tensor_info(x) tensor_info_y = tf.saved_model.utils.build_tensor_info(y) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'inputs': tensor_info_x}, outputs={'outputs': tensor_info_y}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) ) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_images': prediction_signature, tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature, }, legacy_init_op=legacy_init_op) builder.save() print('Done exporting!')
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # KS Threshold: [{}]".format( pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format( pm_options.psi_threshold)) print("PM: # Input File: [{}]".format( pm_options.input_file)) print("PM: # Model File: [{}]".format( pm_options.input_model)) max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model model_file_obj = open(filename, 'rb') mlops.set_stat("# Model Files Used", 1) except Exception as e: print("Model Not Found") print("Got Exception: {}".format(e)) mlops.set_stat("# Model Files Used", 0) mlops.done() return 0 final_model = pickle.load(model_file_obj) # Loading the data loan_df = pd.read_csv(pm_options.input_file) X = loan_df # Cleaning NAs mlops.set_data_distribution_stat(loan_df) print("dataset_size = ", loan_df.shape[0]) print("number of NAs per columns = \n", loan_df.isnull().sum()) loan_df = loan_df.dropna() print("dataset_size without NA rows= ", loan_df.shape[0]) # ## Inference pred_labels = final_model.predict(X) pred_probs = final_model.predict_proba(X) # Prediction distribution and prediction confidence distribution pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T print("XGBoost Inference Prediction Label Distributions: \n {0}".format( pred_label_distribution)) export_bar_table(pred_label_distribution[:, 0], pred_label_distribution[:, 1], "Inference - XGBoost Prediction Distribution") # Pred confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 print("XGBoost Validation Average Prediction confidence per label: \n {0}". format(average_confidence)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class") # Feature importance comparison export_feature_importance(final_model, list(X.columns), 5, "XGBoost") # KS Analysis max_pred_probs = pred_probs.max(axis=1) y_test0 = np.where(pred_labels == 0)[0] y_test1 = np.where(pred_labels == 1)[0] ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES) # raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert( "[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # PSI Analysis total_psi, psi_table = get_psi(max_pred_probs[y_test0], max_pred_probs[y_test1]) psi_table_stat = Table().name("PSI Stats").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total XGBoost PSI values: \n {}".format(total_psi)) print("XGBoost PSI Stats: \n {}".format(psi_table)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES) if total_psi >= min_psi_requirement: mlops.health_alert( "[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format( min_psi_requirement, total_psi)) # ## Finish the program mlops.done()
def kmeans_train(pm_options, spark): """ Kmeans Training function :param pm_options: :param spark: :return: """ # Import Data ################################## input_data = (spark.read.format("csv").option( "header", pm_options.with_headers).option( "ignoreLeadingWhiteSpace", "true").option("ignoreTrailingWhiteSpace", "true").option( "inferschema", "true").load(pm_options.data_file)).repartition(10) column_names_all = input_data.columns if not pm_options.with_headers == "true": for col_index in range(0, len(column_names_all)): input_data = input_data.withColumnRenamed( column_names_all[col_index], 'c' + str(col_index)) input_data = input_data.cache() input_train = input_data input_test = input_data # SparkML pipeline ################################## exclude_cols = [] column_names = input_train.columns input_col_names = [] for elmts in column_names: ind = True for excludes in exclude_cols: if elmts == excludes: ind = False if ind: input_col_names.append(elmts) print(input_col_names) vector_assembler = VectorAssembler(inputCols=input_col_names, outputCol="features") kmeans_pipe = KMeans(k=int(pm_options.K), initMode="k-means||", initSteps=2, tol=1e-4, maxIter=100, featuresCol="features") full_pipe = [vector_assembler, kmeans_pipe] model_kmeans = Pipeline(stages=full_pipe).fit(input_train) # Test validation and statistics collection ############################################################ predicted_df = model_kmeans.transform(input_test) print("model_kmeans.stages(1) = ", model_kmeans.stages[1]) sum_errors = model_kmeans.stages[1].computeCost(predicted_df) print("Sum of Errors for Kmeans = " + str(sum_errors)) # Shows the result. kmeans_centers = model_kmeans.stages[1].clusterCenters() print("Kmeans Centers: ") for center in kmeans_centers: print(center) # calculating stats ############################################################ # Calculating Inter cluster distance inter_cluster_distance = np.zeros( (len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): inter_cluster_distance[centerIndex1, centerIndex2] =\ eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2]) print("inter_cluster_distance = ", inter_cluster_distance) # Calculating Intra cluster distances and the bars for the cluster distribution intra_cluster_distance = np.zeros(len(kmeans_centers)) cluster_dist = np.zeros(len(kmeans_centers)) for centerIndex1 in range(0, len(kmeans_centers)): filtered_df = predicted_df.filter( predicted_df["prediction"] == centerIndex1) cluster_dist[centerIndex1] = filtered_df.count() if cluster_dist[centerIndex1] == 0: intra_cluster_distance[centerIndex1] = 0 else: filtered_df =\ filtered_df.withColumn('distance', udf(eq_dist, FloatType())(col("features"), array([lit(v) for v in kmeans_centers[centerIndex1]]))) intra_cluster_distance[centerIndex1] =\ filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1] # calculating Davis-Boulding Index ############################################################ # R[i,j] = (S[i] + S[j])/M[i,j] # D[i] = max(R[i,j]) for i !=j # DB = (1/K) * sum(D[i]) r_index = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): r_index[centerIndex1, centerIndex2] = 0 if not inter_cluster_distance[centerIndex1, centerIndex2] == 0: r_index[centerIndex1, centerIndex2] =\ (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2])\ / inter_cluster_distance[centerIndex1, centerIndex2] d_index = np.max(r_index, axis=0) db_index = np.sum(d_index, axis=0) / len(kmeans_centers) # pmml model generation ############################################################ pmml_file = toPMMLBytes(spark, input_train, model_kmeans).decode("UTF-8") # PM stats ############################################################ print("Sum of Errors for Kmeans = " + str(sum_errors)) pm.set_stat("Sum of Errors for Kmeans", sum_errors, st.TIME_SERIES) print("Davies-Bouldin index = " + str(db_index)) pm.set_stat("Davies-Bouldin index", db_index, st.TIME_SERIES) # Tables tbl_col_name = [] for j in range(0, len(kmeans_centers)): tbl_col_name.append(str(j)) tbl = Table().name("Inter cluster distance").cols(tbl_col_name) for j in range(0, len(kmeans_centers)): tbl.add_row( str(j) + ":", ["%.2f" % x for x in inter_cluster_distance[j, :]]) pm.set_stat(tbl) tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name) tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance]) pm.set_stat(tbl) tbl_col_name1 = [] for j in range(0, len(kmeans_centers[0])): tbl_col_name1.append(str(j)) tbl = Table().name("Centers (for K<6, Attr<11)").cols(tbl_col_name1) for j in range(0, len(kmeans_centers)): tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers[j]]) pm.set_stat(tbl) # BarGraph bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data( cluster_dist.tolist()) pm.stat(bar) print("PM: generating histogram from data-frame and model") print("PM:" + pmml_file) try: pm.set_data_distribution_stat(data=input_train, model=pmml_file) print("PM: done generating histogram") except Exception as e: print("PM: failed to generate histogram using pm.stat") print(e) return pmml_file
def kmeans_train(pm_options, spark): """ Kmeans Training function :param pm_options: :param spark: :return: """ # Import Data ################################## input_data = (spark.read.format("csv") .option("header", pm_options.with_headers) .option("ignoreLeadingWhiteSpace", "true") .option("ignoreTrailingWhiteSpace", "true") .option("inferschema", "true") .load(pm_options.data_file)).repartition(10) # If Data doesn't have headers Create column names c0-cn column_names_all = input_data.columns if not pm_options.with_headers == "true": for col_index in range(0, len(column_names_all)): input_data = input_data.withColumnRenamed(column_names_all[col_index], 'c' + str(col_index)) input_data = input_data.cache() # Set both train and tesst data to the entire dataset input_train = input_data input_test = input_data # SparkML pipeline ################################## # Create column names for vector assembler. Handle exclude columns for vector assembler exclude_cols = [] # No columns to exclude - kmeans of all columns column_names = input_train.columns input_col_names = [] for elmts in column_names: ind = True for excludes in exclude_cols: if elmts == excludes: ind = False if ind: input_col_names.append(elmts) print(input_col_names) # Set hyper parameters search parameters k_range = pm_options.KRange.split(',') db_index_max = np.finfo(np.float64).max k_max = k_range[0] db_index_array = np.zeros(len(k_range)) for index_hs in range (0,len(k_range)): vector_assembler = VectorAssembler( inputCols=input_col_names, outputCol="features") kmeans_pipe = KMeans( k=int(k_range[index_hs]), initMode="k-means||", initSteps=5, tol=1e-4, maxIter=100, featuresCol="features") full_pipe = [vector_assembler, kmeans_pipe] model_kmeans = Pipeline(stages=full_pipe).fit(input_train) # Test validation and statistics collection ############################################################ predicted_df = model_kmeans.transform(input_test) print("model_kmeans.stages(1) = ", model_kmeans.stages[1]) sum_errors = model_kmeans.stages[1].computeCost(predicted_df) print("Sum of Errors for Kmeans = " + str(sum_errors)) kmeans_centers = model_kmeans.stages[1].clusterCenters() print("Kmeans Centers: ") for center in kmeans_centers: print(center) # calculating stats ############################################################ # Calculating Inter cluster distance inter_cluster_distance = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): inter_cluster_distance[centerIndex1, centerIndex2] = \ eq_dist(kmeans_centers[centerIndex1], kmeans_centers[centerIndex2]) print("inter_cluster_distance = ", inter_cluster_distance) # Calculating Intra cluster distances and the bars for the cluster distribution intra_cluster_distance = np.zeros(len(kmeans_centers)) cluster_dist = np.zeros(len(kmeans_centers)) for centerIndex1 in range(0, len(kmeans_centers)): filtered_df = predicted_df.filter(predicted_df["prediction"] == centerIndex1) cluster_dist[centerIndex1] = filtered_df.count() if cluster_dist[centerIndex1] == 0: intra_cluster_distance[centerIndex1] = 0 else: filtered_df = \ filtered_df.withColumn('distance', udf(eq_dist, FloatType())(col("features"), array([lit(v) for v in kmeans_centers[centerIndex1]]))) intra_cluster_distance[centerIndex1] = \ filtered_df.agg(sum("distance")).first()[0] / cluster_dist[centerIndex1] # calculating Davies-Boulding Index ############################################################ # R[i,j] = (S[i] + S[j])/M[i,j] # D[i] = max(R[i,j]) for i !=j # DB = (1/K) * sum(D[i]) r_index = np.zeros((len(kmeans_centers), len(kmeans_centers))) for centerIndex1 in range(0, len(kmeans_centers)): for centerIndex2 in range(0, len(kmeans_centers)): r_index[centerIndex1, centerIndex2] = 0 if not inter_cluster_distance[centerIndex1, centerIndex2] == 0: r_index[centerIndex1, centerIndex2] = \ (intra_cluster_distance[centerIndex1] + intra_cluster_distance[centerIndex2]) \ / inter_cluster_distance[centerIndex1, centerIndex2] d_index = np.max(r_index, axis=0) db_index = np.sum(d_index, axis=0) / len(kmeans_centers) db_index_array[index_hs] = db_index # Check Hyper Parameter Search max if (db_index < db_index_max): db_index_max = db_index k_max = k_range[index_hs] model_kmeans_max = model_kmeans sum_errors_max = sum_errors kmeans_centers_max = kmeans_centers inter_cluster_distance_max = inter_cluster_distance intra_cluster_distance_max = intra_cluster_distance cluster_dist_max = cluster_dist # PM stats ############################################################ print("Optimal K = " + str(k_max)) pm.set_stat("Optimal number of clusters", k_max, st.TIME_SERIES) print("Sum of Errors for Kmeans = " + str(sum_errors_max)) pm.set_stat("Sum of Errors for Kmeans", sum_errors_max, st.TIME_SERIES) print("Davies-Bouldin index = " + str(db_index_max)) pm.set_stat("Davies-Bouldin index", db_index_max, st.TIME_SERIES) # Tables tbl_col_name = [] for j in range(0, len(k_range)): tbl_col_name.append(str(k_range[j])) tbl = Table().name("Davies-Bouldin index for hyper parameter Search").cols(tbl_col_name) tbl.add_row("Davies-Bouldin index:", ["%.2f" % x for x in db_index_array]) pm.set_stat(tbl) tbl_col_name = [] for j in range(0, len(kmeans_centers_max)): tbl_col_name.append(str(j)) tbl = Table().name("Inter cluster distance").cols(tbl_col_name) for j in range(0, len(kmeans_centers_max)): tbl.add_row(str(j) + ":", ["%.2f" % x for x in inter_cluster_distance_max[j, :]]) pm.set_stat(tbl) tbl = Table().name("Intra cluster avg. distance").cols(tbl_col_name) tbl.add_row("Distances:", ["%.2f" % x for x in intra_cluster_distance_max]) pm.set_stat(tbl) if (len(kmeans_centers_max) < 6) & (len(kmeans_centers_max[0]) < 12): tbl_col_name1 = [] for j in range(0, len(kmeans_centers_max[0])): tbl_col_name1.append(str(j)) tbl = Table().name("Centers (for K<6, Attr<12)").cols(tbl_col_name1) for j in range(0, len(kmeans_centers_max)): tbl.add_row("center" + str(j) + ":", ["%.2f" % x for x in kmeans_centers_max[j]]) pm.set_stat(tbl) # BarGraph bar = BarGraph().name("Cluster Destribution").cols(tbl_col_name).data(cluster_dist_max.tolist()) pm.set_stat(bar) return model_kmeans_max
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # KS Threshold: [{}]".format( pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format( pm_options.psi_threshold)) print("PM: # Input File: [{}]".format( pm_options.input_file)) print("PM: # Model File: [{}]".format( pm_options.input_model)) # Initialize MLOps Library mlops.init() # Load the model if pm_options.input_model is not None: try: filename = pm_options.input_model model_file_obj = open(filename, 'rb') mlops.set_stat("# Model Files Used", 1) except Exception as e: print("Model Not Found") print("Got Exception: {}".format(e)) mlops.set_stat("# Model Files Used", 0) mlops.done() return 0 final_model = pickle.load(model_file_obj) try: data_filename = pm_options.input_file data_file_obj = open(data_filename, 'rb') data = np.loadtxt(data_file_obj) X = data # select columns 1 through end except Exception as e: print("Generating Synthetic Data Because {}".format(e)) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_classification( n_samples=num_samples, n_features=num_features, # binary classification only! n_classes=2, random_state=42) # Add random noise to the data randomly import random if random.randint(1, 21) / 2 == 0: print("Adding Random Noise!") noisy_features = np.random.uniform(0, 1) * \ np.random.normal(0, 1, (num_samples, num_features)) X = X + noisy_features # Separate into features and labels features = X max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data and compare it automatically with the ones mlops.set_data_distribution_stat(features) # Output the number of samples being processed using MCenter mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(features), st.TIME_SERIES) # Accuracy for the chosen model pred_labels = final_model.predict(features) pred_probs = final_model.predict_proba(features) print("Pred Labels: ", pred_labels) # Remove printout can be huge print("Pred Probabilities: ", pred_probs) # Remove printout can be huge # Pred Label distribution pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() print("Pred Label distributions: \n {0}".format(pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter pred_bar = BarGraph().name("Pred Label Distribution").cols( (pred_label_distribution[:, 0]).astype(str).tolist()).data( (pred_label_distribution[:, 1]).tolist()) mlops.set_stat(pred_bar) # Pred Label confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] print(" np.sum(confidence[index_class])", np.sum(max_pred_probs[index_class])) print("counts_elements[i] ", pred_counts[i]) if pred_counts[i] > 0: average_confidence[i] = np.sum( max_pred_probs[index_class]) / (float(pred_counts[i])) else: average_confidence[i] = 0 # BarGraph showing confidence per class pred_values1 = [str(i) for i in pred_value] bar = BarGraph().name("Average Confidence Per Class").cols( pred_values1).data(average_confidence.tolist()) mlops.set_stat(bar) # KS for the chosen model ks = ks_2samp(max_pred_probs[pred_labels == 1], max_pred_probs[pred_labels == 0]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter if not np.isnan(ks_stat): print("printing KS_stat ") mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES) else: print("not printing KS_stat ") # Raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert( "[Inference] KS Violation From Inference Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # Calculating PSI total_psi, psi_table = get_psi(max_pred_probs[pred_labels == 1], max_pred_probs[pred_labels == 0]) psi_table_stat = Table().name("PSI Stats").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES) # Raising alert if total_psi goes below required threshold if total_psi <= min_psi_requirement: mlops.health_alert( "[Inference] PSI Violation From Inference Node", "PSI Went Below {}. Current PSI Is {}".format( min_psi_requirement, total_psi)) # Terminate MLOPs mlops.done()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Sample: [{}]".format( pm_options.num_samples)) print("PM: # Features: [{}]".format( pm_options.num_features)) print("PM: # Validation Split: [{}]".format( pm_options.validation_split)) print("PM: # AUC Threshold: [{}]".format( pm_options.auc_threshold)) print("PM: # KS Threshold: [{}]".format( pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format( pm_options.psi_threshold)) print("PM: # Estimators: [{}]".format( pm_options.n_estimators)) print("PM: # Max Depth: [{}]".format(pm_options.max_depth)) print("PM: # Learning Rate: [{}]".format( pm_options.learning_rate)) print("PM: # Min Child Weight: [{}]".format( pm_options.min_child_weight)) print("PM: # Objective: [{}]".format(pm_options.objective)) print("PM: # Gamma: [{}]".format(pm_options.gamma)) print("PM: # Max Delta Step: [{}]".format( pm_options.max_delta_step)) print("PM: # Subsample: [{}]".format(pm_options.subsample)) print("PM: # Reg Alpha: [{}]".format(pm_options.reg_alpha)) print("PM: # Reg Lambda: [{}]".format( pm_options.reg_lambda)) print("PM: # Scale Pos Weight: [{}]".format( pm_options.scale_pos_weight)) print("PM: # Input File: [{}]".format( pm_options.input_file)) print("PM: Output model: [{}]".format( pm_options.output_model)) min_auc_requirement = float(pm_options.auc_threshold) max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # Initialize MLOps Library mlops.init() try: data_filename = pm_options.input_file data_file_obj = open(data_filename, 'rb') data = np.loadtxt(data_file_obj) X = data[:, 1:] # select columns 1 through end y = data[:, 0] except Exception as e: print("Generating Synthetic Data Because {}".format(e)) # Create synthetic data (Gaussian Distribution, Poisson Distribution and Beta Distribution) num_samples = int(pm_options.num_samples) num_features = int(pm_options.num_features) # Create synthetic data using scikit learn X, y = make_classification( n_samples=num_samples, n_features=num_features, # binary classification only! n_classes=2, random_state=42) print("Adding Random Noise!") noisy_features = np.random.uniform(0, 1) * \ np.random.normal(0, 1, (num_samples, num_features)) X = X + noisy_features X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=float(pm_options.validation_split), random_state=42) import xgboost as xgb # Create a model that should be deployed into production final_model = xgb.XGBClassifier( max_depth=int(pm_options.max_depth), min_child_weight=int(pm_options.min_child_weight), learning_rate=float(pm_options.learning_rate), n_estimators=int(pm_options.n_estimators), silent=True, objective=str(pm_options.objective), gamma=float(pm_options.gamma), max_delta_step=int(pm_options.max_delta_step), subsample=float(pm_options.subsample), colsample_bytree=1, colsample_bylevel=1, reg_alpha=float(pm_options.reg_alpha), reg_lambda=float(pm_options.reg_lambda), scale_pos_weight=float(pm_options.scale_pos_weight), seed=1, missing=None) final_model.fit(X_train, y_train) # Output Health Statistics to MCenter # MLOps API to report the distribution statistics of each feature in the data mlops.set_data_distribution_stat(X_train) # Accuracy for the chosen model pred_labels = final_model.predict(X_test) pred_probs = final_model.predict_proba(X_test) print("Pred Labels: ", pred_labels) print("Pred Probabilities: ", pred_probs) accuracy = accuracy_score(y_test, pred_labels) print("Accuracy values: \n {0}".format(accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("Accuracy", accuracy, st.TIME_SERIES) # Label distribution in training value, counts = np.unique(y_test, return_counts=True) label_distribution = np.asarray((value, counts)).T # column_names = value.astype(str).tolist() print("Validation Actual Label distributions: \n {0}".format( label_distribution)) # Output Label distribution as a BarGraph using MCenter bar = BarGraph().name("Validation Actual Label Distribution").cols( (label_distribution[:, 0]).astype(str).tolist()).data( (label_distribution[:, 1]).tolist()) mlops.set_stat(bar) # Pred Label distribution in training pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() print("Validation Prediction Label Distributions: \n {0}".format( pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter pred_bar = BarGraph().name( "Validation Prediction Label Distributions").cols( (pred_label_distribution[:, 0]).astype(str).tolist()).data( (pred_label_distribution[:, 1]).tolist()) mlops.set_stat(pred_bar) # ROC for the chosen model roc_auc = roc_auc_score(y_test, pred_probs[:, 1]) print("ROC AUC values: \n {}".format(roc_auc)) # Output ROC of the chosen model using MCenter mlops.set_stat("ROC AUC", roc_auc, st.TIME_SERIES) if roc_auc <= min_auc_requirement: mlops.health_alert( "[Training] AUC Violation From Training Node", "AUC Went Below {}. Current AUC Is {}".format( min_auc_requirement, roc_auc)) # ROC Curve fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1]) cg = MultiGraph().name( "Receiver Operating Characteristic ").set_continuous() cg.add_series(label='Random Curve ' '', x=fpr.tolist(), y=fpr.tolist()) cg.add_series(label='ROC Curve (Area = {0:0.2f})' ''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist()) cg.x_title('False Positive Rate') cg.y_title('True Positive Rate') mlops.set_stat(cg) max_pred_probs = pred_probs.max(axis=1) # KS for the chosen model ks = ks_2samp(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values: \n Statistics: {} \n pValue: {}\n".format( ks_stat, ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stat", ks_stat, st.TIME_SERIES) # Raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert( "[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format( max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # Calculating PSI total_psi, psi_table = get_psi(max_pred_probs[y_test == 1], max_pred_probs[y_test == 0]) psi_table_stat = Table().name("PSI Stats").cols([ "Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI" ]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total PSI ", total_psi, st.TIME_SERIES) # Raising alert if total_psi goes below required threshold if total_psi <= min_psi_requirement: mlops.health_alert( "[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format( min_psi_requirement, total_psi)) # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # Terminate MLOPs mlops.done()
class CategoricalStatistics(InferenceStatistics): def __init__(self, print_interval, stats_type, num_categories, conf_thresh, conf_percent, hot_label=True): super(CategoricalStatistics, self).__init__(print_interval) self._num_categories = num_categories self._hot_label = hot_label self._stats_type = stats_type self._conf_thresh = conf_thresh / 100.0 self._conf_percent = conf_percent # These are useful for development, but should be replaced by mlops library functions self._label_hist = [] self._infer_hist = [] for i in range(0, self._num_categories): self._label_hist.append(0) self._infer_hist.append(0) if self._stats_type == "python": mlops.init(ctx=None, connect_mlops=True, mlops_mode=MLOpsMode.AGENT) elif self._stats_type == "file": mlops.init(ctx=None, connect_mlops=False, mlops_mode=MLOpsMode.STAND_ALONE) else: self._stats_type = "none" if self._stats_type != "none": column_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] self._infer_tbl = Table().name("categories").cols(column_names) self._infer_bar = BarGraph().name("categories bar").cols( column_names) def infer_stats(self, sample, label, inference): # for now, we only process 1 inference at a time inference = inference[0] prediction = ny.argmax(inference) confidence = inference[prediction] if confidence < self._conf_thresh: self.increment_low_conf() self._infer_hist[prediction] += 1 if label is not None: if (self._hot_label): label = ny.argmax(label) self._label_hist[label] += 1 if prediction == label: self.increment_correct() self.increment_total() if self.is_time_to_report(): self.report_stats() return prediction def report_stats(self): # what percentage of the predictions had confidences less than the threshold low_conf_percent = self.get_low_conf( ) * 100.0 / self.get_report_interval() if low_conf_percent > self._conf_percent: mlops.health_alert( "Low confidence alert", "{}% of inferences had confidence below {}%".format( low_conf_percent, self._conf_thresh * 100)) for i in range(0, self._num_categories): print(i, "label_total =", self._label_hist[i], "infer_total = ", self._infer_hist[i]) print("total = ", self.get_total(), "total_correct = ", self.get_correct()) category_data = [ self._infer_hist[0], self._infer_hist[1], self._infer_hist[2], self._infer_hist[3], self._infer_hist[4], self._infer_hist[5], self._infer_hist[6], self._infer_hist[7], self._infer_hist[8], self._infer_hist[9] ] self._infer_tbl.add_row(str(self.get_cum_total()), category_data) self._infer_bar.data(category_data) if self._stats_type != "none": mlops.set_stat("correct_percent", self.get_correct() * 100.0 / self.get_total()) mlops.set_stat(self._infer_tbl) mlops.set_stat(self._infer_bar) # Update total prediction count with the all new predictions since we last reported. mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, self.get_report_interval()) print("Completed {} predictions".format( self.get_report_interval())) self.reset() def __del__(self): mlops.done() super(CategoricalStatistics, self).__del__()
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: # Validation Split: [{}]".format(pm_options.validation_split)) print("PM: # AUC Threshold: [{}]".format(pm_options.auc_threshold)) print("PM: # KS Threshold: [{}]".format(pm_options.ks_threshold)) print("PM: # PSI Threshold: [{}]".format(pm_options.psi_threshold)) print("PM: # Estimators: [{}]".format(pm_options.n_estimators)) print("PM: # Max Depth: [{}]".format(pm_options.max_depth)) print("PM: # Learning Rate: [{}]".format(pm_options.learning_rate)) print("PM: # Min Child Weight: [{}]".format(pm_options.min_child_weight)) print("PM: # Objective: [{}]".format(pm_options.objective)) print("PM: # Gamma: [{}]".format(pm_options.gamma)) print("PM: # Max Delta Step: [{}]".format(pm_options.max_delta_step)) print("PM: # Subsample: [{}]".format(pm_options.subsample)) print("PM: # Reg Alpha: [{}]".format(pm_options.reg_alpha)) print("PM: # Reg Lambda: [{}]".format(pm_options.reg_lambda)) print("PM: # Scale Pos Weight: [{}]".format(pm_options.scale_pos_weight)) print("PM: # Input File: [{}]".format(pm_options.input_file)) print("PM: Output model: [{}]".format(pm_options.output_model)) min_auc_requirement = float(pm_options.auc_threshold) max_ks_requirement = float(pm_options.ks_threshold) min_psi_requirement = float(pm_options.psi_threshold) # mlops Init mlops.init() # Loading and cleaning the data # This section goes though the various stages of loading and cleaning the data: loan_df = pd.read_csv(pm_options.input_file) # Cleaning NAs print("dataset_size = ", loan_df.shape[0]) mlops.set_data_distribution_stat(loan_df) print("number of NAs per columns = ", loan_df.isnull().sum()) loan_df = loan_df.dropna() print("dataset_size without NA rows= ", loan_df.shape[0]) # Marking the label field. remove it from the features set: y = loan_df["bad_loan"] X = loan_df.drop("bad_loan", axis=1) from sklearn_pandas import DataFrameMapper # Splitting the data to train and test sets: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(pm_options.validation_split), random_state=42) All_columns = X_train.columns.tolist() categorical_columns = ["verification_status", "addr_state", "purpose", "home_ownership", "term"] mapper_list =[] for d in All_columns: if d in categorical_columns: mapper_list.append(([d], OneHotEncoder(handle_unknown='ignore'))) else: mapper_list.append(([d], MinMaxScaler())) mapper = DataFrameMapper(mapper_list) # ## Training # XGBoost Training: import xgboost as xgb xgboost_model = xgb.XGBClassifier(max_depth=int(pm_options.max_depth), min_child_weight=int(pm_options.min_child_weight), learning_rate=float(pm_options.learning_rate), n_estimators=int(pm_options.n_estimators), silent=True, objective=pm_options.objective, gamma=float(pm_options.gamma), max_delta_step=int(pm_options.max_delta_step), subsample=float(pm_options.subsample), colsample_bytree=1, colsample_bylevel=1, reg_alpha=float(pm_options.reg_alpha), reg_lambda=float(pm_options.reg_lambda), scale_pos_weight=float(pm_options.scale_pos_weight), seed=1, n_jobs=1, missing=None) final_model = Pipeline([("mapper", mapper), ("xgboost", xgboost_model)]) final_model.fit(X_train, y_train) # Random Forest Training from sklearn.ensemble import RandomForestClassifier rf_only_model = RandomForestClassifier(n_estimators=int(pm_options.n_estimators), max_depth=int(pm_options.max_depth)+3, random_state=42, n_jobs=1, class_weight="balanced") rf_model = Pipeline([("mapper", mapper), ("rf", rf_only_model)]) rf_model.fit(X_train, y_train) # ## Statistics on Test Dataset # Prediction and prediction distribution pred_labels = final_model.predict(X_test) pred_probs = final_model.predict_proba(X_test) rf_pred_labels = rf_model.predict(X_test) rf_pred_probs = rf_model.predict_proba(X_test) # Accuracy calculation # Accuracy for the xgboost model accuracy = accuracy_score(y_test, pred_labels) print("XGBoost Accuracy value: {0}".format(accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("XGBoost Accuracy", accuracy, st.TIME_SERIES) # Accuracy for the RF model rf_accuracy = accuracy_score(y_test, rf_pred_labels) print("RF Accuracy value: {0}".format(rf_accuracy)) # Output accuracy of the chosen model using MCenter mlops.set_stat("RF Accuracy", rf_accuracy, st.TIME_SERIES) # Label distribution: # Label distribution in training value, counts = np.unique(y_test, return_counts=True) label_distribution = np.asarray((value, counts)).T print("Validation Actual Label distributions: \n {0}".format(label_distribution)) # Output Label distribution as a BarGraph using MCenter export_bar_table(label_distribution[:,0], label_distribution[:,1], "Validation - Actual Label Distribution") # Prediction distribution and prediction confidence distribution # Pred Label distribution in training pred_value, pred_counts = np.unique(pred_labels, return_counts=True) pred_label_distribution = np.asarray((pred_value, pred_counts)).T print("XGBoost Validation Prediction Label Distributions: \n {0}".format(pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_label_distribution[:,0], pred_label_distribution[:,1], "Validation - XGBoost Prediction Distribution") rf_pred_value, rf_pred_counts = np.unique(rf_pred_labels, return_counts=True) rf_pred_label_distribution = np.asarray((rf_pred_value, rf_pred_counts)).T # pred_column_names = pred_value.astype(str).tolist() print("RF Validation Prediction Label Distributions: \n {0}".format(rf_pred_label_distribution)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(rf_pred_label_distribution[:,0], rf_pred_label_distribution[:,1], "Validation - RF Prediction Distribution") # Pred confidence per label label_number = len(pred_counts) average_confidence = np.zeros(label_number) max_pred_probs = pred_probs.max(axis=1) for i in range(0, label_number): index_class = np.where(pred_labels == i)[0] if pred_counts[i] > 0: average_confidence[i] = np.sum(max_pred_probs[index_class])/(float(pred_counts[i])) else: average_confidence[i] = 0 print("XGBoost Validation Average Prediction confidence per label: \n {0}".format(average_confidence)) # Pred confidence per label rf_label_number = len(rf_pred_counts) rf_average_confidence = np.zeros(rf_label_number) rf_max_pred_probs = rf_pred_probs.max(axis=1) for i in range(0, rf_label_number): rf_index_class = np.where(rf_pred_labels == i)[0] if rf_pred_counts[i] > 0: rf_average_confidence[i] = np.sum(rf_max_pred_probs[rf_index_class])/(float(rf_pred_counts[i])) else: rf_average_confidence[i] = 0 print("RF Validation Average Prediction confidence per label: \n {0}".format(rf_average_confidence)) # Output Pred label distribution as a BarGraph using MCenter export_bar_table(pred_value, average_confidence, "Validation - XGBoost Average confidence per class") export_bar_table(rf_pred_value, rf_average_confidence, "Validation - RF Average confidence per class") # Confusion Matrix # XGBoost Confusion Matrix confmat = confusion_matrix(y_true=y_test, y_pred=pred_labels) print("Confusion Matrix for XGBoost: \n {0}".format(confmat)) # Output Confusion Matrix as a Table using MCenter export_confusion_table(confmat, "XGBoost") # RF Confusion Matrix rf_confmat = confusion_matrix(y_true=y_test, y_pred=rf_pred_labels) print("Confusion Matrix for RF: \n {0}".format(rf_confmat)) # Output Confusion Matrix as a Table using MCenter export_confusion_table(rf_confmat, "RF") # Classification Report # XGBoost Classification Report class_rep = classification_report(y_true=y_test, y_pred=pred_labels, output_dict=True) print("XGBoost Classification Report: \n {0}".format(class_rep)) # RF Classification Report rf_class_rep = classification_report(y_true=y_test, y_pred=rf_pred_labels, output_dict=True) print("RF Classification Report: \n {0}".format(rf_class_rep)) # Output Classification Report as a Table using MCenter export_classification_report(class_rep, "XGBoost") export_classification_report(rf_class_rep, "RF") # AUC and ROC Curves # ROC for XGBoost model roc_auc = roc_auc_score(y_test, pred_probs[:, 1]) print("XGBoost ROC AUC value: {}".format(roc_auc)) rf_roc_auc = roc_auc_score(y_test, rf_pred_probs[:, 1]) print("RF ROC AUC value: {}".format(rf_roc_auc)) # Output ROC of the chosen model using MCenter mlops.set_stat("XGBoost ROC AUC", roc_auc, st.TIME_SERIES) mlops.set_stat("RF ROC AUC", rf_roc_auc, st.TIME_SERIES) if roc_auc <= min_auc_requirement: mlops.health_alert("[Training] AUC Violation From Training Node", "AUC Went Below {}. Current AUC Is {}".format(min_auc_requirement, roc_auc)) # ROC curve fpr, tpr, thr = roc_curve(y_test, pred_probs[:, 1]) rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, rf_pred_probs[:, 1]) cg = MultiGraph().name("Receiver Operating Characteristic ").set_continuous() cg.add_series(label='Random curve ''', x=fpr.tolist(), y=fpr.tolist()) cg.add_series(label='XGBoost ROC curve (area = {0:0.2f})'''.format(roc_auc), x=fpr.tolist(), y=tpr.tolist()) cg.add_series(label='RF ROC curve (area = {0:0.2f})'''.format(rf_roc_auc), x=rf_fpr.tolist(), y=rf_tpr.tolist()) cg.x_title('False Positive Rate') cg.y_title('True Positive Rate') mlops.set_stat(cg) # Feature importance comparison # XGBoost Feature importance export_feature_importance(final_model, list(X_train.columns), 5, "XGBoost") export_feature_importance(rf_model, list(X_train.columns), 5, "RF") # KS Analysis max_pred_probs = pred_probs.max(axis=1) y_test0=np.where(y_test == 0)[0] y_test1=np.where(y_test == 1)[0] rf_max_pred_probs = rf_pred_probs.max(axis=1) # KS for the XGBoost model ks = ks_2samp(max_pred_probs[y_test0], max_pred_probs[y_test1]) ks_stat = ks.statistic ks_pvalue = ks.pvalue print("KS values for XGBoost: \n Statistics: {} \n pValue: {}\n".format(ks_stat, ks_pvalue)) # KS for the RF model rf_ks = ks_2samp(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1]) rf_ks_stat = rf_ks.statistic rf_ks_pvalue = rf_ks.pvalue print("RF KS values: \n Statistics: {} \n pValue: {}\n".format(rf_ks_stat, rf_ks_pvalue)) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stats for XGBoost", ks_stat, st.TIME_SERIES) # Output KS Stat of the chosen model using MCenter mlops.set_stat("KS Stats for RF", rf_ks_stat, st.TIME_SERIES) # raising alert if ks-stat goes above required threshold if ks_stat >= max_ks_requirement: mlops.health_alert("[Training] KS Violation From Training Node", "KS Stat Went Above {}. Current KS Stat Is {}".format(max_ks_requirement, ks_stat)) ks_table = Table().name("KS Stats for XGBoost").cols(["Statistic", "pValue"]) ks_table.add_row([ks_stat, ks_pvalue]) mlops.set_stat(ks_table) # PSI Analysis # Calculating PSI total_psi, psi_table = get_psi(max_pred_probs[y_test0], max_pred_probs[y_test1]) rf_total_psi, rf_psi_table = get_psi(rf_max_pred_probs[y_test0], rf_max_pred_probs[y_test1]) psi_table_stat = Table().name("PSI Stats for XGBoost").cols( ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI"]) row_num = 1 for each_value in psi_table.values: str_values = [str(i) for i in each_value] psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(psi_table_stat) print("Total XGBoost PSI values: \n {}".format(total_psi)) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total XGBoost PSI ", total_psi, st.TIME_SERIES) if total_psi >= min_psi_requirement: mlops.health_alert("[Training] PSI Violation From Training Node", "PSI Went Below {}. Current PSI Is {}".format(min_psi_requirement, total_psi)) print("Total RF PSI values: \n {}".format(rf_total_psi)) rf_psi_table_stat = Table().name("PSI Stats for RF").cols( ["Base Pop", "Curr Pop", "Lower Bound", "Upper Bound", "Base Percent", "Curr Percent", "Segment PSI"]) row_num = 1 for each_value in rf_psi_table.values: str_values = [str(i) for i in each_value] rf_psi_table_stat.add_row(str(row_num), str_values) row_num += 1 mlops.set_stat(rf_psi_table_stat) # Output Total PSI of the chosen model using MCenter mlops.set_stat("Total RF PSI ", rf_total_psi, st.TIME_SERIES) # ## Save the XGBoost Model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(final_model, model_file) model_file.close() # ## Finish the program mlops.done()
def generate_health_and_heatmap_stat(stat_object_method, logger, features_values, features_names, model_stat, model_id, num_bins=13, # TODO: Have ability to get this argument from user! data_analysis=True): """ Method is highly responsible and creates continuous/categorical histograms. Also creates heatmap and compare two histogram if program is running on inference. :param stat_object_method: stat object method to output stat :param logger: logger to log :param features_values: feature array :param features_names: feature names :param model_stat: model stat :param num_bins: max number of bins for features. :return: """ # generating general stats like categorical/continuous features and contender histograms. general_hist_stat = GeneralHistogramStat() general_hist_stat \ .create_and_set_general_stat(set_of_features_values=features_values, set_of_features_names=features_names, model_stat=model_stat) # For Continuous Values # continuous feature names continuous_features_names = general_hist_stat.set_of_continuous_features # predefined bins of contender continuous hist pred_bins_continuous_hist = general_hist_stat.contender_continuous_hist_bins contender_continuous_histogram_representation = general_hist_stat.contender_continuous_histogram continuous_features_values = PythonChannelHealth. \ _create_feature_subset(features_values=features_values, features_names=features_names, selection_features_subset=continuous_features_names) current_continuous_histogram_representation = \ PythonChannelHealth._create_current_hist_rep( features_values=continuous_features_values, features_names=continuous_features_names, num_bins=num_bins, pred_bins_hist=pred_bins_continuous_hist, stat_object_method=stat_object_method, name_of_stat=PyHealth.CONTINUOUS_HISTOGRAM_KEY, model_id=model_id) # running data analysis for continuous dataset if data_analysis: continuous_data_analyst_result = ContinuousDataAnalyst \ .analyze(set_of_continuous_feature_names=continuous_features_names, set_of_continuous_feature_values=continuous_features_values) # outputting stat only if analysis result is there if len(continuous_data_analyst_result) > 0: cont_da = Table() \ .name("Continuous Data Analysis") \ .cols(["Count", "Missing", "Zeros", "Standard Deviation", "Min", "Mean", "Median", "Max"]) for f_n in continuous_data_analyst_result.keys(): f_v = continuous_data_analyst_result[f_n] cont_da.add_row(str(f_v.feature_name), [f_v.count, f_v.NAs, f_v.zeros, f_v.std, f_v.min, f_v.mean, f_v.median, f_v.max]) # outputting stat using stat object as stat message type stat_object_method(mlops_stat=cont_da.get_mlops_stat(model_id=model_id), reflex_event_message_type=ReflexEvent.StatsMessage) logger.debug("continuous features values: {}".format(continuous_features_values)) logger.debug("continuous features names: {}".format(continuous_features_names)) logger.debug( "current histogram representation: {}".format(current_continuous_histogram_representation)) logger.debug( "contender histogram representation: {}".format(contender_continuous_histogram_representation)) # For Categorical Values # categorical feature names categorical_features_names = general_hist_stat.set_of_categorical_features # predefined bins of contender categorical hist pred_bins_categorical_hist = general_hist_stat.contender_categorical_hist_bins contender_categorical_histogram_representation = general_hist_stat.contender_categorical_histogram categorical_features_values = PythonChannelHealth._create_feature_subset(features_values=features_values, features_names=features_names, selection_features_subset=categorical_features_names) current_categorical_histogram_representation = \ PythonChannelHealth._create_current_hist_rep( categorical_features_values, categorical_features_names, num_bins, pred_bins_categorical_hist, stat_object_method, name_of_stat=PyHealth.CATEGORICAL_HISTOGRAM_KEY, model_id=model_id) # running data analysis for categorical dataset if data_analysis: categorical_data_analyst_result = CategoricalDataAnalyst \ .analyze(set_of_categorical_feature_names=categorical_features_names, set_of_categorical_feature_values=categorical_features_values) # outputting stat only if analysis result is there if len(categorical_data_analyst_result) > 0: categ_da = Table() \ .name("Categorical Data Analysis") \ .cols(["Count", "Missing", "Uniques", "Top Frequently Occurring Category", "Top Frequency", "Average String Length"]) for f_n in categorical_data_analyst_result.keys(): f_v = categorical_data_analyst_result[f_n] categ_da. \ add_row(str(f_v.feature_name), [f_v.count, f_v.NAs, f_v.unique, f_v.top, f_v.freq_top, f_v.avg_str_len]) # outputting stat using stat object as stat message type stat_object_method(mlops_stat=categ_da.get_mlops_stat(model_id=model_id), reflex_event_message_type=ReflexEvent.StatsMessage) logger.debug("categorical features values: {}".format(categorical_features_values)) logger.debug("categorical features names: {}".format(categorical_features_names)) logger.debug( "current histogram representation: {}".format(current_categorical_histogram_representation)) logger.debug( "contender histogram representation: {}".format(contender_categorical_histogram_representation)) # If model_stat is given, it means it is inference program # so it needs to create heatmap and score too. if model_stat is not None: if continuous_features_values.shape[0] > 0: continuous_features_names, heat_map_values = PythonChannelHealth. \ _create_current_continuous_heatmap_rep(continuous_features_values=continuous_features_values, continuous_features_names=continuous_features_names, stat_object_method=stat_object_method, model_id=model_id) logger.debug("features: {}, heatmap values: {}".format(continuous_features_names, heat_map_values)) compared_continuous_feature_names, compared_continuous_feature_score = PythonChannelHealth. \ _compare_health( current_histogram_representation=current_continuous_histogram_representation, contender_histogram_representation=contender_continuous_histogram_representation, stat_object_method=stat_object_method, name_of_stat=PyHealth.CONTINUOUS_HISTOGRAM_OVERLAP_SCORE_KEY, model_id=model_id) logger.debug( "continuous features: {}, overlap scores: {}".format(compared_continuous_feature_names, compared_continuous_feature_score)) if categorical_features_values.shape[0] > 0: compared_categorical_feature_names, compared_categorical_feature_names = PythonChannelHealth. \ _compare_health( current_histogram_representation=current_categorical_histogram_representation, contender_histogram_representation=contender_categorical_histogram_representation, stat_object_method=stat_object_method, name_of_stat=PyHealth.CATEGORICAL_HISTOGRAM_OVERLAP_SCORE_KEY, model_id=model_id) logger.debug( "categorical features: {}, overlap scores: {}".format( compared_categorical_feature_names, compared_categorical_feature_names))
def main(): pm_options = parse_args() print("PM: Configuration:") print("PM: Data file: [{}]".format(pm_options.data_file)) print("PM: Output model: [{}]".format(pm_options.output_model)) print("PM: regularization_range: [{}]".format( pm_options.regularization_range)) mlops.init() # Read the Samsung datafile dataset = pd.read_csv(pm_options.data_file) # Separate into features and labels features = dataset.iloc[:, 1:].values labels = dataset.iloc[:, 0].values # Hyper-parameter search using k-fold cross-validation # Applying k_fold cross validation regularization_range = pm_options.regularization_range.split(',') regularization = [ float(regularization_var) for regularization_var in regularization_range ] tune_parameters = [{'C': regularization}] # Initialize logistic regression algorithm LR = LogisticRegression(class_weight='balanced', multi_class='multinomial', solver='lbfgs') clf = GridSearchCV(LR, tune_parameters, cv=5, scoring='accuracy') clf.fit(features, labels) print("best parameter = ", clf.best_params_) accuracy = clf.cv_results_['mean_test_score'] print( 'Accuracy values: \n {0} \n for `Regularization values: \n{1}'.format( accuracy, regularization)) ########## Start of ParallelM instrumentation ############## # Report Hyper-parameter Table tbl = Table().name("Hyper-parameter Search Results").cols( ["Mean accuracy from k-fold cross-validation"]) print("length of regularization", len(regularization)) index_max = np.argmax(accuracy) for a in range(0, len(regularization)): print("adding row", regularization[a]) if a == index_max: tbl.add_row("[Best] Regularization = " + np.str(regularization[a]), [accuracy[a]]) else: tbl.add_row("Regularization = " + np.str(regularization[a]), [accuracy[a]]) mlops.set_stat(tbl) ########## End of ParallelM instrumentation ############## # Label distribution in training label_distribution = dataset['label'].value_counts() column_names = np.array(label_distribution.index).astype(str).tolist() print("Label distributions: \n {0}".format(label_distribution)) ########## Start of ParallelM instrumentation ############## # Report label distribution as a BarGraph bar = BarGraph().name("Label Distribution").cols( np.array(label_distribution.index).astype(str).tolist()).data( label_distribution.values.tolist()) mlops.set_stat(bar) ########## Start of ParallelM instrumentation ############## #################### Start of ParallelM instrumentation ################ # Report accuracy of the chosen model mlops.set_stat("K-fold cross-validation Accuracy", accuracy[index_max], st.TIME_SERIES) #################### End of ParallelM instrumentation ################ # Histogram input mlops.set_data_distribution_stat(dataset) # Save the model import pickle model_file = open(pm_options.output_model, 'wb') pickle.dump(clf, model_file) model_file.close() mlops.done()
def run_mlops_tests(package_to_scan, test_to_run=None): """ Given a directory, scan the directory and take all files starting with test_ in each file run all the functions starting with test_ TODO: find a way to use pytest here if possible. :param package_to_scan: package to scan for test modules :param test_to_run: If provided run only a specific test "module.func" :raise Exception: In case of error in the tests an Exception will be raised """ modules = detect_modules_in_package(package_to_scan) print("Detected modules: {}".format(modules)) print("Loading and running test_XXX methods inside") results = [] failed_tests = 0 total_tests = 0 for mod_name in modules: mod = importlib.import_module(package_to_scan.__name__ + "." + mod_name) mod_funcs = detect_module_methods(mod) module_results = dict() module_results["name"] = mod_name module_results["per_func"] = [] module_results["pass"] = True print("Module {} funcs {}".format(mod, mod_funcs)) for func_name in mod_funcs: if test_to_run is not None: full_test_name = "{}.{}".format(mod_name, func_name) if full_test_name != test_to_run: continue total_tests += 1 func_results = dict() func_results["name"] = func_name print("\n\nrunning test: {}.{}".format(mod_name, func_name)) try: method_to_call = getattr(mod, func_name) method_to_call() func_results["pass"] = True except Exception as e: func_results["pass"] = False func_results["traceback"] = "".join( traceback.format_exception(*sys.exc_info())) failed_tests += 1 module_results["pass"] = False module_results["per_func"].append(func_results) results.append(module_results) # Table tbl = Table().name("Test Results").cols(["Module", "Test", "Status"]) print("\n\n\n") print("Test Summary: total: {} ok: {} failed: {}".format( total_tests, total_tests - failed_tests, failed_tests)) print("=======================================================") idx = 0 for mod_res in results: print("Module: {}".format(mod_res["name"])) for func_res in mod_res["per_func"]: if test_to_run is not None: full_test_name = "{}.{}".format(mod_res["name"], func_res["name"]) if full_test_name != test_to_run: continue print("| {:<20} {}".format(func_res["name"], func_res["pass"])) if func_res["pass"] is False: print("\n{}\n".format(func_res["traceback"])) tbl.add_row(str(idx), [ mod_res["name"], func_res["name"], "pass" if func_res["pass"] else "fail" ]) idx += 1 pm.set_stat(tbl) pm.set_stat(E2EConstants.E2E_RUN_STAT, 1, st.TIME_SERIES) if failed_tests > 0: print("=======================================================\n") print("Aborting unit test due to errors") raise Exception( "Failed running unit tests! failed: {}\n".format(failed_tests))
def infer_loop(model, input, output_file, stats_interval, conf_tracker): output = open(output_file, "w") # Initialize statistics total_predictions = 0 low_confidence_predictions = 0 categories = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] prediction_hist = [] for i in range(0, model.get_num_categories()): prediction_hist.append(0) ### MLOPS start # Create a bar graph and table for reporting prediction distributions and set the column names infer_bar = BarGraph().name("Prediction Distribution Bar Graph").cols( categories) infer_tbl = Table().name("Prediction Distribution Table").cols(categories) ### MLOPS end while True: try: sample, label = input.get_next_input() # Get the inference. This is an array of probabilities for each output value. inference = model.infer(sample) # The prediction is the class with the highest probability prediction = ny.argmax(inference) # The confidence for that prediction confidence = inference[prediction] * 100 # Append the prediction to the output file output.write("{}\n".format(prediction)) # Calculate statistics total_predictions += 1 prediction_hist[prediction] += 1 conf_tracker.check_confidence(confidence, sample) # Report statistics if total_predictions % stats_interval == 0: # Report the prediction distribution for i in range(0, model.get_num_categories()): print("category: {} predictions: {}".format( categories[i], prediction_hist[i])) ### MLOPS start # Update total prediction count with the all new predictions since we last reported mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, stats_interval) # Show the prediction distribution as a table infer_tbl.add_row(str(total_predictions), prediction_hist) # Show the prediction distribution as a bar graph infer_bar.data(prediction_hist) # Report the stats mlops.set_stat(infer_tbl) mlops.set_stat(infer_bar) ### MLOPS end conf_tracker.report_confidence(stats_interval) except EOFError: # stop when we hit end of input print("Reached end of input") output.close() break
mlops.set_stat("myCounterDouble2", 7.3) # Multi-line graph mlt = MultiLineGraph().name("Multi Line").labels(["l1", "l2"]).data([5, 16]) mlops.set_stat(mlt) # Example of sending a table to pm system. # Multi-line graphs mlt = MultiLineGraph().name("Multi Line").labels(["l1", "l2"]).data([5, 16]) mlops.set_stat(mlt) # Table example tbl = Table().name("MyTable").cols(["", "Date"]) tbl.add_row(["line 1", "2001Q1"]) tbl.add_row(["line 2", "2014Q3"]) mlops.set_stat(tbl) bar = BarGraph().name("MyBar").cols(["aa", "bb", "cc", "dd", "ee"]).data([10, 15, 12, 9, 8]) mlops.set_stat(bar) partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2 n = 100000 * partitions def f(_): x = random() * 2 - 1 y = random() * 2 - 1 return 1 if x**2 + y**2 <= 1 else 0
def ab_test(options, start_time, end_time, mode): sc = None if mode == RunModes.PYSPARK: from pyspark import SparkContext sc = SparkContext(appName="pm-ab-testing") pm.init(sc) elif mode == RunModes.PYTHON: pm.init() else: raise Exception("Invalid mode " + mode) not_enough_data = False # Following are a and b component names a_prediction_component_name = options.nodeA b_prediction_component_name = options.nodeB conv_a_stat_name = options.conversionsA conv_b_stat_name = options.conversionsB samples_a_stat_name = options.samplesA samples_b_stat_name = options.samplesB a_agent = utils._get_agent_id(a_prediction_component_name, options.agentA) b_agent = utils._get_agent_id(b_prediction_component_name, options.agentB) if a_agent is None or b_agent is None: print("Invalid agent provided {} or {}".format(options.agentA, options.agentB)) pm.system_alert("PyException", "Invalid Agent {} or {}".format(options.agentA, options.agentB)) return try: a_samples = pm.get_stats(name=samples_a_stat_name, mlapp_node=a_prediction_component_name, agent=a_agent, start_time=start_time, end_time=end_time) b_samples = pm.get_stats(name=samples_b_stat_name, mlapp_node=b_prediction_component_name, agent=b_agent, start_time=start_time, end_time=end_time) a_samples_pdf = pd.DataFrame(a_samples) b_samples_pdf = pd.DataFrame(b_samples) try: rowa1 = int(a_samples_pdf.tail(1)['value']) rowb1 = int(b_samples_pdf.tail(1)['value']) except Exception as e: not_enough_data = True print("Not enough samples stats produced in pipelines") raise ValueError("Not enough data to compare") a_conv = pm.get_stats(name=conv_a_stat_name, mlapp_node=a_prediction_component_name, agent=a_agent, start_time=start_time, end_time=end_time) b_conv = pm.get_stats(name=conv_b_stat_name, mlapp_node=b_prediction_component_name, agent=b_agent, start_time=start_time, end_time=end_time) a_conv_pdf = pd.DataFrame(a_conv) b_conv_pdf = pd.DataFrame(b_conv) try: rowa2 = int(a_conv_pdf.tail(1)['value']) rowb2 = int(b_conv_pdf.tail(1)['value']) except Exception as e: not_enough_data = True print("Not enough conversion stats produced in pipelines") raise ValueError("Not enough data to compare") abHealth = statsCalculator() abHealth.exptOutcome(float(rowa1), float(rowa2), float(rowb1), float(rowb2), options.confidence) confidence = abHealth.calConfidence() out = abHealth.calSuccess(options.confidence) # calculate conversion rate convA = float(rowa2) / float(rowa1) convB = float(rowb2) / float(rowb1) if convA != 0.0: relUplift = (convB - convA) / (convA) else: relUplift = convB relUplift = relUplift * 100 # AB Graphs ab = MultiGraph().name("AB").set_continuous() ab.x_title("Conversion Rate (%)") ab.y_title(" ") # normalizing x and y axis for A for display dist_a_norm_x = [a_x * 100.0 / rowa1 for a_x in abHealth._distControl[0].tolist()] dist_a_norm_y = [a_y * rowa1 / 100.0 for a_y in abHealth._distControl[1].tolist()] ab.add_series(label="A", x=dist_a_norm_x, y=dist_a_norm_y) # normalizing x and y axis for B for display dist_b_norm_x = [b_x * 100.0 / rowb1 for b_x in abHealth._distB[0].tolist()] dist_b_norm_y = [b_y * rowb1 / 100.0 for b_y in abHealth._distB[1].tolist()] ab.add_series(label="B", x=dist_b_norm_x, y=dist_b_norm_y) # annotate confidence line on normalized x-axis ab.annotate(label="{} %".format(options.confidence), x=abHealth._verticalLine * 100.0 / rowa1) # for not overriding it in display # annotate CR line on normalized x-axis if convA != convB: ab.annotate(label="CR A {}".format(convA * 100.0), x=convA * 100.0) ab.annotate(label="CR B {}".format(convB * 100.0), x=convB * 100.0) else: ab.annotate(label="CR A & B {}".format(convA * 100.0), x=convA * 100.0) pm.set_stat(ab) # conversion rate cols = ["A", "B"] mlt = MultiLineGraph().name("ConversionRate").labels(cols).data( [convA * 100.0, convB * 100.0]) pm.set_stat(mlt) # emit table with all stats tbl2 = Table().name("AB Stats").cols( ["Samples Processed", "Conversions", "Conversion Rate (%)", "Improvement (%)", "Chance to beat baseline (%)"]) tbl2.add_row(options.champion, [str(rowa1), str(rowa2), "{0:.2f}".format(convA * 100), "-", "-"]) tbl2.add_row(options.challenger, [str(rowb1), str(rowb2), "{0:.2f}".format(convB * 100), "{0:.2f}".format(relUplift), "{0:.2f}".format(confidence)]) pm.set_stat(tbl2) # set cookie tbl = Table().name("cookie").cols(["uplift", "champion", "challenger", "conversionA", "conversionB", "realUplift", "success", "confidence", "realConfidence"]) tbl.add_row("1", [str(options.uplift), options.champion, options.challenger, "{0:.2f}".format(convA * 100), "{0:.2f}".format(convB * 100), "{0:.2f}".format(abHealth._uplift), str(out), str(options.confidence), "{0:.2f}".format(abHealth.calConfidence())]) pm.set_stat(tbl) if out == True: pm.data_alert("DataAlert", "AB Test Success zScore {}".format(abHealth._zScore)) pm.set_stat("Success", 1, st.TIME_SERIES) else: pm.set_stat("Success", 0, st.TIME_SERIES) except Exception as e: if not_enough_data is False: print("Got exception while getting stats: {}".format(e)) pm.system_alert("PyException", "Got exception {}".format(e)) if mode == RunModes.PYSPARK: sc.stop() pm.done()