def lift_chart(y_true: str, y_score: str, input_relation: str, cursor, pos_label = 1, nbins: int = 1000): query = "SELECT LIFT_TABLE(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output" query = query.format(nbins, y_true, pos_label, y_score, input_relation) cursor.execute(query) query_result = cursor.fetchall() decision_boundary, positive_prediction_ratio, lift = [item[0] for item in query_result], [item[1] for item in query_result], [item[2] for item in query_result] decision_boundary.reverse() try: import matplotlib.pyplot as plt import matplotlib.patches as mpatches plt.figure(figsize = (10,8)) plt.rcParams['axes.facecolor'] = '#F5F5F5' plt.xlabel('Cumulative Data Fraction') plt.plot(decision_boundary, lift, color = "#214579") plt.plot(decision_boundary, positive_prediction_ratio, color = "#444444") plt.title("Lift Table") plt.gca().set_axisbelow(True) plt.grid() color1 = mpatches.Patch(color = "#214579", label = 'Cumulative Lift') color2 = mpatches.Patch(color = "#444444", label = 'Cumulative Capture Rate') plt.legend(handles = [color1, color2]) plt.show() except: pass return (tablesample(values = {"decision_boundary": decision_boundary, "positive_prediction_ratio": positive_prediction_ratio, "lift": lift}, table_info = False))
def confusion_matrix(self, pos_label=None, cutoff: float = 0.5): pos_label = self.classes[1] if ( pos_label == None and len(self.classes) == 2) else pos_label if (pos_label in self.classes and cutoff < 1 and cutoff > 0): input_relation = self.deploySQL( ) + " WHERE predict_knc = '{}'".format(pos_label) y_score = "(CASE WHEN proba_predict > {} THEN 1 ELSE 0 END)".format( cutoff) y_true = "DECODE({}, '{}', 1, 0)".format(self.y, pos_label) result = confusion_matrix(y_true, y_score, input_relation, self.cursor) if pos_label == 1: return (result) else: return (tablesample(values={ "index": ["Non-{}".format(pos_label), "{}".format(pos_label)], "Non-{}".format(pos_label): result.values[0], "{}".format(pos_label): result.values[1] }, table_info=False)) else: input_relation = "(SELECT *, ROW_NUMBER() OVER(PARTITION BY {}, row_id ORDER BY proba_predict DESC) AS pos FROM {}) knc_table_predict WHERE pos = 1".format( ", ".join(self.X), self.deploySQL()) return (multilabel_confusion_matrix(self.y, "predict_knc", input_relation, self.cursor, self.classes))
def elbow(X: list, input_relation: str, cursor, n_cluster = (1, 15), init = "kmeanspp", max_iter: int = 50, tol: float = 1e-4): import matplotlib.pyplot as plt from vertica_ml_python.learn.cluster import KMeans schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) all_within_cluster_SS = [] L = [i for i in range(n_cluster[0], n_cluster[1])] if not(type(n_cluster) == list) else n_cluster for i in L: cursor.execute("DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha)) model = KMeans("{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol) model.fit(input_relation, X) all_within_cluster_SS += [float(model.metrics.values["value"][3])] model.drop() plt.figure(figsize = (10,8)) plt.rcParams['axes.facecolor'] = '#F4F4F4' plt.grid() plt.plot(L, all_within_cluster_SS, marker = "s", color = "#214579") plt.title("Elbow Curve") plt.xlabel('Number of Clusters') plt.ylabel('Between-Cluster SS / Total SS') plt.subplots_adjust(left = 0.2) plt.show() values = {"index": L, "Within-Cluster SS": all_within_cluster_SS} return tablesample(values = values, table_info = False)
def confusion_matrix(self, pos_label=None, cutoff: float = 0.5): pos_label = self.classes[1] if ( pos_label == None and len(self.classes) == 2) else pos_label if (pos_label in self.classes and cutoff < 1 and cutoff > 0): input_relation = self.deploySQL( ) + " WHERE predict_nc = '{}'".format(pos_label) y_score = "(CASE WHEN proba_predict > {} THEN 1 ELSE 0 END)".format( cutoff) y_true = "DECODE({}, '{}', 1, 0)".format(self.y, pos_label) result = confusion_matrix(y_true, y_score, input_relation, self.cursor) if pos_label == 1: return (result) else: return (tablesample(values={ "index": ["Non-{}".format(pos_label), "{}".format(pos_label)], "Non-{}".format(pos_label): result.values[0], "{}".format(pos_label): result.values[1] }, table_info=False)) else: return (multilabel_confusion_matrix(self.y, "predict_nc", self.deploySQL(predict=True), self.cursor, self.classes))
def regression_report(y_true: str, y_score: str, input_relation: str, cursor): query = "SELECT 1 - VARIANCE({} - {}) / VARIANCE({}), MAX(ABS({} - {})), ".format(y_true, y_score, y_true, y_true, y_score) query += "APPROXIMATE_MEDIAN(ABS({} - {})), AVG(ABS({} - {})), ".format(y_true, y_score, y_true, y_score) query += "AVG(POW({} - {}, 2)) FROM {}".format(y_true, y_score, input_relation) r2 = r2_score(y_true, y_score, input_relation, cursor) values = {"index": ["explained_variance", "max_error", "median_absolute_error", "mean_absolute_error", "mean_squared_error", "r2"]} cursor.execute(query) values["value"] = [item for item in cursor.fetchone()] + [r2] return (tablesample(values, table_info = False))
def classification_report(y_true: str = "", y_score: str = "", input_relation: str = "", cursor = None, labels: list = [], cutoff: float = 0.5, estimator = None): if (estimator): num_classes = len(estimator.classes) labels = labels if (num_classes > 2) else [estimator.classes[1]] else: labels = [1] if not (labels) else labels num_classes = len(labels) + 1 values = {"index": ["auc", "prc_auc", "accuracy", "log_loss", "precision", "recall", "f1-score", "mcc", "informedness", "markedness", "csi"]} for idx, elem in enumerate(labels): pos_label = elem non_pos_label = 0 if (elem == 1) else "Non-{}".format(elem) if (estimator): try: matrix = estimator.confusion_matrix(pos_label, cutoff) except: matrix = estimator.confusion_matrix(pos_label) else: y_s, y_p, y_t = y_score[0].format(elem), y_score[1], "DECODE({}, '{}', 1, 0)".format(y_true, elem) matrix = confusion_matrix(y_true, y_p, input_relation, cursor, pos_label) tn, fp, fn, tp = matrix.values[non_pos_label][0], matrix.values[non_pos_label][1], matrix.values[pos_label][0], matrix.values[pos_label][1] ppv = tp / (tp + fp) if (tp + fp != 0) else 0 # precision tpr = tp / (tp + fn) if (tp + fn != 0) else 0 # recall tnr = tn / (tn + fp) if (tn + fp != 0) else 0 npv = tn / (tn + fn) if (tn + fn != 0) else 0 f1 = 2 * (tpr * tnr) / (tpr + tnr) if (tpr + tnr != 0) else 0 # f1 csi = tp / (tp + fn + fp) if (tp + fn + fp != 0) else 0 # csi bm = tpr + tnr - 1 # informedness mk = ppv + npv - 1 # markedness mcc = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) if (tp + fp != 0) and (tp + fn != 0) and (tn + fp != 0) and (tn + fn != 0) else 0 if (estimator): try: accuracy = estimator.score(pos_label = pos_label, method = "acc", cutoff = cutoff) except: accuracy = estimator.score(pos_label = pos_label, method = "acc") auc_score, logloss, prc_auc_score = estimator.score(pos_label = pos_label, method = "auc"), estimator.score(pos_label = pos_label, method = "log_loss"), estimator.score(pos_label = pos_label, method = "prc_auc") else: auc_score = auc(y_t, y_s, input_relation, cursor, 1) prc_auc_score = prc_auc(y_t, y_s, input_relation, cursor, 1) y_p = "DECODE({}, '{}', 1, 0)".format(y_p, elem) logloss = log_loss(y_t, y_s, input_relation, cursor, 1) accuracy = accuracy_score(y_t, y_p, input_relation, cursor) elem = "value" if (len(labels) == 1) else elem values[elem] = [auc_score, prc_auc_score, accuracy, logloss, ppv, tpr, f1, mcc, bm, mk, csi] return (tablesample(values, table_info = False))
def features_importance(self): query = "SELECT predictor_name AS predictor, ROUND(100 * importance_value / SUM(importance_value) OVER (), 2) AS importance, SIGN(importance_value) AS sign FROM (SELECT RF_PREDICTOR_IMPORTANCE ( USING PARAMETERS model_name = '{}')) x ORDER BY 2 DESC;".format( self.name) self.cursor.execute(query) result = self.cursor.fetchall() coeff_importances, coeff_sign = {}, {} for elem in result: coeff_importances[elem[0]] = elem[1] coeff_sign[elem[0]] = elem[2] try: plot_importance(coeff_importances, coeff_sign, print_legend=False) except: pass importances = {"index": ["importance"]} for elem in coeff_importances: importances[elem] = [coeff_importances[elem]] return (tablesample(values=importances, table_info=False).transpose())
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT KMEANS('{}', '{}', '{}', {} USING PARAMETERS max_iterations = {}, epsilon = {}".format(self.name, input_relation, ", ".join(self.X), self.n_cluster, self.max_iter, self.tol) name = "_vpython_kmeans_initial_centers_table_" schema, relation = schema_relation(input_relation) schema = str_column(schema) if (type(self.init) != str): self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name)) if (len(self.init) != self.n_cluster): raise ValueError("'init' must be a list of 'n_cluster' = {} points".format(self.n_cluster)) else: for item in self.init: if (len(X) != len(item)): raise ValueError("Each points of 'init' must be of size len(X) = {}".format(len(self.X))) temp_initial_centers = [item for item in self.init] for item in temp_initial_centers: del temp_initial_centers[0] if (item in temp_initial_centers): raise ValueError("All the points of 'init' must be different") query0 = [] for i in range(len(self.init)): line = [] for j in range(len(self.init[0])): line += [str(self.init[i][j]) + " AS " + X[j]] line = ",".join(line) query0 += ["SELECT " + line] query0 = " UNION ".join(query0) query0 = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS {}".format(schema, name, query0) self.cursor.execute(query0) query += ", initial_centers_table = '" + name + "'" else: query += ", init_method = '" + self.init + "'" query += ")" self.cursor.execute(query) self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name)) self.cluster_centers = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format(self.name), cursor = self.cursor) self.cluster_centers.table_info = False query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'metrics')".format(self.name) self.cursor.execute(query) result = self.cursor.fetchone()[0] values = {"index": ["Between-Cluster Sum of Squares", "Total Sum of Squares", "Total Within-Cluster Sum of Squares", "Between-Cluster SS / Total SS", "converged"]} values["value"] = [float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]) / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), result.split("Converged: ")[1].split("\n")[0] == "True"] self.metrics = tablesample(values, table_info = False) return (self)
def roc_curve(y_true: str, y_score: str, input_relation: str, cursor, pos_label = 1, nbins: int = 1000, auc_roc: bool = False, best_threshold: bool = False): query = "SELECT ROC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output" query = query.format(nbins, y_true, pos_label, y_score, input_relation) cursor.execute(query) query_result = cursor.fetchall() threshold, false_positive, true_positive = [item[0] for item in query_result], [item[1] for item in query_result], [item[2] for item in query_result] auc=0 for i in range(len(false_positive) - 1): if (false_positive[i + 1] - false_positive[i] != 0.0): a = (true_positive[i + 1] - true_positive[i]) / (false_positive[i + 1] - false_positive[i]) b = true_positive[i + 1] - a * false_positive[i + 1] auc = auc + a * (false_positive[i + 1] * false_positive[i + 1] - false_positive[i] * false_positive[i]) / 2 + b * (false_positive[i + 1] - false_positive[i]); auc = - auc auc = min(auc, 1.0) if (auc_roc): return (auc) if (best_threshold): best_threshold_arg = np.argmax([abs(y - x) for x, y in zip(false_positive, true_positive)]) return (threshold[best_threshold_arg]) try: import matplotlib.pyplot as plt plt.figure(figsize = (10,8)) plt.rcParams['axes.facecolor'] = '#F5F5F5' plt.xlabel('False Positive Rate (1-Specificity)') plt.ylabel('True Positive Rate (Sensitivity)') plt.plot(false_positive, true_positive, color = "#214579") plt.plot([0,1], [0,1], color = "#444444") plt.ylim(0,1) plt.xlim(0,1) plt.title("ROC Curve\nAUC = " + str(auc)) plt.gca().set_axisbelow(True) plt.grid() plt.show() except: pass return (tablesample(values = {"threshold": threshold, "false_positive": false_positive, "true_positive": true_positive}, table_info = False))
def features_importance(self): query = "SELECT predictor, ROUND(100 * importance / SUM(importance) OVER(), 2) AS importance, sign FROM " query += "(SELECT stat.predictor AS predictor, ABS(coefficient * (max - min)) AS importance, SIGN(coefficient) AS sign FROM " query += "(SELECT LOWER(\"column\") AS predictor, min, max FROM (SELECT SUMMARIZE_NUMCOL({}) OVER() ".format(", ".join(self.X)) query += " FROM {}) x) stat NATURAL JOIN (SELECT GET_MODEL_ATTRIBUTE (USING PARAMETERS model_name = '{}', ".format(self.input_relation, self.name) query += "attr_name = 'details')) coeff) importance_t ORDER BY 2 DESC;" self.cursor.execute(query) result = self.cursor.fetchall() coeff_importances, coeff_sign = {}, {} for elem in result: coeff_importances[elem[0]] = elem[1] coeff_sign[elem[0]] = elem[2] try: plot_importance(coeff_importances, coeff_sign) except: pass importances = {"index": ["importance"]} for elem in coeff_importances: importances[elem] = [coeff_importances[elem]] return (tablesample(values = importances, table_info = False).transpose())
def prc_curve(y_true: str, y_score: str, input_relation: str, cursor, pos_label = 1, nbins: int = 1000, auc_prc: bool = False): query = "SELECT PRC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output" query = query.format(nbins, y_true, pos_label, y_score, input_relation) cursor.execute(query) query_result = cursor.fetchall() threshold, recall, precision = [0] + [item[0] for item in query_result] + [1], [1] + [item[1] for item in query_result] + [0], [0] + [item[2] for item in query_result] + [1] auc=0 for i in range(len(recall) - 1): if (recall[i + 1] - recall[i] != 0.0): a = (precision[i + 1] - precision[i]) / (recall[i + 1] - recall[i]) b = precision[i + 1] - a * recall[i + 1] auc = auc + a * (recall[i + 1] * recall[i + 1] - recall[i] * recall[i]) / 2 + b * (recall[i + 1] - recall[i]); auc = - auc if (auc_prc): return (auc) try: import matplotlib.pyplot as plt plt.figure(figsize = (10,8)) plt.rcParams['axes.facecolor'] = '#F5F5F5' plt.xlabel('Recall') plt.ylabel('Precision') plt.plot(recall, precision, color = "#214579") plt.ylim(0,1) plt.xlim(0,1) plt.title("PRC Curve\nAUC = " + str(auc)) plt.gca().set_axisbelow(True) plt.grid() plt.show() except: pass return (tablesample(values = {"threshold": threshold, "recall": recall, "precision": precision}, table_info = False))
def cross_validate(estimator, input_relation: str, X: list, y: str, cv: int = 3, pos_label=None, cutoff: float = 0.5): if (estimator.type == "regressor"): result = { "index": [ "explained_variance", "max_error", "median_absolute_error", "mean_absolute_error", "mean_squared_error", "r2" ] } elif (estimator.type == "classifier"): result = { "index": [ "auc", "prc_auc", "accuracy", "log_loss", "precision", "recall", "f1-score", "mcc", "informedness", "markedness", "csi" ] } else: raise ValueError( "Cross Validation is only possible for Regressors and Classifiers") schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) test_name, train_name = "{}_{}".format(relation_alpha, int(1 / cv * 100)), "{}_{}".format( relation_alpha, int(100 - 1 / cv * 100)) estimator.cursor.execute( "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, relation_alpha)) query = "CREATE TEMPORARY TABLE {}.vpython_train_test_split_cv_{} ON COMMIT PRESERVE ROWS AS SELECT *, RANDOMINT({}) AS test FROM {}".format( schema, relation_alpha, cv, input_relation) estimator.cursor.execute(query) for i in range(cv): try: estimator.cursor.execute("DROP MODEL IF EXISTS {}".format( estimator.name)) except: pass estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, test_name)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, train_name)) query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test = {})".format( schema, test_name, "{}.vpython_train_test_split_cv_{}".format(schema, relation_alpha), i) estimator.cursor.execute(query) query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test != {})".format( schema, train_name, "{}.vpython_train_test_split_cv_{}".format(schema, relation_alpha), i) estimator.cursor.execute(query) estimator.fit( "{}.vpython_train_test_split_cv_{}".format(schema, train_name), X, y, "{}.vpython_train_test_split_cv_{}".format(schema, test_name)) if (estimator.type == "regressor"): result["{}-fold".format( i + 1)] = estimator.regression_report().values["value"] else: if (len(estimator.classes) > 2) and (pos_label not in estimator.classes): raise ValueError( "'pos_label' must be in the estimator classes, it must be the main class to study for the Cross Validation" ) try: result["{}-fold".format(i + 1)] = estimator.classification_report( labels=[pos_label], cutoff=cutoff).values["value"] except: result["{}-fold".format(i + 1)] = estimator.classification_report( cutoff=cutoff).values["value"] try: estimator.cursor.execute("DROP MODEL IF EXISTS {}".format( estimator.name)) except: pass n = 6 if (estimator.type == "regressor") else 11 total = [[] for item in range(n)] for i in range(cv): for k in range(n): total[k] += [result["{}-fold".format(i + 1)][k]] result["avg"] = [np.mean(item) for item in total] result["std"] = [np.std(item) for item in total] estimator.cursor.execute( "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, relation_alpha)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, test_name)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, train_name)) return (tablesample(values=result, table_info=False).transpose())