示例#1
0
def lift_chart(y_true: str, 
			   y_score: str, 
			   input_relation: str,
			   cursor,
			   pos_label = 1, 
			   nbins: int = 1000):
	query = "SELECT LIFT_TABLE(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output"
	query = query.format(nbins, y_true, pos_label, y_score, input_relation)
	cursor.execute(query)
	query_result = cursor.fetchall()
	decision_boundary, positive_prediction_ratio, lift = [item[0] for item in query_result], [item[1] for item in query_result], [item[2] for item in query_result]
	decision_boundary.reverse()
	try:
		import matplotlib.pyplot as plt
		import matplotlib.patches as mpatches
		plt.figure(figsize = (10,8))
		plt.rcParams['axes.facecolor'] = '#F5F5F5'
		plt.xlabel('Cumulative Data Fraction')
		plt.plot(decision_boundary, lift, color = "#214579")
		plt.plot(decision_boundary, positive_prediction_ratio, color = "#444444")
		plt.title("Lift Table")
		plt.gca().set_axisbelow(True)
		plt.grid()
		color1 = mpatches.Patch(color = "#214579", label = 'Cumulative Lift')
		color2 = mpatches.Patch(color = "#444444", label = 'Cumulative Capture Rate')
		plt.legend(handles = [color1, color2])
		plt.show()
	except:
		pass
	return (tablesample(values = {"decision_boundary": decision_boundary, "positive_prediction_ratio": positive_prediction_ratio, "lift": lift}, table_info = False))
示例#2
0
 def confusion_matrix(self, pos_label=None, cutoff: float = 0.5):
     pos_label = self.classes[1] if (
         pos_label == None and len(self.classes) == 2) else pos_label
     if (pos_label in self.classes and cutoff < 1 and cutoff > 0):
         input_relation = self.deploySQL(
         ) + " WHERE predict_knc = '{}'".format(pos_label)
         y_score = "(CASE WHEN proba_predict > {} THEN 1 ELSE 0 END)".format(
             cutoff)
         y_true = "DECODE({}, '{}', 1, 0)".format(self.y, pos_label)
         result = confusion_matrix(y_true, y_score, input_relation,
                                   self.cursor)
         if pos_label == 1:
             return (result)
         else:
             return (tablesample(values={
                 "index":
                 ["Non-{}".format(pos_label), "{}".format(pos_label)],
                 "Non-{}".format(pos_label):
                 result.values[0],
                 "{}".format(pos_label):
                 result.values[1]
             },
                                 table_info=False))
     else:
         input_relation = "(SELECT *, ROW_NUMBER() OVER(PARTITION BY {}, row_id ORDER BY proba_predict DESC) AS pos FROM {}) knc_table_predict WHERE pos = 1".format(
             ", ".join(self.X), self.deploySQL())
         return (multilabel_confusion_matrix(self.y, "predict_knc",
                                             input_relation, self.cursor,
                                             self.classes))
示例#3
0
def elbow(X: list,
		  input_relation: str,
		  cursor,
		  n_cluster = (1, 15),
		  init = "kmeanspp",
		  max_iter: int = 50,
		  tol: float = 1e-4):
	import matplotlib.pyplot as plt
	from vertica_ml_python.learn.cluster import KMeans
	schema, relation = schema_relation(input_relation)
	schema = str_column(schema)
	relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
	all_within_cluster_SS = []
	L = [i for i in range(n_cluster[0], n_cluster[1])] if not(type(n_cluster) == list) else n_cluster
	for i in L:
		cursor.execute("DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha))
		model = KMeans("{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol)
		model.fit(input_relation, X)
		all_within_cluster_SS += [float(model.metrics.values["value"][3])]
		model.drop()
	plt.figure(figsize = (10,8))
	plt.rcParams['axes.facecolor'] = '#F4F4F4'
	plt.grid()
	plt.plot(L, all_within_cluster_SS, marker = "s", color = "#214579")
	plt.title("Elbow Curve")
	plt.xlabel('Number of Clusters')
	plt.ylabel('Between-Cluster SS / Total SS')
	plt.subplots_adjust(left = 0.2)
	plt.show()
	values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
	return tablesample(values = values, table_info = False)
示例#4
0
 def confusion_matrix(self, pos_label=None, cutoff: float = 0.5):
     pos_label = self.classes[1] if (
         pos_label == None and len(self.classes) == 2) else pos_label
     if (pos_label in self.classes and cutoff < 1 and cutoff > 0):
         input_relation = self.deploySQL(
         ) + " WHERE predict_nc = '{}'".format(pos_label)
         y_score = "(CASE WHEN proba_predict > {} THEN 1 ELSE 0 END)".format(
             cutoff)
         y_true = "DECODE({}, '{}', 1, 0)".format(self.y, pos_label)
         result = confusion_matrix(y_true, y_score, input_relation,
                                   self.cursor)
         if pos_label == 1:
             return (result)
         else:
             return (tablesample(values={
                 "index":
                 ["Non-{}".format(pos_label), "{}".format(pos_label)],
                 "Non-{}".format(pos_label):
                 result.values[0],
                 "{}".format(pos_label):
                 result.values[1]
             },
                                 table_info=False))
     else:
         return (multilabel_confusion_matrix(self.y, "predict_nc",
                                             self.deploySQL(predict=True),
                                             self.cursor, self.classes))
示例#5
0
def regression_report(y_true: str, 
			 		  y_score: str, 
			 		  input_relation: str,
			 		  cursor):
	query  = "SELECT 1 - VARIANCE({} - {}) / VARIANCE({}), MAX(ABS({} - {})), ".format(y_true, y_score, y_true, y_true, y_score)
	query += "APPROXIMATE_MEDIAN(ABS({} - {})), AVG(ABS({} - {})), ".format(y_true, y_score, y_true, y_score)
	query += "AVG(POW({} - {}, 2)) FROM {}".format(y_true, y_score, input_relation)
	r2 = r2_score(y_true, y_score, input_relation, cursor)
	values = {"index": ["explained_variance", "max_error", "median_absolute_error", "mean_absolute_error", "mean_squared_error", "r2"]}
	cursor.execute(query)
	values["value"] = [item for item in cursor.fetchone()] + [r2]
	return (tablesample(values, table_info = False))
示例#6
0
def classification_report(y_true: str = "", 
						  y_score: str = "", 
						  input_relation: str = "",
						  cursor = None,
						  labels: list = [],
						  cutoff: float = 0.5,
						  estimator = None):
	if (estimator):
		num_classes = len(estimator.classes) 
		labels = labels if (num_classes > 2) else [estimator.classes[1]]
	else:
		labels = [1] if not (labels) else labels
		num_classes = len(labels) + 1
	values = {"index": ["auc", "prc_auc", "accuracy", "log_loss", "precision", "recall", "f1-score", "mcc", "informedness", "markedness", "csi"]}
	for idx, elem in enumerate(labels):
		pos_label = elem
		non_pos_label = 0 if (elem == 1) else "Non-{}".format(elem)
		if (estimator):
			try:
				matrix = estimator.confusion_matrix(pos_label, cutoff) 
			except:
				matrix = estimator.confusion_matrix(pos_label) 
		else:
			y_s, y_p, y_t = y_score[0].format(elem), y_score[1], "DECODE({}, '{}', 1, 0)".format(y_true, elem)
			matrix = confusion_matrix(y_true, y_p, input_relation, cursor, pos_label)
		tn, fp, fn, tp = matrix.values[non_pos_label][0], matrix.values[non_pos_label][1], matrix.values[pos_label][0], matrix.values[pos_label][1]
		ppv = tp / (tp + fp) if (tp + fp != 0) else 0 # precision
		tpr = tp / (tp + fn) if (tp + fn != 0) else 0 # recall
		tnr = tn / (tn + fp) if (tn + fp != 0) else 0 
		npv = tn / (tn + fn) if (tn + fn != 0) else 0
		f1 = 2 * (tpr * tnr) / (tpr + tnr) if (tpr + tnr != 0) else 0 # f1
		csi = tp / (tp + fn + fp) if (tp + fn + fp != 0) else 0 # csi
		bm = tpr + tnr - 1 # informedness
		mk = ppv + npv - 1 # markedness
		mcc = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) if (tp + fp != 0) and (tp + fn != 0) and (tn + fp != 0) and (tn + fn != 0) else 0
		if (estimator):
			try:
				accuracy = estimator.score(pos_label = pos_label, method = "acc", cutoff = cutoff)
			except:
				accuracy = estimator.score(pos_label = pos_label, method = "acc")
			auc_score, logloss, prc_auc_score = estimator.score(pos_label = pos_label, method = "auc"), estimator.score(pos_label = pos_label, method = "log_loss"), estimator.score(pos_label = pos_label, method = "prc_auc")
		else:
			auc_score = auc(y_t, y_s, input_relation, cursor, 1)
			prc_auc_score = prc_auc(y_t, y_s, input_relation, cursor, 1)
			y_p = "DECODE({}, '{}', 1, 0)".format(y_p, elem)
			logloss = log_loss(y_t, y_s, input_relation, cursor, 1)
			accuracy = accuracy_score(y_t, y_p, input_relation, cursor)
		elem = "value" if (len(labels) == 1) else elem
		values[elem] = [auc_score, prc_auc_score, accuracy, logloss, ppv, tpr, f1, mcc, bm, mk, csi]
	return (tablesample(values, table_info = False))
示例#7
0
 def features_importance(self):
     query = "SELECT predictor_name AS predictor, ROUND(100 * importance_value / SUM(importance_value) OVER (), 2) AS importance, SIGN(importance_value) AS sign FROM (SELECT RF_PREDICTOR_IMPORTANCE ( USING PARAMETERS model_name = '{}')) x ORDER BY 2 DESC;".format(
         self.name)
     self.cursor.execute(query)
     result = self.cursor.fetchall()
     coeff_importances, coeff_sign = {}, {}
     for elem in result:
         coeff_importances[elem[0]] = elem[1]
         coeff_sign[elem[0]] = elem[2]
     try:
         plot_importance(coeff_importances, coeff_sign, print_legend=False)
     except:
         pass
     importances = {"index": ["importance"]}
     for elem in coeff_importances:
         importances[elem] = [coeff_importances[elem]]
     return (tablesample(values=importances, table_info=False).transpose())
示例#8
0
	def fit(self, input_relation: str, X: list):
		self.input_relation = input_relation
		self.X = [str_column(column) for column in X]
		query = "SELECT KMEANS('{}', '{}', '{}', {} USING PARAMETERS max_iterations = {}, epsilon = {}".format(self.name, input_relation, ", ".join(self.X), self.n_cluster, self.max_iter, self.tol)
		name = "_vpython_kmeans_initial_centers_table_" 
		schema, relation = schema_relation(input_relation)
		schema = str_column(schema)
		if (type(self.init) != str):
			self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name))
			if (len(self.init) != self.n_cluster):
				raise ValueError("'init' must be a list of 'n_cluster' = {} points".format(self.n_cluster))
			else:
				for item in self.init:
					if (len(X) != len(item)):
						raise ValueError("Each points of 'init' must be of size len(X) = {}".format(len(self.X)))
				temp_initial_centers = [item for item in self.init]
				for item in temp_initial_centers:
					del temp_initial_centers[0]
					if (item in temp_initial_centers):
						raise ValueError("All the points of 'init' must be different")
				query0 = []
				for i in range(len(self.init)):
					line = []
					for j in range(len(self.init[0])):
						line += [str(self.init[i][j]) + " AS " + X[j]]
					line = ",".join(line)
					query0 += ["SELECT " + line]
				query0 = " UNION ".join(query0)
				query0 = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS {}".format(schema, name, query0)
				self.cursor.execute(query0)
				query += ", initial_centers_table = '" + name + "'"
		else:
			query += ", init_method = '" + self.init + "'"
		query += ")"
		self.cursor.execute(query)
		self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name))
		self.cluster_centers = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format(self.name), cursor = self.cursor)
		self.cluster_centers.table_info = False
		query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'metrics')".format(self.name)
		self.cursor.execute(query)
		result = self.cursor.fetchone()[0]
		values = {"index": ["Between-Cluster Sum of Squares", "Total Sum of Squares", "Total Within-Cluster Sum of Squares", "Between-Cluster SS / Total SS", "converged"]}
		values["value"] = [float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]) / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), result.split("Converged: ")[1].split("\n")[0] == "True"] 
		self.metrics = tablesample(values, table_info = False)
		return (self)
示例#9
0
def roc_curve(y_true: str, 
			  y_score: str, 
			  input_relation: str,
			  cursor,
			  pos_label = 1, 
			  nbins: int = 1000,
			  auc_roc: bool = False,
			  best_threshold: bool = False):
	query = "SELECT ROC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output"
	query = query.format(nbins, y_true, pos_label, y_score, input_relation)
	cursor.execute(query)
	query_result = cursor.fetchall()
	threshold, false_positive, true_positive = [item[0] for item in query_result], [item[1] for item in query_result], [item[2] for item in query_result]
	auc=0
	for i in range(len(false_positive) - 1):
		if (false_positive[i + 1] - false_positive[i] != 0.0):
			a = (true_positive[i + 1] - true_positive[i]) / (false_positive[i + 1] - false_positive[i])
			b = true_positive[i + 1] - a * false_positive[i + 1]
			auc = auc + a * (false_positive[i + 1] * false_positive[i + 1] - false_positive[i] * false_positive[i]) / 2 + b * (false_positive[i + 1] - false_positive[i]);
	auc = - auc
	auc = min(auc, 1.0)
	if (auc_roc):
		return (auc)
	if (best_threshold):
		best_threshold_arg = np.argmax([abs(y - x) for x, y in zip(false_positive, true_positive)])
		return (threshold[best_threshold_arg])
	try:
		import matplotlib.pyplot as plt
		plt.figure(figsize = (10,8))
		plt.rcParams['axes.facecolor'] = '#F5F5F5'
		plt.xlabel('False Positive Rate (1-Specificity)')
		plt.ylabel('True Positive Rate (Sensitivity)')
		plt.plot(false_positive, true_positive, color = "#214579")
		plt.plot([0,1], [0,1], color = "#444444")
		plt.ylim(0,1)
		plt.xlim(0,1)
		plt.title("ROC Curve\nAUC = " + str(auc))
		plt.gca().set_axisbelow(True)
		plt.grid()
		plt.show()
	except:
		pass
	return (tablesample(values = {"threshold": threshold, "false_positive": false_positive, "true_positive": true_positive}, table_info = False))
示例#10
0
	def features_importance(self):
		query  = "SELECT predictor, ROUND(100 * importance / SUM(importance) OVER(), 2) AS importance, sign FROM "
		query += "(SELECT stat.predictor AS predictor, ABS(coefficient * (max - min)) AS importance, SIGN(coefficient) AS sign FROM "
		query += "(SELECT LOWER(\"column\") AS predictor, min, max FROM (SELECT SUMMARIZE_NUMCOL({}) OVER() ".format(", ".join(self.X))
		query += " FROM {}) x) stat NATURAL JOIN (SELECT GET_MODEL_ATTRIBUTE (USING PARAMETERS model_name = '{}', ".format(self.input_relation, self.name)
		query += "attr_name = 'details')) coeff) importance_t ORDER BY 2 DESC;"
		self.cursor.execute(query)
		result = self.cursor.fetchall()
		coeff_importances, coeff_sign = {}, {}
		for elem in result:
			coeff_importances[elem[0]] = elem[1]
			coeff_sign[elem[0]] = elem[2]
		try:
			plot_importance(coeff_importances, coeff_sign)
		except:
			pass
		importances = {"index": ["importance"]}
		for elem in coeff_importances:
			importances[elem] = [coeff_importances[elem]]
		return (tablesample(values = importances, table_info = False).transpose())
示例#11
0
def prc_curve(y_true: str, 
			  y_score: str, 
			  input_relation: str,
			  cursor,
			  pos_label = 1, 
			  nbins: int = 1000,
			  auc_prc: bool = False):
	query = "SELECT PRC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output"
	query = query.format(nbins, y_true, pos_label, y_score, input_relation)
	cursor.execute(query)
	query_result = cursor.fetchall()
	threshold, recall, precision = [0] + [item[0] for item in query_result] + [1], [1] + [item[1] for item in query_result] + [0], [0] +  [item[2] for item in query_result] + [1]
	auc=0
	for i in range(len(recall) - 1):
		if (recall[i + 1] - recall[i] != 0.0):
			a = (precision[i + 1] - precision[i]) / (recall[i + 1] - recall[i])
			b = precision[i + 1] - a * recall[i + 1]
			auc = auc + a * (recall[i + 1] * recall[i + 1] - recall[i] * recall[i]) / 2 + b * (recall[i + 1] - recall[i]);
	auc = - auc
	if (auc_prc):
		return (auc)
	try:
		import matplotlib.pyplot as plt
		plt.figure(figsize = (10,8))
		plt.rcParams['axes.facecolor'] = '#F5F5F5'
		plt.xlabel('Recall')
		plt.ylabel('Precision')
		plt.plot(recall, precision, color = "#214579")
		plt.ylim(0,1)
		plt.xlim(0,1)
		plt.title("PRC Curve\nAUC = " + str(auc))
		plt.gca().set_axisbelow(True)
		plt.grid()
		plt.show()
	except:
		pass
	return (tablesample(values = {"threshold": threshold, "recall": recall, "precision": precision}, table_info = False))
def cross_validate(estimator,
                   input_relation: str,
                   X: list,
                   y: str,
                   cv: int = 3,
                   pos_label=None,
                   cutoff: float = 0.5):
    if (estimator.type == "regressor"):
        result = {
            "index": [
                "explained_variance", "max_error", "median_absolute_error",
                "mean_absolute_error", "mean_squared_error", "r2"
            ]
        }
    elif (estimator.type == "classifier"):
        result = {
            "index": [
                "auc", "prc_auc", "accuracy", "log_loss", "precision",
                "recall", "f1-score", "mcc", "informedness", "markedness",
                "csi"
            ]
        }
    else:
        raise ValueError(
            "Cross Validation is only possible for Regressors and Classifiers")
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
    test_name, train_name = "{}_{}".format(relation_alpha,
                                           int(1 / cv * 100)), "{}_{}".format(
                                               relation_alpha,
                                               int(100 - 1 / cv * 100))
    estimator.cursor.execute(
        "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, relation_alpha))
    query = "CREATE TEMPORARY TABLE {}.vpython_train_test_split_cv_{} ON COMMIT PRESERVE ROWS AS SELECT *, RANDOMINT({}) AS test FROM {}".format(
        schema, relation_alpha, cv, input_relation)
    estimator.cursor.execute(query)
    for i in range(cv):
        try:
            estimator.cursor.execute("DROP MODEL IF EXISTS {}".format(
                estimator.name))
        except:
            pass
        estimator.cursor.execute(
            "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
                schema, test_name))
        estimator.cursor.execute(
            "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
                schema, train_name))
        query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test = {})".format(
            schema, test_name,
            "{}.vpython_train_test_split_cv_{}".format(schema,
                                                       relation_alpha), i)
        estimator.cursor.execute(query)
        query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test != {})".format(
            schema, train_name,
            "{}.vpython_train_test_split_cv_{}".format(schema,
                                                       relation_alpha), i)
        estimator.cursor.execute(query)
        estimator.fit(
            "{}.vpython_train_test_split_cv_{}".format(schema, train_name), X,
            y, "{}.vpython_train_test_split_cv_{}".format(schema, test_name))
        if (estimator.type == "regressor"):
            result["{}-fold".format(
                i + 1)] = estimator.regression_report().values["value"]
        else:
            if (len(estimator.classes) > 2) and (pos_label
                                                 not in estimator.classes):
                raise ValueError(
                    "'pos_label' must be in the estimator classes, it must be the main class to study for the Cross Validation"
                )
            try:
                result["{}-fold".format(i +
                                        1)] = estimator.classification_report(
                                            labels=[pos_label],
                                            cutoff=cutoff).values["value"]
            except:
                result["{}-fold".format(i +
                                        1)] = estimator.classification_report(
                                            cutoff=cutoff).values["value"]
        try:
            estimator.cursor.execute("DROP MODEL IF EXISTS {}".format(
                estimator.name))
        except:
            pass
    n = 6 if (estimator.type == "regressor") else 11
    total = [[] for item in range(n)]
    for i in range(cv):
        for k in range(n):
            total[k] += [result["{}-fold".format(i + 1)][k]]
    result["avg"] = [np.mean(item) for item in total]
    result["std"] = [np.std(item) for item in total]
    estimator.cursor.execute(
        "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, relation_alpha))
    estimator.cursor.execute(
        "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, test_name))
    estimator.cursor.execute(
        "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, train_name))
    return (tablesample(values=result, table_info=False).transpose())