def classification_report( y_true: str = "", y_score: list = [], input_relation: str = "", cursor=None, labels: list = [], cutoff=[], estimator=None, ): """ --------------------------------------------------------------------------- Computes a classification report using multiple metrics (AUC, accuracy, PRC AUC, F1...). It will consider each category as positive and switch to the next one during the computation. Parameters ---------- y_true: str, optional Response column. y_score: list, optional List containing the probability and the prediction. input_relation: str, optional Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. labels: list, optional List of the response column categories to use. cutoff: float / list, optional Cutoff for which the tested category will be accepted as prediction. In case of multiclass classification, the list will represent the the classes threshold. If it is empty, the best cutoff will be used. estimator: object, optional Estimator to use to compute the classification report. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [list], ), ( "input_relation", input_relation, [str], ), ( "labels", labels, [list], ), ( "cutoff", cutoff, [int, float, list], ), ]) if estimator: num_classes = len(estimator.classes_) labels = labels if (num_classes > 2) else [estimator.classes_[1]] else: if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) labels = [1] if not (labels) else labels num_classes = len(labels) + 1 values = { "index": [ "auc", "prc_auc", "accuracy", "log_loss", "precision", "recall", "f1_score", "mcc", "informedness", "markedness", "csi", "cutoff", ] } for idx, elem in enumerate(labels): pos_label = elem non_pos_label = 0 if (elem == 1) else "Non-{}".format(elem) if estimator: if not (cutoff): current_cutoff = estimator.score(method="best_cutoff", pos_label=pos_label) elif isinstance(cutoff, Iterable): if len(cutoff) == 1: current_cutoff = cutoff[0] else: current_cutoff = cutoff[idx] else: current_cutoff = cutoff try: matrix = estimator.confusion_matrix(pos_label, current_cutoff) except: matrix = estimator.confusion_matrix(pos_label) else: y_s, y_p, y_t = ( y_score[0].format(elem), y_score[1], "DECODE({}, '{}', 1, 0)".format(y_true, elem), ) matrix = confusion_matrix(y_true, y_p, input_relation, cursor, pos_label) try: tn, fn, fp, tp = ( matrix.values[non_pos_label][0], matrix.values[non_pos_label][1], matrix.values[pos_label][0], matrix.values[pos_label][1], ) except: try: tn, fn, fp, tp = ( matrix.values[0][0], matrix.values[0][1], matrix.values[1][0], matrix.values[1][1], ) except: tn, fn, fp, tp = ( matrix.values["0"][0], matrix.values["0"][1], matrix.values["1"][0], matrix.values["1"][1], ) ppv = tp / (tp + fp) if (tp + fp != 0) else 0 # precision tpr = tp / (tp + fn) if (tp + fn != 0) else 0 # recall tnr = tn / (tn + fp) if (tn + fp != 0) else 0 npv = tn / (tn + fn) if (tn + fn != 0) else 0 f1 = 2 * (tpr * tnr) / (tpr + tnr) if (tpr + tnr != 0) else 0 # f1 csi = tp / (tp + fn + fp) if (tp + fn + fp != 0) else 0 # csi bm = tpr + tnr - 1 # informedness mk = ppv + npv - 1 # markedness mcc = ((tp * tn - fp * fn) / math.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) if (tp + fp != 0) and (tp + fn != 0) and (tn + fp != 0) and (tn + fn != 0) else 0) accuracy = (tp + tn) / (tp + tn + fp + fn) if estimator: auc_score, logloss, prc_auc_score = ( estimator.score(pos_label=pos_label, method="auc"), estimator.score(pos_label=pos_label, method="log_loss"), estimator.score(pos_label=pos_label, method="prc_auc"), ) else: auc_score = auc(y_t, y_s, input_relation, cursor, 1) prc_auc_score = prc_auc(y_t, y_s, input_relation, cursor, 1) y_p = "DECODE({}, '{}', 1, 0)".format(y_p, elem) logloss = log_loss(y_t, y_s, input_relation, cursor, 1) if not (cutoff): current_cutoff = roc_curve(y_t, y_p, input_relation, cursor, best_threshold=True) elif isinstance(cutoff, Iterable): if len(cutoff) == 1: current_cutoff = cutoff[0] else: current_cutoff = cutoff[idx] else: current_cutoff = cutoff elem = "value" if (len(labels) == 1) else elem values[elem] = [ auc_score, prc_auc_score, accuracy, logloss, ppv, tpr, f1, mcc, bm, mk, csi, current_cutoff, ] if not (estimator): if conn: conn.close() return tablesample(values)
def regression_report(y_true: str, y_score: str, input_relation: str, cursor=None): """ --------------------------------------------------------------------------- Computes a regression report using multiple metrics (r2, mse, max error...). Parameters ---------- y_true: str Response column. y_score: str Prediction. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) query = "SELECT 1 - VARIANCE({} - {}) / VARIANCE({}), MAX(ABS({} - {})), ".format( y_true, y_score, y_true, y_true, y_score) query += "APPROXIMATE_MEDIAN(ABS({} - {})), AVG(ABS({} - {})), ".format( y_true, y_score, y_true, y_score) query += "AVG(POW({} - {}, 2)) FROM {}".format(y_true, y_score, input_relation) r2 = r2_score(y_true, y_score, input_relation, cursor) values = { "index": [ "explained_variance", "max_error", "median_absolute_error", "mean_absolute_error", "mean_squared_error", "r2", ] } cursor.execute(query) values["value"] = [item for item in cursor.fetchone()] + [r2] if conn: conn.close() return tablesample(values)
def accuracy_score(y_true: str, y_score: str, input_relation: str, cursor=None, pos_label=1): """ --------------------------------------------------------------------------- Computes the Accuracy Score. Parameters ---------- y_true: str Response column. y_score: str Prediction. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. pos_label: int/float/str, optional Label to use to identify the positive class. If pos_label is NULL then the global accuracy will be computed. Returns ------- float score """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) if pos_label != None: matrix = confusion_matrix(y_true, y_score, input_relation, cursor, pos_label) non_pos_label = 0 if (pos_label == 1) else "Non-{}".format(pos_label) tn, fn, fp, tp = ( matrix.values[non_pos_label][0], matrix.values[non_pos_label][1], matrix.values[pos_label][0], matrix.values[pos_label][1], ) acc = (tp + tn) / (tp + tn + fn + fp) else: try: query = "SELECT AVG(CASE WHEN {} = {} THEN 1 ELSE 0 END) AS accuracy FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL" query = query.format(y_true, y_score, input_relation, y_true, y_score) cursor.execute(query) except: query = "SELECT AVG(CASE WHEN {}::varchar = {}::varchar THEN 1 ELSE 0 END) AS accuracy FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL" query = query.format(y_true, y_score, input_relation, y_true, y_score) cursor.execute(query) acc = cursor.fetchone()[0] if conn: conn.close() return acc
def specificity_score(y_true: str, y_score: str, input_relation: str, cursor=None, pos_label=1): """ --------------------------------------------------------------------------- Computes the Specificity Score. Parameters ---------- y_true: str Response column. y_score: str Prediction. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. pos_label: int/float/str, optional To compute the Specificity Score, one of the response column class has to be the positive one. The parameter 'pos_label' represents this class. Returns ------- float score """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) matrix = confusion_matrix(y_true, y_score, input_relation, cursor, pos_label) if conn: conn.close() non_pos_label = 0 if (pos_label == 1) else "Non-{}".format(pos_label) tn, fn, fp, tp = ( matrix.values[non_pos_label][0], matrix.values[non_pos_label][1], matrix.values[pos_label][0], matrix.values[pos_label][1], ) tnr = tn / (tn + fp) if (tn + fp != 0) else 0 return tnr
def quantile_error(q: float, y_true: str, y_score: str, input_relation: str, cursor=None): """ --------------------------------------------------------------------------- Computes the input Quantile of the Error. Parameters ---------- q: float Input Quantile y_true: str Response column. y_score: str Prediction. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. Returns ------- float score """ check_types([ ( "q", q, [int, float], ), ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) query = "SELECT APPROXIMATE_PERCENTILE(ABS({} - {}) USING PARAMETERS percentile = {}) FROM {}".format( y_true, y_score, q, input_relation) cursor.execute(query) result = cursor.fetchone()[0] if conn: conn.close() return result
def lift_chart( y_true: str, y_score: str, input_relation: str, cursor=None, pos_label=1, nbins: int = 1000, ax=None, ): """ --------------------------------------------------------------------------- Draws the Lift Chart. Parameters ---------- y_true: str Response column. y_score: str Prediction Probability. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. pos_label: int/float/str, optional To compute the Lift Chart, one of the response column class has to be the positive one. The parameter 'pos_label' represents this class. nbins: int, optional Curve number of bins. ax: Matplotlib axes object, optional The axes to plot on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ( "nbins", nbins, [int, float], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) version(cursor=cursor, condition=[8, 0, 0]) query = "SELECT LIFT_TABLE(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output" query = query.format(nbins, y_true, pos_label, y_score, input_relation) cursor.execute(query) query_result = cursor.fetchall() if conn: conn.close() decision_boundary, positive_prediction_ratio, lift = ( [item[0] for item in query_result], [item[1] for item in query_result], [item[2] for item in query_result], ) decision_boundary.reverse() if not (ax): fig, ax = plt.subplots() if isnotebook(): fig.set_size_inches(8, 6) ax.set_facecolor("#F5F5F5") ax.set_xlabel("Cumulative Data Fraction") ax.plot(decision_boundary, lift, color="#FE5016") ax.plot(decision_boundary, positive_prediction_ratio, color="#444444") ax.set_title("Lift Table") ax.set_axisbelow(True) ax.grid() color1 = mpatches.Patch(color="#FE5016", label="Cumulative Lift") color2 = mpatches.Patch(color="#444444", label="Cumulative Capture Rate") ax.legend(handles=[color1, color2]) return tablesample(values={ "decision_boundary": decision_boundary, "positive_prediction_ratio": positive_prediction_ratio, "lift": lift, }, )
def multilabel_confusion_matrix(y_true: str, y_score: str, input_relation: str, labels: list, cursor=None): """ --------------------------------------------------------------------------- Computes the Multi Label Confusion Matrix. Parameters ---------- y_true: str Response column. y_score: str Prediction. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. labels: list List of the response column categories. cursor: DBcursor, optional Vertica DB cursor. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ( "labels", labels, [list], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) version(cursor=cursor, condition=[8, 0, 0]) num_classes = str(len(labels)) query = "SELECT CONFUSION_MATRIX(obs, response USING PARAMETERS num_classes = {}) OVER() FROM (SELECT DECODE({}".format( num_classes, y_true) for idx, item in enumerate(labels): query += ", '{}', {}".format(item, idx) query += ") AS obs, DECODE({}".format(y_score) for idx, item in enumerate(labels): query += ", '{}', {}".format(item, idx) query += ") AS response FROM {}) VERTICAPY_SUBTABLE".format(input_relation) result = to_tablesample(query, cursor) if conn: conn.close() del result.values["comment"] result = result.transpose() result.values["actual_class"] = labels result = result.transpose() matrix = {"index": labels} for elem in result.values: if elem != "actual_class": matrix[elem] = result.values[elem] result.values = matrix return result
def load_winequality(cursor=None, schema: str = "public", name: str = "winequality"): """ --------------------------------------------------------------------------- Ingests the winequality dataset in the Vertica DB (Dataset ideal for Regression and Classification). If a table with the same name and schema already exists, this function will create a vDataFrame from the input relation. Parameters ---------- cursor: DBcursor, optional Vertica DB cursor. schema: str, optional Schema of the new relation. The default schema is public. name: str, optional Name of the new relation. Returns ------- vDataFrame the winequality vDataFrame. See Also -------- load_amazon : Ingests the amazon dataset in the Vertica DB. (Time Series / Regression). load_commodities : Ingests the commodities dataset in the Vertica DB. (Time Series / Regression). load_iris : Ingests the iris dataset in the Vertica DB. (Clustering / Classification). load_market : Ingests the market dataset in the Vertica DB. (Basic Data Exploration). load_smart_meters : Ingests the smart meters dataset in the Vertica DB. (Time Series / Regression). load_titanic : Ingests the titanic dataset in the Vertica DB. (Classification). """ check_types([("schema", schema, [str],), ("name", name, [str],)]) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) try: vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( 'CREATE TABLE {}.{}("fixed_acidity" Numeric(6,3), "volatile_acidity" Numeric(7,4), "citric_acid" Numeric(6,3), "residual_sugar" Numeric(7,3), "chlorides" Float, "free_sulfur_dioxide" Numeric(7,2), "total_sulfur_dioxide" Numeric(7,2), "density" Float, "pH" Numeric(6,3), "sulphates" Numeric(6,3), "alcohol" Float, "quality" Integer, "good" Integer, "color" Varchar(20));'.format( str_column(schema), str_column(name) ) ) try: path = os.path.dirname(verticapy.__file__) + "/learn/data/winequality.csv" query = 'COPY {}.{}("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", "pH", "sulphates", "alcohol", "quality", "good", "color") FROM {} DELIMITER \',\' NULL \'\' ENCLOSED BY \'"\' ESCAPE AS \'\\\' SKIP 1;'.format( str_column(schema), str_column(name), "{}" ) import vertica_python if isinstance(cursor, vertica_python.vertica.cursor.Cursor): with open(path, "r") as fs: cursor.copy(query.format("STDIN"), fs) else: cursor.execute(query.format("LOCAL '{}'".format(path))) cursor.execute("COMMIT;") vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( "DROP TABLE {}.{}".format(str_column(schema), str_column(name)) ) raise return vdf
def load_amazon(cursor=None, schema: str = "public", name: str = "amazon"): """ --------------------------------------------------------------------------- Ingests the amazon dataset in the Vertica DB (Dataset ideal for TS and Regression). If a table with the same name and schema already exists, this function will create a vDataFrame from the input relation. Parameters ---------- cursor: DBcursor, optional Vertica DB cursor. schema: str, optional Schema of the new relation. The default schema is public. name: str, optional Name of the new relation. Returns ------- vDataFrame the amazon vDataFrame. See Also -------- load_commodities : Ingests the commodities dataset in the Vertica DB. (Time Series / Regression). load_iris : Ingests the iris dataset in the Vertica DB. (Clustering / Classification). load_market : Ingests the market dataset in the Vertica DB. (Basic Data Exploration). load_smart_meters : Ingests the smart meters dataset in the Vertica DB. (Time Series / Regression). load_titanic : Ingests the titanic dataset in the Vertica DB. (Classification). load_winequality : Ingests the winequality dataset in the Vertica DB. (Regression / Classification). """ check_types([("schema", schema, [str],), ("name", name, [str],)]) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) try: vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( 'CREATE TABLE {}.{}("date" Date, "state" Varchar(32), "number" Integer);'.format( str_column(schema), str_column(name) ) ) try: path = os.path.dirname(verticapy.__file__) + "/learn/data/amazon.csv" query = "COPY {}.{}(\"date\", \"state\", \"number\") FROM {} DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;".format( str_column(schema), str_column(name), "{}" ) import vertica_python if isinstance(cursor, vertica_python.vertica.cursor.Cursor): with open(path, "r") as fs: cursor.copy(query.format("STDIN"), fs) else: cursor.execute(query.format("LOCAL '{}'".format(path))) cursor.execute("COMMIT;") vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( "DROP TABLE {}.{}".format(str_column(schema), str_column(name)) ) raise return vdf
def train_test_split(input_relation: str, cursor=None, test_size: float = 0.33, schema_writing: str = ""): """ --------------------------------------------------------------------------- Creates a temporary table and 2 views which can be to use to evaluate a model. The table will include all the main relation information with a test column (boolean) which represents if the data belong to the test or train set. Parameters ---------- input_relation: str Input Relation. cursor: DBcursor, optional Vertica DB cursor. test_size: float, optional Proportion of the test set comparint to the training set. schema_writing: str, optional Schema to use to write the main relation. Returns ------- tuple (name of the train view, name of the test view) """ check_types([ ( "test_size", test_size, [float], ), ( "schema_writing", schema_writing, [str], ), ( "input_relation", input_relation, [str], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) schema, relation = schema_relation(input_relation) schema = str_column(schema) if not (schema_writing) else schema_writing relation_alpha = "".join(ch for ch in relation if ch.isalnum()) test_name, train_name = ( "{}_{}".format(relation_alpha, int(test_size * 100)), "{}_{}".format(relation_alpha, int(100 - test_size * 100)), ) try: cursor.execute("DROP TABLE IF EXISTS {}.VERTICAPY_SPLIT_{}".format( schema, relation_alpha)) except: pass cursor.execute("DROP VIEW IF EXISTS {}.VERTICAPY_SPLIT_{}_TEST".format( schema, test_name)) cursor.execute("DROP VIEW IF EXISTS {}.VERTICAPY_SPLIT_{}_TRAIN".format( schema, train_name)) query = "CREATE TABLE {}.VERTICAPY_SPLIT_{} AS SELECT *, (CASE WHEN RANDOM() < {} THEN True ELSE False END) AS test FROM {}".format( schema, relation_alpha, test_size, input_relation) cursor.execute(query) query = "CREATE VIEW {}.VERTICAPY_SPLIT_{}_TEST AS SELECT * FROM {} WHERE test".format( schema, test_name, "{}.VERTICAPY_SPLIT_{}".format(schema, relation_alpha)) cursor.execute(query) query = "CREATE VIEW {}.VERTICAPY_SPLIT_{}_TRAIN AS SELECT * FROM {} WHERE NOT(test)".format( schema, train_name, "{}.VERTICAPY_SPLIT_{}".format(schema, relation_alpha)) cursor.execute(query) if conn: conn.close() return ( "{}.VERTICAPY_SPLIT_{}_TRAIN".format(schema, train_name), "{}.VERTICAPY_SPLIT_{}_TEST".format(schema, test_name), )
def load_titanic(cursor=None, schema: str = "public", name: str = "titanic"): """ --------------------------------------------------------------------------- Ingests the titanic dataset in the Vertica DB (Dataset ideal for Classification). If a table with the same name and schema already exists, this function will create a vDataFrame from the input relation. Parameters ---------- cursor: DBcursor, optional Vertica DB cursor. schema: str, optional Schema of the new relation. The default schema is public. name: str, optional Name of the new relation. Returns ------- vDataFrame the titanic vDataFrame. See Also -------- load_amazon : Ingests the amazon dataset in the Vertica DB. (Time Series / Regression). load_commodities : Ingests the commodities dataset in the Vertica DB. (Time Series / Regression). load_iris : Ingests the iris dataset in the Vertica DB. (Clustering / Classification). load_market : Ingests the market dataset in the Vertica DB. (Basic Data Exploration). load_smart_meters : Ingests the smart meters dataset in the Vertica DB. (Time Series / Regression). load_winequality : Ingests the winequality dataset in the Vertica DB. (Regression / Classification). """ check_types([("schema", schema, [str],), ("name", name, [str],)]) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) try: vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( 'CREATE TABLE {}.{}("pclass" Integer, "survived" Integer, "name" Varchar(164), "sex" Varchar(20), "age" Numeric(6,3), "sibsp" Integer, "parch" Integer, "ticket" Varchar(36), "fare" Numeric(10,5), "cabin" Varchar(30), "embarked" Varchar(20), "boat" Varchar(100), "body" Integer, "home.dest" Varchar(100));'.format( str_column(schema), str_column(name) ) ) try: path = os.path.dirname(verticapy.__file__) + "/learn/data/titanic.csv" query = 'COPY {}.{}("pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest") FROM {} DELIMITER \',\' NULL \'\' ENCLOSED BY \'"\' ESCAPE AS \'\\\' SKIP 1;'.format( str_column(schema), str_column(name), "{}" ) import vertica_python if isinstance(cursor, vertica_python.vertica.cursor.Cursor): with open(path, "r") as fs: cursor.copy(query.format("STDIN"), fs) else: cursor.execute(query.format("LOCAL '{}'".format(path))) cursor.execute("COMMIT;") vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( "DROP TABLE {}.{}".format(str_column(schema), str_column(name)) ) raise return vdf
def roc_curve( y_true: str, y_score: str, input_relation: str, cursor=None, pos_label=1, nbins: int = 1000, auc_roc: bool = False, best_threshold: bool = False, ax=None, ): """ --------------------------------------------------------------------------- Draws the ROC Curve. Parameters ---------- y_true: str Response column. y_score: str Prediction Probability. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. pos_label: int/float/str, optional To compute the PRC Curve, one of the response column class has to be the positive one. The parameter 'pos_label' represents this class. nbins: int, optional Curve number of bins. auc_roc: bool, optional If set to true, the function will return the ROC AUC without drawing the curve. best_threshold: bool, optional If set to True, the function will return the best threshold without drawing the curve. The best threshold is the threshold of the point which is the farest from the random line. ax: Matplotlib axes object, optional The axes to plot on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ( "nbins", nbins, [int, float], ), ( "auc_roc", auc_roc, [bool], ), ( "best_threshold", best_threshold, [bool], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) version(cursor=cursor, condition=[8, 0, 0]) query = "SELECT ROC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output" query = query.format(nbins, y_true, pos_label, y_score, input_relation) cursor.execute(query) query_result = cursor.fetchall() if conn: conn.close() threshold, false_positive, true_positive = ( [item[0] for item in query_result], [item[1] for item in query_result], [item[2] for item in query_result], ) auc = 0 for i in range(len(false_positive) - 1): if false_positive[i + 1] - false_positive[i] != 0.0: a = (true_positive[i + 1] - true_positive[i]) / ( false_positive[i + 1] - false_positive[i]) b = true_positive[i + 1] - a * false_positive[i + 1] auc = (auc + a * (false_positive[i + 1] * false_positive[i + 1] - false_positive[i] * false_positive[i]) / 2 + b * (false_positive[i + 1] - false_positive[i])) auc = -auc auc = min(auc, 1.0) if auc_roc: return auc if best_threshold: l = [abs(y - x) for x, y in zip(false_positive, true_positive)] best_threshold_arg = max(zip(l, range(len(l))))[1] best = max(threshold[best_threshold_arg], 0.001) best = min(best, 0.999) return best if not (ax): fig, ax = plt.subplots() if isnotebook(): fig.set_size_inches(8, 6) ax.set_xlabel("False Positive Rate (1-Specificity)") ax.set_ylabel("True Positive Rate (Sensitivity)") ax.plot(false_positive, true_positive, color="#FE5016") ax.plot([0, 1], [0, 1], color="#444444") ax.set_ylim(0, 1) ax.set_xlim(0, 1) ax.set_title("ROC Curve\nAUC = " + str(auc)) ax.set_axisbelow(True) ax.grid() return tablesample(values={ "threshold": threshold, "false_positive": false_positive, "true_positive": true_positive, }, )
def best_k( X: list, input_relation: str, cursor=None, n_cluster=(1, 100), init="kmeanspp", max_iter: int = 50, tol: float = 1e-4, elbow_score_stop: float = 0.8, ): """ --------------------------------------------------------------------------- Finds the KMeans K based on a score. Parameters ---------- X: list List of the predictor columns. input_relation: str Relation to use to train the model. cursor: DBcursor, optional Vertica DB cursor. n_cluster: int, optional Tuple representing the number of cluster to start with and to end with. It can also be customized list with the different K to test. init: str/list, optional The method to use to find the initial cluster centers. kmeanspp : Use the KMeans++ method to initialize the centers. random : The initial centers It can be also a list with the initial cluster centers to use. max_iter: int, optional The maximum number of iterations the algorithm performs. tol: float, optional Determines whether the algorithm has converged. The algorithm is considered converged after no center has moved more than a distance of 'tol' from the previous iteration. elbow_score_stop: float, optional Stops the Parameters Search when this Elbow score is reached. Returns ------- int the KMeans K """ check_types([ ( "X", X, [list], ), ( "input_relation", input_relation, [str], ), ( "n_cluster", n_cluster, [list], ), ( "init", init, ["kmeanspp", "random"], ), ( "max_iter", max_iter, [int, float], ), ( "tol", tol, [int, float], ), ( "elbow_score_stop", elbow_score_stop, [int, float], ), ]) from verticapy.learn.cluster import KMeans if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) if not (isinstance(n_cluster, Iterable)): L = range(n_cluster[0], n_cluster[1]) else: L = n_cluster L.sort() schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = "".join(ch for ch in relation if ch.isalnum()) for i in L: cursor.execute( "DROP MODEL IF EXISTS {}.__vpython_kmeans_tmp_model_{}__".format( schema, relation_alpha)) model = KMeans( "{}.__vpython_kmeans_tmp_model_{}__".format( schema, relation_alpha), cursor, i, init, max_iter, tol, ) model.fit(input_relation, X) score = model.metrics.values["value"][3] if score > elbow_score_stop: return i score_prev = score if conn: conn.close() print( "\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}" .format(i, score)) return i
def prc_curve( y_true: str, y_score: str, input_relation: str, cursor=None, pos_label=1, nbins: int = 1000, auc_prc: bool = False, ax=None, ): """ --------------------------------------------------------------------------- Draws the PRC Curve. Parameters ---------- y_true: str Response column. y_score: str Prediction Probability. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. pos_label: int/float/str, optional To compute the PRC Curve, one of the response column class has to be the positive one. The parameter 'pos_label' represents this class. nbins: int, optional Curve number of bins. auc_prc: bool, optional If set to True, the function will return the PRC AUC without drawing the curve. ax: Matplotlib axes object, optional The axes to plot on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ( "nbins", nbins, [int, float], ), ( "auc_prc", auc_prc, [bool], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) version(cursor=cursor, condition=[9, 1, 0]) query = "SELECT PRC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output" query = query.format(nbins, y_true, pos_label, y_score, input_relation) cursor.execute(query) query_result = cursor.fetchall() if conn: conn.close() threshold, recall, precision = ( [0] + [item[0] for item in query_result] + [1], [1] + [item[1] for item in query_result] + [0], [0] + [item[2] for item in query_result] + [1], ) auc = 0 for i in range(len(recall) - 1): if recall[i + 1] - recall[i] != 0.0: a = (precision[i + 1] - precision[i]) / (recall[i + 1] - recall[i]) b = precision[i + 1] - a * recall[i + 1] auc = ( auc + a * (recall[i + 1] * recall[i + 1] - recall[i] * recall[i]) / 2 + b * (recall[i + 1] - recall[i])) auc = -auc if auc_prc: return auc if not (ax): fig, ax = plt.subplots() if isnotebook(): fig.set_size_inches(8, 6) ax.set_facecolor("#F5F5F5") ax.set_xlabel("Recall") ax.set_ylabel("Precision") ax.plot(recall, precision, color="#FE5016") ax.set_ylim(0, 1) ax.set_xlim(0, 1) ax.set_title("PRC Curve\nAUC = " + str(auc)) ax.set_axisbelow(True) ax.grid() return tablesample(values={ "threshold": threshold, "recall": recall, "precision": precision }, )
def confusion_matrix(y_true: str, y_score: str, input_relation: str, cursor=None, pos_label=1): """ --------------------------------------------------------------------------- Computes the Confusion Matrix. Parameters ---------- y_true: str Response column. y_score: str Prediction. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. pos_label: int/float/str, optional To compute the one dimension Confusion Matrix, one of the response column class has to be the positive one. The parameter 'pos_label' represents this class. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) version(cursor=cursor, condition=[8, 0, 0]) query = "SELECT CONFUSION_MATRIX(obs, response USING PARAMETERS num_classes = 2) OVER() FROM (SELECT DECODE({}".format( y_true) query += ", '{}', 1, NULL, NULL, 0) AS obs, DECODE({}, '{}', 1, NULL, NULL, 0) AS response FROM {}) VERTICAPY_SUBTABLE".format( pos_label, y_score, pos_label, input_relation) result = to_tablesample(query, cursor) if conn: conn.close() if pos_label in [1, "1"]: labels = [0, 1] else: labels = ["Non-{}".format(pos_label), pos_label] del result.values["comment"] result = result.transpose() result.values["actual_class"] = labels result = result.transpose() matrix = {"index": labels} for elem in result.values: if elem != "actual_class": matrix[elem] = result.values[elem] result.values = matrix return result
def log_loss(y_true: str, y_score: str, input_relation: str, cursor=None, pos_label=1): """ --------------------------------------------------------------------------- Computes the Log Loss. Parameters ---------- y_true: str Response column. y_score: str Prediction Probability. input_relation: str Relation to use to do the scoring. The relation can be a view or a table or even a customized relation. For example, you could write: "(SELECT ... FROM ...) x" as long as an alias is given at the end of the relation. cursor: DBcursor, optional Vertica DB cursor. pos_label: int/float/str, optional To compute the log loss, one of the response column class has to be the positive one. The parameter 'pos_label' represents this class. Returns ------- float score """ check_types([ ( "y_true", y_true, [str], ), ( "y_score", y_score, [str], ), ( "input_relation", input_relation, [str], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) query = "SELECT AVG(CASE WHEN {} = '{}' THEN - LOG({}::float + 1e-90) else - LOG(1 - {}::float + 1e-90) END) FROM {};" query = query.format(y_true, pos_label, y_score, y_score, input_relation) cursor.execute(query) result = cursor.fetchone()[0] if conn: conn.close() return result
def sql(line, cell=""): from verticapy.connections.connect import read_auto_connect from verticapy.utilities import readSQL from verticapy.utilities import vdf_from_relation from IPython.core.display import HTML, display import time import re import vertica_python from verticapy.errors import QueryError version = vertica_python.__version__.split(".") version = [int(elem) for elem in version] conn = read_auto_connect() cursor = conn.cursor() queries = line if (not (cell) and (line)) else cell options = { "limit": 100, "columns": 100, "percent_bar": False, "vdf": False } queries = queries.replace("\t", " ") queries = queries.replace("\n", " ") queries = re.sub(" +", " ", queries) if (cell) and (line): line = re.sub(" +", " ", line) all_options_tmp = line.split(" ") all_options = [] for elem in all_options_tmp: if elem != "": all_options += [elem] n, i, all_options_dict = len(all_options), 0, {} while i < n: all_options_dict[all_options[i]] = all_options[i + 1] i += 2 for option in all_options_dict: if option.lower() == "-limit": options["limit"] = int(all_options_dict[option]) elif option.lower() == "-ncols": options["columns"] = int(all_options_dict[option]) elif option.lower() == "-percent": options["percent_bar"] = bool(all_options_dict[option]) elif option.lower() == "-vdf": options["vdf"] = bool(all_options_dict[option]) else: print( "\u26A0 Warning : The option '{}' doesn't exist, it was skipped." .format(option)) n, i, all_split = len(queries), 0, [] while i < n and queries[n - i - 1] in (";", " ", "\n"): i += 1 queries = queries[:n - i] i, n = 0, n - i while i < n: if queries[i] == '"': i += 1 while i < n and queries[i] != '"': i += 1 elif queries[i] == "'": i += 1 while i < n and queries[i] != "'": i += 1 elif queries[i] == ";": all_split += [i] i += 1 all_split = [0] + all_split + [n] m = len(all_split) start_time = time.time() queries = [queries[all_split[i]:all_split[i + 1]] for i in range(m - 1)] n = len(queries) for i in range(n): query = queries[i] while len(query) > 0 and (query[-1] in (";", " ")): query = query[0:-1] while len(query) > 0 and (query[0] in (";", " ")): query = query[1:] queries[i] = query queries_tmp, i = [], 0 while i < n: query = queries[i] if (i < n - 1) and (queries[i + 1].lower() == "end"): query += "; {}".format(queries[i + 1]) i += 1 queries_tmp += [query] i += 1 queries, n = queries_tmp, len(queries_tmp) result = None for i in range(n): query = queries[i] query_type = (query.split(" ")[0].upper() if (query.split(" ")[0]) else query.split(" ")[1].upper()) if ((query_type == "COPY") and ("from local" in query.lower()) and (version[0] == 0) and (version[1] < 11)): query = re.split("from local", query, flags=re.IGNORECASE) file_name = (query[1].split(" ")[0] if (query[1].split(" ")[0]) else query[1].split(" ")[1]) query = ("".join(query[0]) + "FROM" + "".join(query[1]).replace(file_name, "STDIN")) if (file_name[0] == file_name[-1]) and (file_name[0] in ('"', "'")): file_name = file_name[1:-1] with open(file_name, "r") as fs: cursor.copy(query, fs) elif (i < n - 1) or ((i == n - 1) and (query_type.lower() != "select")): cursor.execute(query) print(query_type) else: error = "" try: if options["vdf"]: result = vdf_from_relation("({}) x".format(query), cursor=cursor) result.set_display_parameters( rows=options["limit"], columns=options["columns"], percent_bar=options["percent_bar"], ) else: result = readSQL( query, cursor=cursor, limit=options["limit"], display_ncols=options["columns"], percent_bar=options["percent_bar"], ) except: try: cursor.execute(query) final_result = cursor.fetchone() if final_result: print(final_result[0]) else: print(query_type) except Exception as e: error = e if error: raise QueryError(error) if not (options["vdf"]): conn.close() elapsed_time = time.time() - start_time display( HTML("<div><b>Execution: </b> {}s</div>".format(round(elapsed_time, 3)))) return result
def elbow( X: list, input_relation: str, cursor=None, n_cluster=(1, 15), init="kmeanspp", max_iter: int = 50, tol: float = 1e-4, ax=None, ): """ --------------------------------------------------------------------------- Draws an Elbow Curve. Parameters ---------- X: list List of the predictor columns. input_relation: str Relation to use to train the model. cursor: DBcursor, optional Vertica DB cursor. n_cluster: int, optional Tuple representing the number of cluster to start with and to end with. It can also be customized list with the different K to test. init: str/list, optional The method to use to find the initial cluster centers. kmeanspp : Use the KMeans++ method to initialize the centers. random : The initial centers It can be also a list with the initial cluster centers to use. max_iter: int, optional The maximum number of iterations the algorithm performs. tol: float, optional Determines whether the algorithm has converged. The algorithm is considered converged after no center has moved more than a distance of 'tol' from the previous iteration. ax: Matplotlib axes object, optional The axes to plot on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "X", X, [list], ), ( "input_relation", input_relation, [str], ), ( "n_cluster", n_cluster, [list], ), ( "init", init, ["kmeanspp", "random"], ), ( "max_iter", max_iter, [int, float], ), ( "tol", tol, [int, float], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) version(cursor=cursor, condition=[8, 0, 0]) schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = "".join(ch for ch in relation if ch.isalnum()) all_within_cluster_SS = [] if isinstance(n_cluster, tuple): L = [i for i in range(n_cluster[0], n_cluster[1])] else: L = n_cluster L.sort() for i in L: cursor.execute( "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format( schema, relation_alpha)) from verticapy.learn.cluster import KMeans model = KMeans( "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol, ) model.fit(input_relation, X) all_within_cluster_SS += [float(model.metrics_.values["value"][3])] model.drop() if conn: conn.close() if not (ax): fig, ax = plt.subplots() if isnotebook(): fig.set_size_inches(8, 6) ax.set_facecolor("#F5F5F5") ax.grid() ax.plot(L, all_within_cluster_SS, marker="s", color="#FE5016") ax.set_title("Elbow Curve") ax.set_xlabel("Number of Clusters") ax.set_ylabel("Between-Cluster SS / Total SS") values = {"index": L, "Within-Cluster SS": all_within_cluster_SS} return tablesample(values=values)