def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT PCA('{}', '{}', '{}' USING PARAMETERS scale = {}, method = '{}'" query = query.format(self.name, input_relation, ", ".join(self.X), self.scale, self.method) if (self.n_components): query += ", num_components = {}".format(self.n_components) query += ")" self.cursor.execute(query) self.components = to_tablesample( query= "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'principal_components')" .format(self.name), cursor=self.cursor) self.components.table_info = False self.explained_variance = to_tablesample( query= "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'singular_values')" .format(self.name), cursor=self.cursor) self.explained_variance.table_info = False self.mean = to_tablesample( query= "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'columns')" .format(self.name), cursor=self.cursor) self.mean.table_info = False return (self)
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT ONE_HOT_ENCODER_FIT('{}', '{}', '{}' USING PARAMETERS extra_levels = '{}')".format(self.name, input_relation, ", ".join(self.X), self.extra_levels) self.cursor.execute(query) try: self.param = to_tablesample(query = "SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) x UNION ALL SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format(self.name, self.name), cursor = self.cursor) except: try: self.param = to_tablesample(query = "SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) x".format(self.name), cursor = self.cursor) except: self.param = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format(self.name), cursor = self.cursor) self.param.table_info = False return (self)
def multilabel_confusion_matrix(y_true: str, y_score: str, input_relation: str, cursor, labels: list): num_classes = str(len(labels)) query = "SELECT CONFUSION_MATRIX(obs, response USING PARAMETERS num_classes = {}) OVER() FROM (SELECT DECODE({}".format(num_classes, y_true) for idx, item in enumerate(labels): query += ", '" + str(item) + "', " + str(idx) query += ") AS obs, DECODE({}".format(y_score) for idx,item in enumerate(labels): query += ", '" + str(item) + "', " + str(idx) query += ") AS response FROM {}) x".format(input_relation) result = to_tablesample(query, cursor) result.table_info = False del (result.values["comment"]) result = result.transpose() result.values["actual_class"] = labels result = result.transpose() matrix = {"index": labels} for elem in result.values: if (elem != "actual_class"): matrix[elem] = result.values[elem] result.values = matrix return (result)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) query = "SELECT SVM_REGRESSOR('{}', '{}', '{}', '{}' USING PARAMETERS C = {}, epsilon = {}, max_iterations = {}" query = query.format(self.name, input_relation, self.y, ", ".join(self.X), self.C, self.tol, self.max_iter) query += ", error_tolerance = {}".format(self.acceptable_error_margin) if (self.fit_intercept): query += ", intercept_mode = '{}', intercept_scaling = {}".format( self.intercept_mode, self.intercept_scaling) query += ")" self.cursor.execute(query) self.coef = to_tablesample( query= "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')" .format(self.name), cursor=self.cursor) self.coef.table_info = False return (self)
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT NORMALIZE_FIT('{}', '{}', '{}', '{}')".format(self.name, input_relation, ", ".join(self.X), self.method) self.cursor.execute(query) self.param = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')".format(self.name), cursor = self.cursor) self.param.table_info = False return (self)
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT KMEANS('{}', '{}', '{}', {} USING PARAMETERS max_iterations = {}, epsilon = {}".format(self.name, input_relation, ", ".join(self.X), self.n_cluster, self.max_iter, self.tol) name = "_vpython_kmeans_initial_centers_table_" schema, relation = schema_relation(input_relation) schema = str_column(schema) if (type(self.init) != str): self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name)) if (len(self.init) != self.n_cluster): raise ValueError("'init' must be a list of 'n_cluster' = {} points".format(self.n_cluster)) else: for item in self.init: if (len(X) != len(item)): raise ValueError("Each points of 'init' must be of size len(X) = {}".format(len(self.X))) temp_initial_centers = [item for item in self.init] for item in temp_initial_centers: del temp_initial_centers[0] if (item in temp_initial_centers): raise ValueError("All the points of 'init' must be different") query0 = [] for i in range(len(self.init)): line = [] for j in range(len(self.init[0])): line += [str(self.init[i][j]) + " AS " + X[j]] line = ",".join(line) query0 += ["SELECT " + line] query0 = " UNION ".join(query0) query0 = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS {}".format(schema, name, query0) self.cursor.execute(query0) query += ", initial_centers_table = '" + name + "'" else: query += ", init_method = '" + self.init + "'" query += ")" self.cursor.execute(query) self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name)) self.cluster_centers = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format(self.name), cursor = self.cursor) self.cluster_centers.table_info = False query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'metrics')".format(self.name) self.cursor.execute(query) result = self.cursor.fetchone()[0] values = {"index": ["Between-Cluster Sum of Squares", "Total Sum of Squares", "Total Within-Cluster Sum of Squares", "Between-Cluster SS / Total SS", "converged"]} values["value"] = [float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]) / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), result.split("Converged: ")[1].split("\n")[0] == "True"] self.metrics = tablesample(values, table_info = False) return (self)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if (test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) query = "SELECT LOGISTIC_REG('{}', '{}', '{}', '{}' USING PARAMETERS optimizer = '{}', epsilon = {}, max_iterations = {}" query = query.format(self.name, input_relation, self.y, ", ".join(self.X), self.solver, self.tol, self.max_iter) query += ", regularization = '{}', lambda = {}".format(self.penalty, self.C) if (self.penalty == 'ENet'): query += ", alpha = {}".format(self.l1_ratio) query += ")" self.cursor.execute(query) self.coef = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')".format(self.name), cursor = self.cursor) self.coef.table_info = False return (self)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): func = "APPROXIMATE_MEDIAN" if (self.p == 1) else "AVG" self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) query = "SELECT {}, {} FROM {} WHERE {} IS NOT NULL GROUP BY {}".format( ", ".join([ "{}({}) AS {}".format(func, column, column) for column in self.X ]), self.y, input_relation, self.y, self.y) self.centroids = to_tablesample(query=query, cursor=self.cursor) self.centroids.table_info = False self.classes = self.centroids.values[y] return (self)
def confusion_matrix(y_true: str, y_score: str, input_relation: str, cursor, pos_label = 1): query = "SELECT CONFUSION_MATRIX(obs, response USING PARAMETERS num_classes = 2) OVER() FROM (SELECT DECODE({}".format(y_true) query += ", '{}', 1, NULL, NULL, 0) AS obs, DECODE({}, '{}', 1, NULL, NULL, 0) AS response FROM {}) x".format(pos_label, y_score, pos_label, input_relation) result = to_tablesample(query, cursor) if (pos_label == 1): labels = [0, 1] else: labels = ["Non-{}".format(pos_label), pos_label] result.table_info = False del (result.values["comment"]) result = result.transpose() result.values["actual_class"] = labels result = result.transpose() matrix = {"index": labels} for elem in result.values: if (elem != "actual_class"): matrix[elem] = result.values[elem] result.values = matrix return (result)
def get_tree(self, tree_id: int = 0): query = "SELECT READ_TREE ( USING PARAMETERS model_name = '{}', tree_id = {}, format = 'tabular');".format( self.name, tree_id) result = to_tablesample(query=query, cursor=self.cursor) result.table_info = False return (result)