def elbow(X: list, input_relation: str, cursor, n_cluster = (1, 15), init = "kmeanspp", max_iter: int = 50, tol: float = 1e-4): import matplotlib.pyplot as plt from vertica_ml_python.learn.cluster import KMeans schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) all_within_cluster_SS = [] L = [i for i in range(n_cluster[0], n_cluster[1])] if not(type(n_cluster) == list) else n_cluster for i in L: cursor.execute("DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha)) model = KMeans("{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol) model.fit(input_relation, X) all_within_cluster_SS += [float(model.metrics.values["value"][3])] model.drop() plt.figure(figsize = (10,8)) plt.rcParams['axes.facecolor'] = '#F4F4F4' plt.grid() plt.plot(L, all_within_cluster_SS, marker = "s", color = "#214579") plt.title("Elbow Curve") plt.xlabel('Number of Clusters') plt.ylabel('Between-Cluster SS / Total SS') plt.subplots_adjust(left = 0.2) plt.show() values = {"index": L, "Within-Cluster SS": all_within_cluster_SS} return tablesample(values = values, table_info = False)
def best_k(X: list, input_relation: str, cursor, n_cluster=(1, 100), init="kmeanspp", max_iter: int = 50, tol: float = 1e-4, elbow_score_stop=0.8): from vertica_ml_python.learn.cluster import KMeans L = range(n_cluster[0], n_cluster[1]) if not (type(n_cluster) == list) else n_cluster schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) for i in L: cursor.execute( "DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format( schema, relation_alpha)) model = KMeans( "{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol) model.fit(input_relation, X) score = model.metrics.values["value"][3] if (score > elbow_score_stop): return i score_prev = score print( "/!\\ The K was not found. The last K (= {}) is returned with an elbow score of {}" .format(i, score)) return i
def train_test_split(input_relation: str, cursor, test_size: float = 0.33): schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) test_name, train_name = "{}_{}".format(relation_alpha, int( test_size * 100)), "{}_{}".format(relation_alpha, int(100 - test_size * 100)) cursor.execute( "DROP TABLE IF EXISTS {}.vpython_train_test_split_{}".format( schema, relation_alpha)) cursor.execute("DROP VIEW IF EXISTS {}.vpython_train_test_split_{}".format( schema, test_name)) cursor.execute("DROP VIEW IF EXISTS {}.vpython_train_test_split_{}".format( schema, train_name)) query = "CREATE TABLE {}.vpython_train_test_split_{} AS SELECT *, (CASE WHEN RANDOM() < {} THEN True ELSE False END) AS test FROM {}".format( schema, relation_alpha, test_size, input_relation) cursor.execute(query) query = "CREATE VIEW {}.vpython_train_test_split_{} AS SELECT * FROM {} WHERE test".format( schema, test_name, "{}.vpython_train_test_split_{}".format(schema, relation_alpha)) cursor.execute(query) query = "CREATE VIEW {}.vpython_train_test_split_{} AS SELECT * FROM {} WHERE NOT(test)".format( schema, train_name, "{}.vpython_train_test_split_{}".format(schema, relation_alpha)) cursor.execute(query) return ("{}.vpython_train_test_split_{}".format(schema, train_name), "{}.vpython_train_test_split_{}".format(schema, test_name))
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT KMEANS('{}', '{}', '{}', {} USING PARAMETERS max_iterations = {}, epsilon = {}".format(self.name, input_relation, ", ".join(self.X), self.n_cluster, self.max_iter, self.tol) name = "_vpython_kmeans_initial_centers_table_" schema, relation = schema_relation(input_relation) schema = str_column(schema) if (type(self.init) != str): self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name)) if (len(self.init) != self.n_cluster): raise ValueError("'init' must be a list of 'n_cluster' = {} points".format(self.n_cluster)) else: for item in self.init: if (len(X) != len(item)): raise ValueError("Each points of 'init' must be of size len(X) = {}".format(len(self.X))) temp_initial_centers = [item for item in self.init] for item in temp_initial_centers: del temp_initial_centers[0] if (item in temp_initial_centers): raise ValueError("All the points of 'init' must be different") query0 = [] for i in range(len(self.init)): line = [] for j in range(len(self.init[0])): line += [str(self.init[i][j]) + " AS " + X[j]] line = ",".join(line) query0 += ["SELECT " + line] query0 = " UNION ".join(query0) query0 = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS {}".format(schema, name, query0) self.cursor.execute(query0) query += ", initial_centers_table = '" + name + "'" else: query += ", init_method = '" + self.init + "'" query += ")" self.cursor.execute(query) self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name)) self.cluster_centers = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format(self.name), cursor = self.cursor) self.cluster_centers.table_info = False query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'metrics')".format(self.name) self.cursor.execute(query) result = self.cursor.fetchone()[0] values = {"index": ["Between-Cluster Sum of Squares", "Total Sum of Squares", "Total Within-Cluster Sum of Squares", "Between-Cluster SS / Total SS", "converged"]} values["value"] = [float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]) / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), result.split("Converged: ")[1].split("\n")[0] == "True"] self.metrics = tablesample(values, table_info = False) return (self)
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(elem) for elem in X] schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) self.cursor.execute("DROP TABLE IF EXISTS {}.{}_countvectorizer_vpython CASCADE".format(schema, relation_alpha)) sql = "CREATE TABLE {}.{}_countvectorizer_vpython(id identity(2000) primary key, text varchar({})) ORDER BY id SEGMENTED BY HASH(id) ALL NODES KSAFE;" self.cursor.execute(sql.format(schema, relation_alpha, self.max_text_size)) text = " || ".join(self.X) if not (self.lowercase) else "LOWER({})".format(" || ".join(self.X)) if (self.ignore_special): text = "REGEXP_REPLACE({}, '[^a-zA-Z0-9\\s]+', '')".format(text) sql = "INSERT INTO {}.{}_countvectorizer_vpython(text) SELECT {} FROM {}".format(schema, relation_alpha, text, input_relation) self.cursor.execute(sql) sql = "CREATE TEXT INDEX {} ON {}.{}_countvectorizer_vpython(id, text) stemmer NONE;".format(self.name, schema, relation_alpha) self.cursor.execute(sql) stop_words = "SELECT token FROM (SELECT token, cnt / SUM(cnt) OVER () AS df, rnk FROM (SELECT token, COUNT(*) AS cnt, RANK() OVER (ORDER BY COUNT(*) DESC) AS rnk FROM {} GROUP BY 1) x) y WHERE not(df BETWEEN {} AND {})".format(self.name, self.min_df, self.max_df) if (self.max_features > 0): stop_words += " OR (rnk > {})".format(self.max_features) self.cursor.execute(stop_words) self.stop_words = [item[0] for item in self.cursor.fetchall()] self.cursor.execute(self.deploySQL()) self.vocabulary = [item[0] for item in self.cursor.fetchall()] return (self)
def fit(self, input_relation: str, X: list, key_columns: list = [], index=""): X = [str_column(column) for column in X] self.X = X self.key_columns = [str_column(column) for column in key_columns] self.input_relation = input_relation cursor = self.cursor n_neighbors = self.n_neighbors p = self.p relation_alpha = ''.join(ch for ch in input_relation if ch.isalnum()) schema, relation = schema_relation(input_relation) schema = str_column(schema) if not (index): index = "id" relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) main_table = "main_{}_vpython".format(relation_alpha) cursor.execute("DROP TABLE IF EXISTS {}.{}".format( schema, main_table)) sql = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format( schema, main_table, ", ".join(X + key_columns), input_relation, " AND ".join(["{} IS NOT NULL".format(item) for item in X])) cursor.execute(sql) else: main_table = input_relation sql = [ "POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], p) for i in range(len(X)) ] distance = "POWER({}, 1 / {})".format(" + ".join(sql), p) sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance, ROW_NUMBER() OVER(PARTITION BY x.{} ORDER BY {}) AS knn FROM {}.{} AS x CROSS JOIN {}.{} AS y".format( index, index, distance, index, distance, schema, main_table, schema, main_table) sql = "SELECT node_id, nn_id, distance, knn FROM ({}) distance_table WHERE knn <= {}".format( sql, n_neighbors + 1) cursor.execute("DROP TABLE IF EXISTS {}.distance_{}_vpython".format( schema, relation_alpha)) sql = "CREATE TEMPORARY TABLE {}.distance_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format( schema, relation_alpha, sql) cursor.execute(sql) kdistance = "(SELECT node_id, nn_id, distance AS distance FROM {}.distance_{}_vpython WHERE knn = {}) AS kdistance_table".format( schema, relation_alpha, n_neighbors + 1) lrd = "SELECT distance_table.node_id, {} / SUM(CASE WHEN distance_table.distance > kdistance_table.distance THEN distance_table.distance ELSE kdistance_table.distance END) AS lrd FROM ({}.distance_{}_vpython AS distance_table LEFT JOIN {} ON distance_table.nn_id = kdistance_table.node_id) x GROUP BY 1".format( n_neighbors, schema, relation_alpha, kdistance) cursor.execute("DROP TABLE IF EXISTS {}.lrd_{}_vpython".format( schema, relation_alpha)) sql = "CREATE TEMPORARY TABLE {}.lrd_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format( schema, relation_alpha, lrd) cursor.execute(sql) sql = "SELECT x.node_id, SUM(y.lrd) / (MAX(x.node_lrd) * {}) AS LOF FROM (SELECT n_table.node_id, n_table.nn_id, lrd_table.lrd AS node_lrd FROM {}.distance_{}_vpython AS n_table LEFT JOIN {}.lrd_{}_vpython AS lrd_table ON n_table.node_id = lrd_table.node_id) x LEFT JOIN {}.lrd_{}_vpython AS y ON x.nn_id = y.node_id GROUP BY 1".format( n_neighbors, schema, relation_alpha, schema, relation_alpha, schema, relation_alpha) cursor.execute("DROP TABLE IF EXISTS {}.lof_{}_vpython".format( schema, relation_alpha)) sql = "CREATE TEMPORARY TABLE {}.lof_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format( schema, relation_alpha, sql) cursor.execute(sql) sql = "SELECT {}, (CASE WHEN lof > 1e100 OR lof != lof THEN 0 ELSE lof END) AS lof_score FROM {} AS x LEFT JOIN {}.lof_{}_vpython AS y ON x.{} = y.node_id".format( ", ".join(X + self.key_columns), main_table, schema, relation_alpha, index) sql = "CREATE TABLE {} AS {}".format(self.name, sql) cursor.execute(sql) sql = "SELECT COUNT(*) FROM {}.lof_{}_vpython z WHERE lof > 1e100 OR lof != lof".format( schema, relation_alpha) cursor.execute(sql) self.n_errors = cursor.fetchone()[0] cursor.execute("DROP TABLE IF EXISTS {}.main_{}_vpython".format( schema, relation_alpha)) cursor.execute("DROP TABLE IF EXISTS {}.distance_{}_vpython".format( schema, relation_alpha)) cursor.execute("DROP TABLE IF EXISTS {}.lrd_{}_vpython".format( schema, relation_alpha)) cursor.execute("DROP TABLE IF EXISTS {}.lof_{}_vpython".format( schema, relation_alpha)) return (self)
def cross_validate(estimator, input_relation: str, X: list, y: str, cv: int = 3, pos_label=None, cutoff: float = 0.5): if (estimator.type == "regressor"): result = { "index": [ "explained_variance", "max_error", "median_absolute_error", "mean_absolute_error", "mean_squared_error", "r2" ] } elif (estimator.type == "classifier"): result = { "index": [ "auc", "prc_auc", "accuracy", "log_loss", "precision", "recall", "f1-score", "mcc", "informedness", "markedness", "csi" ] } else: raise ValueError( "Cross Validation is only possible for Regressors and Classifiers") schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) test_name, train_name = "{}_{}".format(relation_alpha, int(1 / cv * 100)), "{}_{}".format( relation_alpha, int(100 - 1 / cv * 100)) estimator.cursor.execute( "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, relation_alpha)) query = "CREATE TEMPORARY TABLE {}.vpython_train_test_split_cv_{} ON COMMIT PRESERVE ROWS AS SELECT *, RANDOMINT({}) AS test FROM {}".format( schema, relation_alpha, cv, input_relation) estimator.cursor.execute(query) for i in range(cv): try: estimator.cursor.execute("DROP MODEL IF EXISTS {}".format( estimator.name)) except: pass estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, test_name)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, train_name)) query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test = {})".format( schema, test_name, "{}.vpython_train_test_split_cv_{}".format(schema, relation_alpha), i) estimator.cursor.execute(query) query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test != {})".format( schema, train_name, "{}.vpython_train_test_split_cv_{}".format(schema, relation_alpha), i) estimator.cursor.execute(query) estimator.fit( "{}.vpython_train_test_split_cv_{}".format(schema, train_name), X, y, "{}.vpython_train_test_split_cv_{}".format(schema, test_name)) if (estimator.type == "regressor"): result["{}-fold".format( i + 1)] = estimator.regression_report().values["value"] else: if (len(estimator.classes) > 2) and (pos_label not in estimator.classes): raise ValueError( "'pos_label' must be in the estimator classes, it must be the main class to study for the Cross Validation" ) try: result["{}-fold".format(i + 1)] = estimator.classification_report( labels=[pos_label], cutoff=cutoff).values["value"] except: result["{}-fold".format(i + 1)] = estimator.classification_report( cutoff=cutoff).values["value"] try: estimator.cursor.execute("DROP MODEL IF EXISTS {}".format( estimator.name)) except: pass n = 6 if (estimator.type == "regressor") else 11 total = [[] for item in range(n)] for i in range(cv): for k in range(n): total[k] += [result["{}-fold".format(i + 1)][k]] result["avg"] = [np.mean(item) for item in total] result["std"] = [np.std(item) for item in total] estimator.cursor.execute( "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, relation_alpha)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, test_name)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, train_name)) return (tablesample(values=result, table_info=False).transpose())
def fit(self, input_relation: str, X: list, key_columns: list = [], index: str = ""): X = [str_column(column) for column in X] self.X = X self.key_columns = [str_column(column) for column in key_columns] self.input_relation = input_relation schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) cursor = self.cursor if not(index): index = "id" main_table = "{}.main_{}_vpython_".format(schema, relation_alpha) cursor.execute("DROP TABLE IF EXISTS {}".format(main_table)) sql = "CREATE TEMPORARY TABLE {} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format(main_table, ", ".join(X + key_columns), input_relation, " AND ".join(["{} IS NOT NULL".format(item) for item in X])) cursor.execute(sql) else: main_table = input_relation sql = ["POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], self.p) for i in range(len(X))] distance = "POWER({}, 1 / {})".format(" + ".join(sql), self.p) sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance FROM {} AS x CROSS JOIN {} AS y".format(index, index, distance, main_table, main_table) sql = "SELECT node_id, nn_id, SUM(CASE WHEN distance <= {} THEN 1 ELSE 0 END) OVER (PARTITION BY node_id) AS density, distance FROM ({}) distance_table".format(self.eps, sql) cursor.execute("DROP TABLE IF EXISTS {}.graph_{}_vpython_".format(schema, relation_alpha)) sql = "SELECT node_id, nn_id FROM ({}) x WHERE density > {} AND distance < {} AND node_id != nn_id".format(sql, self.min_samples, self.eps) cursor.execute(sql) graph = cursor.fetchall() main_nodes = list(dict.fromkeys([elem[0] for elem in graph] + [elem[1] for elem in graph])) clusters = {} for elem in main_nodes: clusters[elem] = None i = 0 while (graph): node = graph[0][0] node_neighbor = graph[0][1] if (clusters[node] == None) and (clusters[node_neighbor] == None): clusters[node] = i clusters[node_neighbor] = i i = i + 1 else: if (clusters[node] != None): clusters[node_neighbor] = clusters[node] else: clusters[node] = clusters[node_neighbor] del(graph[0]) try: f = open("dbscan_id_cluster_vpython.csv", 'w') for elem in clusters: f.write("{}, {}\n".format(elem, clusters[elem])) f.close() cursor.execute("DROP TABLE IF EXISTS {}.dbscan_clusters".format(schema)) cursor.execute("CREATE TEMPORARY TABLE {}.dbscan_clusters(node_id int, cluster int) ON COMMIT PRESERVE ROWS".format(schema)) cursor.execute("COPY {}.dbscan_clusters(node_id, cluster) FROM LOCAL './dbscan_id_cluster_vpython.csv' DELIMITER ',' ESCAPE AS '\\'".format(schema)) cursor.execute("COMMIT") os.remove("dbscan_id_cluster_vpython.csv") except: os.remove("dbscan_id_cluster_vpython.csv") raise self.n_cluster = i cursor.execute("CREATE TABLE {} AS SELECT {}, COALESCE(cluster, -1) AS dbscan_cluster FROM {} AS x LEFT JOIN {}.dbscan_clusters AS y ON x.{} = y.node_id".format(self.name, ", ".join(self.X + self.key_columns), main_table, schema, index)) cursor.execute("SELECT COUNT(*) FROM {} WHERE dbscan_cluster = -1".format(self.name)) self.n_noise = cursor.fetchone()[0] cursor.execute("DROP TABLE IF EXISTS {}.main_{}_vpython_".format(schema, relation_alpha)) cursor.execute("DROP TABLE IF EXISTS {}.dbscan_clusters".format(schema)) return (self)