def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) query = "SELECT SVM_REGRESSOR('{}', '{}', '{}', '{}' USING PARAMETERS C = {}, epsilon = {}, max_iterations = {}" query = query.format(self.name, input_relation, self.y, ", ".join(self.X), self.C, self.tol, self.max_iter) query += ", error_tolerance = {}".format(self.acceptable_error_margin) if (self.fit_intercept): query += ", intercept_mode = '{}', intercept_scaling = {}".format( self.intercept_mode, self.intercept_scaling) query += ")" self.cursor.execute(query) self.coef = to_tablesample( query= "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')" .format(self.name), cursor=self.cursor) self.coef.table_info = False return (self)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) if (self.max_features == "auto"): self.max_features = int(len(self.X) / 3 + 1) elif (self.max_features == "max"): self.max_features = len(self.X) query = "SELECT RF_CLASSIFIER('{}', '{}', '{}', '{}' USING PARAMETERS ntree = {}, mtry = {}, sampling_size = {}" query = query.format(self.name, input_relation, self.y, ", ".join(self.X), self.n_estimators, self.max_features, self.sample) query += ", max_depth = {}, max_breadth = {}, min_leaf_size = {}, min_info_gain = {}, nbins = {})".format( self.max_depth, int(self.max_leaf_nodes), self.min_samples_leaf, self.min_info_gain, self.nbins) self.cursor.execute(query) self.cursor.execute( "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1". format(self.y, input_relation, self.y)) classes = self.cursor.fetchall() self.classes = [item[0] for item in classes] return (self)
def lof_plot(input_relation: str, cursor, columns: list, lof: str, tablesample: float = -1): import matplotlib.pyplot as plt tablesample = "TABLESAMPLE({})".format(tablesample) if (tablesample > 0 and tablesample < 100) else "" if (len(columns) == 1): column = str_column(columns[0]) query = "SELECT {}, {} FROM {} {} WHERE {} IS NOT NULL".format(column, lof, input_relation, tablesample, column) cursor.execute(query) query_result = cursor.fetchall() column1, lof = [item[0] for item in query_result], [item[1] for item in query_result] column2 = [0] * len(column1) plt.figure(figsize = (10,2)) plt.gca().grid() plt.gca().set_axisbelow(True) plt.title('Local Outlier Factor (LOF)') plt.xlabel(column) radius = [1000 * (item - min(lof)) / (max(lof) - min(lof)) for item in lof] plt.scatter(column1, column2, color = "#214579", s = 14, label = 'Data points') plt.scatter(column1, column2, color = "#FFCC01", s = radius, label = 'Outlier scores', facecolors = 'none') elif (len(columns) == 2): columns = [str_column(column) for column in columns] query = "SELECT {}, {}, {} FROM {} {} WHERE {} IS NOT NULL AND {} IS NOT NULL".format(columns[0], columns[1], lof, input_relation, tablesample, columns[0], columns[1]) cursor.execute(query) query_result = cursor.fetchall() column1, column2, lof = [item[0] for item in query_result], [item[1] for item in query_result], [item[2] for item in query_result] plt.figure(figsize = (10,8)) plt.gca().grid() plt.gca().set_axisbelow(True) plt.title('Local Outlier Factor (LOF)') plt.ylabel(columns[1]) plt.xlabel(columns[0]) radius = [1000 * (item - min(lof)) / (max(lof) - min(lof)) for item in lof] plt.scatter(column1, column2, color = "#214579", s = 14, label = 'Data points') plt.scatter(column1, column2, color = "#FFCC01", s = radius, label = 'Outlier scores', facecolors = 'none') elif (len(columns) == 3): from mpl_toolkits.mplot3d import Axes3D query = "SELECT {}, {}, {}, {} FROM {} {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} IS NOT NULL".format( columns[0], columns[1], columns[2], lof, input_relation, tablesample, columns[0], columns[1], columns[2]) cursor.execute(query) query_result = cursor.fetchall() column1, column2, column3, lof = [float(item[0]) for item in query_result], [float(item[1]) for item in query_result], [float(item[2]) for item in query_result], [float(item[3]) for item in query_result] fig = plt.figure(figsize = (10,8)) ax = fig.add_subplot(111, projection = '3d') plt.title('Local Outlier Factor (LOF)') ax.set_xlabel(columns[0]) ax.set_ylabel(columns[1]) ax.set_zlabel(columns[2]) radius = [1000 * (item - min(lof)) / (max(lof) - min(lof)) for item in lof] ax.scatter(column1, column2, column3, color = "#214579", label = 'Data points') ax.scatter(column1, column2, column3, color = "#FFCC01", s = radius, facecolors = 'none') ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0)) ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0)) ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0)) else: raise ValueError("LocalOutlierFactor Plot is available for a maximum of 3 columns") plt.show()
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) return (self)
def load_amazon(cursor, schema: str = 'public', name='amazon'): try: query = "CREATE TABLE {}.{}(\"number\" Integer, \"date\" Date, \"state\" Varchar(32));" query += "COPY {}.{}(\"number\", \"date\", \"state\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/amazon.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def load_titanic(cursor, schema: str = 'public', name='titanic'): try: query = "CREATE TABLE {}.{}(\"pclass\" Integer, \"survived\" Integer, \"name\" Varchar(164), \"sex\" Varchar(20), \"age\" Numeric(6,3), \"sibsp\" Integer, \"parch\" Integer, \"ticket\" Varchar(36), \"fare\" Numeric(10,5), \"cabin\" Varchar(30), \"embarked\" Varchar(20), \"boat\" Varchar(100), \"body\" Integer, \"home.dest\" Varchar(100));" query += "COPY {}.{}(\"pclass\", \"survived\", \"name\", \"sex\", \"age\", \"sibsp\", \"parch\", \"ticket\", \"fare\", \"cabin\", \"embarked\", \"boat\", \"body\", \"home.dest\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/titanic.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def load_smart_meters(cursor, schema: str = 'public', name='smart_meters'): try: query = "CREATE TABLE {}.{}(\"time\" Timestamp, \"val\" Numeric(11,7), \"id\" Integer);" query += "COPY {}.{}(\"time\", \"val\", \"id\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/smart_meters.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def load_iris(cursor, schema: str = 'public', name='iris'): try: query = "CREATE TABLE {}.{}(\"SepalLengthCm\" Numeric(5,2), \"SepalWidthCm\" Numeric(5,2), \"PetalLengthCm\" Numeric(5,2), \"PetalWidthCm\" Numeric(5,2), \"Species\" Varchar(30));" query += "COPY {}.{}(\"Id\" FILLER Integer, \"SepalLengthCm\", \"SepalWidthCm\", \"PetalLengthCm\", \"PetalWidthCm\", \"Species\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/iris.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def load_winequality(cursor, schema: str = 'public', name='winequality'): try: query = "CREATE TABLE {}.{}(\"fixed_acidity\" Numeric(6,3), \"volatile_acidity\" Numeric(7,4), \"citric_acid\" Numeric(6,3), \"residual_sugar\" Numeric(7,3), \"chlorides\" Float, \"free_sulfur_dioxide\" Numeric(7,2), \"total_sulfur_dioxide\" Numeric(7,2), \"density\" Float, \"pH\" Numeric(6,3), \"sulphates\" Numeric(6,3), \"alcohol\" Float, \"quality\" Integer, \"good\" Integer, \"color\" Varchar(20));" query += "COPY {}.{}(\"fixed_acidity\", \"volatile_acidity\", \"citric_acid\", \"residual_sugar\", \"chlorides\", \"free_sulfur_dioxide\", \"total_sulfur_dioxide\", \"density\", \"pH\", \"sulphates\", \"alcohol\", \"quality\", \"good\", \"color\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/winequality.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT PCA('{}', '{}', '{}' USING PARAMETERS scale = {}, method = '{}'" query = query.format(self.name, input_relation, ", ".join(self.X), self.scale, self.method) if (self.n_components): query += ", num_components = {}".format(self.n_components) query += ")" self.cursor.execute(query) self.components = to_tablesample( query= "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'principal_components')" .format(self.name), cursor=self.cursor) self.components.table_info = False self.explained_variance = to_tablesample( query= "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'singular_values')" .format(self.name), cursor=self.cursor) self.explained_variance.table_info = False self.mean = to_tablesample( query= "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'columns')" .format(self.name), cursor=self.cursor) self.mean.table_info = False return (self)
def elbow(X: list, input_relation: str, cursor, n_cluster = (1, 15), init = "kmeanspp", max_iter: int = 50, tol: float = 1e-4): import matplotlib.pyplot as plt from vertica_ml_python.learn.cluster import KMeans schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) all_within_cluster_SS = [] L = [i for i in range(n_cluster[0], n_cluster[1])] if not(type(n_cluster) == list) else n_cluster for i in L: cursor.execute("DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha)) model = KMeans("{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol) model.fit(input_relation, X) all_within_cluster_SS += [float(model.metrics.values["value"][3])] model.drop() plt.figure(figsize = (10,8)) plt.rcParams['axes.facecolor'] = '#F4F4F4' plt.grid() plt.plot(L, all_within_cluster_SS, marker = "s", color = "#214579") plt.title("Elbow Curve") plt.xlabel('Number of Clusters') plt.ylabel('Between-Cluster SS / Total SS') plt.subplots_adjust(left = 0.2) plt.show() values = {"index": L, "Within-Cluster SS": all_within_cluster_SS} return tablesample(values = values, table_info = False)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) self.cursor.execute( "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1". format(self.y, input_relation, self.y)) classes = self.cursor.fetchall() self.classes = [item[0] for item in classes] return (self)
def train_test_split(input_relation: str, cursor, test_size: float = 0.33): schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) test_name, train_name = "{}_{}".format(relation_alpha, int( test_size * 100)), "{}_{}".format(relation_alpha, int(100 - test_size * 100)) cursor.execute( "DROP TABLE IF EXISTS {}.vpython_train_test_split_{}".format( schema, relation_alpha)) cursor.execute("DROP VIEW IF EXISTS {}.vpython_train_test_split_{}".format( schema, test_name)) cursor.execute("DROP VIEW IF EXISTS {}.vpython_train_test_split_{}".format( schema, train_name)) query = "CREATE TABLE {}.vpython_train_test_split_{} AS SELECT *, (CASE WHEN RANDOM() < {} THEN True ELSE False END) AS test FROM {}".format( schema, relation_alpha, test_size, input_relation) cursor.execute(query) query = "CREATE VIEW {}.vpython_train_test_split_{} AS SELECT * FROM {} WHERE test".format( schema, test_name, "{}.vpython_train_test_split_{}".format(schema, relation_alpha)) cursor.execute(query) query = "CREATE VIEW {}.vpython_train_test_split_{} AS SELECT * FROM {} WHERE NOT(test)".format( schema, train_name, "{}.vpython_train_test_split_{}".format(schema, relation_alpha)) cursor.execute(query) return ("{}.vpython_train_test_split_{}".format(schema, train_name), "{}.vpython_train_test_split_{}".format(schema, test_name))
def best_k(X: list, input_relation: str, cursor, n_cluster=(1, 100), init="kmeanspp", max_iter: int = 50, tol: float = 1e-4, elbow_score_stop=0.8): from vertica_ml_python.learn.cluster import KMeans L = range(n_cluster[0], n_cluster[1]) if not (type(n_cluster) == list) else n_cluster schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) for i in L: cursor.execute( "DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format( schema, relation_alpha)) model = KMeans( "{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol) model.fit(input_relation, X) score = model.metrics.values["value"][3] if (score > elbow_score_stop): return i score_prev = score print( "/!\\ The K was not found. The last K (= {}) is returned with an elbow score of {}" .format(i, score)) return i
def deployInverseSQL(self, key_columns: list = []): sql = "APPLY_INVERSE_PCA({} USING PARAMETERS model_name = '{}', match_by_pos = 'true'" if (key_columns): sql += ", key_columns = '{}'".format(", ".join( [str_column(item) for item in key_columns])) sql += ")" return (sql.format(", ".join(self.X), self.name))
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT NORMALIZE_FIT('{}', '{}', '{}', '{}')".format(self.name, input_relation, ", ".join(self.X), self.method) self.cursor.execute(query) self.param = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')".format(self.name), cursor = self.cursor) self.param.table_info = False return (self)
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT KMEANS('{}', '{}', '{}', {} USING PARAMETERS max_iterations = {}, epsilon = {}".format(self.name, input_relation, ", ".join(self.X), self.n_cluster, self.max_iter, self.tol) name = "_vpython_kmeans_initial_centers_table_" schema, relation = schema_relation(input_relation) schema = str_column(schema) if (type(self.init) != str): self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name)) if (len(self.init) != self.n_cluster): raise ValueError("'init' must be a list of 'n_cluster' = {} points".format(self.n_cluster)) else: for item in self.init: if (len(X) != len(item)): raise ValueError("Each points of 'init' must be of size len(X) = {}".format(len(self.X))) temp_initial_centers = [item for item in self.init] for item in temp_initial_centers: del temp_initial_centers[0] if (item in temp_initial_centers): raise ValueError("All the points of 'init' must be different") query0 = [] for i in range(len(self.init)): line = [] for j in range(len(self.init[0])): line += [str(self.init[i][j]) + " AS " + X[j]] line = ",".join(line) query0 += ["SELECT " + line] query0 = " UNION ".join(query0) query0 = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS {}".format(schema, name, query0) self.cursor.execute(query0) query += ", initial_centers_table = '" + name + "'" else: query += ", init_method = '" + self.init + "'" query += ")" self.cursor.execute(query) self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name)) self.cluster_centers = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format(self.name), cursor = self.cursor) self.cluster_centers.table_info = False query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'metrics')".format(self.name) self.cursor.execute(query) result = self.cursor.fetchone()[0] values = {"index": ["Between-Cluster Sum of Squares", "Total Sum of Squares", "Total Within-Cluster Sum of Squares", "Between-Cluster SS / Total SS", "converged"]} values["value"] = [float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]) / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), result.split("Converged: ")[1].split("\n")[0] == "True"] self.metrics = tablesample(values, table_info = False) return (self)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if (test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) query = "SELECT LOGISTIC_REG('{}', '{}', '{}', '{}' USING PARAMETERS optimizer = '{}', epsilon = {}, max_iterations = {}" query = query.format(self.name, input_relation, self.y, ", ".join(self.X), self.solver, self.tol, self.max_iter) query += ", regularization = '{}', lambda = {}".format(self.penalty, self.C) if (self.penalty == 'ENet'): query += ", alpha = {}".format(self.l1_ratio) query += ")" self.cursor.execute(query) self.coef = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')".format(self.name), cursor = self.cursor) self.coef.table_info = False return (self)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) query = "SELECT NAIVE_BAYES('{}', '{}', '{}', '{}' USING PARAMETERS alpha = {})".format( self.name, input_relation, self.y, ", ".join(self.X), self.alpha) self.cursor.execute(query) self.cursor.execute( "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1". format(self.y, input_relation, self.y)) classes = self.cursor.fetchall() self.classes = [item[0] for item in classes] return (self)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): func = "APPROXIMATE_MEDIAN" if (self.p == 1) else "AVG" self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) query = "SELECT {}, {} FROM {} WHERE {} IS NOT NULL GROUP BY {}".format( ", ".join([ "{}({}) AS {}".format(func, column, column) for column in self.X ]), self.y, input_relation, self.y, self.y) self.centroids = to_tablesample(query=query, cursor=self.cursor) self.centroids.table_info = False self.classes = self.centroids.values[y] return (self)
def deploySQL(self, n_components: int = 0, cutoff: float = 1, key_columns: list = []): sql = "APPLY_PCA({} USING PARAMETERS model_name = '{}', match_by_pos = 'true'" if (key_columns): sql += ", key_columns = '{}'".format(", ".join( [str_column(item) for item in key_columns])) if (n_components): sql += ", num_components = {}".format(n_components) else: sql += ", cutoff = {}".format(cutoff) sql += ")" return (sql.format(", ".join(self.X), self.name))
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(column) for column in X] query = "SELECT ONE_HOT_ENCODER_FIT('{}', '{}', '{}' USING PARAMETERS extra_levels = '{}')".format(self.name, input_relation, ", ".join(self.X), self.extra_levels) self.cursor.execute(query) try: self.param = to_tablesample(query = "SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) x UNION ALL SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format(self.name, self.name), cursor = self.cursor) except: try: self.param = to_tablesample(query = "SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) x".format(self.name), cursor = self.cursor) except: self.param = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format(self.name), cursor = self.cursor) self.param.table_info = False return (self)
def fit(self, input_relation: str, X: list, y: str, test_relation: str = ""): self.input_relation = input_relation self.test_relation = test_relation if ( test_relation) else input_relation self.X = [str_column(column) for column in X] self.y = str_column(y) if (self.max_features == "auto"): self.max_features = int(len(self.X) / 3 + 1) elif (self.max_features == "max"): self.max_features = len(self.X) query = "SELECT RF_REGRESSOR('{}', '{}', '{}', '{}' USING PARAMETERS ntree = {}, mtry = {}, sampling_size = {}" query = query.format(self.name, input_relation, self.y, ", ".join(self.X), self.n_estimators, self.max_features, self.sample) query += ", max_depth = {}, max_breadth = {}, min_leaf_size = {}, min_info_gain = {}, nbins = {})".format( self.max_depth, int(self.max_leaf_nodes), self.min_samples_leaf, self.min_info_gain, self.nbins) self.cursor.execute(query) return (self)
def fit(self, input_relation: str, X: list): self.input_relation = input_relation self.X = [str_column(elem) for elem in X] schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) self.cursor.execute("DROP TABLE IF EXISTS {}.{}_countvectorizer_vpython CASCADE".format(schema, relation_alpha)) sql = "CREATE TABLE {}.{}_countvectorizer_vpython(id identity(2000) primary key, text varchar({})) ORDER BY id SEGMENTED BY HASH(id) ALL NODES KSAFE;" self.cursor.execute(sql.format(schema, relation_alpha, self.max_text_size)) text = " || ".join(self.X) if not (self.lowercase) else "LOWER({})".format(" || ".join(self.X)) if (self.ignore_special): text = "REGEXP_REPLACE({}, '[^a-zA-Z0-9\\s]+', '')".format(text) sql = "INSERT INTO {}.{}_countvectorizer_vpython(text) SELECT {} FROM {}".format(schema, relation_alpha, text, input_relation) self.cursor.execute(sql) sql = "CREATE TEXT INDEX {} ON {}.{}_countvectorizer_vpython(id, text) stemmer NONE;".format(self.name, schema, relation_alpha) self.cursor.execute(sql) stop_words = "SELECT token FROM (SELECT token, cnt / SUM(cnt) OVER () AS df, rnk FROM (SELECT token, COUNT(*) AS cnt, RANK() OVER (ORDER BY COUNT(*) DESC) AS rnk FROM {} GROUP BY 1) x) y WHERE not(df BETWEEN {} AND {})".format(self.name, self.min_df, self.max_df) if (self.max_features > 0): stop_words += " OR (rnk > {})".format(self.max_features) self.cursor.execute(stop_words) self.stop_words = [item[0] for item in self.cursor.fetchall()] self.cursor.execute(self.deploySQL()) self.vocabulary = [item[0] for item in self.cursor.fetchall()] return (self)
def fast_cv(algorithm: str, input_relation: str, cursor, X: list, y: str, cv: int = 3, metrics: list = [], params: dict = {}, cutoff: float = -1): if (algorithm.lower() in ("logistic_reg", "logistic_regression", "logisticregression")): algorithm = "logistic_reg" elif (algorithm.lower() in ("linear_reg", "linear_regression", "linearregression")): algorithm = "linear_reg" elif (algorithm.lower() in ("svm_classifier", "svmclassifier", "linearsvc")): algorithm = "svm_classifier" elif (algorithm.lower() in ("svm_regressor", "svmregressor", "linearsvr")): algorithm = "svm_regressor" elif (algorithm.lower() in ("naive_bayes", "naivebayes", "multinomialnb")): algorithm = "naive_bayes" if not (metrics): if algorithm in ("naive_bayes", "svm_classifier", "logistic_reg"): metrics = ["accuracy", "auc_roc", "auc_prc", "fscore"] elif algorithm in ("svm_regressor", "linear_reg"): metrics = ["MSE", "MAE", "rsquared", "explained_variance"] sql = "SELECT CROSS_VALIDATE('{}', '{}', '{}', '{}' USING PARAMETERS cv_fold_count = {}, cv_metrics = '{}'".format( algorithm, input_relation, y, ", ".join([str_column(item) for item in X]), cv, ", ".join(metrics)) if (params): sql += ", cv_hyperparams = '{}'".format(params) if (cutoff <= 1 and cutoff >= 0): sql += ", cv_prediction_cutoff = '{}'".format(cutoff) sql += ')' cursor.execute(sql) return (cursor.fetchone()[0])
def fit(self, input_relation: str, X: list, key_columns: list = [], index: str = ""): X = [str_column(column) for column in X] self.X = X self.key_columns = [str_column(column) for column in key_columns] self.input_relation = input_relation schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) cursor = self.cursor if not(index): index = "id" main_table = "{}.main_{}_vpython_".format(schema, relation_alpha) cursor.execute("DROP TABLE IF EXISTS {}".format(main_table)) sql = "CREATE TEMPORARY TABLE {} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format(main_table, ", ".join(X + key_columns), input_relation, " AND ".join(["{} IS NOT NULL".format(item) for item in X])) cursor.execute(sql) else: main_table = input_relation sql = ["POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], self.p) for i in range(len(X))] distance = "POWER({}, 1 / {})".format(" + ".join(sql), self.p) sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance FROM {} AS x CROSS JOIN {} AS y".format(index, index, distance, main_table, main_table) sql = "SELECT node_id, nn_id, SUM(CASE WHEN distance <= {} THEN 1 ELSE 0 END) OVER (PARTITION BY node_id) AS density, distance FROM ({}) distance_table".format(self.eps, sql) cursor.execute("DROP TABLE IF EXISTS {}.graph_{}_vpython_".format(schema, relation_alpha)) sql = "SELECT node_id, nn_id FROM ({}) x WHERE density > {} AND distance < {} AND node_id != nn_id".format(sql, self.min_samples, self.eps) cursor.execute(sql) graph = cursor.fetchall() main_nodes = list(dict.fromkeys([elem[0] for elem in graph] + [elem[1] for elem in graph])) clusters = {} for elem in main_nodes: clusters[elem] = None i = 0 while (graph): node = graph[0][0] node_neighbor = graph[0][1] if (clusters[node] == None) and (clusters[node_neighbor] == None): clusters[node] = i clusters[node_neighbor] = i i = i + 1 else: if (clusters[node] != None): clusters[node_neighbor] = clusters[node] else: clusters[node] = clusters[node_neighbor] del(graph[0]) try: f = open("dbscan_id_cluster_vpython.csv", 'w') for elem in clusters: f.write("{}, {}\n".format(elem, clusters[elem])) f.close() cursor.execute("DROP TABLE IF EXISTS {}.dbscan_clusters".format(schema)) cursor.execute("CREATE TEMPORARY TABLE {}.dbscan_clusters(node_id int, cluster int) ON COMMIT PRESERVE ROWS".format(schema)) cursor.execute("COPY {}.dbscan_clusters(node_id, cluster) FROM LOCAL './dbscan_id_cluster_vpython.csv' DELIMITER ',' ESCAPE AS '\\'".format(schema)) cursor.execute("COMMIT") os.remove("dbscan_id_cluster_vpython.csv") except: os.remove("dbscan_id_cluster_vpython.csv") raise self.n_cluster = i cursor.execute("CREATE TABLE {} AS SELECT {}, COALESCE(cluster, -1) AS dbscan_cluster FROM {} AS x LEFT JOIN {}.dbscan_clusters AS y ON x.{} = y.node_id".format(self.name, ", ".join(self.X + self.key_columns), main_table, schema, index)) cursor.execute("SELECT COUNT(*) FROM {} WHERE dbscan_cluster = -1".format(self.name)) self.n_noise = cursor.fetchone()[0] cursor.execute("DROP TABLE IF EXISTS {}.main_{}_vpython_".format(schema, relation_alpha)) cursor.execute("DROP TABLE IF EXISTS {}.dbscan_clusters".format(schema)) return (self)
def fit(self, input_relation: str, X: list, key_columns: list = [], index=""): X = [str_column(column) for column in X] self.X = X self.key_columns = [str_column(column) for column in key_columns] self.input_relation = input_relation cursor = self.cursor n_neighbors = self.n_neighbors p = self.p relation_alpha = ''.join(ch for ch in input_relation if ch.isalnum()) schema, relation = schema_relation(input_relation) schema = str_column(schema) if not (index): index = "id" relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) main_table = "main_{}_vpython".format(relation_alpha) cursor.execute("DROP TABLE IF EXISTS {}.{}".format( schema, main_table)) sql = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format( schema, main_table, ", ".join(X + key_columns), input_relation, " AND ".join(["{} IS NOT NULL".format(item) for item in X])) cursor.execute(sql) else: main_table = input_relation sql = [ "POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], p) for i in range(len(X)) ] distance = "POWER({}, 1 / {})".format(" + ".join(sql), p) sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance, ROW_NUMBER() OVER(PARTITION BY x.{} ORDER BY {}) AS knn FROM {}.{} AS x CROSS JOIN {}.{} AS y".format( index, index, distance, index, distance, schema, main_table, schema, main_table) sql = "SELECT node_id, nn_id, distance, knn FROM ({}) distance_table WHERE knn <= {}".format( sql, n_neighbors + 1) cursor.execute("DROP TABLE IF EXISTS {}.distance_{}_vpython".format( schema, relation_alpha)) sql = "CREATE TEMPORARY TABLE {}.distance_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format( schema, relation_alpha, sql) cursor.execute(sql) kdistance = "(SELECT node_id, nn_id, distance AS distance FROM {}.distance_{}_vpython WHERE knn = {}) AS kdistance_table".format( schema, relation_alpha, n_neighbors + 1) lrd = "SELECT distance_table.node_id, {} / SUM(CASE WHEN distance_table.distance > kdistance_table.distance THEN distance_table.distance ELSE kdistance_table.distance END) AS lrd FROM ({}.distance_{}_vpython AS distance_table LEFT JOIN {} ON distance_table.nn_id = kdistance_table.node_id) x GROUP BY 1".format( n_neighbors, schema, relation_alpha, kdistance) cursor.execute("DROP TABLE IF EXISTS {}.lrd_{}_vpython".format( schema, relation_alpha)) sql = "CREATE TEMPORARY TABLE {}.lrd_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format( schema, relation_alpha, lrd) cursor.execute(sql) sql = "SELECT x.node_id, SUM(y.lrd) / (MAX(x.node_lrd) * {}) AS LOF FROM (SELECT n_table.node_id, n_table.nn_id, lrd_table.lrd AS node_lrd FROM {}.distance_{}_vpython AS n_table LEFT JOIN {}.lrd_{}_vpython AS lrd_table ON n_table.node_id = lrd_table.node_id) x LEFT JOIN {}.lrd_{}_vpython AS y ON x.nn_id = y.node_id GROUP BY 1".format( n_neighbors, schema, relation_alpha, schema, relation_alpha, schema, relation_alpha) cursor.execute("DROP TABLE IF EXISTS {}.lof_{}_vpython".format( schema, relation_alpha)) sql = "CREATE TEMPORARY TABLE {}.lof_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format( schema, relation_alpha, sql) cursor.execute(sql) sql = "SELECT {}, (CASE WHEN lof > 1e100 OR lof != lof THEN 0 ELSE lof END) AS lof_score FROM {} AS x LEFT JOIN {}.lof_{}_vpython AS y ON x.{} = y.node_id".format( ", ".join(X + self.key_columns), main_table, schema, relation_alpha, index) sql = "CREATE TABLE {} AS {}".format(self.name, sql) cursor.execute(sql) sql = "SELECT COUNT(*) FROM {}.lof_{}_vpython z WHERE lof > 1e100 OR lof != lof".format( schema, relation_alpha) cursor.execute(sql) self.n_errors = cursor.fetchone()[0] cursor.execute("DROP TABLE IF EXISTS {}.main_{}_vpython".format( schema, relation_alpha)) cursor.execute("DROP TABLE IF EXISTS {}.distance_{}_vpython".format( schema, relation_alpha)) cursor.execute("DROP TABLE IF EXISTS {}.lrd_{}_vpython".format( schema, relation_alpha)) cursor.execute("DROP TABLE IF EXISTS {}.lof_{}_vpython".format( schema, relation_alpha)) return (self)
def cross_validate(estimator, input_relation: str, X: list, y: str, cv: int = 3, pos_label=None, cutoff: float = 0.5): if (estimator.type == "regressor"): result = { "index": [ "explained_variance", "max_error", "median_absolute_error", "mean_absolute_error", "mean_squared_error", "r2" ] } elif (estimator.type == "classifier"): result = { "index": [ "auc", "prc_auc", "accuracy", "log_loss", "precision", "recall", "f1-score", "mcc", "informedness", "markedness", "csi" ] } else: raise ValueError( "Cross Validation is only possible for Regressors and Classifiers") schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = ''.join(ch for ch in relation if ch.isalnum()) test_name, train_name = "{}_{}".format(relation_alpha, int(1 / cv * 100)), "{}_{}".format( relation_alpha, int(100 - 1 / cv * 100)) estimator.cursor.execute( "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, relation_alpha)) query = "CREATE TEMPORARY TABLE {}.vpython_train_test_split_cv_{} ON COMMIT PRESERVE ROWS AS SELECT *, RANDOMINT({}) AS test FROM {}".format( schema, relation_alpha, cv, input_relation) estimator.cursor.execute(query) for i in range(cv): try: estimator.cursor.execute("DROP MODEL IF EXISTS {}".format( estimator.name)) except: pass estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, test_name)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, train_name)) query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test = {})".format( schema, test_name, "{}.vpython_train_test_split_cv_{}".format(schema, relation_alpha), i) estimator.cursor.execute(query) query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test != {})".format( schema, train_name, "{}.vpython_train_test_split_cv_{}".format(schema, relation_alpha), i) estimator.cursor.execute(query) estimator.fit( "{}.vpython_train_test_split_cv_{}".format(schema, train_name), X, y, "{}.vpython_train_test_split_cv_{}".format(schema, test_name)) if (estimator.type == "regressor"): result["{}-fold".format( i + 1)] = estimator.regression_report().values["value"] else: if (len(estimator.classes) > 2) and (pos_label not in estimator.classes): raise ValueError( "'pos_label' must be in the estimator classes, it must be the main class to study for the Cross Validation" ) try: result["{}-fold".format(i + 1)] = estimator.classification_report( labels=[pos_label], cutoff=cutoff).values["value"] except: result["{}-fold".format(i + 1)] = estimator.classification_report( cutoff=cutoff).values["value"] try: estimator.cursor.execute("DROP MODEL IF EXISTS {}".format( estimator.name)) except: pass n = 6 if (estimator.type == "regressor") else 11 total = [[] for item in range(n)] for i in range(cv): for k in range(n): total[k] += [result["{}-fold".format(i + 1)][k]] result["avg"] = [np.mean(item) for item in total] result["std"] = [np.std(item) for item in total] estimator.cursor.execute( "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, relation_alpha)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, test_name)) estimator.cursor.execute( "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format( schema, train_name)) return (tablesample(values=result, table_info=False).transpose())