Python schema_relation示例，vertica_ml_python.utilities.schema_relation Python示例

示例#1

0

显示文件

def elbow(X: list,
		  input_relation: str,
		  cursor,
		  n_cluster = (1, 15),
		  init = "kmeanspp",
		  max_iter: int = 50,
		  tol: float = 1e-4):
	import matplotlib.pyplot as plt
	from vertica_ml_python.learn.cluster import KMeans
	schema, relation = schema_relation(input_relation)
	schema = str_column(schema)
	relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
	all_within_cluster_SS = []
	L = [i for i in range(n_cluster[0], n_cluster[1])] if not(type(n_cluster) == list) else n_cluster
	for i in L:
		cursor.execute("DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha))
		model = KMeans("{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol)
		model.fit(input_relation, X)
		all_within_cluster_SS += [float(model.metrics.values["value"][3])]
		model.drop()
	plt.figure(figsize = (10,8))
	plt.rcParams['axes.facecolor'] = '#F4F4F4'
	plt.grid()
	plt.plot(L, all_within_cluster_SS, marker = "s", color = "#214579")
	plt.title("Elbow Curve")
	plt.xlabel('Number of Clusters')
	plt.ylabel('Between-Cluster SS / Total SS')
	plt.subplots_adjust(left = 0.2)
	plt.show()
	values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
	return tablesample(values = values, table_info = False)

示例#2

0

显示文件

文件： model_selection.py 项目： FeatCrush/Vertica-ML-Python

def best_k(X: list,
           input_relation: str,
           cursor,
           n_cluster=(1, 100),
           init="kmeanspp",
           max_iter: int = 50,
           tol: float = 1e-4,
           elbow_score_stop=0.8):
    from vertica_ml_python.learn.cluster import KMeans
    L = range(n_cluster[0],
              n_cluster[1]) if not (type(n_cluster) == list) else n_cluster
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format(
                schema, relation_alpha))
        model = KMeans(
            "{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha),
            cursor, i, init, max_iter, tol)
        model.fit(input_relation, X)
        score = model.metrics.values["value"][3]
        if (score > elbow_score_stop):
            return i
        score_prev = score
    print(
        "/!\\ The K was not found. The last K (= {}) is returned with an elbow score of {}"
        .format(i, score))
    return i

示例#3

0

显示文件

文件： model_selection.py 项目： FeatCrush/Vertica-ML-Python

def train_test_split(input_relation: str, cursor, test_size: float = 0.33):
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
    test_name, train_name = "{}_{}".format(relation_alpha, int(
        test_size * 100)), "{}_{}".format(relation_alpha,
                                          int(100 - test_size * 100))
    cursor.execute(
        "DROP TABLE IF EXISTS {}.vpython_train_test_split_{}".format(
            schema, relation_alpha))
    cursor.execute("DROP VIEW IF EXISTS {}.vpython_train_test_split_{}".format(
        schema, test_name))
    cursor.execute("DROP VIEW IF EXISTS {}.vpython_train_test_split_{}".format(
        schema, train_name))
    query = "CREATE TABLE {}.vpython_train_test_split_{} AS SELECT *, (CASE WHEN RANDOM() < {} THEN True ELSE False END) AS test FROM {}".format(
        schema, relation_alpha, test_size, input_relation)
    cursor.execute(query)
    query = "CREATE VIEW {}.vpython_train_test_split_{} AS SELECT * FROM {} WHERE test".format(
        schema, test_name,
        "{}.vpython_train_test_split_{}".format(schema, relation_alpha))
    cursor.execute(query)
    query = "CREATE VIEW {}.vpython_train_test_split_{} AS SELECT * FROM {} WHERE NOT(test)".format(
        schema, train_name,
        "{}.vpython_train_test_split_{}".format(schema, relation_alpha))
    cursor.execute(query)
    return ("{}.vpython_train_test_split_{}".format(schema, train_name),
            "{}.vpython_train_test_split_{}".format(schema, test_name))

示例#4

0

显示文件

文件： cluster.py 项目： FeatCrush/Vertica-ML-Python

	def fit(self, input_relation: str, X: list):
		self.input_relation = input_relation
		self.X = [str_column(column) for column in X]
		query = "SELECT KMEANS('{}', '{}', '{}', {} USING PARAMETERS max_iterations = {}, epsilon = {}".format(self.name, input_relation, ", ".join(self.X), self.n_cluster, self.max_iter, self.tol)
		name = "_vpython_kmeans_initial_centers_table_" 
		schema, relation = schema_relation(input_relation)
		schema = str_column(schema)
		if (type(self.init) != str):
			self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name))
			if (len(self.init) != self.n_cluster):
				raise ValueError("'init' must be a list of 'n_cluster' = {} points".format(self.n_cluster))
			else:
				for item in self.init:
					if (len(X) != len(item)):
						raise ValueError("Each points of 'init' must be of size len(X) = {}".format(len(self.X)))
				temp_initial_centers = [item for item in self.init]
				for item in temp_initial_centers:
					del temp_initial_centers[0]
					if (item in temp_initial_centers):
						raise ValueError("All the points of 'init' must be different")
				query0 = []
				for i in range(len(self.init)):
					line = []
					for j in range(len(self.init[0])):
						line += [str(self.init[i][j]) + " AS " + X[j]]
					line = ",".join(line)
					query0 += ["SELECT " + line]
				query0 = " UNION ".join(query0)
				query0 = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS {}".format(schema, name, query0)
				self.cursor.execute(query0)
				query += ", initial_centers_table = '" + name + "'"
		else:
			query += ", init_method = '" + self.init + "'"
		query += ")"
		self.cursor.execute(query)
		self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name))
		self.cluster_centers = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format(self.name), cursor = self.cursor)
		self.cluster_centers.table_info = False
		query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'metrics')".format(self.name)
		self.cursor.execute(query)
		result = self.cursor.fetchone()[0]
		values = {"index": ["Between-Cluster Sum of Squares", "Total Sum of Squares", "Total Within-Cluster Sum of Squares", "Between-Cluster SS / Total SS", "converged"]}
		values["value"] = [float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]) / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), result.split("Converged: ")[1].split("\n")[0] == "True"] 
		self.metrics = tablesample(values, table_info = False)
		return (self)

示例#5

0

显示文件

	def fit(self, input_relation: str, X: list):
		self.input_relation = input_relation
		self.X = [str_column(elem) for elem in X]
		schema, relation = schema_relation(input_relation)
		schema = str_column(schema)
		relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
		self.cursor.execute("DROP TABLE IF EXISTS {}.{}_countvectorizer_vpython CASCADE".format(schema, relation_alpha))
		sql = "CREATE TABLE {}.{}_countvectorizer_vpython(id identity(2000) primary key, text varchar({})) ORDER BY id SEGMENTED BY HASH(id) ALL NODES KSAFE;"
		self.cursor.execute(sql.format(schema, relation_alpha, self.max_text_size))
		text = " || ".join(self.X) if not (self.lowercase) else "LOWER({})".format(" || ".join(self.X))
		if (self.ignore_special):
			text = "REGEXP_REPLACE({}, '[^a-zA-Z0-9\\s]+', '')".format(text)
		sql = "INSERT INTO {}.{}_countvectorizer_vpython(text) SELECT {} FROM {}".format(schema, relation_alpha, text, input_relation)
		self.cursor.execute(sql)
		sql = "CREATE TEXT INDEX {} ON {}.{}_countvectorizer_vpython(id, text) stemmer NONE;".format(self.name, schema, relation_alpha)
		self.cursor.execute(sql)
		stop_words = "SELECT token FROM (SELECT token, cnt / SUM(cnt) OVER () AS df, rnk FROM (SELECT token, COUNT(*) AS cnt, RANK() OVER (ORDER BY COUNT(*) DESC) AS rnk FROM {} GROUP BY 1) x) y WHERE not(df BETWEEN {} AND {})".format(self.name, self.min_df, self.max_df)
		if (self.max_features > 0):
			stop_words += " OR (rnk > {})".format(self.max_features)
		self.cursor.execute(stop_words)
		self.stop_words = [item[0] for item in self.cursor.fetchall()]
		self.cursor.execute(self.deploySQL())
		self.vocabulary = [item[0] for item in self.cursor.fetchall()]
		return (self)

示例#6

0

显示文件

文件： neighbors.py 项目： FeatCrush/Vertica-ML-Python

 def fit(self,
         input_relation: str,
         X: list,
         key_columns: list = [],
         index=""):
     X = [str_column(column) for column in X]
     self.X = X
     self.key_columns = [str_column(column) for column in key_columns]
     self.input_relation = input_relation
     cursor = self.cursor
     n_neighbors = self.n_neighbors
     p = self.p
     relation_alpha = ''.join(ch for ch in input_relation if ch.isalnum())
     schema, relation = schema_relation(input_relation)
     schema = str_column(schema)
     if not (index):
         index = "id"
         relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
         main_table = "main_{}_vpython".format(relation_alpha)
         cursor.execute("DROP TABLE IF EXISTS {}.{}".format(
             schema, main_table))
         sql = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format(
             schema, main_table, ", ".join(X + key_columns), input_relation,
             " AND ".join(["{} IS NOT NULL".format(item) for item in X]))
         cursor.execute(sql)
     else:
         main_table = input_relation
     sql = [
         "POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], p)
         for i in range(len(X))
     ]
     distance = "POWER({}, 1 / {})".format(" + ".join(sql), p)
     sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance, ROW_NUMBER() OVER(PARTITION BY x.{} ORDER BY {}) AS knn FROM {}.{} AS x CROSS JOIN {}.{} AS y".format(
         index, index, distance, index, distance, schema, main_table,
         schema, main_table)
     sql = "SELECT node_id, nn_id, distance, knn FROM ({}) distance_table WHERE knn <= {}".format(
         sql, n_neighbors + 1)
     cursor.execute("DROP TABLE IF EXISTS {}.distance_{}_vpython".format(
         schema, relation_alpha))
     sql = "CREATE TEMPORARY TABLE {}.distance_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format(
         schema, relation_alpha, sql)
     cursor.execute(sql)
     kdistance = "(SELECT node_id, nn_id, distance AS distance FROM {}.distance_{}_vpython WHERE knn = {}) AS kdistance_table".format(
         schema, relation_alpha, n_neighbors + 1)
     lrd = "SELECT distance_table.node_id, {} / SUM(CASE WHEN distance_table.distance > kdistance_table.distance THEN distance_table.distance ELSE kdistance_table.distance END) AS lrd FROM ({}.distance_{}_vpython AS distance_table LEFT JOIN {} ON distance_table.nn_id = kdistance_table.node_id) x GROUP BY 1".format(
         n_neighbors, schema, relation_alpha, kdistance)
     cursor.execute("DROP TABLE IF EXISTS {}.lrd_{}_vpython".format(
         schema, relation_alpha))
     sql = "CREATE TEMPORARY TABLE {}.lrd_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format(
         schema, relation_alpha, lrd)
     cursor.execute(sql)
     sql = "SELECT x.node_id, SUM(y.lrd) / (MAX(x.node_lrd) * {}) AS LOF FROM (SELECT n_table.node_id, n_table.nn_id, lrd_table.lrd AS node_lrd FROM {}.distance_{}_vpython AS n_table LEFT JOIN {}.lrd_{}_vpython AS lrd_table ON n_table.node_id = lrd_table.node_id) x LEFT JOIN {}.lrd_{}_vpython AS y ON x.nn_id = y.node_id GROUP BY 1".format(
         n_neighbors, schema, relation_alpha, schema, relation_alpha,
         schema, relation_alpha)
     cursor.execute("DROP TABLE IF EXISTS {}.lof_{}_vpython".format(
         schema, relation_alpha))
     sql = "CREATE TEMPORARY TABLE {}.lof_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format(
         schema, relation_alpha, sql)
     cursor.execute(sql)
     sql = "SELECT {}, (CASE WHEN lof > 1e100 OR lof != lof THEN 0 ELSE lof END) AS lof_score FROM {} AS x LEFT JOIN {}.lof_{}_vpython AS y ON x.{} = y.node_id".format(
         ", ".join(X + self.key_columns), main_table, schema,
         relation_alpha, index)
     sql = "CREATE TABLE {} AS {}".format(self.name, sql)
     cursor.execute(sql)
     sql = "SELECT COUNT(*) FROM {}.lof_{}_vpython z WHERE lof > 1e100 OR lof != lof".format(
         schema, relation_alpha)
     cursor.execute(sql)
     self.n_errors = cursor.fetchone()[0]
     cursor.execute("DROP TABLE IF EXISTS {}.main_{}_vpython".format(
         schema, relation_alpha))
     cursor.execute("DROP TABLE IF EXISTS {}.distance_{}_vpython".format(
         schema, relation_alpha))
     cursor.execute("DROP TABLE IF EXISTS {}.lrd_{}_vpython".format(
         schema, relation_alpha))
     cursor.execute("DROP TABLE IF EXISTS {}.lof_{}_vpython".format(
         schema, relation_alpha))
     return (self)

示例#7

0

显示文件

文件： model_selection.py 项目： FeatCrush/Vertica-ML-Python

def cross_validate(estimator,
                   input_relation: str,
                   X: list,
                   y: str,
                   cv: int = 3,
                   pos_label=None,
                   cutoff: float = 0.5):
    if (estimator.type == "regressor"):
        result = {
            "index": [
                "explained_variance", "max_error", "median_absolute_error",
                "mean_absolute_error", "mean_squared_error", "r2"
            ]
        }
    elif (estimator.type == "classifier"):
        result = {
            "index": [
                "auc", "prc_auc", "accuracy", "log_loss", "precision",
                "recall", "f1-score", "mcc", "informedness", "markedness",
                "csi"
            ]
        }
    else:
        raise ValueError(
            "Cross Validation is only possible for Regressors and Classifiers")
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
    test_name, train_name = "{}_{}".format(relation_alpha,
                                           int(1 / cv * 100)), "{}_{}".format(
                                               relation_alpha,
                                               int(100 - 1 / cv * 100))
    estimator.cursor.execute(
        "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, relation_alpha))
    query = "CREATE TEMPORARY TABLE {}.vpython_train_test_split_cv_{} ON COMMIT PRESERVE ROWS AS SELECT *, RANDOMINT({}) AS test FROM {}".format(
        schema, relation_alpha, cv, input_relation)
    estimator.cursor.execute(query)
    for i in range(cv):
        try:
            estimator.cursor.execute("DROP MODEL IF EXISTS {}".format(
                estimator.name))
        except:
            pass
        estimator.cursor.execute(
            "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
                schema, test_name))
        estimator.cursor.execute(
            "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
                schema, train_name))
        query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test = {})".format(
            schema, test_name,
            "{}.vpython_train_test_split_cv_{}".format(schema,
                                                       relation_alpha), i)
        estimator.cursor.execute(query)
        query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test != {})".format(
            schema, train_name,
            "{}.vpython_train_test_split_cv_{}".format(schema,
                                                       relation_alpha), i)
        estimator.cursor.execute(query)
        estimator.fit(
            "{}.vpython_train_test_split_cv_{}".format(schema, train_name), X,
            y, "{}.vpython_train_test_split_cv_{}".format(schema, test_name))
        if (estimator.type == "regressor"):
            result["{}-fold".format(
                i + 1)] = estimator.regression_report().values["value"]
        else:
            if (len(estimator.classes) > 2) and (pos_label
                                                 not in estimator.classes):
                raise ValueError(
                    "'pos_label' must be in the estimator classes, it must be the main class to study for the Cross Validation"
                )
            try:
                result["{}-fold".format(i +
                                        1)] = estimator.classification_report(
                                            labels=[pos_label],
                                            cutoff=cutoff).values["value"]
            except:
                result["{}-fold".format(i +
                                        1)] = estimator.classification_report(
                                            cutoff=cutoff).values["value"]
        try:
            estimator.cursor.execute("DROP MODEL IF EXISTS {}".format(
                estimator.name))
        except:
            pass
    n = 6 if (estimator.type == "regressor") else 11
    total = [[] for item in range(n)]
    for i in range(cv):
        for k in range(n):
            total[k] += [result["{}-fold".format(i + 1)][k]]
    result["avg"] = [np.mean(item) for item in total]
    result["std"] = [np.std(item) for item in total]
    estimator.cursor.execute(
        "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, relation_alpha))
    estimator.cursor.execute(
        "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, test_name))
    estimator.cursor.execute(
        "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, train_name))
    return (tablesample(values=result, table_info=False).transpose())

示例#8

0

显示文件

文件： cluster.py 项目： FeatCrush/Vertica-ML-Python

	def fit(self, input_relation: str, X: list, key_columns: list = [], index: str = ""):
		X = [str_column(column) for column in X]
		self.X = X
		self.key_columns = [str_column(column) for column in key_columns]
		self.input_relation = input_relation
		schema, relation = schema_relation(input_relation)
		schema = str_column(schema)
		relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
		cursor = self.cursor
		if not(index):
			index = "id"
			main_table = "{}.main_{}_vpython_".format(schema, relation_alpha)
			cursor.execute("DROP TABLE IF EXISTS {}".format(main_table))
			sql = "CREATE TEMPORARY TABLE {} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format(main_table, ", ".join(X + key_columns), input_relation, " AND ".join(["{} IS NOT NULL".format(item) for item in X]))
			cursor.execute(sql)
		else:
			main_table = input_relation
		sql = ["POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], self.p) for i in range(len(X))] 
		distance = "POWER({}, 1 / {})".format(" + ".join(sql), self.p)
		sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance FROM {} AS x CROSS JOIN {} AS y".format(index, index, distance, main_table, main_table)
		sql = "SELECT node_id, nn_id, SUM(CASE WHEN distance <= {} THEN 1 ELSE 0 END) OVER (PARTITION BY node_id) AS density, distance FROM ({}) distance_table".format(self.eps, sql)
		cursor.execute("DROP TABLE IF EXISTS {}.graph_{}_vpython_".format(schema, relation_alpha))
		sql = "SELECT node_id, nn_id FROM ({}) x WHERE density > {} AND distance < {} AND node_id != nn_id".format(sql, self.min_samples, self.eps)
		cursor.execute(sql)
		graph = cursor.fetchall()
		main_nodes = list(dict.fromkeys([elem[0] for elem in graph] + [elem[1] for elem in graph]))
		clusters = {}
		for elem in main_nodes:
			clusters[elem] = None
		i = 0
		while (graph):
			node = graph[0][0]
			node_neighbor = graph[0][1]
			if (clusters[node] == None) and (clusters[node_neighbor] == None):
				clusters[node] = i 
				clusters[node_neighbor] = i
				i = i + 1
			else:
				if (clusters[node] != None):
					clusters[node_neighbor] = clusters[node]
				else:
					clusters[node] = clusters[node_neighbor]
			del(graph[0])
		try:
			f = open("dbscan_id_cluster_vpython.csv", 'w')
			for elem in clusters:
				f.write("{}, {}\n".format(elem, clusters[elem]))
			f.close()
			cursor.execute("DROP TABLE IF EXISTS {}.dbscan_clusters".format(schema))
			cursor.execute("CREATE TEMPORARY TABLE {}.dbscan_clusters(node_id int, cluster int) ON COMMIT PRESERVE ROWS".format(schema))
			cursor.execute("COPY {}.dbscan_clusters(node_id, cluster) FROM LOCAL './dbscan_id_cluster_vpython.csv' DELIMITER ',' ESCAPE AS '\\'".format(schema))
			cursor.execute("COMMIT")
			os.remove("dbscan_id_cluster_vpython.csv")
		except:
			os.remove("dbscan_id_cluster_vpython.csv")
			raise
		self.n_cluster = i
		cursor.execute("CREATE TABLE {} AS SELECT {}, COALESCE(cluster, -1) AS dbscan_cluster FROM {} AS x LEFT JOIN {}.dbscan_clusters AS y ON x.{} = y.node_id".format(self.name, ", ".join(self.X + self.key_columns), main_table, schema, index))
		cursor.execute("SELECT COUNT(*) FROM {} WHERE dbscan_cluster = -1".format(self.name))
		self.n_noise = cursor.fetchone()[0]
		cursor.execute("DROP TABLE IF EXISTS {}.main_{}_vpython_".format(schema, relation_alpha))
		cursor.execute("DROP TABLE IF EXISTS {}.dbscan_clusters".format(schema))
		return (self)