def test_model_from_vDF(self, base, iris_vd): base.cursor.execute("DROP MODEL IF EXISTS kmeans_vDF") model_test = KMeans("kmeans_vDF", cursor=base.cursor, init="random") model_test.fit(iris_vd, ["SepalLengthCm", "SepalWidthCm"]) base.cursor.execute( "SELECT model_name FROM models WHERE model_name = 'kmeans_vDF'") assert base.cursor.fetchone()[0] == "kmeans_vDF" model_test.drop()
def test_get_plot(self, base, winequality_vd): base.cursor.execute("DROP MODEL IF EXISTS model_test_plot") model_test = KMeans("model_test_plot", cursor=base.cursor) model_test.fit(winequality_vd, ["alcohol", "quality"]) result = model_test.plot(color="b", ) assert len(result.get_default_bbox_extra_artists()) == 16 plt.close("all") model_test.drop()
def test_get_voronoi_plot(self, iris_vd): current_cursor().execute("DROP MODEL IF EXISTS model_test_plot") model_test = KMeans("model_test_plot", ) model_test.fit(iris_vd, ["SepalLengthCm", "SepalWidthCm"]) result = model_test.plot_voronoi(color="b") assert len(result.gca().get_default_bbox_extra_artists()) == 21 plt.close("all") model_test.drop()
def test_set_cursor(self, base): model_test = KMeans("kmeans_cursor_test", cursor=base.cursor, init="kmeanspp") # TODO: creat a new cursor model_test.set_cursor(base.cursor) model_test.drop() model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"]) base.cursor.execute( "SELECT model_name FROM models WHERE model_name = 'kmeans_cursor_test'" ) assert base.cursor.fetchone()[0] == "kmeans_cursor_test" model_test.drop()
def test_drop(self, base): base.cursor.execute("DROP MODEL IF EXISTS kmeans_model_test_drop") model_test = KMeans("kmeans_model_test_drop", cursor=base.cursor) model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"]) base.cursor.execute( "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'" ) assert base.cursor.fetchone()[0] == "kmeans_model_test_drop" model_test.drop() base.cursor.execute( "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'" ) assert base.cursor.fetchone() is None
def test_drop(self): current_cursor().execute("DROP MODEL IF EXISTS kmeans_model_test_drop") model_test = KMeans("kmeans_model_test_drop", ) model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"]) current_cursor().execute( "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'" ) assert current_cursor().fetchone()[0] == "kmeans_model_test_drop" model_test.drop() current_cursor().execute( "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'" ) assert current_cursor().fetchone() is None
def model(iris_vd): model_class = KMeans( "kmeans_model_test", n_cluster=3, max_iter=10, init=[[7.2, 3.0, 5.8, 1.6], [6.9, 3.1, 4.9, 1.5], [5.7, 4.4, 1.5, 0.4]], ) model_class.drop() model_class.fit( "public.iris", ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], ) yield model_class model_class.drop()
def model(base, iris_vd): base.cursor.execute("DROP MODEL IF EXISTS kmeans_model_test") model_class = KMeans( "kmeans_model_test", cursor=base.cursor, n_cluster=3, max_iter=10, init=[[7.2, 3.0, 5.8, 1.6], [6.9, 3.1, 4.9, 1.5], [5.7, 4.4, 1.5, 0.4]], ) model_class.fit( "public.iris", ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], ) yield model_class model_class.drop()
def test_init_method(self): model_test_kmeanspp = KMeans("kmeanspp_test", init="kmeanspp") model_test_kmeanspp.drop() model_test_kmeanspp.fit("public.iris") current_cursor().execute( "SELECT model_name FROM models WHERE model_name = 'kmeanspp_test'") assert current_cursor().fetchone()[0] == "kmeanspp_test" model_test_kmeanspp.drop() model_test_random = KMeans("random_test", init="random") model_test_random.drop() model_test_random.fit("public.iris") current_cursor().execute( "SELECT model_name FROM models WHERE model_name = 'random_test'") assert current_cursor().fetchone()[0] == "random_test" model_test_random.drop()
def best_k( input_relation: (str, vDataFrame), X: list = [], cursor=None, n_cluster: (tuple, list) = (1, 100), init: (str, list) = "kmeanspp", max_iter: int = 50, tol: float = 1e-4, elbow_score_stop: float = 0.8, ): """ --------------------------------------------------------------------------- Finds the KMeans K based on a score. Parameters ---------- input_relation: str/vDataFrame Relation to use to train the model. X: list, optional List of the predictor columns. If empty, all the numerical columns will be used. cursor: DBcursor, optional Vertica DB cursor. n_cluster: tuple/list, optional Tuple representing the number of cluster to start with and to end with. It can also be customized list with the different K to test. init: str/list, optional The method to use to find the initial cluster centers. kmeanspp : Use the KMeans++ method to initialize the centers. random : The initial centers It can be also a list with the initial cluster centers to use. max_iter: int, optional The maximum number of iterations the algorithm performs. tol: float, optional Determines whether the algorithm has converged. The algorithm is considered converged after no center has moved more than a distance of 'tol' from the previous iteration. elbow_score_stop: float, optional Stops the Parameters Search when this Elbow score is reached. Returns ------- int the KMeans K """ check_types([ ( "X", X, [list], ), ( "input_relation", input_relation, [str, vDataFrame], ), ( "n_cluster", n_cluster, [list], ), ( "init", init, ["kmeanspp", "random"], ), ( "max_iter", max_iter, [int, float], ), ( "tol", tol, [int, float], ), ( "elbow_score_stop", elbow_score_stop, [int, float], ), ]) from verticapy.learn.cluster import KMeans cursor, conn = check_cursor(cursor, input_relation)[0:2] if isinstance(n_cluster, tuple): L = range(n_cluster[0], n_cluster[1]) else: L = n_cluster L.sort() schema, relation = schema_relation(input_relation) if isinstance(input_relation, vDataFrame): if not (schema): schema = "public" schema = str_column(schema) for i in L: cursor.execute( "DROP MODEL IF EXISTS {}.__VERTICAPY_TEMP_MODEL_KMEANS_{}__". format(schema, get_session(cursor))) model = KMeans( "{}.__VERTICAPY_TEMP_MODEL_KMEANS_{}__".format( schema, get_session(cursor)), cursor, i, init, max_iter, tol, ) model.fit(input_relation, X) score = model.metrics_.values["value"][3] if score > elbow_score_stop: return i score_prev = score if conn: conn.close() print( "\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}" .format(i, score)) return i
def elbow( input_relation: (str, vDataFrame), X: list = [], cursor=None, n_cluster: (tuple, list) = (1, 15), init: (str, list) = "kmeanspp", max_iter: int = 50, tol: float = 1e-4, ax=None, **style_kwds, ): """ --------------------------------------------------------------------------- Draws an Elbow Curve. Parameters ---------- input_relation: str/vDataFrame Relation to use to train the model. X: list, optional List of the predictor columns. If empty all the numerical vcolumns will be used. cursor: DBcursor, optional Vertica DB cursor. n_cluster: tuple/list, optional Tuple representing the number of cluster to start with and to end with. It can also be customized list with the different K to test. init: str/list, optional The method to use to find the initial cluster centers. kmeanspp : Use the KMeans++ method to initialize the centers. random : The initial centers It can be also a list with the initial cluster centers to use. max_iter: int, optional The maximum number of iterations the algorithm performs. tol: float, optional Determines whether the algorithm has converged. The algorithm is considered converged after no center has moved more than a distance of 'tol' from the previous iteration. ax: Matplotlib axes object, optional The axes to plot on. **style_kwds Any optional parameter to pass to the Matplotlib functions. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "X", X, [list], ), ( "input_relation", input_relation, [str, vDataFrame], ), ( "n_cluster", n_cluster, [list], ), ( "init", init, ["kmeanspp", "random"], ), ( "max_iter", max_iter, [int, float], ), ( "tol", tol, [int, float], ), ]) cursor, conn = check_cursor(cursor, input_relation)[0:2] version(cursor=cursor, condition=[8, 0, 0]) if isinstance(n_cluster, tuple): L = range(n_cluster[0], n_cluster[1]) else: L = n_cluster L.sort() schema, relation = schema_relation(input_relation) all_within_cluster_SS = [] if isinstance(n_cluster, tuple): L = [i for i in range(n_cluster[0], n_cluster[1])] else: L = n_cluster L.sort() for i in L: cursor.execute( "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format( schema, get_session(cursor))) from verticapy.learn.cluster import KMeans model = KMeans( "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, get_session(cursor)), cursor, i, init, max_iter, tol, ) model.fit(input_relation, X) all_within_cluster_SS += [float(model.metrics_.values["value"][3])] model.drop() if conn: conn.close() if not (ax): fig, ax = plt.subplots() if isnotebook(): fig.set_size_inches(8, 6) ax.grid(axis="y") param = { "color": gen_colors()[0], "marker": "o", "markerfacecolor": "white", "markersize": 7, "markeredgecolor": "black", } ax.plot( L, all_within_cluster_SS, **updated_dict(param, style_kwds), ) ax.set_title("Elbow Curve") ax.set_xlabel("Number of Clusters") ax.set_ylabel("Between-Cluster SS / Total SS") values = {"index": L, "Within-Cluster SS": all_within_cluster_SS} return tablesample(values=values)
def best_k( X: list, input_relation: str, cursor=None, n_cluster=(1, 100), init="kmeanspp", max_iter: int = 50, tol: float = 1e-4, elbow_score_stop: float = 0.8, ): """ --------------------------------------------------------------------------- Finds the KMeans K based on a score. Parameters ---------- X: list List of the predictor columns. input_relation: str Relation to use to train the model. cursor: DBcursor, optional Vertica DB cursor. n_cluster: int, optional Tuple representing the number of cluster to start with and to end with. It can also be customized list with the different K to test. init: str/list, optional The method to use to find the initial cluster centers. kmeanspp : Use the KMeans++ method to initialize the centers. random : The initial centers It can be also a list with the initial cluster centers to use. max_iter: int, optional The maximum number of iterations the algorithm performs. tol: float, optional Determines whether the algorithm has converged. The algorithm is considered converged after no center has moved more than a distance of 'tol' from the previous iteration. elbow_score_stop: float, optional Stops the Parameters Search when this Elbow score is reached. Returns ------- int the KMeans K """ check_types([ ( "X", X, [list], ), ( "input_relation", input_relation, [str], ), ( "n_cluster", n_cluster, [list], ), ( "init", init, ["kmeanspp", "random"], ), ( "max_iter", max_iter, [int, float], ), ( "tol", tol, [int, float], ), ( "elbow_score_stop", elbow_score_stop, [int, float], ), ]) from verticapy.learn.cluster import KMeans if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) if not (isinstance(n_cluster, Iterable)): L = range(n_cluster[0], n_cluster[1]) else: L = n_cluster L.sort() schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = "".join(ch for ch in relation if ch.isalnum()) for i in L: cursor.execute( "DROP MODEL IF EXISTS {}.__vpython_kmeans_tmp_model_{}__".format( schema, relation_alpha)) model = KMeans( "{}.__vpython_kmeans_tmp_model_{}__".format( schema, relation_alpha), cursor, i, init, max_iter, tol, ) model.fit(input_relation, X) score = model.metrics.values["value"][3] if score > elbow_score_stop: return i score_prev = score if conn: conn.close() print( "\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}" .format(i, score)) return i
def elbow( X: list, input_relation: str, cursor=None, n_cluster=(1, 15), init="kmeanspp", max_iter: int = 50, tol: float = 1e-4, ax=None, ): """ --------------------------------------------------------------------------- Draws an Elbow Curve. Parameters ---------- X: list List of the predictor columns. input_relation: str Relation to use to train the model. cursor: DBcursor, optional Vertica DB cursor. n_cluster: int, optional Tuple representing the number of cluster to start with and to end with. It can also be customized list with the different K to test. init: str/list, optional The method to use to find the initial cluster centers. kmeanspp : Use the KMeans++ method to initialize the centers. random : The initial centers It can be also a list with the initial cluster centers to use. max_iter: int, optional The maximum number of iterations the algorithm performs. tol: float, optional Determines whether the algorithm has converged. The algorithm is considered converged after no center has moved more than a distance of 'tol' from the previous iteration. ax: Matplotlib axes object, optional The axes to plot on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "X", X, [list], ), ( "input_relation", input_relation, [str], ), ( "n_cluster", n_cluster, [list], ), ( "init", init, ["kmeanspp", "random"], ), ( "max_iter", max_iter, [int, float], ), ( "tol", tol, [int, float], ), ]) if not (cursor): conn = read_auto_connect() cursor = conn.cursor() else: conn = False check_cursor(cursor) version(cursor=cursor, condition=[8, 0, 0]) schema, relation = schema_relation(input_relation) schema = str_column(schema) relation_alpha = "".join(ch for ch in relation if ch.isalnum()) all_within_cluster_SS = [] if isinstance(n_cluster, tuple): L = [i for i in range(n_cluster[0], n_cluster[1])] else: L = n_cluster L.sort() for i in L: cursor.execute( "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format( schema, relation_alpha)) from verticapy.learn.cluster import KMeans model = KMeans( "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol, ) model.fit(input_relation, X) all_within_cluster_SS += [float(model.metrics_.values["value"][3])] model.drop() if conn: conn.close() if not (ax): fig, ax = plt.subplots() if isnotebook(): fig.set_size_inches(8, 6) ax.set_facecolor("#F5F5F5") ax.grid() ax.plot(L, all_within_cluster_SS, marker="s", color="#FE5016") ax.set_title("Elbow Curve") ax.set_xlabel("Number of Clusters") ax.set_ylabel("Between-Cluster SS / Total SS") values = {"index": L, "Within-Cluster SS": all_within_cluster_SS} return tablesample(values=values)