Exemplo n.º 1
0
 def test_model_from_vDF(self, base, iris_vd):
     base.cursor.execute("DROP MODEL IF EXISTS kmeans_vDF")
     model_test = KMeans("kmeans_vDF", cursor=base.cursor, init="random")
     model_test.fit(iris_vd, ["SepalLengthCm", "SepalWidthCm"])
     base.cursor.execute(
         "SELECT model_name FROM models WHERE model_name = 'kmeans_vDF'")
     assert base.cursor.fetchone()[0] == "kmeans_vDF"
     model_test.drop()
Exemplo n.º 2
0
 def test_get_plot(self, base, winequality_vd):
     base.cursor.execute("DROP MODEL IF EXISTS model_test_plot")
     model_test = KMeans("model_test_plot", cursor=base.cursor)
     model_test.fit(winequality_vd, ["alcohol", "quality"])
     result = model_test.plot(color="b", )
     assert len(result.get_default_bbox_extra_artists()) == 16
     plt.close("all")
     model_test.drop()
Exemplo n.º 3
0
 def test_get_voronoi_plot(self, iris_vd):
     current_cursor().execute("DROP MODEL IF EXISTS model_test_plot")
     model_test = KMeans("model_test_plot", )
     model_test.fit(iris_vd, ["SepalLengthCm", "SepalWidthCm"])
     result = model_test.plot_voronoi(color="b")
     assert len(result.gca().get_default_bbox_extra_artists()) == 21
     plt.close("all")
     model_test.drop()
Exemplo n.º 4
0
    def test_set_cursor(self, base):
        model_test = KMeans("kmeans_cursor_test",
                            cursor=base.cursor,
                            init="kmeanspp")
        # TODO: creat a new cursor
        model_test.set_cursor(base.cursor)
        model_test.drop()
        model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"])

        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_cursor_test'"
        )
        assert base.cursor.fetchone()[0] == "kmeans_cursor_test"
        model_test.drop()
Exemplo n.º 5
0
    def test_drop(self, base):
        base.cursor.execute("DROP MODEL IF EXISTS kmeans_model_test_drop")
        model_test = KMeans("kmeans_model_test_drop", cursor=base.cursor)
        model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"])

        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'"
        )
        assert base.cursor.fetchone()[0] == "kmeans_model_test_drop"

        model_test.drop()
        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'"
        )
        assert base.cursor.fetchone() is None
Exemplo n.º 6
0
    def test_drop(self):
        current_cursor().execute("DROP MODEL IF EXISTS kmeans_model_test_drop")
        model_test = KMeans("kmeans_model_test_drop", )
        model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"])

        current_cursor().execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'"
        )
        assert current_cursor().fetchone()[0] == "kmeans_model_test_drop"

        model_test.drop()
        current_cursor().execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'"
        )
        assert current_cursor().fetchone() is None
Exemplo n.º 7
0
def model(iris_vd):
    model_class = KMeans(
        "kmeans_model_test",
        n_cluster=3,
        max_iter=10,
        init=[[7.2, 3.0, 5.8, 1.6], [6.9, 3.1, 4.9, 1.5], [5.7, 4.4, 1.5,
                                                           0.4]],
    )
    model_class.drop()
    model_class.fit(
        "public.iris",
        ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    )
    yield model_class
    model_class.drop()
Exemplo n.º 8
0
def model(base, iris_vd):
    base.cursor.execute("DROP MODEL IF EXISTS kmeans_model_test")

    model_class = KMeans(
        "kmeans_model_test",
        cursor=base.cursor,
        n_cluster=3,
        max_iter=10,
        init=[[7.2, 3.0, 5.8, 1.6], [6.9, 3.1, 4.9, 1.5], [5.7, 4.4, 1.5,
                                                           0.4]],
    )
    model_class.fit(
        "public.iris",
        ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    )
    yield model_class
    model_class.drop()
Exemplo n.º 9
0
    def test_init_method(self):
        model_test_kmeanspp = KMeans("kmeanspp_test", init="kmeanspp")
        model_test_kmeanspp.drop()
        model_test_kmeanspp.fit("public.iris")

        current_cursor().execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeanspp_test'")
        assert current_cursor().fetchone()[0] == "kmeanspp_test"
        model_test_kmeanspp.drop()

        model_test_random = KMeans("random_test", init="random")
        model_test_random.drop()
        model_test_random.fit("public.iris")

        current_cursor().execute(
            "SELECT model_name FROM models WHERE model_name = 'random_test'")
        assert current_cursor().fetchone()[0] == "random_test"
        model_test_random.drop()
Exemplo n.º 10
0
def best_k(
    input_relation: (str, vDataFrame),
    X: list = [],
    cursor=None,
    n_cluster: (tuple, list) = (1, 100),
    init: (str, list) = "kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    elbow_score_stop: float = 0.8,
):
    """
---------------------------------------------------------------------------
Finds the KMeans K based on a score.

Parameters
----------
input_relation: str/vDataFrame
    Relation to use to train the model.
X: list, optional
	List of the predictor columns. If empty, all the numerical columns will
    be used.
cursor: DBcursor, optional
	Vertica DB cursor.
n_cluster: tuple/list, optional
	Tuple representing the number of cluster to start with and to end with.
	It can also be customized list with the different K to test.
init: str/list, optional
	The method to use to find the initial cluster centers.
		kmeanspp : Use the KMeans++ method to initialize the centers.
		random   : The initial centers
	It can be also a list with the initial cluster centers to use.
max_iter: int, optional
	The maximum number of iterations the algorithm performs.
tol: float, optional
	Determines whether the algorithm has converged. The algorithm is considered 
	converged after no center has moved more than a distance of 'tol' from the 
	previous iteration.
elbow_score_stop: float, optional
	Stops the Parameters Search when this Elbow score is reached.

Returns
-------
int
	the KMeans K
	"""
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str, vDataFrame],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
        (
            "elbow_score_stop",
            elbow_score_stop,
            [int, float],
        ),
    ])

    from verticapy.learn.cluster import KMeans

    cursor, conn = check_cursor(cursor, input_relation)[0:2]
    if isinstance(n_cluster, tuple):
        L = range(n_cluster[0], n_cluster[1])
    else:
        L = n_cluster
        L.sort()
    schema, relation = schema_relation(input_relation)
    if isinstance(input_relation, vDataFrame):
        if not (schema):
            schema = "public"
    schema = str_column(schema)
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.__VERTICAPY_TEMP_MODEL_KMEANS_{}__".
            format(schema, get_session(cursor)))
        model = KMeans(
            "{}.__VERTICAPY_TEMP_MODEL_KMEANS_{}__".format(
                schema, get_session(cursor)),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        score = model.metrics_.values["value"][3]
        if score > elbow_score_stop:
            return i
        score_prev = score
    if conn:
        conn.close()
    print(
        "\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}"
        .format(i, score))
    return i
Exemplo n.º 11
0
def elbow(
    input_relation: (str, vDataFrame),
    X: list = [],
    cursor=None,
    n_cluster: (tuple, list) = (1, 15),
    init: (str, list) = "kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    ax=None,
    **style_kwds,
):
    """
---------------------------------------------------------------------------
Draws an Elbow Curve.

Parameters
----------
input_relation: str/vDataFrame
    Relation to use to train the model.
X: list, optional
    List of the predictor columns. If empty all the numerical vcolumns will
    be used.
cursor: DBcursor, optional
    Vertica DB cursor.
n_cluster: tuple/list, optional
    Tuple representing the number of cluster to start with and to end with.
    It can also be customized list with the different K to test.
init: str/list, optional
    The method to use to find the initial cluster centers.
        kmeanspp : Use the KMeans++ method to initialize the centers.
        random   : The initial centers
    It can be also a list with the initial cluster centers to use.
max_iter: int, optional
    The maximum number of iterations the algorithm performs.
tol: float, optional
    Determines whether the algorithm has converged. The algorithm is considered 
    converged after no center has moved more than a distance of 'tol' from the 
    previous iteration.
ax: Matplotlib axes object, optional
    The axes to plot on.
**style_kwds
    Any optional parameter to pass to the Matplotlib functions.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str, vDataFrame],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
    ])
    cursor, conn = check_cursor(cursor, input_relation)[0:2]
    version(cursor=cursor, condition=[8, 0, 0])
    if isinstance(n_cluster, tuple):
        L = range(n_cluster[0], n_cluster[1])
    else:
        L = n_cluster
        L.sort()
    schema, relation = schema_relation(input_relation)
    all_within_cluster_SS = []
    if isinstance(n_cluster, tuple):
        L = [i for i in range(n_cluster[0], n_cluster[1])]
    else:
        L = n_cluster
        L.sort()
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format(
                schema, get_session(cursor)))
        from verticapy.learn.cluster import KMeans

        model = KMeans(
            "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, get_session(cursor)),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        all_within_cluster_SS += [float(model.metrics_.values["value"][3])]
        model.drop()
    if conn:
        conn.close()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
        ax.grid(axis="y")
    param = {
        "color": gen_colors()[0],
        "marker": "o",
        "markerfacecolor": "white",
        "markersize": 7,
        "markeredgecolor": "black",
    }
    ax.plot(
        L,
        all_within_cluster_SS,
        **updated_dict(param, style_kwds),
    )
    ax.set_title("Elbow Curve")
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Between-Cluster SS / Total SS")
    values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
    return tablesample(values=values)
Exemplo n.º 12
0
def best_k(
    X: list,
    input_relation: str,
    cursor=None,
    n_cluster=(1, 100),
    init="kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    elbow_score_stop: float = 0.8,
):
    """
---------------------------------------------------------------------------
Finds the KMeans K based on a score.

Parameters
----------
X: list
	List of the predictor columns.
input_relation: str
	Relation to use to train the model.
cursor: DBcursor, optional
	Vertica DB cursor.
n_cluster: int, optional
	Tuple representing the number of cluster to start with and to end with.
	It can also be customized list with the different K to test.
init: str/list, optional
	The method to use to find the initial cluster centers.
		kmeanspp : Use the KMeans++ method to initialize the centers.
		random   : The initial centers
	It can be also a list with the initial cluster centers to use.
max_iter: int, optional
	The maximum number of iterations the algorithm performs.
tol: float, optional
	Determines whether the algorithm has converged. The algorithm is considered 
	converged after no center has moved more than a distance of 'tol' from the 
	previous iteration.
elbow_score_stop: float, optional
	Stops the Parameters Search when this Elbow score is reached.

Returns
-------
int
	the KMeans K
	"""
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
        (
            "elbow_score_stop",
            elbow_score_stop,
            [int, float],
        ),
    ])

    from verticapy.learn.cluster import KMeans

    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    if not (isinstance(n_cluster, Iterable)):
        L = range(n_cluster[0], n_cluster[1])
    else:
        L = n_cluster
        L.sort()
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = "".join(ch for ch in relation if ch.isalnum())
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.__vpython_kmeans_tmp_model_{}__".format(
                schema, relation_alpha))
        model = KMeans(
            "{}.__vpython_kmeans_tmp_model_{}__".format(
                schema, relation_alpha),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        score = model.metrics.values["value"][3]
        if score > elbow_score_stop:
            return i
        score_prev = score
    if conn:
        conn.close()
    print(
        "\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}"
        .format(i, score))
    return i
Exemplo n.º 13
0
def elbow(
    X: list,
    input_relation: str,
    cursor=None,
    n_cluster=(1, 15),
    init="kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    ax=None,
):
    """
---------------------------------------------------------------------------
Draws an Elbow Curve.

Parameters
----------
X: list
    List of the predictor columns.
input_relation: str
    Relation to use to train the model.
cursor: DBcursor, optional
    Vertica DB cursor.
n_cluster: int, optional
    Tuple representing the number of cluster to start with and to end with.
    It can also be customized list with the different K to test.
init: str/list, optional
    The method to use to find the initial cluster centers.
        kmeanspp : Use the KMeans++ method to initialize the centers.
        random   : The initial centers
    It can be also a list with the initial cluster centers to use.
max_iter: int, optional
    The maximum number of iterations the algorithm performs.
tol: float, optional
    Determines whether the algorithm has converged. The algorithm is considered 
    converged after no center has moved more than a distance of 'tol' from the 
    previous iteration.
ax: Matplotlib axes object, optional
    The axes to plot on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    version(cursor=cursor, condition=[8, 0, 0])
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = "".join(ch for ch in relation if ch.isalnum())
    all_within_cluster_SS = []
    if isinstance(n_cluster, tuple):
        L = [i for i in range(n_cluster[0], n_cluster[1])]
    else:
        L = n_cluster
        L.sort()
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format(
                schema, relation_alpha))
        from verticapy.learn.cluster import KMeans

        model = KMeans(
            "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, relation_alpha),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        all_within_cluster_SS += [float(model.metrics_.values["value"][3])]
        model.drop()
    if conn:
        conn.close()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
    ax.set_facecolor("#F5F5F5")
    ax.grid()
    ax.plot(L, all_within_cluster_SS, marker="s", color="#FE5016")
    ax.set_title("Elbow Curve")
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Between-Cluster SS / Total SS")
    values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
    return tablesample(values=values)