示例#1
0
def color_dict(d: dict, idx: int = 0):
    if "color" in d:
        if isinstance(d["color"], str):
            return d["color"]
        else:
            return d["color"][idx % len(d["color"])]
    else:
        from verticapy.plot import gen_colors

        return gen_colors()[idx % len(gen_colors())]
示例#2
0
def regression_tree_plot(
    X: list,
    y: str,
    input_relation: str,
    cursor=None,
    max_nb_points: int = 10000,
    ax=None,
    **style_kwds,
):
    check_types(
        [
            ("X", X, [list],),
            ("y", y, [str],),
            ("input_relation", input_relation, [str],),
            ("max_nb_points", max_nb_points, [int, float],),
        ]
    )
    cursor, conn = check_cursor(cursor)[0:2]

    query = "SELECT {}, {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} IS NOT NULL ORDER BY RANDOM() LIMIT {}".format(
        X[0], X[1], y, input_relation, X[0], X[1], y, int(max_nb_points),
    )
    cursor.execute(query)
    all_points = cursor.fetchall()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
        ax.set_axisbelow(True)
        ax.grid()
    x0, x1, y0, y1 = (
        [float(item[0]) for item in all_points],
        [float(item[0]) for item in all_points],
        [float(item[2]) for item in all_points],
        [float(item[1]) for item in all_points],
    )
    x0, y0 = zip(*sorted(zip(x0, y0)))
    x1, y1 = zip(*sorted(zip(x1, y1)))
    color = "black"
    if "color" in style_kwds:
        if not (isinstance(style_kwds["color"], str)) and len(style_kwds["color"]) > 1:
            color = style_kwds["color"][1]
    ax.step(x1, y1, color=color)
    param = {
        "marker": "o",
        "color": gen_colors()[0],
        "s": 50,
        "edgecolors": "black",
    }
    ax.scatter(
        x0, y0, **updated_dict(param, style_kwds,),
    )
    ax.set_xlabel(X[0])
    ax.set_ylabel(y)
    if conn:
        conn.close()
    return ax
示例#3
0
def plot_stepwise_ml(x: list, y: list, z: list = [], w: list = [], var: list = [], x_label: str = "n_features", y_label: str = "score", direction = "forward", ax=None, **style_kwds):
    colors = gen_colors()
    if not(ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
        ax.grid(axis = "y")
        ax.set_axisbelow(True)
    sign = "+" if direction == "forward" else "-"
    x_new, y_new, z_new = [], [], []
    for idx in range(len(x)):
        if idx == 0 or w[idx][0] == sign:
            x_new += [x[idx]]
            y_new += [y[idx]]
            z_new += [z[idx]]
    if len(var[0]) > 3:
        var0 = var[0][0:2] + ["..."] + var[0][-1:]
    else:
        var0 = var[0]
    if len(var[1]) > 3:
        var1 = var[1][0:2] + ["..."] + var[1][-1:]
    else:
        var1 = var[1]
    if "color" in style_kwds:
        if isinstance(style_kwds["color"], str):
            c0, c1 = style_kwds["color"], colors[1]
        else:
            c0, c1 = style_kwds["color"][0], style_kwds["color"][1]
    else:
        c0, c1 = colors[0], colors[1]
    if "color" in style_kwds:
        del style_kwds["color"]
    if direction == "forward":
        delta_ini, delta_final = 0.1, -0.15
        rot_ini, rot_final = -90, 90
        verticalalignment_init, verticalalignment_final = "top", "bottom"
        horizontalalignment = "center"
    else:
        delta_ini, delta_final = 0.35, -0.3
        rot_ini, rot_final = 90, -90
        verticalalignment_init, verticalalignment_final = "top", "bottom"
        horizontalalignment = "left"
    param = {"marker": "s", "alpha": 0.5, "edgecolors": "black", "s": 400}
    ax.scatter(x_new[1:-1], y_new[1:-1], c=c0, **updated_dict(param, style_kwds,),)
    ax.scatter([x_new[0], x_new[-1]], [y_new[0], y_new[-1]], c=c1, **updated_dict(param, style_kwds,),)
    ax.text(x_new[0] + delta_ini, y_new[0], "Initial Variables: {}".format("["+", ".join(var0)+"]"), rotation = rot_ini, verticalalignment=verticalalignment_init,)
    for idx in range(1, len(x_new)):
        dx, dy = x_new[idx] - x_new[idx - 1], y_new[idx] - y_new[idx - 1]
        ax.arrow(x_new[idx - 1], y_new[idx - 1], dx, dy, fc='k', ec='k', alpha=0.2)
        ax.text((x_new[idx] + x_new[idx - 1]) / 2, (y_new[idx] + y_new[idx - 1]) / 2, sign + " " + z_new[idx], rotation = rot_ini)
    if direction == "backward":
        ax.set_xlim(max(x) + 0.1 * (1 + max(x) - min(x)), min(x) - 0.1 - 0.1 * (1 + max(x) - min(x)))
    ax.text(x_new[-1] + delta_final, y_new[-1], "Final Variables: {}".format("["+", ".join(var1)+"]"), rotation = rot_final, verticalalignment=verticalalignment_final, horizontalalignment=horizontalalignment,)
    ax.set_xticks(x_new)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    return ax
示例#4
0
def plot_pca_circle(
        x: list,
        y: list,
        variable_names: list = [],
        explained_variance: tuple = (None, None),
        dimensions: tuple = (1, 2),
        ax=None,
        **style_kwds,
):
    colors = gen_colors()
    if "color" in style_kwds:
        colors[0] = style_kwds["color"]
    circle1 = plt.Circle((0, 0), 1, edgecolor=colors[0], facecolor="none")
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(6, 6)
        ax.set_axisbelow(True)
    n = len(x)
    ax.add_patch(circle1)
    for i in range(n):
        ax.arrow(0,
                 0,
                 x[i],
                 y[i],
                 head_width=0.05,
                 color="black",
                 length_includes_head=True)
        ax.text(x[i], y[i], variable_names[i])
    ax.plot([-1.1, 1.1], [0.0, 0.0], linestyle="--", color="black")
    ax.plot([0.0, 0.0], [-1.1, 1.1], linestyle="--", color="black")
    ax.set_xlabel("Dim{} {}".format(
        dimensions[0],
        "" if not (explained_variance[0]) else "({}%)".format(
            round(explained_variance[0] * 100, 1)),
    ))
    ax.set_ylabel("Dim{} {}".format(
        dimensions[1],
        "" if not (explained_variance[1]) else "({}%)".format(
            round(explained_variance[1] * 100, 1)),
    ))
    ax.xaxis.set_ticks_position("bottom")
    ax.yaxis.set_ticks_position("left")
    ax.set_xlim(-1.1, 1.1)
    ax.set_ylim(-1.1, 1.1)
    return ax
示例#5
0
def lift_chart(
    y_true: str,
    y_score: str,
    input_relation: (str, vDataFrame),
    cursor=None,
    pos_label: (int, float, str) = 1,
    nbins: int = 30,
    ax=None,
    **style_kwds,
):
    """
---------------------------------------------------------------------------
Draws the Lift Chart.

Parameters
----------
y_true: str
    Response column.
y_score: str
    Prediction Probability.
input_relation: str/vDataFrame
    Relation to use to do the scoring. The relation can be a view or a table
    or even a customized relation. For example, you could write:
    "(SELECT ... FROM ...) x" as long as an alias is given at the end of the
    relation.
cursor: DBcursor, optional
    Vertica DB cursor.
pos_label: int/float/str, optional
    To compute the Lift Chart, one of the response column class has to be the 
    positive one. The parameter 'pos_label' represents this class.
nbins: int, optional
    Curve number of bins.
ax: Matplotlib axes object, optional
    The axes to plot on.
**style_kwds
    Any optional parameter to pass to the Matplotlib functions.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str, vDataFrame],
        ),
        (
            "nbins",
            nbins,
            [int, float],
        ),
    ])
    cursor, conn, input_relation = check_cursor(cursor, input_relation)
    version(cursor=cursor, condition=[8, 0, 0])
    query = "SELECT LIFT_TABLE(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output"
    query = query.format(nbins, y_true, pos_label, y_score, input_relation)
    executeSQL(cursor, query, "Computing the Lift Table.")
    query_result = cursor.fetchall()
    if conn:
        conn.close()
    decision_boundary, positive_prediction_ratio, lift = (
        [item[0] for item in query_result],
        [item[1] for item in query_result],
        [item[2] for item in query_result],
    )
    decision_boundary.reverse()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
    ax.set_xlabel("Cumulative Data Fraction")
    max_value = max([0 if elem != elem else elem for elem in lift])
    lift = [max_value if elem != elem else elem for elem in lift]
    param1 = {"color": gen_colors()[0]}
    ax.plot(
        decision_boundary,
        lift,
        **updated_dict(param1, style_kwds, 0),
    )
    param2 = {"color": gen_colors()[1]}
    ax.plot(
        decision_boundary,
        positive_prediction_ratio,
        **updated_dict(param2, style_kwds, 1),
    )
    color1, color2 = color_dict(style_kwds, 0), color_dict(style_kwds, 1)
    if color1 == color2:
        color2 = gen_colors()[1]
    ax.fill_between(decision_boundary,
                    positive_prediction_ratio,
                    lift,
                    facecolor=color1,
                    alpha=0.2)
    ax.fill_between(
        decision_boundary,
        [0 for elem in decision_boundary],
        positive_prediction_ratio,
        facecolor=color2,
        alpha=0.2,
    )
    ax.set_title("Lift Table")
    ax.set_axisbelow(True)
    ax.grid()
    color1 = mpatches.Patch(color=color1, label="Cumulative Lift")
    color2 = mpatches.Patch(color=color2, label="Cumulative Capture Rate")
    ax.legend(handles=[color1, color2],
              loc="center left",
              bbox_to_anchor=[1, 0.5])
    ax.set_xlim(0, 1)
    ax.set_ylim(0)
    return tablesample(values={
        "decision_boundary": decision_boundary,
        "positive_prediction_ratio": positive_prediction_ratio,
        "lift": lift,
    }, )
示例#6
0
def elbow(
    input_relation: (str, vDataFrame),
    X: list = [],
    cursor=None,
    n_cluster: (tuple, list) = (1, 15),
    init: (str, list) = "kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    ax=None,
    **style_kwds,
):
    """
---------------------------------------------------------------------------
Draws an Elbow Curve.

Parameters
----------
input_relation: str/vDataFrame
    Relation to use to train the model.
X: list, optional
    List of the predictor columns. If empty all the numerical vcolumns will
    be used.
cursor: DBcursor, optional
    Vertica DB cursor.
n_cluster: tuple/list, optional
    Tuple representing the number of cluster to start with and to end with.
    It can also be customized list with the different K to test.
init: str/list, optional
    The method to use to find the initial cluster centers.
        kmeanspp : Use the KMeans++ method to initialize the centers.
        random   : The initial centers
    It can be also a list with the initial cluster centers to use.
max_iter: int, optional
    The maximum number of iterations the algorithm performs.
tol: float, optional
    Determines whether the algorithm has converged. The algorithm is considered 
    converged after no center has moved more than a distance of 'tol' from the 
    previous iteration.
ax: Matplotlib axes object, optional
    The axes to plot on.
**style_kwds
    Any optional parameter to pass to the Matplotlib functions.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str, vDataFrame],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
    ])
    cursor, conn = check_cursor(cursor, input_relation)[0:2]
    version(cursor=cursor, condition=[8, 0, 0])
    if isinstance(n_cluster, tuple):
        L = range(n_cluster[0], n_cluster[1])
    else:
        L = n_cluster
        L.sort()
    schema, relation = schema_relation(input_relation)
    all_within_cluster_SS = []
    if isinstance(n_cluster, tuple):
        L = [i for i in range(n_cluster[0], n_cluster[1])]
    else:
        L = n_cluster
        L.sort()
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format(
                schema, get_session(cursor)))
        from verticapy.learn.cluster import KMeans

        model = KMeans(
            "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, get_session(cursor)),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        all_within_cluster_SS += [float(model.metrics_.values["value"][3])]
        model.drop()
    if conn:
        conn.close()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
        ax.grid(axis="y")
    param = {
        "color": gen_colors()[0],
        "marker": "o",
        "markerfacecolor": "white",
        "markersize": 7,
        "markeredgecolor": "black",
    }
    ax.plot(
        L,
        all_within_cluster_SS,
        **updated_dict(param, style_kwds),
    )
    ax.set_title("Elbow Curve")
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Between-Cluster SS / Total SS")
    values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
    return tablesample(values=values)
示例#7
0
def roc_curve(
    y_true: str,
    y_score: str,
    input_relation: (str, vDataFrame),
    cursor=None,
    pos_label: (int, float, str) = 1,
    nbins: int = 30,
    auc_roc: bool = False,
    best_threshold: bool = False,
    cutoff_curve: bool = False,
    ax=None,
    **style_kwds,
):
    """
---------------------------------------------------------------------------
Draws the ROC Curve.

Parameters
----------
y_true: str
    Response column.
y_score: str
    Prediction Probability.
input_relation: str/vDataFrame
    Relation to use to do the scoring. The relation can be a view or a table
    or even a customized relation. For example, you could write:
    "(SELECT ... FROM ...) x" as long as an alias is given at the end of the
    relation.
cursor: DBcursor, optional
    Vertica DB cursor.
pos_label: int/float/str, optional
    To compute the PRC Curve, one of the response column class has to be the 
    positive one. The parameter 'pos_label' represents this class.
nbins: int, optional
    Curve number of bins.
auc_roc: bool, optional
    If set to true, the function will return the ROC AUC without drawing the 
    curve.
best_threshold: bool, optional
    If set to True, the function will return the best threshold without drawing 
    the curve. The best threshold is the threshold of the point which is the 
    farest from the random line.
cutoff_curve: bool, optional
    If set to True, the Cutoff curve will be drawn.
ax: Matplotlib axes object, optional
    The axes to plot on.
**style_kwds
    Any optional parameter to pass to the Matplotlib functions.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str, vDataFrame],
        ),
        (
            "nbins",
            nbins,
            [int, float],
        ),
        (
            "auc_roc",
            auc_roc,
            [bool],
        ),
        (
            "best_threshold",
            best_threshold,
            [bool],
        ),
        (
            "cutoff_curve",
            cutoff_curve,
            [bool],
        ),
    ])
    cursor, conn, input_relation = check_cursor(cursor, input_relation)
    version(cursor=cursor, condition=[8, 0, 0])
    query = "SELECT decision_boundary, false_positive_rate, true_positive_rate FROM (SELECT ROC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output) x"
    query = query.format(nbins, y_true, pos_label, y_score, input_relation)
    executeSQL(cursor, query, "Computing the ROC Table.")
    query_result = cursor.fetchall()
    if conn:
        conn.close()
    threshold, false_positive, true_positive = (
        [item[0] for item in query_result],
        [item[1] for item in query_result],
        [item[2] for item in query_result],
    )
    auc = 0
    for i in range(len(false_positive) - 1):
        if false_positive[i + 1] - false_positive[i] != 0.0:
            a = (true_positive[i + 1] - true_positive[i]) / (
                false_positive[i + 1] - false_positive[i])
            b = true_positive[i + 1] - a * false_positive[i + 1]
            auc = (auc + a * (false_positive[i + 1] * false_positive[i + 1] -
                              false_positive[i] * false_positive[i]) / 2 + b *
                   (false_positive[i + 1] - false_positive[i]))
    auc = -auc
    auc = min(auc, 1.0)
    if auc_roc:
        return auc
    if best_threshold:
        l = [abs(y - x) for x, y in zip(false_positive, true_positive)]
        best_threshold_arg = max(zip(l, range(len(l))))[1]
        best = max(threshold[best_threshold_arg], 0.001)
        best = min(best, 0.999)
        return best
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
    color1, color2 = color_dict(style_kwds, 0), color_dict(style_kwds, 1)
    if color1 == color2:
        color2 = gen_colors()[1]
    if cutoff_curve:
        ax.plot(
            threshold,
            [1 - item for item in false_positive],
            label="Specificity",
            **updated_dict({"color": gen_colors()[0]}, style_kwds),
        )
        ax.plot(
            threshold,
            true_positive,
            label="Sensitivity",
            **updated_dict({"color": gen_colors()[1]}, style_kwds),
        )
        ax.fill_between(
            threshold,
            [1 - item for item in false_positive],
            true_positive,
            facecolor="black",
            alpha=0.02,
        )
        ax.set_xlabel("Decision Boundary")
        ax.set_title("Cutoff Curve")
        ax.legend(loc="center left", bbox_to_anchor=[1, 0.5])
    else:
        ax.set_xlabel("False Positive Rate (1-Specificity)")
        ax.set_ylabel("True Positive Rate (Sensitivity)")
        ax.plot(
            false_positive,
            true_positive,
            **updated_dict({"color": gen_colors()[0]}, style_kwds),
        )
        ax.fill_between(false_positive,
                        false_positive,
                        true_positive,
                        facecolor=color1,
                        alpha=0.1)
        ax.fill_between([0, 1], [0, 0], [0, 1], facecolor=color2, alpha=0.1)
        ax.plot([0, 1], [0, 1], color=color2)
        ax.set_title("ROC Curve")
        ax.text(
            0.995,
            0,
            "AUC = " + str(round(auc, 4) * 100) + "%",
            verticalalignment="bottom",
            horizontalalignment="right",
            fontsize=11.5,
        )
    ax.set_ylim(0, 1)
    ax.set_xlim(0, 1)
    ax.set_axisbelow(True)
    ax.grid()
    return tablesample(values={
        "threshold": threshold,
        "false_positive": false_positive,
        "true_positive": true_positive,
    }, )
示例#8
0
def plot_acf_pacf(
    vdf: vDataFrame,
    column: str,
    ts: str,
    by: list = [],
    p: (int, list) = 15,
    **style_kwds,
):
    """
---------------------------------------------------------------------------
Draws the ACF and PACF Charts.

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
column: str
    Response column.
ts: str
    vcolumn used as timeline. It will be to use to order the data. 
    It can be a numerical or type date like (date, datetime, timestamp...) 
    vcolumn.
by: list, optional
    vcolumns used in the partition.
p: int/list, optional
    Int equals to the maximum number of lag to consider during the computation
    or List of the different lags to include during the computation.
    p must be positive or a list of positive integers.
**style_kwds
    Any optional parameter to pass to the Matplotlib functions.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "column",
            column,
            [str],
        ),
        (
            "ts",
            ts,
            [str],
        ),
        (
            "by",
            by,
            [list],
        ),
        (
            "p",
            p,
            [int, float],
        ),
        (
            "vdf",
            vdf,
            [
                vDataFrame,
            ],
        ),
    ])
    tmp_style = {}
    for elem in style_kwds:
        if elem not in ("color", "colors"):
            tmp_style[elem] = style_kwds[elem]
    if "color" in style_kwds:
        color = style_kwds["color"]
    else:
        color = gen_colors()[0]
    columns_check([column, ts] + by, vdf)
    by = vdf_columns_names(by, vdf)
    column, ts = vdf_columns_names([column, ts], vdf)
    acf = vdf.acf(ts=ts, column=column, by=by, p=p, show=False)
    pacf = vdf.pacf(ts=ts, column=column, by=by, p=p, show=False)
    result = tablesample(
        {
            "index": [i for i in range(0, len(acf.values["value"]))],
            "acf": acf.values["value"],
            "pacf": pacf.values["value"],
            "confidence": pacf.values["confidence"],
        }, )
    fig = plt.figure(figsize=(10,
                              6)) if isnotebook() else plt.figure(figsize=(10,
                                                                           6))
    plt.rcParams["axes.facecolor"] = "#FCFCFC"
    ax1 = fig.add_subplot(211)
    x, y, confidence = (
        result.values["index"],
        result.values["acf"],
        result.values["confidence"],
    )
    plt.xlim(-1, x[-1] + 1)
    ax1.bar(
        x,
        y,
        width=0.007 * len(x),
        color="#444444",
        zorder=1,
        linewidth=0,
    )
    param = {
        "s": 90,
        "marker": "o",
        "facecolors": color,
        "edgecolors": "black",
        "zorder": 2,
    }
    ax1.scatter(
        x,
        y,
        **updated_dict(
            param,
            tmp_style,
        ),
    )
    ax1.plot(
        [-1] + x + [x[-1] + 1],
        [0 for elem in range(len(x) + 2)],
        color=color,
        zorder=0,
    )
    ax1.fill_between(x, confidence, color="#FE5016", alpha=0.1)
    ax1.fill_between(x, [-elem for elem in confidence],
                     color="#FE5016",
                     alpha=0.1)
    ax1.set_title("Autocorrelation")
    y = result.values["pacf"]
    ax2 = fig.add_subplot(212)
    ax2.bar(x, y, width=0.007 * len(x), color="#444444", zorder=1, linewidth=0)
    ax2.scatter(
        x,
        y,
        **updated_dict(
            param,
            tmp_style,
        ),
    )
    ax2.plot(
        [-1] + x + [x[-1] + 1],
        [0 for elem in range(len(x) + 2)],
        color=color,
        zorder=0,
    )
    ax2.fill_between(x, confidence, color="#FE5016", alpha=0.1)
    ax2.fill_between(x, [-elem for elem in confidence],
                     color="#FE5016",
                     alpha=0.1)
    ax2.set_title("Partial Autocorrelation")
    plt.show()
    return result
示例#9
0
def regression_plot(
    X: list,
    y: str,
    input_relation: str,
    coefficients: list,
    max_nb_points: int = 50,
    ax=None,
    **style_kwds,
):
    check_types([
        ("X", X, [list]),
        ("y", y, [str]),
        ("input_relation", input_relation, [str]),
        ("coefficients", coefficients, [list]),
        ("max_nb_points", max_nb_points, [int, float]),
    ])
    param = {
        "marker": "o",
        "color": gen_colors()[0],
        "s": 50,
        "edgecolors": "black",
    }
    if len(X) == 1:
        query = "SELECT {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL LIMIT {}".format(
            X[0], y, input_relation, X[0], y, int(max_nb_points))
        all_points = executeSQL(query, method="fetchall", print_time_sql=False)
        if not (ax):
            fig, ax = plt.subplots()
            if isnotebook():
                fig.set_size_inches(8, 6)
            ax.set_axisbelow(True)
            ax.grid()
        x0, y0 = (
            [float(item[0]) for item in all_points],
            [float(item[1]) for item in all_points],
        )
        min_reg, max_reg = min(x0), max(x0)
        x_reg = [min_reg, max_reg]
        y_reg = [coefficients[0] + coefficients[1] * item for item in x_reg]
        ax.plot(x_reg, y_reg, alpha=1, color="black")
        ax.scatter(
            x0,
            y0,
            **updated_dict(param, style_kwds, 0),
        )
        ax.set_xlabel(X[0])
        ax.set_ylabel(y)
    elif len(X) == 2:
        query = "(SELECT {}, {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} IS NOT NULL LIMIT {})".format(
            X[0], X[1], y, input_relation, X[0], X[1], y, int(max_nb_points))
        all_points = executeSQL(query, method="fetchall", print_time_sql=False)
        x0, y0, z0 = (
            [float(item[0]) for item in all_points],
            [float(item[1]) for item in all_points],
            [float(item[2]) for item in all_points],
        )
        min_reg_x, max_reg_x = min(x0), max(x0)
        step_x = (max_reg_x - min_reg_x) / 40.0
        min_reg_y, max_reg_y = min(y0), max(y0)
        step_y = (max_reg_y - min_reg_y) / 40.0
        X_reg = (arange(min_reg_x - 5 * step_x, max_reg_x +
                        5 * step_x, step_x) if (step_x > 0) else [max_reg_x])
        Y_reg = (arange(min_reg_y - 5 * step_y, max_reg_y +
                        5 * step_y, step_y) if (step_y > 0) else [max_reg_y])
        X_reg, Y_reg = np.meshgrid(X_reg, Y_reg)
        Z_reg = coefficients[
            0] + coefficients[1] * X_reg + coefficients[2] * Y_reg
        if not (ax):
            if isnotebook():
                plt.figure(figsize=(8, 6))
            ax = plt.axes(projection="3d")
        ax.plot_surface(X_reg,
                        Y_reg,
                        Z_reg,
                        rstride=1,
                        cstride=1,
                        alpha=0.5,
                        color="gray")
        ax.scatter(
            x0,
            y0,
            z0,
            **updated_dict(param, style_kwds, 0),
        )
        ax.set_xlabel(X[0])
        ax.set_ylabel(X[1])
        ax.set_zlabel(y + " = f(" + X[0] + ", " + X[1] + ")")
    else:
        raise ParameterError("The number of predictors is too big.")
    return ax
示例#10
0
def plot_var(
        x: list,
        y: list,
        variable_names: list = [],
        explained_variance: tuple = (None, None),
        dimensions: tuple = (1, 2),
        bar_name: str = "",
        ax=None,
        **style_kwds,
):
    colors = gen_colors()
    if "color" in style_kwds:
        colors[0] = style_kwds["color"]
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(6, 6)
        ax.set_axisbelow(True)
        ax.grid()
    else:
        fig = plt
    n = len(x)
    delta_y = (max(y) - min(y)) * 0.04
    delta_x = (max(x) - min(x)) * 0.04
    for i in range(n):
        ax.text(x[i],
                y[i] + delta_y,
                variable_names[i],
                horizontalalignment="center")
    param = {"marker": "^", "s": 100, "edgecolors": "black"}
    if "c" not in style_kwds:
        param["color"] = colors[0]
    img = ax.scatter(x, y, **updated_dict(param, style_kwds, 0))
    ax.plot(
        [min(x) - 5 * delta_x, max(x) + 5 * delta_x],
        [0.0, 0.0],
        linestyle="--",
        color="black",
    )
    ax.plot(
        [0.0, 0.0],
        [min(y) - 5 * delta_y, max(y) + 5 * delta_y],
        linestyle="--",
        color="black",
    )
    ax.set_xlim(min(x) - 5 * delta_x, max(x) + 5 * delta_x)
    ax.set_ylim(min(y) - 5 * delta_y, max(y) + 5 * delta_y)
    ax.set_xlabel("Dim{} {}".format(
        dimensions[0],
        "" if not (explained_variance[0]) else "({}%)".format(
            round(explained_variance[0] * 100, 1)),
    ))
    ax.set_ylabel("Dim{} {}".format(
        dimensions[1],
        "" if not (explained_variance[1]) else "({}%)".format(
            round(explained_variance[1] * 100, 1)),
    ))
    ax.xaxis.set_ticks_position("bottom")
    ax.yaxis.set_ticks_position("left")
    if "c" in style_kwds:
        fig.colorbar(img).set_label(bar_name)
    return ax
示例#11
0
def logit_plot(
    X: list,
    y: str,
    input_relation: str,
    coefficients: list,
    max_nb_points=50,
    ax=None,
    **style_kwds,
):
    check_types([
        ("X", X, [list]),
        ("y", y, [str]),
        ("input_relation", input_relation, [str]),
        ("coefficients", coefficients, [list]),
        ("max_nb_points", max_nb_points, [int, float]),
    ])
    param0 = {
        "marker": "o",
        "s": 50,
        "color": gen_colors()[0],
        "edgecolors": "black",
        "alpha": 0.8,
    }
    param1 = {
        "marker": "o",
        "s": 50,
        "color": gen_colors()[1],
        "edgecolors": "black",
    }

    def logit(x):
        return 1 / (1 + math.exp(-x))

    if len(X) == 1:
        query = "(SELECT {}, {} FROM {} WHERE {} IS NOT NULL AND {} = 0 LIMIT {})".format(
            X[0], y, input_relation, X[0], y, int(max_nb_points / 2))
        query += " UNION ALL (SELECT {}, {} FROM {} WHERE {} IS NOT NULL AND {} = 1 LIMIT {})".format(
            X[0], y, input_relation, X[0], y, int(max_nb_points / 2))
        all_points = executeSQL(query, method="fetchall", print_time_sql=False)
        if not (ax):
            fig, ax = plt.subplots()
            if isnotebook():
                fig.set_size_inches(8, 6)
            ax.set_axisbelow(True)
            ax.grid()
        x0, x1 = [], []
        for idx, item in enumerate(all_points):
            if item[1] == 0:
                x0 += [float(item[0])]
            else:
                x1 += [float(item[0])]
        min_logit, max_logit = min(x0 + x1), max(x0 + x1)
        step = (max_logit - min_logit) / 40.0
        x_logit = (arange(min_logit - 5 * step, max_logit + 5 * step, step) if
                   (step > 0) else [max_logit])
        y_logit = [
            logit(coefficients[0] + coefficients[1] * item) for item in x_logit
        ]
        ax.plot(x_logit, y_logit, alpha=1, color="black")
        all_scatter = [
            ax.scatter(
                x0,
                [
                    logit(coefficients[0] + coefficients[1] * item)
                    for item in x0
                ],
                **updated_dict(param1, style_kwds, 1),
            )
        ]
        all_scatter += [
            ax.scatter(
                x1,
                [
                    logit(coefficients[0] + coefficients[1] * item)
                    for item in x1
                ],
                **updated_dict(param0, style_kwds, 0),
            )
        ]
        ax.set_xlabel(X[0])
        ax.set_ylabel(y)
        ax.legend(
            all_scatter,
            [0, 1],
            scatterpoints=1,
            loc="center left",
            bbox_to_anchor=[1, 0.5],
        )
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    elif len(X) == 2:
        query = "(SELECT {}, {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} = 0 LIMIT {})".format(
            X[0], X[1], y, input_relation, X[0], X[1], y,
            int(max_nb_points / 2))
        query += " UNION (SELECT {}, {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} = 1 LIMIT {})".format(
            X[0], X[1], y, input_relation, X[0], X[1], y,
            int(max_nb_points / 2))
        all_points = executeSQL(query, method="fetchall", print_time_sql=False)
        x0, x1, y0, y1 = [], [], [], []
        for idx, item in enumerate(all_points):
            if item[2] == 0:
                x0 += [float(item[0])]
                y0 += [float(item[1])]
            else:
                x1 += [float(item[0])]
                y1 += [float(item[1])]
        min_logit_x, max_logit_x = min(x0 + x1), max(x0 + x1)
        step_x = (max_logit_x - min_logit_x) / 40.0
        min_logit_y, max_logit_y = min(y0 + y1), max(y0 + y1)
        step_y = (max_logit_y - min_logit_y) / 40.0
        X_logit = (arange(min_logit_x - 5 * step_x, max_logit_x +
                          5 * step_x, step_x) if
                   (step_x > 0) else [max_logit_x])
        Y_logit = (arange(min_logit_y - 5 * step_y, max_logit_y +
                          5 * step_y, step_y) if
                   (step_y > 0) else [max_logit_y])
        X_logit, Y_logit = np.meshgrid(X_logit, Y_logit)
        Z_logit = 1 / (1 + np.exp(-(coefficients[0] + coefficients[1] *
                                    X_logit + coefficients[2] * Y_logit)))
        if not (ax):
            if isnotebook():
                plt.figure(figsize=(8, 6))
            ax = plt.axes(projection="3d")
        ax.plot_surface(X_logit,
                        Y_logit,
                        Z_logit,
                        rstride=1,
                        cstride=1,
                        alpha=0.5,
                        color="gray")
        all_scatter = [
            ax.scatter(
                x0,
                y0,
                [
                    logit(coefficients[0] + coefficients[1] * x0[i] +
                          coefficients[2] * y0[i]) for i in range(len(x0))
                ],
                **updated_dict(param1, style_kwds, 1),
            )
        ]
        all_scatter += [
            ax.scatter(
                x1,
                y1,
                [
                    logit(coefficients[0] + coefficients[1] * x1[i] +
                          coefficients[2] * y1[i]) for i in range(len(x1))
                ],
                **updated_dict(param0, style_kwds, 0),
            )
        ]
        ax.set_xlabel(X[0])
        ax.set_ylabel(X[1])
        ax.set_zlabel(y)
        ax.legend(
            all_scatter,
            [0, 1],
            scatterpoints=1,
            loc="center left",
            bbox_to_anchor=[1.1, 0.5],
            title=y,
            ncol=2,
            fontsize=8,
        )
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    else:
        raise ParameterError("The number of predictors is too big.")
    return ax
示例#12
0
def plot_bubble_ml(
        x: list,
        y: list,
        s: list = None,
        z: list = [],
        x_label: str = "time",
        y_label: str = "score",
        title: str = "Model Type",
        reverse: tuple = (True, True),
        plt_text=True,
        ax=None,
        **style_kwds,
):
    if s:
        s = [min(250 + 5000 * elem, 1200) if elem != 0 else 1000 for elem in s]
    if z and s:
        data = [(x[i], y[i], s[i], z[i]) for i in range(len(x))]
        data.sort(key=lambda tup: str(tup[3]))
        x = [elem[0] for elem in data]
        y = [elem[1] for elem in data]
        s = [elem[2] for elem in data]
        z = [elem[3] for elem in data]
    elif z:
        data = [(x[i], y[i], z[i]) for i in range(len(x))]
        data.sort(key=lambda tup: str(tup[2]))
        x = [elem[0] for elem in data]
        y = [elem[1] for elem in data]
        z = [elem[2] for elem in data]
    colors = gen_colors()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
        ax.grid(axis="y")
        ax.set_axisbelow(True)
    if z:
        current_cat = z[0]
        idx = 0
        i = 0
        j = 1
        all_scatter = []
        all_categories = [current_cat]
        tmp_colors = []
        while j != len(z):
            while j < len(z) and z[j] == current_cat:
                j += 1
            param = {
                "alpha": 0.8,
                "marker": "o",
                "color": colors[idx],
                "edgecolors": "black",
            }
            if s:
                size = s[i:j]
            else:
                size = 50
            all_scatter += [
                ax.scatter(x[i:j],
                           y[i:j],
                           s=size,
                           **updated_dict(param, style_kwds, idx))
            ]
            tmp_colors += [updated_dict(param, style_kwds, idx)["color"]]
            if j < len(z):
                all_categories += [z[j]]
                current_cat = z[j]
                i = j
                idx += 1
        ax.legend(
            [
                Line2D(
                    [0],
                    [0],
                    marker="o",
                    color="black",
                    markerfacecolor=color,
                    markersize=8,
                ) for color in tmp_colors
            ],
            all_categories,
            bbox_to_anchor=[1, 0.5],
            loc="center left",
            title=title,
            labelspacing=1,
        )
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    else:
        param = {
            "alpha": 0.8,
            "marker": "o",
            "color": colors[0],
            "edgecolors": "black"
        }
        if s:
            size = s
        else:
            size = 300
        ax.scatter(x, y, s=size, **updated_dict(param, style_kwds, 0))
    if reverse[0]:
        ax.set_xlim(
            max(x) + 0.1 * (1 + max(x) - min(x)),
            min(x) - 0.1 - 0.1 * (1 + max(x) - min(x)),
        )
    if reverse[1]:
        ax.set_ylim(
            max(y) + 0.1 * (1 + max(y) - min(y)),
            min(y) - 0.1 * (1 + max(y) - min(y)))
    if plt_text:
        ax.set_xlabel(x_label, loc="right")
        ax.set_ylabel(y_label, loc="top")
        ax.spines["left"].set_position("center")
        ax.spines["bottom"].set_position("center")
        ax.spines["right"].set_color("none")
        ax.spines["top"].set_color("none")
        delta_x = (max(x) - min(x)) * 0.1
        delta_y = (max(y) - min(y)) * 0.1
        plt.text(
            max(x) + delta_x if reverse[0] else min(x) - delta_x,
            max(y) + delta_y if reverse[1] else min(y) - delta_y,
            "Modest",
            size=15,
            rotation=130.0,
            ha="center",
            va="center",
            bbox=dict(boxstyle="round",
                      ec=gen_colors()[0],
                      fc=gen_colors()[0],
                      alpha=0.3),
        )
        plt.text(
            max(x) + delta_x if reverse[0] else min(x) - delta_x,
            min(y) - delta_y if reverse[1] else max(y) + delta_y,
            "Efficient",
            size=15,
            rotation=30.0,
            ha="center",
            va="center",
            bbox=dict(boxstyle="round",
                      ec=gen_colors()[1],
                      fc=gen_colors()[1],
                      alpha=0.3),
        )
        plt.text(
            min(x) - delta_x if reverse[0] else max(x) + delta_x,
            max(y) + delta_y if reverse[1] else min(y) - delta_y,
            "Performant",
            size=15,
            rotation=-130.0,
            ha="center",
            va="center",
            bbox=dict(boxstyle="round",
                      ec=gen_colors()[2],
                      fc=gen_colors()[2],
                      alpha=0.3),
        )
        plt.text(
            min(x) - delta_x if reverse[0] else max(x) + delta_x,
            min(y) - delta_y if reverse[1] else max(y) + delta_y,
            "Performant & Efficient",
            size=15,
            rotation=-30.0,
            ha="center",
            va="center",
            bbox=dict(boxstyle="round",
                      ec=gen_colors()[3],
                      fc=gen_colors()[3],
                      alpha=0.3),
        )
    else:
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)
    return ax
示例#13
0
def lof_plot(
    input_relation: str,
    columns: list,
    lof: str,
    tablesample: float = -1,
    ax=None,
    **style_kwds,
):
    check_types([
        ("input_relation", input_relation, [str]),
        ("columns", columns, [list]),
        ("lof", lof, [str]),
        ("tablesample", tablesample, [int, float]),
    ])
    tablesample = ("TABLESAMPLE({})".format(tablesample) if
                   (tablesample > 0 and tablesample < 100) else "")
    colors = []
    if "color" in style_kwds:
        if isinstance(style_kwds["color"], str):
            colors = [style_kwds["color"]]
        else:
            colors = style_kwds["color"]
        del style_kwds["color"]
    elif "colors" in style_kwds:
        if isinstance(style_kwds["colors"], str):
            colors = [style_kwds["colors"]]
        else:
            colors = style_kwds["colors"]
        del style_kwds["colors"]
    colors += gen_colors()
    param = {
        "s": 50,
        "edgecolors": "black",
        "color": colors[0],
    }
    if len(columns) == 1:
        column = quote_ident(columns[0])
        query = "SELECT {}, {} FROM {} {} WHERE {} IS NOT NULL".format(
            column, lof, input_relation, tablesample, column)
        query_result = executeSQL(query,
                                  method="fetchall",
                                  print_time_sql=False)
        column1, lof = (
            [item[0] for item in query_result],
            [item[1] for item in query_result],
        )
        column2 = [0] * len(column1)
        if not (ax):
            fig, ax = plt.subplots()
            if isnotebook():
                fig.set_size_inches(8, 2)
            ax.set_axisbelow(True)
            ax.grid()
        ax.set_xlabel(column)
        radius = [
            2 * 1000 * (item - min(lof)) / (max(lof) - min(lof))
            for item in lof
        ]
        ax.scatter(
            column1,
            column2,
            label="Data points",
            **updated_dict(param, style_kwds, 0),
        )
        ax.scatter(
            column1,
            column2,
            s=radius,
            label="Outlier scores",
            facecolors="none",
            color=colors[1],
        )
    elif len(columns) == 2:
        columns = [quote_ident(column) for column in columns]
        query = "SELECT {}, {}, {} FROM {} {} WHERE {} IS NOT NULL AND {} IS NOT NULL".format(
            columns[0],
            columns[1],
            lof,
            input_relation,
            tablesample,
            columns[0],
            columns[1],
        )
        query_result = executeSQL(query,
                                  method="fetchall",
                                  print_time_sql=False)
        column1, column2, lof = (
            [item[0] for item in query_result],
            [item[1] for item in query_result],
            [item[2] for item in query_result],
        )
        if not (ax):
            fig, ax = plt.subplots()
            if isnotebook():
                fig.set_size_inches(8, 6)
            ax.set_axisbelow(True)
            ax.grid()
        ax.set_ylabel(columns[1])
        ax.set_xlabel(columns[0])
        radius = [
            1000 * (item - min(lof)) / (max(lof) - min(lof)) for item in lof
        ]
        ax.scatter(
            column1,
            column2,
            label="Data points",
            **updated_dict(param, style_kwds, 0),
        )
        ax.scatter(
            column1,
            column2,
            s=radius,
            label="Outlier scores",
            facecolors="none",
            color=colors[1],
        )
    elif len(columns) == 3:
        query = "SELECT {}, {}, {}, {} FROM {} {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} IS NOT NULL".format(
            columns[0],
            columns[1],
            columns[2],
            lof,
            input_relation,
            tablesample,
            columns[0],
            columns[1],
            columns[2],
        )
        query_result = executeSQL(query,
                                  method="fetchall",
                                  print_time_sql=False)
        column1, column2, column3, lof = (
            [float(item[0]) for item in query_result],
            [float(item[1]) for item in query_result],
            [float(item[2]) for item in query_result],
            [float(item[3]) for item in query_result],
        )
        if not (ax):
            if isnotebook():
                plt.figure(figsize=(8, 6))
            ax = plt.axes(projection="3d")
        ax.set_xlabel(columns[0])
        ax.set_ylabel(columns[1])
        ax.set_zlabel(columns[2])
        radius = [
            1000 * (item - min(lof)) / (max(lof) - min(lof)) for item in lof
        ]
        ax.scatter(
            column1,
            column2,
            column3,
            label="Data points",
            **updated_dict(param, style_kwds, 0),
        )
        ax.scatter(
            column1,
            column2,
            column3,
            s=radius,
            facecolors="none",
            color=colors[1],
        )
        ax.xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
        ax.yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
        ax.zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
    else:
        raise Exception(
            "LocalOutlierFactor Plot is available for a maximum of 3 columns")
    return ax
示例#14
0
def voronoi_plot(
    clusters: list,
    columns: list,
    input_relation: str,
    max_nb_points: int = 1000,
    plot_crosses: bool = True,
    ax=None,
    **style_kwds,
):
    check_types([
        ("clusters", clusters, [list]),
        ("columns", columns, [list]),
        ("input_relation", input_relation, [str]),
        ("max_nb_points", max_nb_points, [int]),
    ])
    from scipy.spatial import voronoi_plot_2d, Voronoi

    min_x, max_x, min_y, max_y = (
        min([elem[0] for elem in clusters]),
        max([elem[0] for elem in clusters]),
        min([elem[1] for elem in clusters]),
        max([elem[1] for elem in clusters]),
    )
    dummies_point = [
        [min_x - 999, min_y - 999],
        [min_x - 999, max_y + 999],
        [max_x + 999, min_y - 999],
        [max_x + 999, max_y + 999],
    ]
    v = Voronoi(clusters + dummies_point)
    param = {"show_vertices": False}
    voronoi_plot_2d(v, ax=ax, **updated_dict(param, style_kwds))
    if not (ax):
        ax = plt
        ax.xlabel(columns[0])
        ax.ylabel(columns[1])
    colors = gen_colors()
    for idx, region in enumerate(v.regions):
        if not -1 in region:
            polygon = [v.vertices[i] for i in region]
            if "color" in style_kwds:
                if isinstance(style_kwds["color"], str):
                    color = style_kwds["color"]
                else:
                    color = style_kwds["color"][idx % len(style_kwds["color"])]
            else:
                color = colors[idx % len(colors)]
            ax.fill(*zip(*polygon), alpha=0.4, color=color)
    ax.plot([elem[0] for elem in clusters], [elem[1] for elem in clusters],
            "ko")
    ax.xlim(min_x - 0.05 * (max_x - min_x), max_x + 0.05 * (max_x - min_x))
    ax.ylim(min_y - 0.05 * (max_y - min_y), max_y + 0.05 * (max_y - min_y))
    if max_nb_points > 0:
        query = "SELECT {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL ORDER BY RANDOM() LIMIT {}".format(
            columns[0],
            columns[1],
            input_relation,
            columns[0],
            columns[1],
            int(max_nb_points),
        )
        all_points = executeSQL(query, method="fetchall", print_time_sql=False)
        x, y = (
            [float(item[0]) for item in all_points],
            [float(item[1]) for item in all_points],
        )
        ax.scatter(
            x,
            y,
            color="black",
            s=10,
            alpha=1,
            zorder=3,
        )
        if plot_crosses:
            ax.scatter(
                [elem[0] for elem in clusters],
                [elem[1] for elem in clusters],
                color="white",
                s=200,
                linewidths=5,
                alpha=1,
                zorder=4,
                marker="x",
            )
    return ax
示例#15
0
def svm_classifier_plot(
    X: list,
    y: str,
    input_relation: str,
    coefficients: list,
    max_nb_points: int = 500,
    ax=None,
    **style_kwds,
):
    check_types([
        ("X", X, [list]),
        ("y", y, [str]),
        ("input_relation", input_relation, [str]),
        ("coefficients", coefficients, [list]),
        ("max_nb_points", max_nb_points, [int, float]),
    ])
    param0 = {
        "marker": "o",
        "color": gen_colors()[0],
        "s": 50,
        "edgecolors": "black",
    }
    param1 = {
        "marker": "o",
        "color": gen_colors()[1],
        "s": 50,
        "edgecolors": "black",
    }
    if len(X) == 1:
        query = "(SELECT {}, {} FROM {} WHERE {} IS NOT NULL AND {} = 0 LIMIT {})".format(
            X[0], y, input_relation, X[0], y, int(max_nb_points / 2))
        query += " UNION ALL (SELECT {}, {} FROM {} WHERE {} IS NOT NULL AND {} = 1 LIMIT {})".format(
            X[0], y, input_relation, X[0], y, int(max_nb_points / 2))
        all_points = executeSQL(query, method="fetchall", print_time_sql=False)
        if not (ax):
            fig, ax = plt.subplots()
            if isnotebook():
                fig.set_size_inches(8, 6)
            ax.set_axisbelow(True)
            ax.grid()
        x0, x1 = [], []
        for idx, item in enumerate(all_points):
            if item[1] == 0:
                x0 += [float(item[0])]
            else:
                x1 += [float(item[0])]
        x_svm, y_svm = (
            [
                -coefficients[0] / coefficients[1],
                -coefficients[0] / coefficients[1]
            ],
            [-1, 1],
        )
        ax.plot(x_svm, y_svm, alpha=1, color="black")
        all_scatter = [
            ax.scatter(x0, [0 for item in x0],
                       **updated_dict(param1, style_kwds, 1))
        ]
        all_scatter += [
            ax.scatter(x1, [0 for item in x1],
                       **updated_dict(param0, style_kwds, 0))
        ]
        ax.set_xlabel(X[0])
        ax.legend(
            all_scatter,
            [0, 1],
            scatterpoints=1,
            loc="center left",
            bbox_to_anchor=[1, 0.5],
        )
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    elif len(X) == 2:
        query = "(SELECT {}, {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} = 0 LIMIT {})".format(
            X[0], X[1], y, input_relation, X[0], X[1], y,
            int(max_nb_points / 2))
        query += " UNION (SELECT {}, {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} = 1 LIMIT {})".format(
            X[0], X[1], y, input_relation, X[0], X[1], y,
            int(max_nb_points / 2))
        all_points = executeSQL(query, method="fetchall", print_time_sql=False)
        if not (ax):
            fig, ax = plt.subplots()
            if isnotebook():
                fig.set_size_inches(8, 6)
            ax.set_axisbelow(True)
            ax.grid()
        x0, x1, y0, y1 = [], [], [], []
        for idx, item in enumerate(all_points):
            if item[2] == 0:
                x0 += [float(item[0])]
                y0 += [float(item[1])]
            else:
                x1 += [float(item[0])]
                y1 += [float(item[1])]
        min_svm, max_svm = min(x0 + x1), max(x0 + x1)
        x_svm, y_svm = (
            [min_svm, max_svm],
            [
                -(coefficients[0] + coefficients[1] * min_svm) /
                coefficients[2],
                -(coefficients[0] + coefficients[1] * max_svm) /
                coefficients[2],
            ],
        )
        ax.plot(x_svm, y_svm, alpha=1, color="black")
        all_scatter = [
            ax.scatter(x0, y0, **updated_dict(param1, style_kwds, 1))
        ]
        all_scatter += [
            ax.scatter(x1, y1, **updated_dict(param0, style_kwds, 0))
        ]
        ax.set_xlabel(X[0])
        ax.set_ylabel(X[1])
        ax.legend(
            all_scatter,
            [0, 1],
            scatterpoints=1,
            loc="center left",
            bbox_to_anchor=[1, 0.5],
        )
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    elif len(X) == 3:
        query = "(SELECT {}, {}, {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} IS NOT NULL AND {} = 0 LIMIT {})".format(
            X[0],
            X[1],
            X[2],
            y,
            input_relation,
            X[0],
            X[1],
            X[2],
            y,
            int(max_nb_points / 2),
        )
        query += " UNION (SELECT {}, {}, {}, {} FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} IS NOT NULL AND {} = 1 LIMIT {})".format(
            X[0],
            X[1],
            X[2],
            y,
            input_relation,
            X[0],
            X[1],
            X[2],
            y,
            int(max_nb_points / 2),
        )
        all_points = executeSQL(query, method="fetchall", print_time_sql=False)
        x0, x1, y0, y1, z0, z1 = [], [], [], [], [], []
        for idx, item in enumerate(all_points):
            if item[3] == 0:
                x0 += [float(item[0])]
                y0 += [float(item[1])]
                z0 += [float(item[2])]
            else:
                x1 += [float(item[0])]
                y1 += [float(item[1])]
                z1 += [float(item[2])]
        min_svm_x, max_svm_x = min(x0 + x1), max(x0 + x1)
        step_x = (max_svm_x - min_svm_x) / 40.0
        min_svm_y, max_svm_y = min(y0 + y1), max(y0 + y1)
        step_y = (max_svm_y - min_svm_y) / 40.0
        X_svm = (arange(min_svm_x - 5 * step_x, max_svm_x +
                        5 * step_x, step_x) if (step_x > 0) else [max_svm_x])
        Y_svm = (arange(min_svm_y - 5 * step_y, max_svm_y +
                        5 * step_y, step_y) if (step_y > 0) else [max_svm_y])
        X_svm, Y_svm = np.meshgrid(X_svm, Y_svm)
        Z_svm = coefficients[
            0] + coefficients[1] * X_svm + coefficients[2] * Y_svm
        if not (ax):
            if isnotebook():
                plt.figure(figsize=(8, 6))
            ax = plt.axes(projection="3d")
        ax.plot_surface(X_svm,
                        Y_svm,
                        Z_svm,
                        rstride=1,
                        cstride=1,
                        alpha=0.5,
                        color="gray")
        param0["alpha"] = 0.8
        all_scatter = [
            ax.scatter(x0, y0, z0, **updated_dict(param1, style_kwds, 1))
        ]
        all_scatter += [
            ax.scatter(x1, y1, z1, **updated_dict(param0, style_kwds, 0))
        ]
        ax.set_xlabel(X[0])
        ax.set_ylabel(X[1])
        ax.set_zlabel(X[2])
        ax.legend(
            all_scatter,
            [0, 1],
            scatterpoints=1,
            title=y,
            loc="center left",
            bbox_to_anchor=[1.1, 0.5],
            ncol=1,
            fontsize=8,
        )
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    else:
        raise ParameterError("The number of predictors is too big.")
    return ax
示例#16
0
    def plot_var(self,
                 dimensions: tuple = (1, 2),
                 method: str = "auto",
                 ax=None,
                 **style_kwds):
        """
    ---------------------------------------------------------------------------
    Draws the MCA (multiple correspondence analysis) graph.

    Parameters
    ----------
    dimensions: tuple, optional
        Tuple of two IDs of the model's components.
    method: str, optional
        Method used to draw the plot.
            auto   : Only the variables are displayed.
            cos2   : The cos2 is used as CMAP.
            contrib: The feature contribution is used as CMAP.
    ax: Matplotlib axes object, optional
        The axes to plot on.
    **style_kwds
        Any optional parameter to pass to the Matplotlib functions.

    Returns
    -------
    ax
        Matplotlib axes object
        """
        check_types([
            ("dimensions", dimensions, [tuple]),
            ("method", method, ["auto", "cos2", "contrib"]),
        ])
        x = self.components_["PC{}".format(dimensions[0])]
        y = self.components_["PC{}".format(dimensions[1])]
        n = len(self.cos2_["PC{}".format(dimensions[0])])
        if method in ("cos2", "contrib"):
            if method == "cos2":
                c = [
                    self.cos2_["PC{}".format(dimensions[0])][i] +
                    self.cos2_["PC{}".format(dimensions[1])][i]
                    for i in range(n)
                ]
            else:
                sum_1, sum_2 = (
                    sum(self.cos2_["PC{}".format(dimensions[0])]),
                    sum(self.cos2_["PC{}".format(dimensions[1])]),
                )
                c = [
                    0.5 * 100 *
                    (self.cos2_["PC{}".format(dimensions[0])][i] / sum_1 +
                     self.cos2_["PC{}".format(dimensions[1])][i] / sum_2)
                    for i in range(n)
                ]
            style_kwds["c"] = c
            if "cmap" not in style_kwds:
                from verticapy.plot import gen_cmap, gen_colors

                style_kwds["cmap"] = gen_cmap(
                    color=[gen_colors()[0],
                           gen_colors()[1],
                           gen_colors()[2]])
        explained_variance = self.explained_variance_["explained_variance"]
        return plot_var(x, y, self.X, (
            explained_variance[dimensions[0] - 1],
            explained_variance[dimensions[1] - 1],
        ), dimensions, method, ax, **style_kwds)