Пример #1
0
    def outlier(self, x_df, y_se, ntop):
        x, labs_list, m, n = self.prepare(x_df)
        x, norm = st.normalize(x)
        y = y_se.values.copy()
        y = np.array(y, dtype="float64")
        self.mc_check(x)

        e, e2 = st.res(x, y)
        eq, eq2 = st.res_pre(x, y)
        h, lev = st.hat(x)
        plots.lev_plot(lev, m, n)
        plots.res_plot(e, eq)
        e_in = st.int_student(x, y)
        e_ex = st.ext_student(x, y)
        dff = st.dffits(x, y)

        plots.res_plot2(e_in, e_ex, dff, m, n)

        lev_sind = np.argsort(lev)
        dff_sind = np.argsort(np.abs(dff))

        print("Outliers (top " + str(ntop) + ")")
        outlier_list = []
        for i in range(1, m + 1):
            outlier_list.append([
                np.round(lev[lev_sind[-i]], 4), lev_sind[-i],
                np.round(np.abs(dff[dff_sind[-i]]), 4), dff_sind[-i]
            ])

        results_df = pd.DataFrame(
            outlier_list,
            columns=["Leverages", "index", "|DFFITS|", "index"],
            index=range(m))
        print(results_df[:ntop])
Пример #2
0
def _xty(x, y):
    m, n = x.shape
    x_norm, norm = st.normalize(x)
    xty = np.dot(x_norm.T, y)
    xty_list = []
    for i in range(n):
        xty_list.append([i, xty[i], np.abs(xty[i])])
    xty_ar = np.array(xty_list)
    sort_ind = np.argsort(xty_ar[:, 2])
    return xty_ar[sort_ind[::-1], :]
Пример #3
0
def ex_plot(x, y, cmb_list, r_or_q):
    beta_list = []
    r2_list = []
    q2_list = []
    x, norm = st.normalize(x)
    n = x.shape[1]
    for cmb in cmb_list:
        b_list = [0] * n
        b = st.beta(x[:, cmb], y)
        for i, j in enumerate(cmb):
            b_list[j] = b[i]
            #b_list[j] = 1
        b_list = b_list[::-1]
        beta_list.append(b_list)
        r2_list.append(st.r2(x[:, cmb], y))
        q2_list.append(st.q2(x[:, cmb], y))

    # Plot for R2 and Q2 curves
    plt.figure(figsize=(8, 5))
    plt.rcParams["font.size"] = 15
    plt.plot(range(len(cmb_list)), r2_list, "o-", alpha=0.8, label="R2")
    plt.plot(range(len(cmb_list)), q2_list, "o-", alpha=0.8, label="Q2")
    plt.legend(loc="best", prop={"size": 12})
    plt.xlabel(f"Ranking of models (sorted by {r_or_q} score)")
    plt.ylabel("Score")
    plt.grid()
    plt.show()

    df = pd.DataFrame(beta_list, columns=list(range(n))[::-1])

    plt.figure(figsize=(10, 5))
    sb.heatmap(
        df.T,
        square=False,
        cmap='coolwarm',
        vmin=-1,
        vmax=1,
        xticklabels=10,
        #yticklabels=False,
        #yticklabels=5,
        linecolor="black",
        cbar=True)
    plt.xlabel(
        f"Ranking of models (sorted by {r_or_q} score. beta value (color) is normalized.)"
    )
    plt.ylabel("Descriptors")

    plt.show()
Пример #4
0
    def vector_projection(self, x_df, y_se, sort, normalize):
        x, labs_list, m, n = self.prepare(x_df)
        y = y_se.values.copy()
        y = np.array(y, dtype="float64")
        x, norm = st.normalize(x)
        self.mc_check(x)

        e, e2 = st.res(x, y)
        eq, eq2 = st.res_pre(x, y)
        b_norm = st.beta(x, y)
        b = b_norm / norm

        if normalize:
            beta = b_norm
        else:
            beta = b

        p_val = st.p_val(x, y)
        log10p = np.log10(p_val)

        giy2_list = []
        for i in range(n):
            xwoi = np.delete(x, i, axis=1)
            giy2 = st.giy2(x, xwoi, y)
            giy2_list.append(giy2)

        tri2_list = []
        for i in range(n):
            xwoi, xi = np.delete(x, i, axis=1), x[:, i]
            tri2 = st.tri2(xwoi, xi)
            tri2_list.append(tri2)

        print(f"  e2 = {e2: .6f}")
        print(f"  mse = {e2/m: .6f}")
        print(f"  rmse = {np.sqrt(e2/m): .6f}")
        print(f"  R2 = {st.r2(x,y): .6f}")
        print(f"  eq2 = {eq2: .6f}")
        print(f"  mseq = {eq2/m: .6f}")
        print(f"  rmseq = {np.sqrt(eq2/m): .6f}")
        print(f"  Q2 = {st.q2(x,y): .6f}")
        print()
        print(
            f"  TR2, TQ2, AIC = {st.tr2(x,y): .6f},  {st.tq2(x,y): .6f},  {st.aic(x,y): .6f}"
        )
        print()

        nr = 5
        results_df = pd.DataFrame(
            {
                "Label": labs_list,
                "b": np.round(beta, nr),
                "|b|": np.round(np.abs(beta), nr),
                "-log10(p)": np.round(-log10p, nr),
                "G2": np.round(giy2_list, nr),
                "TRi2": np.round(tri2_list, 10)
            },
            columns=["Label", "b", "|b|", "-log10(p)", "G2", "TRi2"],
            index=range(1, n + 1))
        if not sort == None:
            results_df = results_df.sort_values(by=sort,
                                                ascending=False).reset_index()
        print(results_df)
        self.beta = b
        hy = np.dot(x, b_norm)
        self.hy = pd.Series(hy, name="hy", index=y_se.index)
Пример #5
0
def hypervolume(x):
    x_norm, norm = st.normalize(x)
    vol = np.sqrt(np.linalg.det(np.dot(x_norm.T, x_norm)))
    return vol
Пример #6
0
def find_multicollinearity(x_df, normalize, tole1, tole2):
    labs = x_df.columns.tolist()
    x = x_df.values.copy()
    x = np.array(x, dtype="float64")
    x_norm, norm = st.normalize(
        x
    )  # In any case, descriptors are normalized once for increase of calculation accuracy
    m, n = x.shape
    print(f"Shape: X ({m}, {n})")

    c_mat, new_order, rk = find_ints(x_norm, tole1, tole2)

    base_list = new_order[:rk]
    extr_list = new_order[rk:]
    spac_list = extr_list.copy()
    spac_list.append(base_list)
    print("Space index: [ extra basis, [ basis ] ]")
    print(spac_list)

    # X' = XD :   X' (m,n) is a normalized descriptor matrix.
    #                 D (n,n) is a normalized operator.
    # X'_rref = RX'Q :   X'_rref (m,n) is a matrix with reduced row echelon form (rref).
    #                           R (m,m) is a elementary row operation.
    #                           Q (n,n) is a elementary column operation (supporse an orthogonal matrix).
    # X'_rrefC_rref = 0   : C_rref is a solution matrix.
    # (R^-1)X'_rref(Q^-1)QC_rref = 0,  then,  X'QC_rref = 0,
    # the solution of X'C' = 0 is given by QC_rref.
    # the solution of XC = 0 is given by DQC_rref.
    # In this program, the solutions for XQ or X'Q is calculated (instead of X or X').
    # Therefore, C_rref (for normalized coef.) or (Q^-1)DQC_rref (for original coef.) is obtained as shown below.

    if normalize:  # for normalized coefficients
        dc_mat_t = c_mat.T  # C_rref^T :   solution vectors for X'Q.
    else:  # for original coefficients
        q_mat = q_matrix(new_order)  # Q
        q_mat_inv = q_mat.T  # Q^-1  (Q^T because Q is orthogonal)
        qc_mat = np.dot(q_mat, c_mat)  # QC_rref
        d_mat = np.linalg.inv(np.diag(norm))  # D
        dqc_mat = np.dot(d_mat, qc_mat)  # DQC_rref
        dc_mat = np.dot(q_mat_inv, dqc_mat)  # (Q^-1)DQC_rref
        dc_mat_t = dc_mat.T  # ( (Q^-1)DQC_rref)^T :   solution vectors for XQ.

    ns = n - rk
    # Print correlaiton form
    print("\nThe form of multi-correlated descriptors")
    for i in range(ns):
        y_lab = labs[new_order[rk + i]]
        x_labs = []
        for j in range(rk):
            x_lab = labs[new_order[j]]  # X'Q
            x_labs.append(x_lab)
        form = model_form(y_lab, x_labs, -dc_mat_t[i] / dc_mat_t[i, rk + i])
        print(f"{i+1} : {form}")
    print("")

    # Make subspace list
    find_subspace(labs, c_mat.T, new_order, rk)

    lid_list = []
    for bi in base_list:
        blab = labs[bi]
        lid_list.append(blab)
    print(f"Temporal linearly independent descriptors (LIDs): {rk}")
    print(f"{lid_list}\n")
    return lid_list