def outlier(self, x_df, y_se, ntop): x, labs_list, m, n = self.prepare(x_df) x, norm = st.normalize(x) y = y_se.values.copy() y = np.array(y, dtype="float64") self.mc_check(x) e, e2 = st.res(x, y) eq, eq2 = st.res_pre(x, y) h, lev = st.hat(x) plots.lev_plot(lev, m, n) plots.res_plot(e, eq) e_in = st.int_student(x, y) e_ex = st.ext_student(x, y) dff = st.dffits(x, y) plots.res_plot2(e_in, e_ex, dff, m, n) lev_sind = np.argsort(lev) dff_sind = np.argsort(np.abs(dff)) print("Outliers (top " + str(ntop) + ")") outlier_list = [] for i in range(1, m + 1): outlier_list.append([ np.round(lev[lev_sind[-i]], 4), lev_sind[-i], np.round(np.abs(dff[dff_sind[-i]]), 4), dff_sind[-i] ]) results_df = pd.DataFrame( outlier_list, columns=["Leverages", "index", "|DFFITS|", "index"], index=range(m)) print(results_df[:ntop])
def _xty(x, y): m, n = x.shape x_norm, norm = st.normalize(x) xty = np.dot(x_norm.T, y) xty_list = [] for i in range(n): xty_list.append([i, xty[i], np.abs(xty[i])]) xty_ar = np.array(xty_list) sort_ind = np.argsort(xty_ar[:, 2]) return xty_ar[sort_ind[::-1], :]
def ex_plot(x, y, cmb_list, r_or_q): beta_list = [] r2_list = [] q2_list = [] x, norm = st.normalize(x) n = x.shape[1] for cmb in cmb_list: b_list = [0] * n b = st.beta(x[:, cmb], y) for i, j in enumerate(cmb): b_list[j] = b[i] #b_list[j] = 1 b_list = b_list[::-1] beta_list.append(b_list) r2_list.append(st.r2(x[:, cmb], y)) q2_list.append(st.q2(x[:, cmb], y)) # Plot for R2 and Q2 curves plt.figure(figsize=(8, 5)) plt.rcParams["font.size"] = 15 plt.plot(range(len(cmb_list)), r2_list, "o-", alpha=0.8, label="R2") plt.plot(range(len(cmb_list)), q2_list, "o-", alpha=0.8, label="Q2") plt.legend(loc="best", prop={"size": 12}) plt.xlabel(f"Ranking of models (sorted by {r_or_q} score)") plt.ylabel("Score") plt.grid() plt.show() df = pd.DataFrame(beta_list, columns=list(range(n))[::-1]) plt.figure(figsize=(10, 5)) sb.heatmap( df.T, square=False, cmap='coolwarm', vmin=-1, vmax=1, xticklabels=10, #yticklabels=False, #yticklabels=5, linecolor="black", cbar=True) plt.xlabel( f"Ranking of models (sorted by {r_or_q} score. beta value (color) is normalized.)" ) plt.ylabel("Descriptors") plt.show()
def vector_projection(self, x_df, y_se, sort, normalize): x, labs_list, m, n = self.prepare(x_df) y = y_se.values.copy() y = np.array(y, dtype="float64") x, norm = st.normalize(x) self.mc_check(x) e, e2 = st.res(x, y) eq, eq2 = st.res_pre(x, y) b_norm = st.beta(x, y) b = b_norm / norm if normalize: beta = b_norm else: beta = b p_val = st.p_val(x, y) log10p = np.log10(p_val) giy2_list = [] for i in range(n): xwoi = np.delete(x, i, axis=1) giy2 = st.giy2(x, xwoi, y) giy2_list.append(giy2) tri2_list = [] for i in range(n): xwoi, xi = np.delete(x, i, axis=1), x[:, i] tri2 = st.tri2(xwoi, xi) tri2_list.append(tri2) print(f" e2 = {e2: .6f}") print(f" mse = {e2/m: .6f}") print(f" rmse = {np.sqrt(e2/m): .6f}") print(f" R2 = {st.r2(x,y): .6f}") print(f" eq2 = {eq2: .6f}") print(f" mseq = {eq2/m: .6f}") print(f" rmseq = {np.sqrt(eq2/m): .6f}") print(f" Q2 = {st.q2(x,y): .6f}") print() print( f" TR2, TQ2, AIC = {st.tr2(x,y): .6f}, {st.tq2(x,y): .6f}, {st.aic(x,y): .6f}" ) print() nr = 5 results_df = pd.DataFrame( { "Label": labs_list, "b": np.round(beta, nr), "|b|": np.round(np.abs(beta), nr), "-log10(p)": np.round(-log10p, nr), "G2": np.round(giy2_list, nr), "TRi2": np.round(tri2_list, 10) }, columns=["Label", "b", "|b|", "-log10(p)", "G2", "TRi2"], index=range(1, n + 1)) if not sort == None: results_df = results_df.sort_values(by=sort, ascending=False).reset_index() print(results_df) self.beta = b hy = np.dot(x, b_norm) self.hy = pd.Series(hy, name="hy", index=y_se.index)
def hypervolume(x): x_norm, norm = st.normalize(x) vol = np.sqrt(np.linalg.det(np.dot(x_norm.T, x_norm))) return vol
def find_multicollinearity(x_df, normalize, tole1, tole2): labs = x_df.columns.tolist() x = x_df.values.copy() x = np.array(x, dtype="float64") x_norm, norm = st.normalize( x ) # In any case, descriptors are normalized once for increase of calculation accuracy m, n = x.shape print(f"Shape: X ({m}, {n})") c_mat, new_order, rk = find_ints(x_norm, tole1, tole2) base_list = new_order[:rk] extr_list = new_order[rk:] spac_list = extr_list.copy() spac_list.append(base_list) print("Space index: [ extra basis, [ basis ] ]") print(spac_list) # X' = XD : X' (m,n) is a normalized descriptor matrix. # D (n,n) is a normalized operator. # X'_rref = RX'Q : X'_rref (m,n) is a matrix with reduced row echelon form (rref). # R (m,m) is a elementary row operation. # Q (n,n) is a elementary column operation (supporse an orthogonal matrix). # X'_rrefC_rref = 0 : C_rref is a solution matrix. # (R^-1)X'_rref(Q^-1)QC_rref = 0, then, X'QC_rref = 0, # the solution of X'C' = 0 is given by QC_rref. # the solution of XC = 0 is given by DQC_rref. # In this program, the solutions for XQ or X'Q is calculated (instead of X or X'). # Therefore, C_rref (for normalized coef.) or (Q^-1)DQC_rref (for original coef.) is obtained as shown below. if normalize: # for normalized coefficients dc_mat_t = c_mat.T # C_rref^T : solution vectors for X'Q. else: # for original coefficients q_mat = q_matrix(new_order) # Q q_mat_inv = q_mat.T # Q^-1 (Q^T because Q is orthogonal) qc_mat = np.dot(q_mat, c_mat) # QC_rref d_mat = np.linalg.inv(np.diag(norm)) # D dqc_mat = np.dot(d_mat, qc_mat) # DQC_rref dc_mat = np.dot(q_mat_inv, dqc_mat) # (Q^-1)DQC_rref dc_mat_t = dc_mat.T # ( (Q^-1)DQC_rref)^T : solution vectors for XQ. ns = n - rk # Print correlaiton form print("\nThe form of multi-correlated descriptors") for i in range(ns): y_lab = labs[new_order[rk + i]] x_labs = [] for j in range(rk): x_lab = labs[new_order[j]] # X'Q x_labs.append(x_lab) form = model_form(y_lab, x_labs, -dc_mat_t[i] / dc_mat_t[i, rk + i]) print(f"{i+1} : {form}") print("") # Make subspace list find_subspace(labs, c_mat.T, new_order, rk) lid_list = [] for bi in base_list: blab = labs[bi] lid_list.append(blab) print(f"Temporal linearly independent descriptors (LIDs): {rk}") print(f"{lid_list}\n") return lid_list