df_str = df_v2[list(tipos3t.head(0))] # In[] #Normalización de datos para PCA df_num_norm = StandardScaler().fit_transform(df_num) df_num_norm = pd.DataFrame(df_num_norm, columns=list(tipos2t.head())) # In[] #Matriz de covarianza, correlaciones, gráfica de dependencia líneal y número de condición cov_df = df_num_norm.cov() var_global = sum(np.diag(cov_df)) det = np.linalg.det(cov_df) corr_df = df_num_norm.corr() sns.heatmap(corr_df, center=0, cmap='Blues_r') cond_cov = np.linalg.cond(cov_df) # In[] #Identificación de outliers y Eliminación del 10% #a=[] a_rob = [] media_num_norm = np.array(df_num_norm.mean()) mediana_num_norm = np.array(df_num_norm.median()) inv_cov = np.linalg.inv(np.array(cov_df)) for i in range(len(df_num_norm.index)): #b = distance.mahalanobis(np.array(df_num_norm.iloc[i,:]),media_num_norm,inv_cov) b_rob = distance.mahalanobis(np.array(df_num_norm.iloc[i, :]), mediana_num_norm, inv_cov)
df = y_var_df.join(X_vars_df) df = df.merge(target_names_df, left_on=df.target, right_index=True) # %% """ Standardize X Data """ X_normalized = StandardScaler().fit_transform(X_vars_df) X_normalized = pd.DataFrame(X_normalized, columns=X_vars_df.columns) # %% """ Exploratory analysis """ # Obtain the correlation matrix correlation_matrix = X_normalized.corr() # Extract eigenvalues and eigenvectors from the correlation matrix # Instantiate PCA object selecting all possible dimensions, # i.e. len(original_variables) pca = PCA(n_components=len(X_vars_df.columns)) pc_matrix = pca.fit_transform(X_normalized) # Extract eigenvalues and eigenvectors from class properties and store in dfs # --Get index for eigenvalues eigenvalues_index = \ ['PC_{}'.format(i) for i in range(1, len(X_normalized.columns) + 1)] # --Get actual eigenvalues df eigenvalues_df = \ pd.DataFrame(data=pca.explained_variance_, columns=['Eigenvalues']