def get_MFA_params(zl, kl, rl_nextl): ''' Determine clusters with a GMM and then adjust a Factor Model over each cluster zl (ndarray): The lth layer latent variable kl (int): The number of components of the lth layer rl_nextl (1darray): The dimension of the lth layer and (l+1)th layer ----------------------------------------------------- returns (dict): Dict with the parameters of the MFA approximated by GMM + FA. ''' #====================================================== # Fit a GMM in the continuous space #====================================================== numobs = zl.shape[0] not_all_groups = True max_trials = 100 empty_count_counter = 0 while not_all_groups: # If not enough obs per group then the MFA diverge... gmm = GaussianMixture(n_components=kl) s = gmm.fit_predict(zl) clusters_found, count = np.unique(s, return_counts=True) if (len(clusters_found) == kl): # & (count >= 5).all(): not_all_groups = False empty_count_counter += 1 if empty_count_counter >= max_trials: raise RuntimeError( 'Could not find a GMM init that presents the \ proper number of groups:', kl) psi = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float) psi_inv = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float) H = np.full((kl, rl_nextl[0], rl_nextl[1]), 0).astype(float) eta = np.full((kl, rl_nextl[0]), 0).astype(float) z_nextl = np.full((numobs, rl_nextl[1]), np.nan).astype(float) #======================================================== # And then a MFA on each of those group #======================================================== for j in range(kl): indices = (s == j) fa = FactorAnalyzer(rotation=None, method='ml', n_factors=rl_nextl[1]) fa.fit(zl[indices]) psi[j] = np.diag(fa.get_uniquenesses()) H[j] = fa.loadings_ psi_inv[j] = np.diag(1 / fa.get_uniquenesses()) z_nextl[indices] = fa.transform(zl[indices]) eta[j] = np.mean(zl[indices], axis=0) params = {'H': H, 'psi': psi, 'z_nextl': z_nextl, 'eta': eta, 'classes': s} return params
0: 'Purchase', 1: 'Marking', 2: 'Post Purchase', 3: 'Product Position' }) del dfX['Post Purchase'] return dfX # Prepare significant X values for regression. trainDF = dropInsignificantX(X_train_transformed) testDF = dropInsignificantX(X_test_tranformed) # Train regression model on training data model = LinearRegression() model.fit(trainDF, y_train) # Show model statistics. showModelSummary(model, y_test, testDF) # Check coefficient significance. showCoefficientPValues(y_train, trainDF.values) print("FA Factor Variance") print(fa.get_factor_variance()) print("FA Get Communalities") print(fa.get_communalities()) print("FA Uniqunesses") print(fa.get_uniquenesses())
def calculate_py_output(test_name, factors, method, rotation, use_corr_matrix=False, top_dir=None): """ Use the `FactorAnalyzer()` class to perform the factor analysis and return a dictionary with relevant output for given scenario. Parameters ---------- test_name : str The name of the test factors : int The number of factors method : str The rotation method rotation : str The type of rotation use_corr_matrix : bool, optional Whether to use the correlation matrix. Defaults to False. top_dir : str, optional The top directory for test data Defaults to `DATA_DIR`` Returns ------- output : dict A dictionary containing the outputs for all `OUTPUT_TYPES`. """ if top_dir is None: top_dir = DATA_DIR filename = join(top_dir, test_name + '.csv') data = pd.read_csv(filename) if use_corr_matrix: X = data.corr() else: X = data.copy() rotation = None if rotation == 'none' else rotation method = {'uls': 'minres'}.get(method, method) fa = FactorAnalyzer(n_factors=factors, method=method, rotation=rotation, is_corr_matrix=use_corr_matrix) fa.fit(X) evalues, values = fa.get_eigenvalues() return { 'value': values, 'evalues': evalues, 'structure': fa.structure_, 'loading': fa.loadings_, 'uniquenesses': fa.get_uniquenesses(), 'communalities': fa.get_communalities(), 'scores': fa.transform(data) }
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False): """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability. You want "promax" if you want Oblimin on large datasets. See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. """ assert not df.isnull().values.any(), "Data must not contain any nan or inf values" assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)" def data_suitable(df, kmo_value = False, ignore = False): #Test to ensure data is not identity Matrix chi_square_value, p_value = calculate_bartlett_sphericity(df) # test to ensure that observed data is adquite for FA. Must be > 0.6 kmo_all, kmo_model = calculate_kmo(df) if (p_value > 0.1 or kmo_model < 0.6) and ignore != True: raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}. KMO model Score: {}".format(p_value, kmo_model)) if kmo_value: return kmo_model else: return print("KMO Value: {}.".format(data_suitable(df, kmo_value = True))) fa = FactorAnalyzer(method = "minres", rotation = rotation, n_factors = n_factors) fa.fit(df) def eigenplot(df): df = pd.DataFrame(df) fig = go.Figure() fig.add_trace( go.Scatter( x = df.index.values, y = df[0].values, mode = 'lines' ) ) fig.add_shape( type = "line", y0 = 1, x0 = 0, y1 = 1, x1 = len(df), line = dict( color = 'red', dash = 'dash' ) ) fig.update_layout( title = "Factor Eigenvalues", yaxis_title="Eigenvalue", xaxis_title="Factor", xaxis = dict( range = [0,df[df[0] > 0].index.values[-1]] ) ) fig.show() return eigenplot(fa.get_eigenvalues()[1]) Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000) tmp = pd.DataFrame(fa.get_factor_variance()[1:]) tmp.index = ["Proportional Varience","Cumulative Varience"] Plotting.dfTable(tmp) if rotation == 'promax': Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, title = "Varience Explained", x = list(df.columns), description = "The proportion of each variables varience that can be explained by the factors.", expand = True, height = 300, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, title = "Variable Uniqueness", x = list(df.columns), expand = True, height = 300, width = 2000) if transform: return fa.transform(df) return