# Keep only widows marital_idx = np.argmax(varnames == 'marital.status') widowed_idx = np.argmax( le_dict['marital.status'].classes_ == 'Widowed') authorized_ranges[:, 0, marital_idx] = [widowed_idx, widowed_idx] # Only widowed else: raise RuntimeError('Not implemented yet') #***************************************************************** # Run MIAMI #***************************************************************** init = dim_reduce_init(train, n_clusters, k, r, nj, var_distrib, seed = None,\ use_famd=True) out = MIAMI(train, n_clusters, r, k, init, var_distrib, nj, authorized_ranges, nb_pobs, it,\ eps, maxstep, seed, perform_selec = False, dm = dm, max_patience = 0) print('MIAMI has kept one observation over', round(1 / out['share_kept_pseudo_obs']),\ 'observations generated') acceptance_rate[design].append(out['share_kept_pseudo_obs']) pred = pd.DataFrame(out['y_all'], columns=train.columns) #================================================================ # Inverse transform the datasets #================================================================ for j, colname in enumerate(train.columns): if colname in le_dict.keys():
r = np.array([2, 1]) numobs = len(full_contra) k = [n_clusters] seed = 1 init_seed = 2 eps = 1E-05 it = 10 maxstep = 100 #===========================================# # MI2AMI initialisation #===========================================# init_full = dim_reduce_init(full_contra, n_clusters, k, r, nj, var_distrib, seed = None,\ use_famd=True) out_full = MI2AMI(full_contra.astype(float), n_clusters, r, k, init_full, var_distrib, nj, nan_mask,\ nb_pobs, it, eps, maxstep, seed, dm = dm3, perform_selec = False) completed_y3 = pd.DataFrame(out_full['completed_y'].round(0), columns = full_contra.columns) #===========================================# # Comparison #===========================================# #====================================== # mu_s #====================================== out['mu'][0] out2['mu'][0]
# Defining distances over the features cat_features = pd.Series(var_distrib).isin( ['categorical', 'bernoulli']).to_list() dm = gower_matrix(train.astype(np.object), cat_features=cat_features) dtype = {train.columns[j]: np.float64 if (var_distrib[j] != 'bernoulli') and \ (var_distrib[j] != 'categorical') else np.str for j in range(p_new)} train = train.astype(dtype, copy=True) numobs = len(train) #***************************************************************** # Run MIAMI #***************************************************************** prince_init = dim_reduce_init(train, 2, k, r, nj, var_distrib, seed = None,\ use_famd=True) out = MIAMI(train_np, 'auto', r, k, prince_init, var_distrib, nj, authorized_ranges, nb_pobs, it,\ eps, maxstep, seed, perform_selec = False, dm = dm, max_patience = 0) print('MIAMI has kept one observation over', round(1 / out['share_kept_pseudo_obs']),\ 'observations generated') acceptance_rate[design].append(out['share_kept_pseudo_obs']) #***************************************************************** # Visualisation result #***************************************************************** train_new_np = out['y_all'][len(train):] train_new = pd.DataFrame(train_new_np, columns=train.columns) le_dict = {**categ_dict, **bin_dict, **k_dict}
def MDGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True): ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM) y (numobs x p ndarray): The observations containing mixed variables n_clusters (int or str): The number of clusters to look for in the data or the use mode of the MDGMM r (dict): The dimension of latent variables through the first 2 layers k (dict): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable For categorical data: the number of different existing categories for each variable it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' # Break the reference link k = deepcopy(k) r = deepcopy(r) best_k = deepcopy(k) best_r = deepcopy(r) # Add other checks for the other variables check_inputs(k, r) prev_lik = - 1E15 best_lik = -1E15 tol = 0.01 max_patience = 1 patience = 0 #==================================================== # Initialize the parameters #==================================================== eta_c, eta_d, H_c, H_d, psi_c, psi_d = dispatch_dgmm_init(init) lambda_bin, lambda_ord, lambda_categ = dispatch_gllvm_init(init) w_s_c, w_s_d = dispatch_paths_init(init) numobs = len(y) likelihood = [] it_num = 0 ratio = 1000 np.random.seed = seed #==================================================== # Dispatch variables between categories #==================================================== y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nj_bin = nj_bin.astype(int) nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'] nj_ord = nj[var_distrib == 'ordinal'] nj_ord = nj_ord.astype(int) nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'].astype(int) nb_categ = len(nj_categ) yc = y[:, var_distrib == 'continuous'] ss = StandardScaler() yc = ss.fit_transform(yc) nb_cont = yc.shape[1] # *_1L standsds for quantities going through all the network (head + tail) k_1L, L_1L, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} best_sil = [-1.1 for l in range(L['t'] - 1)] if n_clusters == 'multi' else -1.1 new_sil = [-1.1 for l in range(L['t'] - 1)] if n_clusters == 'multi' else -1.1 M = M_growth(1, r_1L, numobs) if nb_bin + nb_ord + nb_categ == 0: # Create the InputError class and change this raise ValueError('Input does not contain discrete variables,\ consider using a regular DGMM') if nb_cont == 0: # Create the InputError class and change this raise ValueError('Input does not contain continuous values,\ consider using a DDGMM') # Compute the Gower matrix cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli') dm = gower_matrix(y, cat_features = cat_features) while (it_num < it) & ((ratio > eps) | (patience <= max_patience)): print(it_num) # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters if not(isnumeric(n_clusters)): if n_clusters == 'auto': clustering_layer = 0 elif n_clusters == 'multi': clustering_layer = list(range(L['t'] - 1)) else: raise ValueError('Please enter an int, auto or multi for n_clusters') else: assert (np.array(k['t']) == n_clusters).any() clustering_layer = np.argmax(np.array(k['t']) == n_clusters) ##################################################################################### ################################# MC step ############################################ ##################################################################################### #===================================================================== # Draw from f(z^{l} | s, Theta) for both heads and tail #===================================================================== mu_s_c, sigma_s_c = compute_path_params(eta_c, H_c, psi_c) sigma_s_c = ensure_psd(sigma_s_c) mu_s_d, sigma_s_d = compute_path_params(eta_d, H_d, psi_d) sigma_s_d = ensure_psd(sigma_s_d) z_s_c, zc_s_c, z_s_d, zc_s_d = draw_z_s_all_network(mu_s_c, sigma_s_c,\ mu_s_d, sigma_s_d, yc, eta_c, eta_d, S_1L, L, M) #======================================================================== # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1 #======================================================================== # Create wrapper as before and after chsi_c = compute_chsi(H_c, psi_c, mu_s_c, sigma_s_c) chsi_c = ensure_psd(chsi_c) rho_c = compute_rho(eta_c, H_c, psi_c, mu_s_c, sigma_s_c, zc_s_c, chsi_c) chsi_d = compute_chsi(H_d, psi_d, mu_s_d, sigma_s_d) chsi_d = ensure_psd(chsi_d) rho_d = compute_rho(eta_d, H_d, psi_d, mu_s_d, sigma_s_d, zc_s_d, chsi_d) # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively z2_z1s_c, z2_z1s_d = draw_z2_z1s_network(chsi_c, chsi_d, rho_c, \ rho_d, M, r_1L, L) #======================================================================= # Compute the p(y^D| z1) for all discrete variables #======================================================================= py_zl1_d = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord,\ lambda_categ, y_categ, nj_categ, z_s_d[0]) #======================================================================== # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s #======================================================================== zl1_ys_d = draw_zl1_ys(z_s_d, py_zl1_d, M['d']) ##################################################################################### ################################# E step ############################################ ##################################################################################### #===================================================================== # Compute quantities necessary for E steps of both heads and tail #===================================================================== # Discrete head quantities pzl1_ys_d, ps_y_d, py_d = E_step_GLLVM(z_s_d[0], mu_s_d[0], sigma_s_d[0], w_s_d, py_zl1_d) py_s_d = ps_y_d * py_d / w_s_d[n_axis] # Continuous head quantities ps_y_c, py_s_c, py_c = continuous_lik(yc, mu_s_c[0], sigma_s_c[0], w_s_c) pz_s_d = fz_s(z_s_d, mu_s_d, sigma_s_d) pz_s_c = fz_s(z_s_c, mu_s_c, sigma_s_c) #===================================================================== # Compute p(z^{(l)}| s, y). Equation (5) of the paper #===================================================================== # Compute pz2_z1s_d and pz2_z1s_d for the tail indices whereas it is useless pz2_z1s_d = fz2_z1s(t(pzl1_ys_d, (1, 0, 2)), z2_z1s_d, chsi_d, rho_d, S_1L['d']) pz_ys_d = fz_ys(t(pzl1_ys_d, (1, 0, 2)), pz2_z1s_d) pz2_z1s_c = fz2_z1s([], z2_z1s_c, chsi_c, rho_c, S_1L['c']) pz_ys_c = fz_ys([], pz2_z1s_c) pz2_z1s_t = fz2_z1s([], z2_z1s_c[bar_L['c']:], chsi_c[bar_L['c']:], \ rho_c[bar_L['c']:], S_1L['t']) # Junction layer computations # Compute p(zC |s) py_zs_d = fy_zs(pz_ys_d, py_s_d) py_zs_c = fy_zs(pz_ys_c, py_s_c) # Compute p(zt | yC, yD, sC, SD) pzt_yCyDs = fz_yCyDs(py_zs_c, pz_ys_d, py_s_c, M, S_1L, L) #===================================================================== # Compute MFA expectations #===================================================================== # Discrete head. Ez_ys_d, E_z1z2T_ys_d, E_z2z2T_ys_d, EeeT_ys_d = \ E_step_DGMM_d(zl1_ys_d, H_d, z_s_d, zc_s_d, z2_z1s_d, pz_ys_d,\ pz2_z1s_d, S_1L['d'], L['d']) # Continuous head Ez_ys_c, E_z1z2T_ys_c, E_z2z2T_ys_c, EeeT_ys_c = \ E_step_DGMM_c(H_c, z_s_c, zc_s_c, z2_z1s_c, pz_ys_c,\ pz2_z1s_c, S_1L['c'], L['c']) # Junction layers Ez_ys_t, E_z1z2T_ys_t, E_z2z2T_ys_t, EeeT_ys_t = \ E_step_DGMM_t(H_c[bar_L['c']:], \ z_s_c[bar_L['c']:], zc_s_c[bar_L['c']:], z2_z1s_c[bar_L['c']:],\ pzt_yCyDs, pz2_z1s_t, S_1L, L, k_1L) # Error here for the first two terms: p(y^h | z^t, s^C) != p(y^h | z^t, s^{1C:L}) pst_yCyD = fst_yCyD(py_zs_c, py_zs_d, pz_s_d, w_s_c, w_s_d, k_1L, L) ########################################################################### ############################ M step ####################################### ########################################################################### #======================================================= # Compute DGMM Parameters #======================================================= # Discrete head w_s_d = np.mean(ps_y_d, axis = 0) eta_d_barL, H_d_barL, psi_d_barL = M_step_DGMM(Ez_ys_d, E_z1z2T_ys_d, E_z2z2T_ys_d, \ EeeT_ys_d, ps_y_d, H_d, k_1L['d'][:-1],\ L_1L['d'], r_1L['d']) # Add dispatching function here eta_d[:bar_L['d']] = eta_d_barL H_d[:bar_L['d']] = H_d_barL psi_d[:bar_L['d']] = psi_d_barL # Continuous head w_s_c = np.mean(ps_y_c, axis = 0) eta_c_barL, H_c_barL, psi_c_barL = M_step_DGMM(Ez_ys_c, E_z1z2T_ys_c, E_z2z2T_ys_c, \ EeeT_ys_c, ps_y_c, H_c, k_1L['c'][:-1],\ L_1L['c'] + 1, r_1L['c']) eta_c[:bar_L['c']] = eta_c_barL H_c[:bar_L['c']] = H_c_barL psi_c[:bar_L['c']] = psi_c_barL # Common tail eta_t, H_t, psi_t, Ezst_y = M_step_DGMM_t(Ez_ys_t, E_z1z2T_ys_t, E_z2z2T_ys_t, \ EeeT_ys_t, ps_y_c, ps_y_d, pst_yCyD, \ H_c[bar_L['c']:], S_1L, k_1L, \ L_1L, L, r_1L['t']) eta_d[bar_L['d']:] = eta_t H_d[bar_L['d']:] = H_t psi_d[bar_L['d']:] = psi_t eta_c[bar_L['c']:] = eta_t H_c[bar_L['c']:] = H_t psi_c[bar_L['c']:] = psi_t #======================================================= # Identifiability conditions #======================================================= w_s_t = np.mean(pst_yCyD, axis = 0) eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \ H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L) #======================================================= # Compute GLLVM Parameters #======================================================= # We optimize each column separately as it is faster than all column jointly # (and more relevant with the independence hypothesis) lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y_d, \ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y_d, \ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y_d,\ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) ########################################################################### ################## Clustering parameters updating ######################### ########################################################################### new_lik = np.sum(np.log(py_d) + np.log(py_c)) likelihood.append(new_lik) ratio = (new_lik - prev_lik)/abs(prev_lik) if n_clusters == 'multi': temp_classes = [] z_tail = [] classes = [[] for l in range(L['t'] - 1)] for l in clustering_layer: idx_to_sum = tuple(set(range(1, L['t'] + 1)) -\ set([clustering_layer[l] + 1])) psl_y = pst_yCyD.reshape(numobs, *k['t'],\ order = 'C').sum(idx_to_sum) temp_class_l = np.argmax(psl_y, axis = 1) sil_l = silhouette_score(dm, temp_class_l, metric = 'precomputed') temp_classes.append(temp_class_l) #z_tail.append(Ezst_y[l].sum(1)) new_sil[l] = sil_l #z_tail = [] for l in range(L['t'] - 1): zl = Ezst_y[l].sum(1) z_tail.append(zl) if best_sil[l] < new_sil[l]: # Update the quantity if the silhouette score is better best_sil[l] = deepcopy(new_sil[l]) classes[l] = deepcopy(temp_classes[l]) if zl.shape[-1] == 3: plot_3d(zl, classes[l]) elif zl.shape[-1] == 2: plot_2d(zl, classes[l]) else: idx_to_sum = tuple(set(range(1, L['t'] + 1)) - set([clustering_layer + 1])) psl_y = pst_yCyD.reshape(numobs, *k['t'], order = 'C').sum(idx_to_sum) temp_classes = np.argmax(psl_y, axis = 1) try: new_sil = silhouette_score(dm, temp_classes, metric = 'precomputed') except: new_sil = -1 z_tail = [Ezst_y[l].sum(1) for l in range(L['t'] - 1)] if best_sil < new_sil: # Update the quantity if the silhouette score is better zl = z_tail[clustering_layer] best_sil = deepcopy(new_sil) classes = deepcopy(temp_classes) if zl.shape[-1] == 3: plot_3d(zl, classes) elif zl.shape[-1] == 2: plot_2d(zl, classes) # Refresh the likelihood if best if best_lik < new_lik: best_lik = deepcopy(prev_lik) if prev_lik < new_lik: patience = 0 M = M_growth(it_num + 1, r_1L, numobs) else: patience += 1 ########################################################################### ######################## Parameter selection ############################# ########################################################################### min_nb_clusters = 2 is_not_min_specif = not(is_min_architecture_reached(k, r, min_nb_clusters)) if look_for_simpler_network(it_num) & perform_selec & is_not_min_specif: # To add: selection according to categ r_to_keep = r_select(y_bin, y_ord, y_categ, yc, zl1_ys_d,\ z2_z1s_d[:bar_L['d']], w_s_d, z2_z1s_c[:bar_L['c']], z2_z1s_c[bar_L['c']:], n_clusters) # Check layer deletion is_c_layer_deletion = np.any([len(rl) == 0 for rl in r_to_keep['c']]) is_d_layer_deletion = np.any([len(rl) == 0 for rl in r_to_keep['d']]) is_head_layer_deletion = np.any([is_c_layer_deletion, is_d_layer_deletion]) if is_head_layer_deletion: # Restart the algorithm if is_c_layer_deletion: r['c'] = [len(rl) for rl in r_to_keep['c'][:-1]] k['c'] = k['c'][:-1] if is_d_layer_deletion: r['d'] = [len(rl) for rl in r_to_keep['d'][:-1]] k['d'] = k['d'][:-1] init = dim_reduce_init(pd.DataFrame(y), n_clusters, k, r, nj, var_distrib,\ seed = None) eta_c, eta_d, H_c, H_d, psi_c, psi_d = dispatch_dgmm_init(init) lambda_bin, lambda_ord, lambda_categ = dispatch_gllvm_init(init) w_s_c, w_s_d = dispatch_paths_init(init) # *_1L standsds for quantities going through all the network (head + tail) k_1L, L_1L, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} M = M_growth(it_num + 1, r_1L, numobs) prev_lik = deepcopy(new_lik) it_num = it_num + 1 print(likelihood) print('Restarting the algorithm') continue new_Lt = np.sum([len(rl) != 0 for rl in r_to_keep['t']]) #- 1 # If r_l == 0, delete the last l + 1: layers new_Lt = np.sum([len(rl) != 0 for rl in r_to_keep['t']]) #- 1 #w_s_t = pst_yCyD.mean(0) k_to_keep = k_select(w_s_c, w_s_d, w_s_t, k, new_Lt, clustering_layer, n_clusters) is_selection = check_if_selection(r_to_keep, r, k_to_keep, k, L, new_Lt) assert new_Lt > 0 # > 1 ? if n_clusters == 'multi': assert new_Lt == L['t'] if is_selection: # Part to change when update also number of layers on each head nb_deleted_layers_tail = L['t'] - new_Lt L['t'] = new_Lt L_1L = {keys: values - nb_deleted_layers_tail for keys, values in L_1L.items()} eta_c, eta_d, H_c, H_d, psi_c, psi_d = dgmm_coeff_selection(eta_c,\ H_c, psi_c, eta_d, H_d, psi_d, L, r_to_keep, k_to_keep) lambda_bin, lambda_ord, lambda_categ = gllvm_coeff_selection(lambda_bin, lambda_ord,\ lambda_categ, r, r_to_keep) w_s_c, w_s_d = path_proba_selection(w_s_c, w_s_d, k, k_to_keep, new_Lt) k = {h: [len(k_to_keep[h][l]) for l in range(L[h])] for h in ['d', 't']} k['c'] = [len(k_to_keep['c'][l]) for l in range(L['c'] + 1)] r = {h: [len(r_to_keep[h][l]) for l in range(L[h])] for h in ['d', 't']} r['c'] = [len(r_to_keep['c'][l]) for l in range(L['c'] + 1)] k_1L, _, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} patience = 0 best_r = deepcopy(r) best_k = deepcopy(k) #======================================================= # Identifiability conditions #======================================================= eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \ H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L) print('New architecture:') print('k', k) print('r', r) print('L', L) print('S_1L', S_1L) print("w_s_c", len(w_s_c)) print("w_s_d", len(w_s_d)) M = M_growth(it_num + 1, r_1L, numobs) prev_lik = deepcopy(new_lik) print(likelihood) print('Silhouette score:', new_sil) it_num = it_num + 1 out = dict(likelihood = likelihood, classes = classes, \ best_r = best_r, best_k = best_k) if n_clusters == 'multi': out['z'] = z_tail else: out['z'] = z_tail[clustering_layer] return(out)
#===========================================# # Running the algorithm #===========================================# r = np.array([2, 1]) numobs = len(y) k = [n_clusters] seed = 1 init_seed = 2 eps = 1E-05 it = 50 maxstep = 100 prince_init = dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, seed = None,\ use_famd=True) m, pred = misc(labels_oh, prince_init['classes'], True) print(m) print(confusion_matrix(labels_oh, pred)) print(silhouette_score(dm, pred, metric='precomputed')) ''' init = prince_init seed = None y = y_np perform_selec = False os.chdir('C:/Users/rfuchs/Documents/GitHub/M1DGMM') ''' out = M1DGMM(y_np, 'auto', r, k, prince_init, var_distrib, nj, it,\ eps, maxstep, seed, perform_selec = False)
seed = 1 init_seed = 2 eps = 1E-05 it = 10 maxstep = 100 nb_runs = 4 for run_idx in range(nb_runs): #===========================================# # MI2AMI initialisation #===========================================# init = dim_reduce_init(complete_y, n_clusters, k, r, nj, var_distrib, seed = None,\ use_famd=False) out = MI2AMI(y, n_clusters, r, k, init, var_distrib, nj, nan_mask,\ nb_pobs, it, eps, maxstep, seed, dm = dm, perform_selec = False) completed_y = pd.DataFrame(out['completed_y'].round(0), columns=full_contra.columns) #===========================================# # MI2AMI full #===========================================# dm2 = gower_matrix(completed_y, cat_features=cat_features) init2 = dim_reduce_init(completed_y.astype(dtype), n_clusters, k, r, nj, var_distrib, seed = None,\ use_famd=False)
# Running the algorithm #===========================================# n_clusters = 2 r = {'c': [nb_cont], 'd': [3], 't': [2, 1]} k = {'c': [1], 'd': [2], 't': [n_clusters, 1]} seed = 1 init_seed = 2 eps = 1E-05 it = 15 maxstep = 100 # MCA init prince_init = dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, seed=None) out = MDGMM(y_np, n_clusters, r, k, prince_init, var_distrib, nj, it, eps,\ maxstep, seed, perform_selec = False) m, pred = misc(labels_oh, out['classes'], True) micro = precision_score(labels_oh, pred, average='micro') macro = precision_score(labels_oh, pred, average='macro') print('Silhouette', silhouette_score(dm, pred, metric='precomputed')) print('Micro', micro) print('Macro', macro) #===========================================# # Final plotting #===========================================#
cat_features = pd.Series(var_distrib).isin(['categorical', 'bernoulli']).to_list() dtype = { y.columns[j]: np.str if cat_features[j] else np.float64 for j in range(p) } y = y.astype(dtype, copy=True) dm = gower_matrix(y, cat_features=cat_features) #===========================================# # Running the M1DGMM #===========================================# for i in range(nb_trials): prince_init = dim_reduce_init(y, nb_clusters_start, k, r, nj, var_distrib, seed = None,\ use_famd=True) out = M1DGMM(y_np, 'auto', r, k, prince_init, var_distrib, nj, it,\ eps, maxstep, seed, perform_selec = True, dm = dm) mdgmm_res = mdgmm_res.append({'dataset': dataset, 'it_id': i + 1,\ 'n_clusters_found': len(set(out['classes'])),\ 'r': r, 'k':k,\ 'best_r': out['best_r'], 'best_k':out['best_k']},\ ignore_index=True) #===========================================# # Running the hierarchical clustering #===========================================# hierarch_res = pd.DataFrame(
# Running the algorithm #===========================================# r = [3, 2, 1] numobs = len(y) k = [n_clusters, 2] seed = 1 init_seed = 2 eps = 1E-05 it = 15 maxstep = 100 # MCA init prince_init = dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, seed=None) m, pred = misc(labels_oh, prince_init['classes'], True) print(m) print(confusion_matrix(labels_oh, pred)) ''' init = prince_init y = y_np seed = None ''' out = DDGMM(y_np, n_clusters, r, k, prince_init, var_distrib, nj, it,\ eps, maxstep, seed, perform_selec = False) m, pred = misc(labels_oh, out['classes'], True) print(m) print(confusion_matrix(labels_oh, pred))