results['oracle'] = add_gs_results( pd.DataFrame(), model=models['oracle'], model_name='oracle', run_biptsp_on_full=False, # or true? dataset='full', view='both', **kws) ################## # classification # ################## clf_results = clf_fit_and_score(LinearDiscriminantAnalysis(), X_tr=np.hstack(X_tr), y_tr=view_labs_to_overall(Y_tr), X_tst=np.hstack(X_tst), y_tst=view_labs_to_overall(Y_tst)) ################ # save results # ################ dump( { 'results': results, 'models': models, 'runtimes': runtimes, 'data': { 'X_tr': X_tr, 'Y_tr': Y_tr,
def add_gs_results(results_df, sim_stub, model, model_name, dataset, view, X_tr, Y_tr, X_tst, Y_tst, Pi_true, view_params_true, zero_thresh=0, run_biptsp_on_full=False, X_tr_precomp_dists=None): """ Extracts results from after fitting model. Parameters ---------- results_df: pd.DataFrame The dataframe containing the results sim_stub: dict Dict contatining simulation information. model: Grid search clustering model. model_name: str, [ Name of model. dataset: str, ['full', 'view'] Which dataset are we looking at -- the full dataset or only a single view. view: int, str Either 'both' or which view we are looking at. X_tr, y_tr, X_tst, y_tst """ # idetifying information for results identif_stub = {'dataset': dataset, 'view': view, 'model': model_name} res = {} res.update(sim_stub) res.update(identif_stub) assert dataset in ['full', 'view'] assert type(view) == int or view == 'both' # assert model_name in ['ts_gs_mvmm', 'gmm_cat'] or \ # 'marginal_view' in model_name n_views = len(X_tr) n_samples = Y_tr.shape[0] # TODO is_mvmm_model = is_mvmm(model) is_gs_model = isinstance(model, BaseGridSearch) or \ isinstance(model, SpectralPenSearchByBlockMVMM) # isinstance(model, SpectralPenSearchMVMM) or \ # format X data if not is_mvmm_model and dataset == 'full': X_tr = np.hstack(X_tr) X_tst = np.hstack(X_tst) elif dataset == 'view': X_tr = X_tr[view] X_tst = X_tst[view] X_tr_precomp_dists = multi_view_safe_pairwise_distances(X=X_tr) model_sel_measures = [ 'aic', 'bic', 'silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn' ] # format y data y_tr_overall = view_labs_to_overall(Y_tr) y_tst_overall = view_labs_to_overall(Y_tst) # if dataset == 'full': # y_tr = view_labs_to_overall(Y_tr) # y_tst = view_labs_to_overall(Y_tst) # else: # y_tr = Y_tr[:, view] # y_tst = Y_tst[:, view] # get true number of components n_comp_tot_true, n_comp_views_true = get_n_comp(Pi_true) # true communities # comm_mat_true[np.isnan(comm_mat_true)] = -1 comm_mat_true = get_block_mat(Pi_true > 0) # res['n_blocks_true'] = int(np.nanmax(comm_mat_true) + 1) res['n_blocks_true'] = get_n_blocks(comm_mat_true) # comm_true_tr = [comm_mat_true[Y_tr[i, 0], Y_tr[i, 1]] # for i in range(Y_tr.shape[0])] # comm_true_tst = [comm_mat_true[Y_tst[i, 0], Y_tst[i, 1]] # for i in range(Y_tst.shape[0])] # add tuning parameter data if is_gs_model: n_tune_values = len(model.param_grid_) res['best_tuning_idx'] = model.best_idx_ all_estimators = [ model.estimators_[tune_idx] for tune_idx in range(n_tune_values) ] else: n_tune_values = 1 res['best_tuning_idx'] = 0 all_estimators = [model] # TODO-DEBUG # if res['best_tuning_idx'] is None: # 1/0 # get results for every estimator for tune_idx in range(n_tune_values): ################# # get model out # ################# estimator = all_estimators[tune_idx] # get final est for two stage if isinstance(estimator, TwoStage): is_two_stage = True estimator = estimator.final_ start_est = all_estimators[tune_idx].start_ else: is_two_stage = False start_est = None # if it's a MVMM model and a marginal view dataset # get the view marginal model if is_mvmm_model and dataset == 'view': # for view datasets use view marginal part of MVMM estimator = estimator.view_models_[view] if is_two_stage: start_est = start_est.view_models_[view] ############### # Get results # ############### # bureaucracy res['tune_idx'] = int(tune_idx) # if grid search get the tuing param valeus if is_gs_model: res['tuning_param_values'] = model.param_grid_[tune_idx] else: res['tuning_param_values'] = None # total number of mixture components if isinstance(estimator, BlockDiagMVMM): n_comp_est = (estimator.bd_weights_ > estimator.zero_thresh).sum() else: n_comp_est = estimator.n_components res['n_comp_est'] = n_comp_est if dataset == 'full': n_comp_true = n_comp_tot_true else: n_comp_true = n_comp_views_true[view] res['n_comp_resid'] = n_comp_true - n_comp_est if is_mvmm_model and dataset == 'full': res['n_comp_tot_est'] = estimator.n_components res['n_comp_views_est'] = estimator.n_view_components else: if dataset == 'full': res['n_comp_tot_est'] = n_comp_est res['n_comp_views_est'] = [None for _ in range(n_views)] else: res['n_comp_tot_est'] = n_comp_est res['n_comp_views_est'] = [None for _ in range(n_views)] res['n_comp_views_est'][view] = n_comp_est # fit time if hasattr(estimator, 'metadata_'): res['fit_time'] = estimator.metadata_['fit_time'] ################### # Fitting results # ################### # model fitting measures # res['bic'] = estimator.bic(X_tr) # res['aic'] = estimator.aic(X_tr) model_sel_scores = \ unsupervised_cluster_scores(X=X_tr, estimator=estimator, measures=model_sel_measures, metric='precomputed', precomp_dists=X_tr_precomp_dists, dunn_kws={'diameter_method': 'farthest', 'cdist_method': 'nearest'}) for k in model_sel_measures: res[k] = model_sel_scores[k] res['log_lik'] = estimator.log_likelihood(X_tr) res['n_params'] = estimator._n_parameters() # compute metrics on training data y_tr_pred = estimator.predict(X_tr) y_tst_pred = estimator.predict(X_tst) # compare to overall labeling res = add_to_res(res, cluster_report(y_tr_overall, y_tr_pred), stub='train_overall') res = add_to_res(res, cluster_report(y_tst_overall, y_tst_pred), stub='test_overall') # TODO remove these # compare overall labels to commmunity labels # TODO: not sure if this is valuable # res = add_cluster_report(res, y_tr_pred, comm_true_tr, # stub='train_overall_vs_community') # res = add_cluster_report(res, y_tst_pred, comm_true_tst, # stub='test_overall_vs_community') # res['train_ars_overall'] = adjusted_rand_score(y_tr_overall, # y_tr_pred) # res['test_ars_overall'] = adjusted_rand_score(y_tst_overall, # y_tst_pred) # compare to view specific labels for v in range(n_views): res = add_to_res(res, cluster_report(Y_tr[:, v], y_tr_pred), stub='train_view_{}'.format(v)) res = add_to_res(res, cluster_report(Y_tst[:, v], y_tst_pred), stub='test_view_{}'.format(v)) # res['train_ars_view_{}'.format(v)] = \ # adjusted_rand_score(Y_tr[:, v], y_tr_pred) # res['test_ars_view_{}'.format(v)] = \ # adjusted_rand_score(Y_tst[:, v], y_tst_pred) ############ # TwoStage # ############ if is_two_stage: y_tr_pred_start = start_est.predict(X_tr) y_tst_pred_start = start_est.predict(X_tst) res = add_to_res(res, cluster_report(y_tr_overall, y_tr_pred_start), stub='start_train_overall') res = add_to_res(res, cluster_report(y_tst_overall, y_tst_pred_start), stub='start_test_overall') # if MVMM on full data, get pi accuracy if is_mvmm_model and dataset == 'full': Pi_est = estimator.weights_mat_ # get estimated community structure if is_block_diag_mvmm(estimator): # for bd_mvmm the communities come from D D = estimator.bd_weights_ comm_mat_est = get_block_mat(D > zero_thresh) res['n_blocks_est'] = get_n_blocks(comm_mat_est) elif type(model) == MVMM and run_biptsp_on_full: # for full MVMM run spectral bipartite partitioning # for true number of blocks n_blocks = res['n_blocks_true'] comm_mat_est = run_bipt_spect_partitioning(Pi_est, n_blocks) res['n_blocks_est'] = np.nan else: # otherwise the communities come from just Pi_est comm_mat_est = get_block_mat(Pi_est > zero_thresh) res['n_blocks_est'] = get_n_blocks(comm_mat_est) # int(np.nanmax(comm_mat_est) + 1) ##################### # Pi graph accuracy # ##################### # TODO: we might want to use D or a normalized D for these if is_block_diag_mvmm(estimator): # for bd_weights use a normalzied version of D # for graph accuracy Pi_est_tilde = D / D.sum() else: Pi_est_tilde = Pi_est res['pi_graph_acc_norm'] = \ get_pi_acc(Pi_est_tilde, Pi_true, method='random_walk', normalize=True, method_type='fast', kernel_type='exponential') res['pi_graph_acc_unnorm'] = \ get_pi_acc(Pi_est_tilde, Pi_true, method='random_walk', normalize=False, method_type='fast', kernel_type='exponential') ####################### # Pi aligned accuracy # ####################### means_true = [view_params_true[v]['means'] for v in range(n_views)] means_est = [ estimator.view_models_[v].means_ for v in range(n_views) ] res['pi_aligned_dist'] = \ get_aligned_pi_distance(Pi_true, Pi_est, means_true, means_est) ###################### # community accuracy # ###################### # predicted communities # comm_mat_est = get_block_mat(Pi_est > zero_thresh) # comm_mat_est[np.isnan(comm_mat_est)] = -1 # res['n_blocks_est'] = int(np.nanmax(comm_mat_est) + 1) cl_report_no_out_tr, cl_report_restr_tr, n_out_tr = \ get_comm_pred_summary(Pi_true=Pi_true, Y_true=Y_tr, mvmm=estimator, view_data=X_tr, comm_mat_est=comm_mat_est) cl_report_no_out_tst, cl_report_restr_tst, n_out_tst = \ get_comm_pred_summary(Pi_true=Pi_true, Y_true=Y_tst, mvmm=estimator, view_data=X_tst, comm_mat_est=comm_mat_est) res = add_to_res(res, cl_report_no_out_tr, stub='train_community') res = add_to_res(res, cl_report_no_out_tst, stub='test_community') res = add_to_res(res, cl_report_restr_tst, stub='test_community_restr') res['comm_est_train_n_out'] = n_out_tr res['comm_est_test_n_out'] = n_out_tst # comm_est_tr = [] # comm_est_tst = [] # for i in range(len(y_tr_pred)): # y0, y1 = estimator._get_view_clust_idx(y_tr_pred[i]) # comm_est_tr.append(comm_mat_est[y0, y1]) # for i in range(len(y_tst_pred)): # y0, y1 = estimator._get_view_clust_idx(y_tst_pred[i]) # comm_est_tst.append(comm_mat_est[y0, y1]) # res = add_cluster_report(res, comm_true_tr, comm_est_tr, # stub='train_community') # res = add_cluster_report(res, comm_true_tst, comm_est_tst, # stub='test_community') # res['train_ars_comm'] = adjusted_rand_score(comm_true_tr, # comm_est_tr) # res['test_ars_comm'] = adjusted_rand_score(comm_true_tst, # comm_est_tst) ############ # TwoStage # ############ if is_two_stage: Pi_est = start_est.weights_mat_ # aligned accuracy means_true = [ view_params_true[v]['means'] for v in range(n_views) ] means_est = [ start_est.view_models_[v].means_ for v in range(n_views) ] res['start_pi_aligned_dist'] = \ get_aligned_pi_distance(Pi_true, Pi_est, means_true, means_est) # n_blocks comm_mat_est = get_block_mat(Pi_est > zero_thresh) comm_mat_est[np.isnan(comm_mat_est)] = -1 # res['start_n_blocks_est'] = int(np.nanmax(comm_mat_est) + 1) res['start_n_blocks_est'] = get_n_blocks(comm_mat_est) # to_add.append() results_df = results_df.append(res, ignore_index=True) # # do model selection for each measures # for sel_measure in model_sel_measures: # if MEASURE_MIN_GOOD[sel_measure]: # best_pd_idx = results_df[sel_measure].idxmin() # else: # best_pd_idx = results_df[sel_measure].idxmax() # best_tune_idx = results_df.loc[best_pd_idx]['tune_idx'] # results_df['{}_best_tune_idx'.format(sel_measure)] = best_tune_idx # bic_best_pd_idx = results_df['bic'].idxmin() # bic_best_tune_idx = results_df.loc[bic_best_pd_idx]['tune_idx'] # results_df['bic_best_tune_idx'] = bic_best_tune_idx return results_df
def add_gs_results(results_df, sim_stub, model, model_name, dataset, view, X_tr, Y_tr, X_tst, Y_tst, Pi_true, view_params_true, zero_thresh=0): """ Extracts results from after fitting model. Parameters ---------- results_df: pd.DataFrame The dataframe containing the results sim_stub: dict Dict contatining simulation information. model: Grid search clustering model. model_name: str, ['mvmm', 'gmm'] Name of model. dataset: str, ['full', 'view'] Which dataset are we looking at -- the full dataset or only a single view. view: int, str Either 'both' or which view we are looking at. X_tr, y_tr, X_tst, y_tst """ # idetifying information for results identif_stub = {'dataset': dataset, 'view': view, 'model': model_name} res = {} res.update(sim_stub) res.update(identif_stub) assert dataset in ['full', 'view'] assert type(view) == int or view == 'both' # assert model_name in ['ts_gs_mvmm', 'gmm_cat'] or \ # 'marginal_view' in model_name n_views = len(X_tr) n_samples = Y_tr.shape[0] # TODO is_mvmm_model = is_mvmm(model) is_gs_model = isinstance(model, BaseGridSearch) # format X data if not is_mvmm_model and dataset == 'full': X_tr = np.hstack(X_tr) X_tst = np.hstack(X_tst) elif dataset == 'view': X_tr = X_tr[view] X_tst = X_tst[view] # format y data y_tr_overall = view_labs_to_overall(Y_tr) y_tst_overall = view_labs_to_overall(Y_tst) # if dataset == 'full': # y_tr = view_labs_to_overall(Y_tr) # y_tst = view_labs_to_overall(Y_tst) # else: # y_tr = Y_tr[:, view] # y_tst = Y_tst[:, view] # get true number of components n_comp_tot_true, n_comp_views_true = get_n_comp(Pi_true) # true communities # comm_mat_true[np.isnan(comm_mat_true)] = -1 comm_mat_true = get_block_mat(Pi_true > 0) res['true_n_communities'] = int(np.nanmax(comm_mat_true) + 1) # comm_true_tr = [comm_mat_true[Y_tr[i, 0], Y_tr[i, 1]] # for i in range(Y_tr.shape[0])] # comm_true_tst = [comm_mat_true[Y_tst[i, 0], Y_tst[i, 1]] # for i in range(Y_tst.shape[0])] # add tuning parameter data if is_gs_model: n_tune_values = len(model.param_grid_) res['best_tuning_idx'] = model.best_idx_ all_estimators = [ model.estimators_[tune_idx] for tune_idx in range(n_tune_values) ] else: n_tune_values = 1 res['best_tuning_idx'] = 0 all_estimators = [model] # get results for every estimator for tune_idx in range(n_tune_values): ################# # get model out # ################# estimator = all_estimators[tune_idx] # get final est for two stage if isinstance(estimator, TwoStage): is_two_stage = True estimator = estimator.final_ start_est = all_estimators[tune_idx].start_ else: is_two_stage = False start_est = None # if it's a MVMM model and a marginal view dataset # get the view marginal model if is_mvmm_model and dataset == 'view': # for view datasets use view marginal part of MVMM estimator = estimator.view_models_[view] if is_two_stage: start_est = start_est.view_models_[view] ############### # Get results # ############### # bureaucracy res['tune_idx'] = tune_idx # if grid search get the tuing param valeus if is_gs_model: res['tuning_param_values'] = model.param_grid_[tune_idx] else: res['tuning_param_values'] = None # number of components this model is trying to estimate n_comp_est = estimator.n_components res['n_comp_est'] = n_comp_est if dataset == 'full': n_comp_true = n_comp_tot_true else: n_comp_true = n_comp_views_true[view] res['n_comp_resid'] = n_comp_true - n_comp_est if is_mvmm_model and dataset == 'full': res['n_comp_tot_est'] = estimator.n_components res['n_comp_views_est'] = estimator.n_view_components else: if dataset == 'full': res['n_comp_tot_est'] = n_comp_est res['n_comp_views_est'] = [None for _ in range(n_views)] else: res['n_comp_tot_est'] = n_comp_est res['n_comp_views_est'] = [None for _ in range(n_views)] res['n_comp_views_est'][view] = n_comp_est # fit time if hasattr(estimator, 'metadata_'): res['fit_time'] = estimator.metadata_['fit_time'] ################### # Fitting results # ################### # model fitting measures res['bic'] = estimator.bic(X_tr) res['aic'] = estimator.aic(X_tr) # compute metrics on training data y_tr_pred = estimator.predict(X_tr) y_tst_pred = estimator.predict(X_tst) # compare to overall labeling res = add_to_res(res, cluster_report(y_tr_overall, y_tr_pred), stub='train_overall') res = add_to_res(res, cluster_report(y_tst_overall, y_tst_pred), stub='test_overall') # TODO remove these # compare overall labels to commmunity labels # TODO: not sure if this is valuable # res = add_cluster_report(res, y_tr_pred, comm_true_tr, # stub='train_overall_vs_community') # res = add_cluster_report(res, y_tst_pred, comm_true_tst, # stub='test_overall_vs_community') # res['train_ars_overall'] = adjusted_rand_score(y_tr_overall, # y_tr_pred) # res['test_ars_overall'] = adjusted_rand_score(y_tst_overall, # y_tst_pred) # compare to view specific labels for v in range(n_views): res = add_to_res(res, cluster_report(Y_tr[:, v], y_tr_pred), stub='train_view_{}'.format(v)) res = add_to_res(res, cluster_report(Y_tst[:, v], y_tst_pred), stub='test_view_{}'.format(v)) # res['train_ars_view_{}'.format(v)] = \ # adjusted_rand_score(Y_tr[:, v], y_tr_pred) # res['test_ars_view_{}'.format(v)] = \ # adjusted_rand_score(Y_tst[:, v], y_tst_pred) ############ # TwoStage # ############ if is_two_stage: y_tr_pred_start = start_est.predict(X_tr) y_tst_pred_start = start_est.predict(X_tst) res = add_to_res(res, cluster_report(y_tr_overall, y_tr_pred_start), stub='start_train_overall') res = add_to_res(res, cluster_report(y_tst_overall, y_tst_pred_start), stub='start_test_overall') # if MVMM on full data, get pi accuracy if is_mvmm_model and dataset == 'full': # get estimated Pi matrix if is_block_diag_mvmm(estimator): Pi_est = estimator.bd_weights_ Pi_est /= Pi_est.sum() # normalize else: Pi_est = estimator.weights_mat_ # get community matrix for block diagonal matrix if is_two_stage: comm_mat_est = get_block_mat(Pi_est > zero_thresh) res['est_n_communities'] = int(np.nanmax(comm_mat_est) + 1) else: # full MVMM # for full MVMM run spectral bipartite partitioning # for true number of blocks n_blocks = res['true_n_communities'] comm_mat_est = run_bipt_spect_partitioning(Pi_est, n_blocks) res['est_n_communities'] = np.nan ##################### # Pi graph accuracy # ##################### res['pi_graph_acc_norm'] = \ get_pi_acc(Pi_est, Pi_true, method='random_walk', normalize=True, method_type='fast', kernel_type='exponential') res['pi_graph_acc_unnorm'] = \ get_pi_acc(Pi_est, Pi_true, method='random_walk', normalize=False, method_type='fast', kernel_type='exponential') ####################### # Pi aligned accuracy # ####################### means_true = [view_params_true[v]['means'] for v in range(n_views)] means_est = [ estimator.view_models_[v].means_ for v in range(n_views) ] res['pi_aligned_dist'] = \ get_aligned_pi_distance(Pi_true, Pi_est, means_true, means_est) ###################### # community accuracy # ###################### # predicted communities # comm_mat_est = get_block_mat(Pi_est > zero_thresh) # comm_mat_est[np.isnan(comm_mat_est)] = -1 # res['est_n_communities'] = int(np.nanmax(comm_mat_est) + 1) cl_report_no_out_tr, cl_report_restr_tr, n_out_tr = \ get_comm_pred_summary(Pi_true=Pi_true, Y_true=Y_tr, mvmm=estimator, view_data=X_tr, comm_mat_est=comm_mat_est) cl_report_no_out_tst, cl_report_restr_tst, n_out_tst = \ get_comm_pred_summary(Pi_true=Pi_true, Y_true=Y_tst, mvmm=estimator, view_data=X_tst, comm_mat_est=comm_mat_est) res = add_to_res(res, cl_report_no_out_tr, stub='train_community') res = add_to_res(res, cl_report_no_out_tst, stub='test_community') res = add_to_res(res, cl_report_restr_tst, stub='test_community_restr') res['comm_est_train_n_out'] = n_out_tr res['comm_est_test_n_out'] = n_out_tst # comm_est_tr = [] # comm_est_tst = [] # for i in range(len(y_tr_pred)): # y0, y1 = estimator._get_view_clust_idx(y_tr_pred[i]) # comm_est_tr.append(comm_mat_est[y0, y1]) # for i in range(len(y_tst_pred)): # y0, y1 = estimator._get_view_clust_idx(y_tst_pred[i]) # comm_est_tst.append(comm_mat_est[y0, y1]) # res = add_cluster_report(res, comm_true_tr, comm_est_tr, # stub='train_community') # res = add_cluster_report(res, comm_true_tst, comm_est_tst, # stub='test_community') # res['train_ars_comm'] = adjusted_rand_score(comm_true_tr, # comm_est_tr) # res['test_ars_comm'] = adjusted_rand_score(comm_true_tst, # comm_est_tst) ############ # TwoStage # ############ if is_two_stage: Pi_est = start_est.weights_mat_ # aligned accuracy means_true = [ view_params_true[v]['means'] for v in range(n_views) ] means_est = [ start_est.view_models_[v].means_ for v in range(n_views) ] res['start_pi_aligned_dist'] = \ get_aligned_pi_distance(Pi_true, Pi_est, means_true, means_est) # n_communities comm_mat_est = get_block_mat(Pi_est > zero_thresh) comm_mat_est[np.isnan(comm_mat_est)] = -1 res['start_est_n_communities'] = int( np.nanmax(comm_mat_est) + 1) results_df = results_df.append(res, ignore_index=True) return results_df
def run_sim(models, data_dist, Pi, view_params, n_samples_tr, data_seed, n_samples_tst=2000, zero_thresh=0, reg_covar_mult=1e-2, mc_index=None, to_exclude=None, log_fpath=None): """ Parameters ---------- models data_dist: callable(n_samples, seed) Function to generate data. n_samples_tr: int Number of training samples. data_seed: int Seed for sampling train/test observations. n_samples_tst: int Number of samples to get for test data. """ res_writer = ResultsWriter(log_fpath, delete_if_exists=True) res_writer.write("Beginning simulation at {}".format(get_current_time)) overall_start_time = time() seeds = get_seeds(random_state=data_seed, n_seeds=2) # sample data X_tr, Y_tr = data_dist(n_samples=n_samples_tr, random_state=seeds[0]) X_tst, Y_tst = data_dist(n_samples=n_samples_tst, random_state=seeds[1]) n_views = len(X_tr) Pi_empirical = get_empirical_pi(Y_tr, Pi.shape, scale='counts') runtimes = {} if to_exclude is None: to_exclude = [] for m in to_exclude: assert m in ['bd_mvmm', 'sp_mvmm', 'log_pen_mvmm'] ############################# # covariance regularization # ############################# n_views = len(X_tr) reg_covar = {} # set cov reg for each view for v in range(n_views): reg = default_cov_regularization(X=X_tr[v], mult=reg_covar_mult) models['view_gmms'][v].base_estimator.set_params(reg_covar=reg) models['full_mvmm'].base_view_models[v].set_params(reg_covar=reg) models['bd_mvmm'].base_estimator.base_start.base_view_models[v].\ set_params(reg_covar=reg) models['bd_mvmm'].base_estimator.base_final.base_view_models[v].\ set_params(reg_covar=reg) models['log_pen_mvmm'].base_estimator.base_start.base_view_models[v].\ set_params(reg_covar=reg) models['log_pen_mvmm'].base_estimator.base_start.base_view_models[v].\ set_params(reg_covar=reg) models['sp_mvmm'].base_mvmm_0.base_view_models[v].\ set_params(reg_covar=reg) models['sp_mvmm'].base_wbd_mvmm.base_view_models[v].\ set_params(reg_covar=reg) # print and save reg_covar[v] = reg res_writer.write( "\nCovarinace regularization for view {} is {}".format(v, reg)) stds = X_tr[v].std(axis=0) res_writer.write("Smallest variance: {}".format(stds.min()**2)) res_writer.write("Largest variance: {}".format(stds.max()**2)) # for cat GMM reg = default_cov_regularization(X=np.hstack(X_tr), mult=reg_covar_mult) models['cat_gmm'].base_estimator.set_params(reg_covar=reg) reg_covar['cat_gmm'] = reg ############## # fit models # ############## # get classification resuls clf_results = {} start_time = time() clf_results['cat'] = clf_fit_and_score(clone(models['clf']), X_tr=np.hstack(X_tr), y_tr=view_labs_to_overall(Y_tr), X_tst=np.hstack(X_tst), y_tst=view_labs_to_overall(Y_tst)) runtimes['cat'] = time() - start_time for v in range(n_views): start_time = time() clf_results['view_{}'.format(v)] =\ clf_fit_and_score(clone(models['clf']), X_tr=X_tr[v], y_tr=Y_tr[:, v], X_tst=X_tst[v], y_tst=Y_tst[:, v]) runtimes['clf_view_{}'.format(v)] = time() - start_time # fit clustering simplefilter('ignore', ConvergenceWarning) results_df = pd.DataFrame() sim_stub = {'mc_index': mc_index, 'n_samples': n_samples_tr} dists_cat = pairwise_distances(X=np.hstack(X_tr)) dists_views = [pairwise_distances(X=X_tr[v]) for v in range(n_views)] kws = { 'sim_stub': sim_stub, 'X_tr': X_tr, 'Y_tr': Y_tr, 'X_tst': X_tst, 'Y_tst': Y_tst, 'Pi_true': Pi, 'view_params_true': view_params, 'zero_thresh': zero_thresh, } ########### # cat-GMM # ########### # print('start fitting cat-GMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['cat_gmm'].fit(np.hstack(X_tr)) runtimes['cat_gmm'] = time() - start_time res_writer.write('fitting grid search cat-GMM took {:1.2f} seconds'.format( runtimes['cat_gmm'])) results_df = add_gs_results(results_df=results_df, model=models['cat_gmm'], model_name='gmm_cat', dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) ############# # View GMMs # ############# # print('start fitting view marginal GMMs at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) for v in range(n_views): start_time = time() models['view_gmms'][v].fit(X_tr[v]) runtimes['gmm_view_{}'.format(v)] = time() - start_time res_writer.write( 'fitting marginal view {} GMM took {:1.2f} seconds'.format( v, runtimes['gmm_view_{}'.format(v)])) # gmm fit on this view results_df = add_gs_results(results_df=results_df, model=models['view_gmms'][v], model_name='marginal_view_{}'.format(v), dataset='view', view=v, X_tr_precomp_dists=dists_views[v], **kws) ############# # Full MVMM # ############# # print('start fitting full MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['full_mvmm'].fit(X_tr) runtimes['full_mvmm'] = time() - start_time res_writer.write('fitting full mvmm took {:1.2f} seconds'.format( runtimes['full_mvmm'])) results_df = add_gs_results(results_df=results_df, model=models['full_mvmm'], model_name='full_mvmm', run_biptsp_on_full=True, dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) for v in range(n_views): # add MVMM results for this view results_df = add_gs_results( results_df=results_df, model=models['full_mvmm'], model_name='full_mvmm', dataset='view', view=v, X_tr_precomp_dists=dists_views[v], # TODO is this what we want **kws) ################ # log pen MVMM # ################ if 'log_pen_mvmm' not in to_exclude: # print('start fitting log pen grid search MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['log_pen_mvmm'].fit(X_tr) runtimes['log_pen_mvmm'] = time() - start_time res_writer.write('fitting grid search for log pen' 'mvmm took {:1.2f} seconds'.format( runtimes['log_pen_mvmm'])) results_df = add_gs_results(results_df=results_df, model=models['log_pen_mvmm'], model_name='log_pen_mvmm', dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) for v in range(n_views): # add log pen MVMM results for this view results_df = add_gs_results( results_df=results_df, model=models['log_pen_mvmm'], model_name='log_pen_mvmm', dataset='view', view=v, X_tr_precomp_dists=dists_views[ v], # TODO: is this what we want **kws) ####################### # block diagonal MVMM # ####################### if 'bd_mvmm' not in to_exclude: # print('start fitting block diag grid search MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['bd_mvmm'].fit(X_tr) runtimes['bd_mvmm'] = time() - start_time res_writer.write('fitting grid search for block' 'diag mvmm took {:1.2f} seconds'.format( runtimes['bd_mvmm'])) results_df = add_gs_results(results_df=results_df, model=models['bd_mvmm'], model_name='bd_mvmm', dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) for v in range(n_views): # add bd MVMM results for this view results_df = add_gs_results( results_df=results_df, model=models['bd_mvmm'], model_name='bd_mvmm', dataset='view', view=v, X_tr_precomp_dists=dists_views[ v], # TODO: is this what we want **kws) ######################### # spectral penalty MVMM # ######################### if 'sp_mvmm' not in to_exclude: # print('start fitting spectral penalty MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() models['sp_mvmm'].fit(X_tr) runtimes['sp_mvmm'] = time() - start_time res_writer.write('fitting grid search for spect pen' 'mvmm took {:1.2f} seconds'.format( runtimes['sp_mvmm'])) results_df = add_gs_results(results_df=results_df, model=models['sp_mvmm'], model_name='sp_mvmm', dataset='full', view='both', X_tr_precomp_dists=dists_cat, **kws) for v in range(n_views): # add sp MVMM results for this view results_df = add_gs_results( results_df=results_df, model=models['sp_mvmm'], model_name='sp_mvmm', dataset='view', view=v, X_tr_precomp_dists=dists_views[ v], # TODO: is this what we want **kws) ########## # oracle # ########## results_df = add_gs_results( results_df, model=models['oracle'], model_name='oracle', run_biptsp_on_full=False, # or true? dataset='full', view='both', **kws) # Some formatting # ensure these columns are saved as integers int_cols = [ 'mc_index', 'best_tuning_idx', 'n_comp_est', 'n_comp_resid', 'n_comp_tot_est', 'n_samples', 'tune_idx' ] results_df[int_cols] = results_df[int_cols].astype(int) # block diagonal summary of Pi estimates bd_summary = {} if 'log_pen_mvmm' not in to_exclude: _sim_stub = deepcopy(sim_stub) _sim_stub.update({'model_name': 'log_pen_mvmm'}) bd_summary['log_pen_mvmm'] = \ get_bd_summary_for_gs(_sim_stub, models['log_pen_mvmm'], zero_thresh=zero_thresh) if 'bd_mvmm' not in to_exclude: _sim_stub = deepcopy(sim_stub) _sim_stub.update({'model_name': 'bd_mvmm'}) bd_summary['bd_mvmm'] = \ get_bd_summary_for_gs(_sim_stub, models['bd_mvmm'], zero_thresh=zero_thresh) if 'sp_mvmm' not in to_exclude: _sim_stub = deepcopy(sim_stub) _sim_stub.update({'model_name': 'sp_mvmm'}) bd_summary['sp_mvmm'] = \ get_bd_summary_for_gs(_sim_stub, models['sp_mvmm'], zero_thresh=zero_thresh) res_writer.write( "Entire simulation took {:1.2f} seconds".format(time() - overall_start_time)) tr_data = {'X_tr': X_tr, 'Y_tr': Y_tr} return results_df, clf_results, models, bd_summary, Pi_empirical, tr_data,\ runtimes
def run_sim(full_mvmm, ts_gs_mvmm, cat_gmm, view_gmms, clf, data_dist, Pi, view_params, n_samples_tr, data_seed, mc_index=None, n_samples_tst=2000, zero_thresh=0): """ Parameters ---------- ts_gs_mvmm: Mutli-view mixture model grid search. cat_gmm: GMM grid serach for concatonated data. view_gmms: list GMMs for each view. clf: Classifier for concatenated data. data_dist: callable(n_samples, seed) Function to generate data. n_samples_tr: int Number of training samples. data_seed: int Seed for sampling train/test observations. n_samples_tst: int Number of samples to get for test data. """ seeds = get_seeds(random_state=data_seed, n_seeds=2) # sample data X_tr, Y_tr = data_dist(n_samples=n_samples_tr, random_state=seeds[0]) X_tst, Y_tst = data_dist(n_samples=n_samples_tst, random_state=seeds[1]) n_views = len(X_tr) Pi_empirical = get_empirical_pi(Y_tr, Pi.shape, scale='counts') runtimes = {} # get classification resuls clf_results = {} start_time = time() clf_results['cat'] = clf_fit_and_score(clone(clf), X_tr=np.hstack(X_tr), y_tr=view_labs_to_overall(Y_tr), X_tst=np.hstack(X_tst), y_tst=view_labs_to_overall(Y_tst)) runtimes['clf_cat'] = time() - start_time for v in range(n_views): start_time = time() clf_results['view_{}'.format(v)] =\ clf_fit_and_score(clone(clf), X_tr=X_tr[v], y_tr=Y_tr[:, v], X_tst=X_tst[v], y_tst=Y_tst[:, v]) runtimes['clf_view_{}'.format(v)] = time() - start_time # fit clustering simplefilter('ignore', ConvergenceWarning) # print('start fitting full MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() full_mvmm.fit(X_tr) runtimes['full_mvmm'] = time() - start_time print('fitting full mvmm took {:1.2f} seconds'.format( runtimes['full_mvmm'])) # print('start fitting Two stage grid search MVMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() ts_gs_mvmm.fit(X_tr) runtimes['ts_gs_mvmm'] = time() - start_time print('fitting grid search mvmm took {:1.2f} seconds'.format( runtimes['ts_gs_mvmm'])) # print('start fitting cat-GMM at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) start_time = time() cat_gmm.fit(np.hstack(X_tr)) runtimes['cat_gmm'] = time() - start_time print('fitting grid search cat-GMM took {:1.2f} seconds'.format( runtimes['cat_gmm'])) # print('start fitting view marginal GMMs at {}'. # format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))) for v in range(n_views): start_time = time() view_gmms[v].fit(X_tr[v]) runtimes['gmm_view_{}'.format(v)] = time() - start_time print('fitting marginal view {} GMM took {:1.2f} seconds'.format( v, runtimes['gmm_view_{}'.format(v)])) fit_models = { 'full_mvmm': full_mvmm, 'ts_gs_mvmm': ts_gs_mvmm, 'cat_gmm': cat_gmm, 'view_gmms': view_gmms } results_df = pd.DataFrame() sim_stub = {'mc_index': mc_index, 'n_samples': n_samples_tr} kws = { 'sim_stub': sim_stub, 'X_tr': X_tr, 'Y_tr': Y_tr, 'X_tst': X_tst, 'Y_tst': Y_tst, 'Pi_true': Pi, 'view_params_true': view_params, 'zero_thresh': zero_thresh } start_time = time() # add MVMM results_df = add_gs_results(results_df=results_df, model=full_mvmm, model_name='full_mvmm', dataset='full', view='both', **kws) results_df = add_gs_results(results_df=results_df, model=ts_gs_mvmm, model_name='ts_gs_mvmm', dataset='full', view='both', **kws) # pi_ests = get_pi_ests(mvmm) # add GMM on concat data results_df = add_gs_results(results_df=results_df, model=cat_gmm, model_name='gmm_cat', dataset='full', view='both', **kws) for v in range(n_views): # add MVMM results for this view results_df = add_gs_results(results_df=results_df, model=full_mvmm, model_name='full_mvmm', dataset='view', view=v, **kws) # add MVMM results for this view results_df = add_gs_results(results_df=results_df, model=ts_gs_mvmm, model_name='ts_gs_mvmm', dataset='view', view=v, **kws) # gmm fit on this view results_df = add_gs_results(results_df=results_df, model=view_gmms[v], model_name='marginal_view_{}'.format(v), dataset='view', view=v, **kws) # ensure these columns are saved as integers int_cols = [ 'mc_index', 'best_tuning_idx', 'n_comp_est', 'n_comp_resid', 'n_comp_tot_est', 'n_samples', 'tune_idx' ] results_df[int_cols] = results_df[int_cols].astype(int) if is_block_diag_mvmm(ts_gs_mvmm): bd_results = get_bd_results(sim_stub, ts_gs_mvmm, zero_thresh=zero_thresh) else: bd_results = None print('getting the results took {:1.2f} seconds'.format(time() - start_time)) return results_df, clf_results, fit_models, bd_results, Pi_empirical,\ runtimes