def fit_penalized(self, train_set, penalty_params, max_em_iters, val_set_evaluator=None, init_theta=None, reference_pen_param=None, pool=None): """ @param penalty_params: penalty parameter for fitting penalized model @param val_set_evaluator: LikelihoodComparer with a given reference model @param reference_pen_param: the penalty parameters for the reference model @return the fitted model after the 2-step procedure """ if init_theta is None: init_theta = initialize_theta(self.theta_shape, self.possible_theta_mask, self.zero_theta_mask) penalized_theta, _, _, _ = self.em_algo.run( train_set, self.feat_generator, theta=init_theta, possible_theta_mask=self.possible_theta_mask, zero_theta_mask=self.zero_theta_mask, burn_in=self.burn_in, penalty_params=penalty_params, max_em_iters=max_em_iters, max_e_samples=self.num_e_samples * 4, pool=pool, ) curr_model_results = MethodResults(penalty_params) #### Calculate validation log likelihood (EM surrogate), use to determine if model is any good. log_lik_ratio_lower_bound, log_lik_ratio = self._do_validation_set_checks( penalized_theta, val_set_evaluator, ) curr_model_results.set_penalized_theta( penalized_theta, log_lik_ratio_lower_bound, log_lik_ratio, model_masks=ModelTruncation(penalized_theta, self.feat_generator), reference_penalty_param=reference_pen_param, ) log.info("==== Penalized theta, %s, nonzero %d ====" % (penalty_params, curr_model_results.penalized_num_nonzero)) log.info( get_nonzero_theta_print_lines(penalized_theta, self.feat_generator)) return curr_model_results
def main(argv): num_threads = 1 num_runs = 30 try: opts, args = getopt.getopt(argv,"f:z:a:b:c:s:m:r:t:") except getopt.GetoptError: print "Bad Arguments to python script" sys.exit(2) settings = Elastic_Net_Settings() for opt, arg in opts: if opt == '-f': settings.num_features = int(arg) elif opt == '-z': settings.num_nonzero_features = int(arg) elif opt == '-a': settings.train_size = int(arg) elif opt == '-b': settings.validate_size = int(arg) elif opt == '-c': settings.test_size = int(arg) elif opt == "-s": settings.snr = float(arg) elif opt == "-m": assert(arg in METHODS) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) settings.print_settings() sys.stdout.flush() data_gen = DataGenerator(settings) run_data = [] for i in range(num_runs): observed_data = data_gen.make_correlated(settings.num_features, settings.num_nonzero_features) run_data.append(Iteration_Data(i, observed_data, settings)) if settings.method not in ["SP", "SP0"] and num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
def main(argv): seed = 10 print "seed", seed np.random.seed(seed) num_threads = 1 num_runs = 1 try: opts, args = getopt.getopt(argv, "m:t:r:") except getopt.GetoptError: print "Bad argument given to realdata_eval.py" sys.exit(2) settings = RealDataSettings() for opt, arg in opts: if opt == "-m": assert (arg in ["HC", "GS"]) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) print "TOTAL NUM RUNS %d" % num_runs sys.stdout.flush() geneset_dict = read_geneset_file() X_genesets, y, genesets = read_gene_expr_data(geneset_dict) print "num features", sum( [X_genesets[i].shape[1] for i in range(0, len(X_genesets))]) print "total genesets ever", len(X_genesets) X_genesets = normalize_data(X_genesets) run_data = [] for i in range(num_runs): data = Shuffled_Gene_Data(X_genesets, y, genesets) run_data.append(Iteration_Data(i, data, settings)) if num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
def main(argv): num_threads = 1 num_runs = 1 try: opts, args = getopt.getopt(argv,"m:t:r:") except getopt.GetoptError: print "Bad argument given to realdata_eval.py" sys.exit(2) settings = Simulation_Settings() settings.results_folder = "results/realdata" for opt, arg in opts: if opt == "-m": assert(arg in ["HC", "GS"]) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) print "TOTAL NUM RUNS %d" % num_runs sys.stdout.flush() geneset_dict = read_geneset_file() X_genesets, y, genesets = read_gene_expr_data(geneset_dict) print "num features", sum([X_genesets[i].shape[1] for i in range(0, len(X_genesets))]) print "total genesets ever", len(X_genesets) X_genesets = normalize_data(X_genesets) run_data = [] for i in range(num_runs): data = Shuffled_Gene_Data(X_genesets, y, genesets) run_data.append(Iteration_Data(i, data, settings)) if num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
def main(argv): seed = 10 print "seed", seed np.random.seed(seed) num_threads = 1 num_runs = 1 try: opts, args = getopt.getopt(argv, "g:f:a:b:c:s:m:t:r:i") except getopt.GetoptError: print "Bad argument given to sgl_eval.py" sys.exit(2) settings = SGL_Settings() for opt, arg in opts: if opt == '-g': settings.expert_num_groups = int(arg) elif opt == '-f': settings.num_features = int(arg) elif opt == '-a': settings.train_size = int(arg) elif opt == '-b': settings.validate_size = int(arg) elif opt == '-c': settings.test_size = int(arg) elif opt == "-s": settings.snr = float(arg) elif opt == "-m": assert (arg in METHODS) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) print "TOTAL NUM RUNS %d" % num_runs settings.print_settings() sys.stdout.flush() data_gen = DataGenerator(settings) run_data = [] for i in range(num_runs): observed_data = data_gen.sparse_groups() run_data.append(Iteration_Data(i, observed_data, settings)) if settings.method != "SP" and num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
def main(argv): seed = 10 print "seed", seed np.random.seed(seed) num_threads = 1 num_runs = 1 try: opts, args = getopt.getopt(argv, "d:z:f:a:v:s:m:t:r:i") except getopt.GetoptError: print "Bad argument given to Matrix_Completion_eval.py" sys.exit(2) settings = Matrix_Completion_Settings() for opt, arg in opts: if opt == '-d': arg_split = arg.split(",") settings.num_rows = int(arg_split[0]) settings.num_cols = int(arg_split[1]) elif opt == '-z': arg_split = arg.split(",") settings.num_nonzero_row_features = int(arg_split[0]) settings.num_nonzero_col_features = int(arg_split[1]) elif opt == '-f': arg_split = arg.split(",") settings.num_row_features = int(arg_split[0]) settings.num_col_features = int(arg_split[1]) elif opt == '-a': arg_split = arg.split(",") settings.train_perc = float(arg_split[0]) settings.validate_perc = float(arg_split[1]) settings.test_perc = float(arg_split[2]) assert (settings.train_perc + settings.validate_perc + settings.test_perc < 1) elif opt == "-v": settings.num_nonzero_s = int(arg) elif opt == "-s": settings.snr = float(arg) elif opt == "-m": assert (arg in METHODS) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) elif opt == "-i": settings.big_init_set = True assert (settings.num_nonzero_s <= settings.num_rows and settings.num_nonzero_s <= settings.num_cols) # SP does not care about initialization assert (not (settings.big_init_set == True and settings.method in ["SP", "SP0"])) settings.matrix_size = settings.num_rows * settings.num_cols settings.train_size = int(settings.train_perc * settings.matrix_size) settings.validate_size = int(settings.validate_perc * settings.matrix_size) settings.test_size = int(settings.test_perc * settings.matrix_size) print "TOTAL NUM RUNS %d" % num_runs settings.print_settings() sys.stdout.flush() data_gen = DataGenerator(settings) run_data = [] for i in range(num_runs): observed_data = data_gen.matrix_completion() run_data.append(Iteration_Data(i, observed_data, settings)) if settings.method != "SP" and num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
import gridsearch_interaction_effects GENERATE_PLOT = False NUM_RUNS = 15 # TRAIN_SIZE = 70 # NUM_EFFECTS = 40 # NUM_NONZERO_EFFECTS = 3 # NUM_NONZERO_INTERACTIONS = 300 TRAIN_SIZE = 40 NUM_EFFECTS = 20 NUM_NONZERO_EFFECTS = 3 NUM_NONZERO_INTERACTIONS = 40 hc_results = MethodResults("Hillclimb") mu_results = MethodResults("MU") gs_results = MethodResults("Gridsearch") for i in range(0, NUM_RUNS): beta_real, theta_real, X_train, W_train, y_train, X_validate, W_validate, y_validate, X_test, W_test, y_test = \ effects_and_interactions(TRAIN_SIZE, NUM_EFFECTS, NUM_NONZERO_EFFECTS, NUM_NONZERO_INTERACTIONS) def _get_test_beta_theta_err(beta_guess, theta_guess): test_err = testerror_interactions(X_test, W_test, y_test, beta_guess, theta_guess) / y_test.size * 2 beta_err = betaerror(beta_real, beta_guess) theta_err = betaerror(theta_guess, theta_real) return (test_err, beta_err, theta_err) hc_beta_guess, hc_theta_guess, hc_costpath = hillclimb_interaction_effects.run(X_train, W_train, y_train, X_validate, W_validate, y_validate) hc_results.append_test_beta_theta_err(_get_test_beta_theta_err(hc_beta_guess, hc_theta_guess))
def main(argv): seed = 10 print "seed", seed np.random.seed(seed) num_threads = 1 num_runs = 30 try: opts, args = getopt.getopt(argv,"f:z:a:b:c:s:m:r:t:") except getopt.GetoptError: print "Bad Arguments to python script" sys.exit(2) settings = Elastic_Net_Settings() for opt, arg in opts: if opt == '-f': settings.num_features = int(arg) elif opt == '-z': settings.num_nonzero_features = int(arg) elif opt == '-a': settings.train_size = int(arg) elif opt == '-b': settings.validate_size = int(arg) elif opt == '-c': settings.test_size = int(arg) elif opt == "-s": settings.snr = float(arg) elif opt == "-m": assert(arg in METHODS) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) settings.print_settings() sys.stdout.flush() data_gen = DataGenerator(settings) run_data = [] for i in range(num_runs): observed_data = data_gen.make_correlated(settings.num_features, settings.num_nonzero_features) run_data.append(Iteration_Data(i, observed_data, settings)) if settings.method not in ["SP", "SP0"] and num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
def main(argv): num_threads = 1 num_runs = 1 try: opts, args = getopt.getopt(argv, "d:z:f:g:a:v:s:m:t:r:i:") except getopt.GetoptError: print "Bad argument given" sys.exit(2) settings = Matrix_Completion_Group_Settings() for opt, arg in opts: if opt == '-d': arg_split = arg.split(",") settings.num_rows = int(arg_split[0]) settings.num_cols = int(arg_split[1]) elif opt == '-z': arg_split = arg.split(",") settings.num_nonzero_row_groups = int(arg_split[0]) settings.num_nonzero_col_groups = int(arg_split[1]) elif opt == '-f': arg_split = arg.split(",") settings.num_row_features = int(arg_split[0]) settings.num_col_features = int(arg_split[1]) elif opt == '-g': arg_split = arg.split(",") settings.num_row_groups = int(arg_split[0]) settings.num_col_groups = int(arg_split[1]) elif opt == '-a': arg_split = arg.split(",") settings.train_perc = float(arg_split[0]) settings.validate_perc = float(arg_split[1]) assert (settings.train_perc + settings.validate_perc <= 1.0) elif opt == "-v": settings.num_nonzero_s = int(arg) elif opt == "-s": settings.snr = float(arg) elif opt == "-m": assert (arg in METHODS) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) elif opt == "-i": settings.gamma_to_row_col_m = float(arg) assert (settings.num_nonzero_s <= settings.num_rows and settings.num_nonzero_s <= settings.num_cols) settings.matrix_size = settings.num_rows * settings.num_cols print "TOTAL NUM RUNS %d" % num_runs settings.print_settings() sys.stdout.flush() data_gen = DataGenerator(settings) run_data = [] for i in range(num_runs): observed_data = data_gen.matrix_completion_groups( gamma_to_row_col_m=settings.gamma_to_row_col_m, feat_factor=settings.feat_factor) run_data.append(Iteration_Data(i, observed_data, settings)) if settings.method != "SP" and num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
def main(): SMOOTH_FCNS = [big_sin, identity_fcn, big_cos_sin, crazy_down_sin, pwr_small] smooth_fcn_list = SMOOTH_FCNS[:NUM_FUNCS] hc_results = MethodResults("Hillclimb") hc_nesterov_results = MethodResults("Hillclimb_nesterov") gs_results = MethodResults("Gridsearch") for i in range(0, NUM_RUNS): # Generate dataset X_train, y_train, X_validate, y_validate, X_test, y_test = multi_smooth_features( TRAIN_SIZE, smooth_fcn_list, desired_snr=SNR, feat_range=[f * TRAIN_SIZE/60 for f in FEATURE_RANGE], train_to_validate_ratio=VALIDATE_RATIO, test_size=NUM_TEST ) X_full, train_idx, validate_idx, test_idx = GenAddModelHillclimb.stack((X_train, X_validate, X_test)) def _create_method_result(best_thetas, runtime): test_err = testerror_multi_smooth(y_test, test_idx, best_thetas) validate_err = testerror_multi_smooth(y_validate, validate_idx, best_thetas) print "create_method_result", test_err return MethodResult(test_err=test_err, validation_err=validate_err, runtime=runtime) def _run_hc(results, nesterov): hillclimb_prob = GenAddModelHillclimb(X_train, y_train, X_validate, y_validate, X_test, nesterov=nesterov) thetas, cost_path, runtime = _hillclimb_coarse_grid_search(hillclimb_prob, smooth_fcn_list) results.append(_create_method_result(thetas, runtime)) if PLOT_RUNS: _plot_res( thetas[test_idx], smooth_fcn_list, X_test, y_test, outfile="%s/test_%s_f%d.png" % (FIGURE_DIR, hillclimb_prob.method_label, NUM_FUNCS), ) _plot_res( thetas[validate_idx], smooth_fcn_list, X_validate, y_validate, outfile="%s/validation_%s_f%d.png" % (FIGURE_DIR, hillclimb_prob.method_label, NUM_FUNCS), ) _plot_res( thetas[train_idx], smooth_fcn_list, X_train, y_train, outfile="%s/train_%s_f%d.png" % (FIGURE_DIR, hillclimb_prob.method_label, NUM_FUNCS), ) return thetas, cost_path hc_thetas, hc_cost_path = _run_hc(hc_results, nesterov=False) # hc_nesterov_thetas, hc_nesterov_cost_path = _run_hc(hc_nesterov_results, nesterov=True) if PLOT_RUNS: _plot_cost_paths( cost_path_list=[hc_cost_path, hc_nesterov_cost_path], labels=["HC", "HC_Nesterov"], num_funcs=NUM_FUNCS, ) print "==================================================" start_time = time.time() gs_thetas, best_lambdas = gs.run( y_train, y_validate, X_full, train_idx, validate_idx, num_lambdas=NUM_GS_LAMBDAS, max_lambda=MAX_LAMBDA ) gs_runtime = time.time() - start_time gs_results.append(_create_method_result(gs_thetas, gs_runtime)) if PLOT_RUNS: _plot_res( gs_thetas[test_idx], smooth_fcn_list, X_test, y_test, outfile="%s/test_gs_f%d.png" % (FIGURE_DIR, NUM_FUNCS), ) _plot_gs_v_hc( gs_thetas[train_idx], hc_thetas[train_idx], smooth_fcn_list, X_train, y_train, outfile_prefix="%s/train_gs_v_hc_f%d" % (FIGURE_DIR, NUM_FUNCS), ) print "===========RUN %d ============" % i hc_results.print_results() hc_nesterov_results.print_results() gs_results.print_results()
def main(argv): num_threads = 1 num_runs = 1 try: opts, args = getopt.getopt(argv, "f:z:a:b:c:s:m:t:r:") except getopt.GetoptError: sys.exit(2) settings = Sparse_Add_Models_Settings() for opt, arg in opts: if opt == '-f': settings.num_funcs = int(arg) elif opt == '-z': settings.num_zero_funcs = int(arg) elif opt == '-a': settings.train_size = int(arg) elif opt == '-b': settings.validate_size = int(arg) elif opt == '-c': settings.test_size = int(arg) elif opt == "-s": settings.snr = float(arg) elif opt == "-m": assert (arg in METHODS) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) print "TOTAL NUM RUNS %d" % num_runs settings.print_settings() sys.stdout.flush() assert (settings.num_funcs <= len(settings.smooth_fcns)) smooth_fcn_list = settings.smooth_fcns[:settings.num_funcs] + [ const_zero ] * settings.num_zero_funcs data_gen = DataGenerator(settings) run_data = [] for i in range(num_runs): observed_data = data_gen.make_additive_smooth_data(smooth_fcn_list) run_data.append(Iteration_Data(i, observed_data, settings)) if settings.method != "SP" and num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
def main(): seed = int(np.random.rand() * 1e15) print "seed", seed np.random.seed(seed) geneset_dict = read_geneset_file() X_genesets, y, genesets = read_gene_expr_data(geneset_dict) print "num features", sum([X_genesets[i].shape[1] for i in range(0, len(X_genesets))]) print "total genesets ever", len(X_genesets) X_genesets = normalize_data(X_genesets) hc_results = MethodResults("HC") gs_grouped_results = MethodResults("GS_Grouped") gs_results = MethodResults("GS_Lasso") for i in range(0, NUM_ITERS): X_groups_train, y_train, X_groups_validate, y_validate, X_groups_test, y_test = shuffle_and_split_data( X_genesets, y, TRAIN_SIZE, VALIDATE_SIZE) X_validate = np.hstack(X_groups_validate) X_test = np.hstack(X_groups_test) start = time.time() hc_betas, hc_cost_path = hc.run_for_lambdas(X_groups_train, y_train, X_groups_validate, y_validate, init_lambdas=INIT_LAMBDAS) hc_runtime = time.time() - start print "hc 1e-6", get_num_nonzero_betas(hc_betas, genesets, threshold=1e-6) print "hc 1e-8", get_num_nonzero_betas(hc_betas, genesets, threshold=1e-8) print "hc 1e-10", get_num_nonzero_betas(hc_betas, genesets, threshold=1e-10) hc_validate_cost, hc_validate_rate = testerror_logistic_grouped(X_validate, y_validate, hc_betas) print "hc_validate_cost", hc_validate_cost start = time.time() gs_grouped_betas, gs_grouped_cost = gs_grouped.run_classify(X_groups_train, y_train, X_groups_validate, y_validate) gs_grouped_runtime = time.time() - start print "gs_grouped 1e-6", get_num_nonzero_betas(gs_grouped_betas, genesets, threshold=1e-6) print "gs_grouped 1e-8", get_num_nonzero_betas(gs_grouped_betas, genesets, threshold=1e-8) print "gs_grouped 1e-10", get_num_nonzero_betas(gs_grouped_betas, genesets, threshold=1e-10) gs_grouped_validate_cost, gs_grouped_validate_rate = testerror_logistic_grouped(X_validate, y_validate, gs_grouped_betas) print "gs_grouped_validate_cost", gs_grouped_validate_cost start = time.time() gs_betas, gs_cost = gs.run_classify(X_groups_train, y_train, X_groups_validate, y_validate) gs_runtime = time.time() - start print "gs 1e-6", get_num_nonzero_betas(gs_betas, genesets, threshold=1e-6) print "gs 1e-8", get_num_nonzero_betas(gs_betas, genesets, threshold=1e-8) print "gs 1e-10", get_num_nonzero_betas(gs_betas, genesets, threshold=1e-10) gs_validate_cost, gs_validate_rate = testerror_logistic_grouped(X_validate, y_validate, gs_betas) print "gs_validate_cost", gs_validate_cost print "================= hc ======================" hc_test, hc_rate = testerror_logistic_grouped(X_test, y_test, hc_betas) print "hc_test", hc_test, "hc_rate", hc_rate hc_results.append(MethodResult(test_err=hc_test, validation_err=hc_validate_cost, sensitivity=hc_rate, runtime=hc_runtime)) print "================= gs grouped ======================" gs_grouped_test, gs_grouped_rate = testerror_logistic_grouped(X_test, y_test, gs_grouped_betas) print "gs_grouped_test", gs_grouped_test, "gs_grouped_rate", gs_grouped_rate gs_grouped_results.append(MethodResult(test_err=gs_grouped_test, validation_err=gs_grouped_validate_cost, sensitivity=gs_grouped_rate, runtime=gs_grouped_runtime)) print "================= gs ======================" gs_test, gs_rate = testerror_logistic_grouped(X_test, y_test, gs_betas) print "gs_test", gs_test, "gs_rate", gs_rate gs_results.append(MethodResult(test_err=gs_test, validation_err=gs_validate_cost, sensitivity=gs_rate, runtime=gs_runtime)) print "ITERATION", i hc_results.print_results() gs_grouped_results.print_results() gs_results.print_results() if i == 0: pickle_data(PICKLE_DATA_FILENAME, X_groups_train, y_train, X_groups_validate, y_validate, X_groups_test, y_test, genesets) pickle_betas(PICKLE_BETAS_FILENAME, hc_betas, gs_grouped_betas, gs_betas)
def main(argv): seed = 10 print "seed", seed np.random.seed(seed) num_threads = 1 num_runs = 1 try: opts, args = getopt.getopt(argv,"g:f:a:b:c:s:m:t:r:") except getopt.GetoptError: print "Bad argument given to sgl_eval.py" sys.exit(2) settings = SGL_Settings() for opt, arg in opts: if opt == '-g': settings.expert_num_groups = int(arg) elif opt == '-f': settings.num_features = int(arg) elif opt == '-a': settings.train_size = int(arg) elif opt == '-b': settings.validate_size = int(arg) elif opt == '-c': settings.test_size = int(arg) elif opt == "-s": settings.snr = float(arg) elif opt == "-m": assert(arg in METHODS) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) print "TOTAL NUM RUNS %d" % num_runs settings.print_settings() sys.stdout.flush() data_gen = DataGenerator(settings) run_data = [] for i in range(num_runs): observed_data = data_gen.sparse_groups() run_data.append(Iteration_Data(i, observed_data, settings)) if settings.method != "SP" and num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
for init_lambda1 in COARSE_LAMBDA_GRID: kwargs["initial_lambda1"] = init_lambda1 kwargs["initial_lambda2"] = init_lambda1 beta_guess, cost_path = optimization_func(*args, **kwargs) validation_cost = testerror(X_validate, y_validate, beta_guess) if best_cost > validation_cost: best_start_lambdas = [kwargs["initial_lambda1"], kwargs["initial_lambda2"]] best_cost = validation_cost best_beta = beta_guess best_cost_path = cost_path end_time = time.time() print "HC: BEST best_cost", best_cost, "best_start_lambdas", best_start_lambdas return beta_guess, cost_path, end_time - start_time hc_results = MethodResults(HC_LAMBDA12_LABEL) hc_results1 = MethodResults(HC_LAMBDA12_LABEL + "_SHRINK") hc_dim_results = MethodResults(HC_LAMBDA12_DIM_LABEL) hc_nesterov_results = MethodResults(HC_LAMBDA12_NESTEROV_LABEL) hc_lambda_alpha_results = MethodResults(HC_LAMBDA_ALPHA_LABEL) hc_lambda_alpha_results1 = MethodResults(HC_LAMBDA_ALPHA_LABEL + "_SHRINK") hc_lambda_alpha_dim_results = MethodResults(HC_LAMBDA_ALPHA_DIM_LABEL) hc_lambda_alpha_nesterov_results = MethodResults(HC_LAMBDA_ALPHA_NESTEROV_LABEL) nm_results = MethodResults("NELDER-MEAD") bs_results = MethodResults("BAYES_SPEARMINT") gs_results = MethodResults(GS_LAMBDA12_LABEL) for i in range(0, NUM_RUNS): beta_real, X_train, y_train, X_validate, y_validate, X_test, y_test = data_generation.correlated( TRAIN_SIZE, NUM_FEATURES, NUM_NONZERO_FEATURES, signal_noise_ratio=SIGNAL_NOISE_RATIO) def _create_method_result(beta_guess, runtime):
def main(argv): seed = 10 print "seed", seed np.random.seed(seed) num_threads = 1 num_runs = 1 try: opts, args = getopt.getopt(argv,"f:z:a:b:c:s:m:t:r:i") except getopt.GetoptError: sys.exit(2) settings = Sparse_Add_Models_Settings() for opt, arg in opts: if opt == '-f': settings.num_funcs = int(arg) elif opt == '-z': settings.num_zero_funcs = int(arg) elif opt == '-a': settings.train_size = int(arg) elif opt == '-b': settings.validate_size = int(arg) elif opt == '-c': settings.test_size = int(arg) elif opt == "-s": settings.snr = float(arg) elif opt == "-m": assert(arg in METHODS) settings.method = arg elif opt == "-t": num_threads = int(arg) elif opt == "-r": num_runs = int(arg) elif opt == "-i": settings.big_init_set = True # SP does not care about initialization assert(not (settings.big_init_set == True and settings.method in ["SP", "SP0"])) print "TOTAL NUM RUNS %d" % num_runs settings.print_settings() sys.stdout.flush() assert(settings.num_funcs <= len(settings.smooth_fcns)) smooth_fcn_list = settings.smooth_fcns[:settings.num_funcs] + [const_zero] * settings.num_zero_funcs data_gen = DataGenerator(settings) run_data = [] for i in range(num_runs): observed_data = data_gen.make_additive_smooth_data(smooth_fcn_list) run_data.append(Iteration_Data(i, observed_data, settings)) if settings.method != "SP" and num_threads > 1: print "Do multiprocessing" pool = Pool(num_threads) results = pool.map(fit_data_for_iter_safe, run_data) else: print "Avoiding multiprocessing" results = map(fit_data_for_iter_safe, run_data) method_results = MethodResults(settings.method, settings.method_result_keys) num_crashes = 0 for r in results: if r is not None: method_results.append(r) else: num_crashes += 1 print "==========TOTAL RUNS %d============" % method_results.get_num_runs() method_results.print_results() print "num crashes %d" % num_crashes
def main(argv): try: opts, args = getopt.getopt(argv,"d:p") except getopt.GetoptError: print "BAD REQUEST" print "accepts a folder name. reads the XML files inside" sys.exit(2) RUN_HC_POOLED = False for opt, arg in opts: if opt == '-d': data_type = int(arg) if data_type == 0: TRAIN_SIZE = 10 TOTAL_FEATURES = 30 NUM_GROUPS = 3 elif data_type == 1: TRAIN_SIZE = 60 TOTAL_FEATURES = 300 NUM_GROUPS = 30 elif data_type == 2: TRAIN_SIZE = 90 TOTAL_FEATURES = 900 NUM_GROUPS = 60 elif data_type == 3: TRAIN_SIZE = 90 TOTAL_FEATURES = 1200 NUM_GROUPS = 100 elif opt == '-p': RUN_HC_POOLED = True TRUE_GROUP_FEATURE_SIZES = [TOTAL_FEATURES / TRUE_NUM_GROUPS] * TRUE_NUM_GROUPS EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES = [TOTAL_FEATURES / NUM_GROUPS] * NUM_GROUPS COARSE_LAMBDA1S = [1, 1e-1] if RUN_HC_POOLED: print "RUN POOLED FOR GS and HC" else: print "UNPOOLED VS. POOLED" seed = np.random.randint(0, 1e5) seed = 10 np.random.seed(seed) print "RANDOM SEED", seed print "TRAIN_SIZE", TRAIN_SIZE print "TOTAL_FEATURES", TOTAL_FEATURES print "NUM_GROUPS", NUM_GROUPS print "COARSE_LAMBDA1S", COARSE_LAMBDA1S def _hillclimb_coarse_grid_search(optimization_func, *args, **kwargs): start_time = time.time() best_cost = 1e10 best_beta = [] best_cost_path = [] best_lambda = 0 for init_lambda in COARSE_LAMBDA1S: kwargs["initial_lambda1"] = init_lambda beta_guess, cost_path = optimization_func(*args, **kwargs) if best_cost > cost_path[-1]: best_cost = cost_path[-1] best_cost_path = cost_path best_beta = beta_guess best_lambda = init_lambda print "init_lambda better!", init_lambda print "HC: best_cost", best_cost sys.stdout.flush() print "HC_FINAL: best_cost", best_cost, "best_lambda", best_lambda end_time = time.time() print "runtime", end_time - start_time return best_beta, best_cost_path, end_time - start_time hc_results = MethodResults(HC_GROUPED_LASSO_LABEL) hc_nesterov_results = MethodResults("NESTEROV") hc_pooled_results = MethodResults(HC_GROUPED_LASSO_LABEL + "_POOLED") hc_pooled_nesterov_results = MethodResults("NESTEROV_POOLED") nm_results = MethodResults("NELDER_MEAD") gs_results = MethodResults(GS_GROUPED_LASSO_LABEL) for i in range(0, NUM_RUNS): beta_reals, X_train, y_train, X_validate, y_validate, X_test, y_test = sparse_groups(TRAIN_SIZE, TRUE_GROUP_FEATURE_SIZES) def _create_method_result(beta_guesses, runtime): test_err = testerror_grouped(X_test, y_test, beta_guesses) validation_err = testerror_grouped(X_validate, y_validate, beta_guesses) beta_guesses_all = np.concatenate(beta_guesses) beta_reals_all = np.concatenate(beta_reals) beta_err = betaerror(beta_reals_all, beta_guesses_all) guessed_nonzero_elems = np.where(get_nonzero_indices(beta_guesses_all, threshold=ZERO_THRESHOLD)) true_nonzero_elems = np.where(get_nonzero_indices(beta_reals_all, threshold=ZERO_THRESHOLD)) intersection = np.intersect1d(np.array(guessed_nonzero_elems), np.array(true_nonzero_elems)) sensitivity = intersection.size / float(guessed_nonzero_elems[0].size) * 100 print "test_err", test_err, "beta_err", beta_err, "sensitivity", sensitivity sys.stdout.flush() return MethodResult(test_err=test_err, validation_err=validation_err, beta_err=beta_err, sensitivity=sensitivity, runtime=runtime) if RUN_HC_POOLED: hc_pooled_beta_guesses, hc_pooled_costpath, runtime = _hillclimb_coarse_grid_search(hc_pooled.run, X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES) hc_pooled_results.append(_create_method_result(hc_pooled_beta_guesses, runtime)) # hc_pooled_nesterov_beta_guesses, hc_pooled_nesterov_costpath, runtime = _hillclimb_coarse_grid_search(hc_pooled.run_nesterov, X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES) # hc_pooled_nesterov_results.append(_create_method_result(hc_pooled_nesterov_beta_guesses, runtime)) else: hc_beta_guesses, hc_costpath, runtime = _hillclimb_coarse_grid_search(hc.run, X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES) hc_results.append(_create_method_result(hc_beta_guesses, runtime)) # hc_nesterov_beta_guesses, hc_nesterov_costpath, runtime = _hillclimb_coarse_grid_search(hc.run_nesterov, X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES) # hc_nesterov_results.append(_create_method_result(hc_nesterov_beta_guesses, runtime)) nm_beta_guesses, runtime = nm.run(X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES) nm_results.append(_create_method_result(nm_beta_guesses, runtime)) start = time.time() gs_beta_guesses, gs_lowest_cost = gridsearch_grouped_lasso.run(X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES) runtime = time.time() - start gs_results.append(_create_method_result(gs_beta_guesses, runtime)) print "NUM RUN", i print "FEATURE GROUPS", TRUE_GROUP_FEATURE_SIZES print "NUM_GROUPS", NUM_GROUPS print "TRAIN SIZE", TRAIN_SIZE if RUN_HC_POOLED: hc_pooled_results.print_results() nm_results.print_results() hc_pooled_nesterov_results.print_results() else: hc_results.print_results() nm_results.print_results() hc_nesterov_results.print_results() gs_results.print_results() if GENERATE_PLOT and i == 0: plt.clf() if RUN_HC_POOLED: plt.plot(hc_pooled_costpath, label="Gradient Descent", color="red") plt.plot(hc_pooled_nesterov_costpath, label="Nesterov's Gradient Descent", color="blue") # Integer ticks only plt.xticks(np.arange(0, max(len(hc_pooled_costpath), len(hc_pooled_nesterov_costpath)), 1.0)) else: plt.plot(hc_costpath, label=HC_GROUPED_LASSO_LABEL, color=HC_GROUPED_LASSO_COLOR) plt.plot(hc_nesterov_costpath, label="Nesterov", color="purple") # Integer ticks only plt.xticks(np.arange(0, max(len(hc_costpath), len(hc_nesterov_costpath)), 1.0)) plt.axhline(gs_lowest_cost, label="Grid Search", color=GS_COLOR) plt.legend(fontsize="x-small") # plt.title("Train=%d p=%d, g=%d, m=%d" % (TRAIN_SIZE, TOTAL_FEATURES, TRUE_NUM_GROUPS, NUM_GROUPS)) plt.xlabel("Number of iterations") plt.ylabel("Validation test error") plt.savefig("figures/grouped_lasso_%d_%d_%d_%d.png" % (TRAIN_SIZE, TOTAL_FEATURES, TRUE_NUM_GROUPS, NUM_GROUPS)) sys.stdout.flush()