def run_experiments(problems): lfname = log_filename(problems, STEM) logging.basicConfig(filename='./nn_logs/' + lfname + '.txt', level=logging.DEBUG) algo_runs = { 'gradient_descent': 3, 'random_hill_climb': 3, 'simulated_annealing': 3, 'genetic_alg': 3 } param_grids = get_param_grids() results = {} for problem in problems: results[problem] = {} X_train, y_train = load_data() ss = StandardScaler() X_train_scaled = ss.fit_transform(X_train) #X_test_scaled = ss.transform(X_test) for algo in algo_runs.keys(): results[problem][algo] = {} pg = ParameterGrid(param_grids[algo]) for i, kwargs in enumerate(pg): logging.info('problem ' + problem + ' algo ' + algo + ' i ' + str(i) + str(kwargs)) results[problem][algo][i] = {} print(algo) for run in range(algo_runs[algo]): results[problem][algo][i][run] = {} mdl = LogisticRegression( algorithm=algo, #'gradient_descent', bias=True, early_stopping=True, #False, #early_stopping=True, #clip_max=5, max_attempts=500, random_state=0, curve=True, **kwargs) start_time = time.time() logging.info('start time ' + str(time.time())) mdl.fit(X_train_scaled, y_train) end_time = time.time() duration = end_time - start_time logging.info('duration ' + str(duration)) y_pred_train = mdl.predict(X_train_scaled) train_acc = balanced_accuracy_score(y_train, y_pred_train) train_f1 = f1_score(y_train, y_pred_train) results[problem][algo][i][run]['fit_time'] = duration results[problem][algo][i][run]['curve'] = mdl.fitness_curve results[problem][algo][i][run][ 'balanced_accuracy'] = train_acc results[problem][algo][i][run]['F1_score'] = train_f1 #results[problem][algo][i][run]['train'] = {} #results[problem][algo][i][run]['train']['balanced_accuracy'] = train_acc #results[problem][algo][i][run]['train']['F1_score'] = train_f1 #y_pred_test= mdl.predict(X_test_scaled) #test_acc = balanced_accuracy_score(y_test, y_pred_test) #test_f1 = f1_score(y_test, y_pred_test) #results[problem][algo][i][run]['test'] = {} #results[problem][algo][i][run]['test']['balanced_accuracy'] = test_acc #results[problem][algo][i][run]['test']['F1_score'] = test_f1 logging.info('train ba' + str(train_acc)) #logging.info('test_ba' + str(test_acc)) with open('./nn_output/' + STEM + '_nn_results_dict.pkl', 'wb') as f: pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)
def run_experiments(problems): lfname = log_filename(problems, STEM) logging.basicConfig(filename='./nn_logs/' + lfname + '.txt', level=logging.DEBUG) algo_runs = { 'gradient_descent': 3, 'random_hill_climb': 3, 'simulated_annealing': 3, 'genetic_alg': 3 } param_grids = get_param_grids() results = {} for problem in problems: results[problem] = {} X_train, y_train = load_data() n_classes = np.unique(y_train).size markers = 'xo' colors = ['purple', 'yellow'] fig, axes = plt.subplots(1, 1, figsize=(6, 6)) for k, label in enumerate(np.unique(y_train)): plot_mask = (y_train == label) plot_mask = plot_mask.astype(bool) plt.scatter( X_train[plot_mask, 0], X_train[plot_mask, 1], marker=markers[k], c=colors[k], #'gray', edgecolor='k', alpha=0.6) plt.xlabel('x1') plt.ylabel('x2') plt.suptitle('The Two Deterministic Features of the Synthetic Data') plt.savefig('./nn_plots/' + STEM + '_data.png') ss = StandardScaler() X_train_scaled = ss.fit_transform(X_train) #X_test_scaled = ss.transform(X_test) for algo in algo_runs.keys(): results[problem][algo] = {} pg = ParameterGrid(param_grids[algo]) for i, kwargs in enumerate(pg): logging.info('problem ' + problem + ' algo ' + algo + ' i ' + str(i) + str(kwargs)) results[problem][algo][i] = {} print(algo) for run in range(algo_runs[algo]): results[problem][algo][i][run] = {} mdl = LogisticRegression( algorithm=algo, #'gradient_descent', bias=True, early_stopping=True, #False, #early_stopping=True, #clip_max=5, max_attempts=500, random_state=0, curve=True, **kwargs) start_time = time.time() logging.info('start time ' + str(time.time())) mdl.fit(X_train_scaled, y_train) end_time = time.time() duration = end_time - start_time logging.info('duration ' + str(duration)) y_pred_train = mdl.predict(X_train_scaled) train_acc = balanced_accuracy_score(y_train, y_pred_train) train_f1 = f1_score(y_train, y_pred_train) results[problem][algo][i][run]['fit_time'] = duration results[problem][algo][i][run]['curve'] = mdl.fitness_curve results[problem][algo][i][run][ 'weights'] = mdl.fitted_weights results[problem][algo][i][run][ 'balanced_accuracy'] = train_acc results[problem][algo][i][run]['F1_score'] = train_f1 #results[problem][algo][i][run]['train'] = {} #results[problem][algo][i][run]['train']['balanced_accuracy'] = train_acc #results[problem][algo][i][run]['train']['F1_score'] = train_f1 #y_pred_test= mdl.predict(X_test_scaled) #test_acc = balanced_accuracy_score(y_test, y_pred_test) #test_f1 = f1_score(y_test, y_pred_test) #results[problem][algo][i][run]['test'] = {} #results[problem][algo][i][run]['test']['balanced_accuracy'] = test_acc #results[problem][algo][i][run]['test']['F1_score'] = test_f1 logging.info('train ba' + str(train_acc)) #logging.info('test_ba' + str(test_acc)) with open('./nn_output/' + STEM + '_nn_results_dict.pkl', 'wb') as f: pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)
def run_experiments(problems): lfname = log_filename(problems, STEM) logging.basicConfig(filename='./logs/' + lfname + '.txt', level=logging.DEBUG) problems_dict = get_problems() optimizers = get_optimizers() runs_per_problem = get_runs_per_problem() func_kwargs_per_problem = get_func_kwargs_per_problem() best_state_collection = {} best_fitness_collection = {} curves = {} eval_curves = {} times = {} iters = {} for problem in problems: print('problem', problem) best_state_collection[problem] = {} best_fitness_collection[problem] = {} curves[problem] = {} eval_curves[problem] = {} times[problem] = {} iters[problem] = {} prob_dict = problems_dict[problem] #print('prob_dict', prob_dict) for exp_id, experiment in enumerate(prob_dict['experiments']): print('exp_id', exp_id) kwargs = experiment['kwargs'] problem_kwargs = kwarg_str(kwargs) logging.info('problem ' + problem + ' experiment id ' + str(exp_id) + ' for ' + problem_kwargs) best_state_collection[problem][exp_id] = {} best_fitness_collection[problem][exp_id] = {} curves[problem][exp_id] = {} eval_curves[problem][exp_id] = {} times[problem][exp_id] = {} iters[problem][exp_id] = {} fitness = prob_dict['class'] opt_prob = prob_dict['opt_prob'] for opt_name, optimizer in optimizers.items(): best_state_collection[problem][exp_id][opt_name] = {} best_fitness_collection[problem][exp_id][opt_name] = {} curves[problem][exp_id][opt_name] = {} eval_curves[problem][exp_id][opt_name] = {} times[problem][exp_id][opt_name] = {} iters[problem][exp_id][opt_name] = {} func = optimizer['function'] #for f_kwargs_id, func_kwargs in enumerate(optimizer['kwargs']): f_kwargs_id = func_kwargs_per_problem[problem][opt_name] func_kwargs = optimizer['kwargs'][f_kwargs_id] print('fitness', fitness.__name__) print('kwargs', kwargs) print('func', func) print('func_kwargs', func_kwargs) func_kwargs_str = kwarg_str(func_kwargs) logging.info('problem ' + problem + 'experiment id ' + str(exp_id) + ' fkwargs_id ' + str(f_kwargs_id) + ' is ' + func_kwargs_str) fit_func = fitness(**kwargs) problem_fit = opt_prob(experiment['length'], fit_func) print('problem_fit', problem_fit) best_state_collection[problem][exp_id][opt_name][ f_kwargs_id] = {} best_fitness_collection[problem][exp_id][opt_name][ f_kwargs_id] = {} curves[problem][exp_id][opt_name][f_kwargs_id] = {} eval_curves[problem][exp_id][opt_name][f_kwargs_id] = {} times[problem][exp_id][opt_name][f_kwargs_id] = {} iters[problem][exp_id][opt_name][f_kwargs_id] = {} param_list = [] for run in range(runs_per_problem[problem]): params = { 'problem_fit': problem_fit, 'max_attempts': 200, #1000, #'max_iters': 5000, 'run': run, 'func_kwargs': func_kwargs, 'func': func } param_list.append(params) logging.info('time ' + str(time.time())) with mp.Pool() as pool: outcome = pool.map(run_opt, param_list) pool.close() pool.join() #outcome = run_opt(params) #start_time = time.time() #best_state, best_fitness, curve = ( # func( # problem_fit, # max_attempts=1000, # max_iters=10000, # curve=True, # random_state=run, # **func_kwargs # ) #) #end_time = time.time() #time = end_time - start_time #best_state_collection[problem][problem_kwargs][opt_name][func_kwargs_str][run] = best_state #best_fitness_collection[problem][problem_kwargs][opt_name][func_kwargs_str][run] = best_fitness #curves[problem][problem_kwargs][opt_name][func_kwargs_str][run] = curve #times[problem][problem_kwargs][opt_name][func_kwargs_str][run] = time #best_state_collection[problem][problem_kwargs][opt_name][func_kwargs_str][run] = outcome['best_state'] #best_fitness_collection[problem][problem_kwargs][opt_name][func_kwargs_str][run] = outcome['best_fitness'] #curves[problem][problem_kwargs][opt_name][func_kwargs_str][run] = outcome['curve'] #times[problem][problem_kwargs][opt_name][func_kwargs_str][run] = outcome['duration'] for run in range(runs_per_problem[problem]): best_state_collection[problem][exp_id][opt_name][ f_kwargs_id][run] = outcome[run]['best_state'] best_fitness_collection[problem][exp_id][opt_name][ f_kwargs_id][run] = outcome[run]['best_fitness'] curves[problem][exp_id][opt_name][f_kwargs_id][ run] = outcome[run]['curve'] eval_curves[problem][exp_id][opt_name][f_kwargs_id][ run] = outcome[run]['eval_curve'] times[problem][exp_id][opt_name][f_kwargs_id][ run] = outcome[run]['duration'] iters[problem][exp_id][opt_name][f_kwargs_id][run] = len( outcome[run]['curve']) #best_state_df = pd.DataFrame.from_dict( # {(i, j, k, l, m): best_state_collection[i][j][k][l][m] # for i in best_state_collection.keys() # for j in best_state_collection[i].keys() # for k in best_state_collection[i][j].keys() # for l in best_state_collection[i][j][k].keys() # for m in best_state_collection[i][j][k][l].keys() # }, orient='index' # ) best_fitness_df = pd.DataFrame.from_dict( {(i, j, k, l, m): best_fitness_collection[i][j][k][l][m] for i in best_fitness_collection.keys() for j in best_fitness_collection[i].keys() for k in best_fitness_collection[i][j].keys() for l in best_fitness_collection[i][j][k].keys() for m in best_fitness_collection[i][j][k][l].keys()}, orient='index') best_fitness_df.columns = ['fitness'] best_fitness_df.index = pd.MultiIndex.from_tuples( best_fitness_df.index, names=('problem', 'experiment', 'algo', 'params', 'run')) #best_fitness_df.to_pickle('./output/best_fitness_df.pkl') for problem in problems: for exp_id, experiment in enumerate(prob_dict['experiments']): fig, ax = plt.subplots(figsize=(6, 4)) plt.subplots_adjust(bottom=.2) #f, ax = plt.subplots(1, 1) #plt.subplots_adjust(bottom=.3) summary = best_fitness_df.loc[idx[( problem, exp_id)], :].unstack().droplevel(1) #summary = best_fitness_df.loc[idx[(problem)], :].unstack() summary.plot(kind='bar', rot=45, ax=ax, legend=False, color="C0", edgecolor='black') plt.xlabel('Each Run Grouped By Algorithm') plt.ylabel('fitness') plt.suptitle('Fitness Result on ' + problem + '\nfor ' + str(runs_per_problem[problem]) + ' runs of each Algorithm ') #plt.legend(loc='lower right') plt.savefig('./plots/' + STEM + '_end_fitness_by_run_' + file_join([problem, exp_id]) + '.png') plt.clf() curves_df = pd.DataFrame.from_dict( {(i, j, k, l, m): curves[i][j][k][l][m] for i in curves.keys() for j in curves[i].keys() for k in curves[i][j].keys() for l in curves[i][j][k].keys() for m in curves[i][j][k][l].keys()}, orient='index') curves_df = curves_df.ffill(axis=1) curves_df.index = pd.MultiIndex.from_tuples(curves_df.index, names=('problem', 'experiment', 'algo', 'params', 'run')) avg_curves = curves_df.mean(level=list(range(4))) for problem in problems: for exp_id, experiment in enumerate(prob_dict['experiments']): fig, ax = plt.subplots(figsize=(6, 4)) plt.subplots_adjust(bottom=.15) #plot_df = avg_curves.loc[idx[problem, exp_id, : , ], :] plot_df = avg_curves.loc[idx[problem, exp_id, :, ], :].droplevel( level=[0, 1, 3]) plot_df.T.plot(ax=ax) plt.xlabel('iteration') plt.ylabel('fitness') plt.suptitle('Fitness on ' + problem + ' by iteration') plt.legend(loc='lower right') plt.savefig('./plots/' + STEM + '_fitness_curves_' + file_join([problem, exp_id]) + '.png') plt.clf() #curves_df.to_pickle('./output/curves_df.pkl') #print(curves_df.unstack(-1)) eval_curves_df = pd.DataFrame.from_dict( {(i, j, k, l, m): eval_curves[i][j][k][l][m] for i in eval_curves.keys() for j in eval_curves[i].keys() for k in eval_curves[i][j].keys() for l in eval_curves[i][j][k].keys() for m in eval_curves[i][j][k][l].keys()}, orient='index') eval_curves_df.index = pd.MultiIndex.from_tuples( eval_curves_df.index, names=('problem', 'experiment', 'algo', 'params', 'run')) avg_eval_curves = eval_curves_df.mean(level=list(range(4))) #print(avg_eval_curves) for problem in problems: for exp_id, experiment in enumerate(prob_dict['experiments']): fig, ax = plt.subplots(figsize=(6, 4)) plt.subplots_adjust(bottom=.15) plot_df = avg_eval_curves.loc[idx[problem, exp_id, :, ], :].droplevel( level=[0, 1, 3]) plot_df.T.plot(ax=ax) plt.xlabel('iteration') plt.ylabel('evaluations') plt.yscale('log') plt.suptitle('Evaluations on ' + problem + ' by Iteration') plt.legend(loc='lower right') plt.savefig('./plots/' + STEM + '_eval_curves_' + file_join([problem, exp_id]) + '.png') plt.clf() colors = ['C0', 'C1', 'C2', 'C3'] #print('curves') #print(curves_df.head()) #print('evals') #print(eval_curves_df.head()) for problem in problems: for exp_id, experiment in enumerate(prob_dict['experiments']): fig, ax = plt.subplots(figsize=(6, 4)) plt.subplots_adjust(bottom=.15) #for i, algo in enumerate(algos): # change for i, (opt_name, optimizer) in enumerate(optimizers.items()): color = colors[i] f_kwargs_id = func_kwargs_per_problem[problem][opt_name] for run in range(runs_per_problem[problem]): fitness = curves_df.loc[idx[problem, exp_id, opt_name, f_kwargs_id, run], :] #print('fitness') #print(fitness) evals = eval_curves_df.loc[idx[problem, exp_id, opt_name, f_kwargs_id, run], :] #print('evals') #print(evals) #plt.plot(evals, fitness) #plt.show() if run == 0: ax.plot(evals, fitness, color=color, linestyle=':', label=opt_name) else: ax.plot(evals, fitness, color=color, linestyle=':', label='_nolegend_') ax.set_xlabel('evaluations') ax.set_ylabel('fitness') ax.set_xscale('log') ax.set_title('Fitness by Evaluation for ' + problem) ax.legend() plt.savefig('./plots/' + STEM + '_fitness_by_eval_' + file_join([problem, exp_id]) + '.png') plt.clf() times_df = pd.DataFrame.from_dict( {(i, j, k, l, m): times[i][j][k][l][m] for i in times.keys() for j in times[i].keys() for k in times[i][j].keys() for l in times[i][j][k].keys() for m in times[i][j][k][l].keys()}, orient='index') times_df.columns = ['time'] times_df.index = pd.MultiIndex.from_tuples(times_df.index, names=('problem', 'experiment', 'algo', 'params', 'run')) iters_df = pd.DataFrame.from_dict( {(i, j, k, l, m): iters[i][j][k][l][m] for i in iters.keys() for j in iters[i].keys() for k in iters[i][j].keys() for l in iters[i][j][k].keys() for m in iters[i][j][k][l].keys()}, orient='index') iters_df.columns = ['iterations'] iters_df.index = pd.MultiIndex.from_tuples(iters_df.index, names=('problem', 'experiment', 'algo', 'params', 'run')) combined = pd.merge(left=times_df, right=iters_df, how='left', left_index=True, right_index=True) #combined.to_pickle('./output/combined.pkl') avgtimes = {} for problem in problems: avgtimes[problem] = {} for exp_id, experiment in enumerate(prob_dict['experiments']): avgtimes[problem][exp_id] = {} fig, ax = plt.subplots(figsize=(6, 4)) plt.subplots_adjust(bottom=.15) #fig, ax = plt.subplots() plot_df = combined.loc[idx[(problem, exp_id)], :] #print(plot_df) for opt_name, optimizer in optimizers.items(): df = plot_df.loc[idx[opt_name], :] avgtime = df['time'].sum() / df['iterations'].sum() avgtimes[problem][exp_id][opt_name] = avgtime ax.scatter(df['iterations'], df['time'], label=f'{opt_name}: {avgtime:.2e} s/iter') ax.set_yscale('log') plt.xlabel('iterations') plt.ylabel('completion time (seconds)') plt.suptitle('Time vs Iterations for each algorithm on ' + problem) plt.legend() plt.savefig('./plots/' + STEM + '_time_iterations_' + file_join([problem, exp_id]) + '.png') combined = pd.merge(left=times_df, right=best_fitness_df, how='left', left_index=True, right_index=True) #combined.to_pickle('./output/combined.pkl') for problem in problems: for exp_id, experiment in enumerate(prob_dict['experiments']): fig, ax = plt.subplots(figsize=(6, 4)) plt.subplots_adjust(bottom=.15) #fig, ax = plt.subplots() plot_df = combined.loc[idx[(problem, exp_id)], :] #print(plot_df) for opt_name, optimizer in optimizers.items(): avgtime = avgtimes[problem][exp_id][opt_name] df = plot_df.loc[idx[opt_name], :] ax.scatter(df['time'], df['fitness'], label=f'{opt_name}: {avgtime:.2e} s/iter') ax.set_xscale('log') plt.xlabel('time (seconds)') plt.ylabel('Fitness') plt.suptitle('Fitness vs Time for each algorithm on ' + problem) plt.legend() plt.savefig('./plots/' + STEM + '_fitness_time_' + file_join([problem, exp_id]) + '.png') #best_state_df.to_csv('./results/' + lfname + '_best_states.csv') best_fitness_df.to_csv('./results/' + lfname + '_best_fitness.csv') curves_df.to_csv('./results/' + lfname + '_curves.csv') eval_curves_df.to_csv('./results/' + lfname + '_eval_curves.csv') times_df.to_csv('./results/' + lfname + '_times.csv') iters_df.to_csv('./results/' + lfname + '_iterations.csv') #logging.info(str(best_state_collection)) logging.info(str(best_fitness_collection)) logging.info(str(curves)) logging.info(str(times))