def perform(self): # Policy iteration self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = '{}/PI/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name) with open(grid_file_name, 'w') as f: f.write("params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std,steps_new\n") discount_factors = np.round(np.linspace(0, 0.9, num=10), 2) dims = len(discount_factors) self.log("Searching PI in {} dimensions".format(dims)) runs = 1 for discount_factor in discount_factors: t = time.clock() self.log("{}/{} Processing PI with discount factor {}".format(runs, dims, discount_factor)) p = solvers.PolicyIterationSolver(self._details.env, discount_factor=discount_factor, max_policy_eval_steps=3000, verbose=self._verbose) stats = self.run_solver_and_collect(p, self.convergence_check_fn) self.log("Took {} steps".format(len(stats.steps))) stats.to_csv('{}/PI/{}_{}.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) stats.pickle_results('{}/PI/pkl/{}_{}_{}.pkl'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor, '{}'), map_desc.shape) stats.plot_policies_on_map('{}/images/PI/{}_{}_{}.png'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor, '{}_{}'), map_desc, self._details.env.colors(), self._details.env.directions(), 'Policy Iteration', 'Step', self._details, only_last=True) optimal_policy_stats = self.run_policy_and_collect(p, stats.optimal_policy) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv('{}/PI/{}_{}_optimal.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{},{}\n'.format( json.dumps({'discount_factor': discount_factor}).replace('"', '""'), time.clock() - t, # len(optimal_policy_stats.rewards), len(stats.converged_values), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, len(stats.converged_values) )) # print(optimal_policy_stats) # print(optimal_policy_stats.rewards) # print(stats.converged_values, len(stats.converged_values)) runs += 1
def perform(self): # Policy iteration self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = os.path.join(PI_DIR, '{}_grid.csv'.format(self._details.env_name)) with open(grid_file_name, 'w') as f: f.write("params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n") discount_factors = np.round(np.linspace(self._discount_min, max(self._discount_min, self._discount_max), \ num=self._num_discounts), 2) dims = len(discount_factors) self.log("Searching PI in {} dimensions".format(dims)) runs = 1 for discount_factor in discount_factors: t = time.clock() self.log("{}/{} Processing PI with discount factor {}".format(runs, dims, discount_factor)) p = solvers.PolicyIterationSolver(self._details.env, discount_factor=discount_factor, max_policy_eval_steps=self._max_steps, theta=self._theta, verbose=self._verbose) stats = self.run_solver_and_collect(p, self.convergence_check_fn) self.log("Took {} steps".format(len(stats.steps))) stats.to_csv(os.path.join(PI_DIR, '{}_{}.csv'.format(self._details.env_name, discount_factor))) stats.pickle_results( os.path.join(PKL_DIR, '{}_{}_{}.pkl'.format(self._details.env_name, discount_factor, '{}')), map_desc.shape) stats.plot_policies_on_map( os.path.join(IMG_DIR, '{}_{}_{}.png'.format(self._details.env_name, discount_factor, '{}_{}')), map_desc, self._details.env.colors(), self._details.env.directions(), 'Policy Iteration', 'Step', self._details, only_last=True) optimal_policy_stats = self.run_policy_and_collect(p, stats.optimal_policy, self._num_trials) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv( os.path.join(PI_DIR, '{}_{}_optimal.csv'.format(self._details.env_name, discount_factor))) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({'discount_factor': discount_factor}).replace('"', '""'), time.clock() - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1
def perform(self): # Policy iteration self._details.env.reset() grid_file_name = '{}/PI/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name) with open(grid_file_name, 'w') as f: f.write( "params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n" ) discount_factors = np.array([0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99]) dims = len(discount_factors) self.log("Searching PI in {} dimensions".format(dims)) runs = 1 for discount_factor in discount_factors: t = int(round(time.time() * 1000)) self.log("{}/{} Processing PI with discount factor {}".format( runs, dims, discount_factor)) p = solvers.PolicyIterationSolver(self._details.env, discount_factor=discount_factor, max_policy_eval_steps=3000, verbose=self._verbose) stats = self.run_solver_and_collect(p, self.convergence_check_fn, self._details.state_to_track) self.log("Took {} steps".format(len(stats.steps))) stats.to_csv('{}/PI/{}_{}_episodes.csv'.format( OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) optimal_policy_stats = self.run_policy_and_collect( p, stats.optimal_policy, times=100) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({ 'discount_factor': discount_factor }).replace('"', '""'), int(round(time.time() * 1000)) - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1
def perform(self): # Policy iteration self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = '{}/PI/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name) my_results_name = '{}/PI/{}_modified_result.csv'.format( OUTPUT_DIRECTORY, self._details.env_name) with open(grid_file_name, 'w') as f: f.write( "params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n" ) with open(my_results_name, 'w') as f: f.write( "discount,time_to_converge,time_per_iteration,num_iterations_to_converge,physical_steps_taken,reward_mean,reward_median,reward_min,reward_max,reward_std\n" ) # array([0. , 0.2, 0.4, 0.7, 0.9]) discount_factors = np.round(np.linspace(0.0, 0.9, num=10), 1) dims = len(discount_factors) self.log("Searching PI in {} dimensions".format(dims)) result = [] runs = 1 for discount_factor in discount_factors: t = time.clock() self.log("{}/{} Processing PI with discount factor {}".format( runs, dims, discount_factor)) p = solvers.PolicyIterationSolver(self._details.env, discount_factor=discount_factor, max_policy_eval_steps=3000, verbose=self._verbose) stats = self.run_solver_and_collect(p, self.convergence_check_fn) result.append({ 'd': discount_factor, 'total_time_to_coverage': stats.elapsed_time, 'time_per_iteration': stats.step_times[-1], 'num_iterations_to_converage': stats.total_iteration_step }) self.log("Took {} steps".format(len(stats.steps))) stats.to_csv('{}/PI/{}_{}.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) stats.pickle_results( '{}/PI/pkl/{}_{}_{}.pkl'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor, '{}'), map_desc.shape) stats.plot_policies_on_map('{}/images/PI/{}_{}_{}.png'.format( OUTPUT_DIRECTORY, self._details.env_name, discount_factor, '{}_{}'), map_desc, self._details.env.colors(), self._details.env.directions(), 'Policy Iteration', 'Step', self._details, only_last=True) optimal_policy_stats = self.run_policy_and_collect( p, discount_factor, stats.optimal_policy) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv('{}/PI/{}_{}_optimal.csv'.format( OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({ 'discount_factor': discount_factor }).replace('"', '""'), time.clock() - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) # result.append({'d': discount_factor, # 'total_time_to_coverage': stats.elapsed_time, # 'time_per_iteration': stats.step_times[-1], # 'num_iterations_to_converage': stats.total_iteration_step}) with open(my_results_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{},{},{}\n'.format( discount_factor, stats.elapsed_time, stats.step_times[-1], stats.total_iteration_step, optimal_policy_stats.num_of_steps, optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1