def perform(self): # Q-Learner self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = os.path.join( QL_DIR, '{}_grid.csv'.format(self._details.env_name)) with open(grid_file_name, 'w') as f: f.write( "params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n" ) alphas = ALPHAS q_inits = Q_INITS epsilons = EPSILONS discount_factors = np.round( np.linspace(DISCOUNT_MIN, max(DISCOUNT_MIN, DISCOUNT_MAX), num=NUM_DISCOUNTS), 2) dims = len(discount_factors) * len(alphas) * len(q_inits) * len( epsilons) * len(self._epsilon_decays) self.log("Searching Q in {} dimensions".format(dims)) runs = 1 for alpha in alphas: for q_init in q_inits: for epsilon in epsilons: for epsilon_decay in self._epsilon_decays: for discount_factor in discount_factors: t = time.clock() self.log( "{}/{} Processing QL with alpha {}, q_init {}, epsilon {}, epsilon_decay {}," " discount_factor {}".format( runs, dims, alpha, q_init, epsilon, epsilon_decay, discount_factor)) qs = solvers.QLearningSolver( self._details.env, self._max_episodes, discount_factor=discount_factor, alpha=alpha, epsilon=epsilon, epsilon_decay=epsilon_decay, q_init=q_init, min_consecutive_sub_theta_episodes=self. _min_sub_thetas, verbose=self._verbose, theta=self._theta) stats = self.run_solver_and_collect( qs, self.convergence_check_fn) self.log("Took {} episodes".format(len( stats.steps))) stats.to_csv( os.path.join( QL_DIR, '{}_{}_{}_{}_{}_{}.csv'.format( self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor))) stats.pickle_results(os.path.join( PKL_DIR, '{}_{}_{}_{}_{}_{}_{}.pkl'.format( self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor, '{}')), map_desc.shape, step_size=self._max_episodes / 20.0) stats.plot_policies_on_map( os.path.join( IMG_DIR, '{}_{}_{}_{}_{}_{}_{}.png'.format( self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor, '{}_{}')), map_desc, self._details.env.colors(), self._details.env.directions(), 'Q-Learner', 'Episode', self._details, step_size=self._max_episodes / 20.0, only_last=True) # We have extra stats about the episode we might want to look at later episode_stats = qs.get_stats() episode_stats.to_csv( os.path.join( QL_DIR, '{}_{}_{}_{}_{}_{}_episode.csv'.format( self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor))) optimal_policy_stats = self.run_policy_and_collect( qs, stats.optimal_policy, self._num_trials) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv( os.path.join( QL_DIR, '{}_{}_{}_{}_{}_{}_optimal.csv'.format( self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor))) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({ 'alpha': alpha, 'q_init': q_init, 'epsilon': epsilon, 'epsilon_decay': epsilon_decay, 'discount_factor': discount_factor, }).replace('"', '""'), time.clock() - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1
def run_q(self, params): grid_file_name = '{}/Q/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name) alpha, q_init, epsilon, epsilon_decay, discount_factor, runs, dims, map_desc = params print("Processing run {}".format(runs)) t = time.clock() self.log("{}/{} Processing Q with alpha {}, q_init {}, epsilon {}, epsilon_decay {}," " discount_factor {}".format( runs, dims, alpha, q_init, epsilon, epsilon_decay, discount_factor )) qs = solvers.QLearningSolver(self._details.env, self.max_episodes, discount_factor=discount_factor, alpha=alpha, epsilon=epsilon, epsilon_decay=epsilon_decay, q_init=q_init, verbose=self._verbose, theta=0.001) stats = self.run_solver_and_collect(qs, self.convergence_check_fn) self.log("Took {} episodes".format(len(stats.steps))) stats.to_csv('{}/Q/{}_{}_{}_{}_{}_{}.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor)) stats.pickle_results('{}/Q/pkl/{}_{}_{}_{}_{}_{}_{}.pkl'.format(OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor, '{}'), map_desc.shape, step_size=self.max_episodes/20.0) stats.plot_policies_on_map('{}/images/Q/{}_{}_{}_{}_{}_{}_{}.png'.format(OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor, '{}_{}'), map_desc, self._details.env.colors(), self._details.env.directions(), 'Q-Learner', 'Episode', self._details, step_size=self.max_episodes / 20.0, only_last=True) # We have extra stats about the episode we might want to look at later episode_stats = qs.get_stats() episode_stats.to_csv('{}/Q/{}_{}_{}_{}_{}_{}_episode.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor)) optimal_policy_stats = self.run_policy_and_collect(qs, stats.optimal_policy) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv('{}/Q/{}_{}_{}_{}_{}_{}_optimal.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor)) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({ 'alpha': alpha, 'q_init': q_init, 'epsilon': epsilon, 'epsilon_decay': epsilon_decay, 'discount_factor': discount_factor, }).replace('"', '""'), time.clock() - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) return runs
def perform(self): # Q-Learner self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = '{}/Q/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name) with open(grid_file_name, 'w') as f: f.write( "params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n" ) alphas = [0.1, 0.3, 0.5, 0.7, 0.9] q_inits = ['random', 0] epsilons = [0.1, 0.3, 0.5, 0.7, 0.9] #epsilon_decays = [0.0001] epsilon_decays = [0.001] #discount_factors = np.round(np.linspace(0, 0.9, num=10), 2) discount_factors = np.round(np.linspace(0.1, 0.9, num=5), 2) dims = len(discount_factors) * len(alphas) * len(q_inits) * len( epsilons) * len(epsilon_decays) self.log("Searching Q in {} dimensions".format(dims)) runs = 1 for alpha in alphas: for q_init in q_inits: for epsilon in epsilons: for epsilon_decay in epsilon_decays: for discount_factor in discount_factors: t = time.clock() self.log( "{}/{} Processing Q with alpha {}, q_init {}, epsilon {}, epsilon_decay {}," " discount_factor {}".format( runs, dims, alpha, q_init, epsilon, epsilon_decay, discount_factor)) qs = solvers.QLearningSolver( self._details.env, self.max_episodes, discount_factor=discount_factor, alpha=alpha, epsilon=epsilon, epsilon_decay=epsilon_decay, q_init=q_init, verbose=self._verbose) stats = self.run_solver_and_collect( qs, self.convergence_check_fn) self.log("Took {} episodes".format(len( stats.steps))) stats.to_csv('{}/Q/{}_{}_{}_{}_{}_{}.csv'.format( OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor)) stats.pickle_results( '{}/Q/pkl/{}_{}_{}_{}_{}_{}_{}.pkl'.format( OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor, '{}'), map_desc.shape, step_size=self.max_episodes / 20.0) stats.plot_policies_on_map( '{}/images/Q/{}_{}_{}_{}_{}_{}_{}.png'.format( OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor, '{}_{}'), map_desc, self._details.env.colors(), self._details.env.directions(), 'Q-Learner', 'Episode', self._details, step_size=self.max_episodes / 20.0, only_last=True) # We have extra stats about the episode we might want to look at later episode_stats = qs.get_stats() episode_stats.to_csv( '{}/Q/{}_{}_{}_{}_{}_{}_episode.csv'.format( OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor)) optimal_policy_stats = self.run_policy_and_collect( qs, stats.optimal_policy) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv( '{}/Q/{}_{}_{}_{}_{}_{}_optimal.csv'.format( OUTPUT_DIRECTORY, self._details.env_name, alpha, q_init, epsilon, epsilon_decay, discount_factor)) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({ 'alpha': alpha, 'q_init': q_init, 'epsilon': epsilon, 'epsilon_decay': epsilon_decay, 'discount_factor': discount_factor, }).replace('"', '""'), time.clock() - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1
def perform(self): # Q-Learner self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = os.path.join( QL_DIR, '{}_grid.csv'.format(self._details.env_name)) with open(grid_file_name, 'w') as f: f.write( "params,q_init,alpha_initial,alpha_min,alpha_decay,epsilon_initial,epsilon_min,epsilon_decay," "discount_factor,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n" ) dims = len(self._discount_factors) * len(self._alphas) * len( self._q_inits) * len(self._epsilons) self.log("Searching Q in {} dimensions".format(dims)) runs = 1 for alpha in self._alphas: for q_init in self._q_inits: for epsilon in self._epsilons: for discount_factor in self._discount_factors: t = time.clock() self.log( f"{runs}/{dims} Processing QL with alpha {alpha['initial']}->{alpha['min']} " f"(decay={alpha['decay']}), q_init {q_init}, epsilon {epsilon['initial']}->" f"{epsilon['min']} (decay={epsilon['decay']}), discount_factor {discount_factor}" ) # Build a QLeaningSolver object qs = solvers.QLearningSolver( self._details.env, self._max_episodes, self._min_episodes, max_steps_per_episode=self._max_episode_steps, discount_factor=discount_factor, alpha_initial=alpha['initial'], alpha_decay=alpha['decay'], alpha_min=alpha['min'], epsilon_initial=epsilon['initial'], epsilon_decay=epsilon['decay'], epsilon_min=epsilon['min'], q_init=q_init, min_consecutive_sub_theta_episodes=self. _min_sub_thetas, verbose=self._verbose, theta=self._theta) # Run the solver to generate an optimal policy. Stats object contains details about all # optimal policy and # s stats = self.run_solver_and_collect( qs, self.convergence_check_fn) self.log("Took {} episodes".format(len(stats.steps))) filename_base = params_to_filename_base( self._details.env_name, alpha["initial"], alpha["min"], alpha["decay"], q_init, epsilon["initial"], epsilon["min"], epsilon["decay"], discount_factor) stats.to_csv( os.path.join(QL_DIR, f'{filename_base}.csv')) stats.pickle_results( os.path.join(PKL_DIR, f'{filename_base}_{{}}.pkl'), map_desc.shape, step_size=self._max_episodes / 20.0) stats.plot_policies_on_map( os.path.join(IMG_DIR, f'{filename_base}_{{}}_{{}}.png'), map_desc, self._details.env.colors(), self._details.env.directions(), 'Q-Learner', 'Episode', self._details, step_size=self._max_episodes / 20.0, only_last=True) # We have extra stats about the episode we might want to look at later episode_stats = qs.get_stats() episode_stats.to_csv( os.path.join(QL_DIR, f'{filename_base}_episode.csv')) optimal_policy_stats = self.run_policy_and_collect( qs, stats.best_policy, self._num_trials) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv( os.path.join(QL_DIR, f'{filename_base}_optimal.csv')) with open(grid_file_name, 'a') as f: # Data as an iterable of numbers and such # TODO: Replace these instances where headers are above and numbers written down here with # a csv or pandas to csv call? # Single group version (for legacy support) params = json.dumps({ 'q_init': q_init, 'alpha_initial': alpha['initial'], 'alpha_min': alpha['min'], 'alpha_decay': alpha['decay'], 'epsilon_initial': epsilon['initial'], 'epsilon_min': epsilon['min'], 'epsilon_decay': epsilon['decay'], 'discount_factor': discount_factor, }).replace('"', '""') data = [ f'"{params}"', q_init, alpha['initial'], alpha['min'], alpha['decay'], epsilon['initial'], epsilon['min'], epsilon['decay'], discount_factor, time.clock() - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, ] # Convert to a single csv string data_as_string = ",".join([str(d) for d in data]) f.write(f'{data_as_string}\n') runs += 1