def write_test_results(self): helper.write_stats_file( self.run_stats_file, self.current_episode, sum(self.test_steps), np.mean(self.test_steps), sum(self.test_rewards), np.mean(self.test_rewards), float("{0:.5f}".format(self.learner.last_epsilon)), self.run_steps) self._write_test_results()
def _init_run(self): self.learner.init_Q(states=self.env.get_all_states(), how='zero') self.run_lib_probs_file = os.path.join(self.run_dir, 'stats_policy_probs.csv') self.run_lib_absolute_file = os.path.join(self.run_dir, 'stats_policy_absolute.csv') policy_usage_header = self.task_policies[:] policy_usage_header.insert(0, 'episode') helper.write_stats_file(self.run_lib_probs_file, policy_usage_header) helper.write_stats_file(self.run_lib_absolute_file, policy_usage_header) self.tau_policy = self.params['tau_policy'] self.importance_limit = self.params['policy_importance_limit'] for policy_name in self.current_library: self.current_library[policy_name]['active'] = True self.active_policies = len(self.task_policies) epsilon = (self.params['epsilon'] * ( (1 + self.params['task_library_size'] - len(self.current_library)) / self.params['task_library_size'])) self.learner.set_epsilon(epsilon) self.learner.set_epsilon(epsilon) # if len(self.current_library) > 1: # self.learner.set_epsilon(epsilon) # self.learner.set_epsilon(epsilon) # else: # self.learner.set_epsilon(self.params['epsilon']) # self.learner.set_epsilon(self.params['epsilon']) self.evaluate_current_library()
def init_run(self): _logger.info("..... Starting run %s" % str(self.current_run)) run_dir = os.path.join(self.task_dir, 'run_' + str(self.current_run)) self.run_dir = helper.create_dir(run_dir) # Create run stats file: run_stats.csv self.run_stats_file = os.path.join(self.run_dir, 'stats_run.csv') self.run_steps = 0 helper.write_stats_file(self.run_stats_file, 'episode', 'steps_total', 'steps_mean', 'reward_total', 'reward_mean', 'epsilon', 'step_count') self._init_run()
def _init_run(self): self.learner.init_Q(states=self.env.get_all_states(), how='zero') self.learner.set_epsilon(self.params['epsilon']) self.learner.set_epsilon(self.params['epsilon']) self.run_lib_W_file = os.path.join(self.run_dir, 'stats_policy_W.csv') self.run_lib_W_mean_file = os.path.join(self.run_dir, 'stats_policy_W_mean.csv') self.run_lib_U_file = os.path.join(self.run_dir, 'stats_policy_U.csv') self.run_lib_P_file = os.path.join(self.run_dir, 'stats_policy_P.csv') policy_usage_header = [] for policy_name in self.library: policy_usage_header.append(policy_name) policy_usage_header.insert(0, 'episode') helper.write_stats_file(self.run_lib_W_file, policy_usage_header) helper.write_stats_file(self.run_lib_W_mean_file, ['episode', 'W_mean']) helper.write_stats_file(self.run_lib_U_file, policy_usage_header) helper.write_stats_file(self.run_lib_P_file, policy_usage_header) self.tau_policy = self.params['tau_policy']
def save_best_episode(self): df = pd.read_csv(os.path.join(self.run_dir, 'stats_run.csv')) least_steps_row = df.ix[df['steps_mean'].idxmin()] run_best_file = os.path.join(self.run_dir, 'stats_run_best.csv') headers = ['run'] content = [int(self.current_run)] for column in df: headers.append(str(column)) content.append(least_steps_row[column]) helper.write_stats_file(run_best_file, headers) helper.write_stats_file(run_best_file, content) helper.copy_file( os.path.join(self.run_dir, 'episode_' + str(int(least_steps_row['episode'])), 'Qs.npy'), os.path.join(self.run_dir, 'best_Qs.npy'))
def evaluate_current_library(self): self.set_status('policy_eval') removed_policy = True while removed_policy: removed_policy = False divider = 0.0 absolute = [self.current_episode] for policy_name in self.current_library: if self.current_library[policy_name]['active']: if self.active_policies > 1: policy_results_mean = self.get_mean_test_results( policy_name) self.current_library[policy_name]['weight'] = \ self.get_policy_weight(policy_results_mean) absolute.append(policy_results_mean) else: absolute.append(0.0) else: self.current_library[policy_name]['weight'] = 0.0 absolute.append(0.0) divider += self.current_library[policy_name]['weight'] weights = [self.current_episode] for policy_name in self.current_library: if self.current_library[policy_name]['active']: if self.active_policies > 1: self.current_library[policy_name]['weight'] /= divider if self.current_library[policy_name]['weight'] < \ self.importance_limit and not\ policy_name == self.current_task['name']: self.current_library[policy_name]['active'] = False self.active_policies -= 1 _logger.debug( "Episode %s: Deactivated policy %s" % (str(self.current_episode), policy_name)) removed_policy = True break else: self.current_library[policy_name]['weight'] = 1.0 weights.append(self.current_library[policy_name]['weight']) _logger.debug( "%s: weight = %s" % (policy_name, str(self.current_library[policy_name]['weight']))) helper.write_stats_file(self.run_lib_absolute_file, absolute) helper.write_stats_file(self.run_lib_probs_file, weights) self.update_train_settings()
def save_best_run(self): # Save best Q-table for current task df = pd.read_csv( os.path.join(self.task_dir, 'run_' + str(1), 'stats_run_best.csv')) for i in range(2, self.params['runs']): df.append(pd.read_csv( os.path.join(self.task_dir, 'run_' + str(i), 'stats_run_best.csv')), ignore_index=True) least_steps_row = df.ix[df['steps_mean'].idxmin()] task_best_file = os.path.join(self.task_dir, 'stats_task_best.csv') headers = ['task'] content = [str(self.current_task['name'])] for column in df: headers.append(str(column)) content.append(least_steps_row[column]) helper.write_stats_file(task_best_file, headers) helper.write_stats_file(task_best_file, content) helper.copy_file( os.path.join(self.task_dir, 'run_' + str(int(least_steps_row['run'])), 'best_Qs.npy'), os.path.join(self.task_dir, 'best_Qs.npy'))
def _cleanup_episode(self): if self.status == 'training': # if self.learner.epsilon > -1 * self.learner.epsilon_change: # self.learner.set_epsilon(self.learner.epsilon + # self.learner.epsilon_change) self.current_W = \ ((self.params['gamma'] ** self.steps_in_episode) * self.reward_in_episode) self.library[self.current_policy]['W'] = \ (((self.library[self.current_policy]['W'] * self.library[self.current_policy]['U']) + self.current_W) / (self.library[self.current_policy]['U'] + 1)) self.library[self.current_policy]['U'] += 1 self.tau_policy += self.params['tau_policy_delta'] _logger.debug('%s: current_W=%s, W=%s, U=%s, tau=%s' % (str(self.current_policy), str(self.current_W), str(self.library[self.current_policy]['W']), str(self.library[self.current_policy]['U']), str(self.tau_policy))) Ws = [self.current_episode] W_mean = [self.current_episode] Us = [self.current_episode] Ps = [self.current_episode] W_sum = 0 for policy in self.library: W_sum += self.library[policy]['W'] Ws.append(self.library[policy]['W']) Us.append(self.library[policy]['U']) Ps.append(self.library[policy]['P']) W_mean.append(W_sum / len(self.library)) helper.write_stats_file(self.run_lib_W_file, Ws) helper.write_stats_file(self.run_lib_W_mean_file, W_mean) helper.write_stats_file(self.run_lib_U_file, Us) helper.write_stats_file(self.run_lib_P_file, Ps)