def take_action(self, state, is_train, is_start): if is_train: if is_start: self.train_ep_count += 1 if self.use_external_exploration: _, greedy_action = self.hydra_network.predict_action(np.expand_dims(state, 0), False) chosen_action = self.exploration_policy.generate(greedy_action[0], self.train_global_steps) else: # single state so first idx chosen_action = self.hydra_network.sample_action(np.expand_dims(state, 0), False, is_single_sample=True)[0][0] self.train_global_steps += 1 if self.write_log: write_summary(self.writer, self.train_global_steps, chosen_action[0], tag='train/action_taken') if self.write_plot: alpha, mean, sigma = self.hydra_network.getModalStats() func1 = self.hydra_network.getQFunction(state) func2 = self.hydra_network.getPolicyFunction(alpha, mean, sigma) old_greedy_action, greedy_action = self.hydra_network.predict_action(np.expand_dims(state, 0), False) if self.hydra_network.use_better_q_gd: greedy_action = self.hydra_network.q_gradient_ascent(np.expand_dims(state, 0), greedy_action, True, is_better_q_gd=True) old_greedy_action = old_greedy_action[0] greedy_action = greedy_action[0] utils.plot_utils.plotFunction("ActorExpert", [func1, func2], state, [greedy_action, old_greedy_action, mean], chosen_action, self.action_min, self.action_max, display_title='Actor-Expert+ , steps: ' + str( self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) else: old_greedy_action, greedy_action = self.hydra_network.predict_action(np.expand_dims(state, 0), False) if self.hydra_network.use_better_q_gd: greedy_action = self.hydra_network.q_gradient_ascent(np.expand_dims(state, 0), greedy_action, True, is_better_q_gd=True) old_greedy_action = old_greedy_action[0] greedy_action = greedy_action[0] if is_start: self.eval_ep_count += 1 chosen_action = greedy_action self.eval_global_steps += 1 if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action
def take_action(self, state, is_train, is_start): greedy_action, action_points = self.network.predict_action( state.reshape(-1, self.state_dim)) # train if is_train: if is_start: self.train_ep_count += 1 self.train_global_steps += 1 if self.use_external_exploration: chosen_action = self.exploration_policy.generate( greedy_action, self.train_global_steps) else: chosen_action = greedy_action if self.write_log: write_summary(self.writer, self.train_global_steps, chosen_action[0], tag='train/action_taken') if self.write_plot: func1 = self.network.getQFunction(state) utils.plot_utils.plotFunction( "WireFitting", [func1], state, greedy_action, chosen_action, self.action_min, self.action_max, display_title='WireFitting, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) return chosen_action # eval else: if is_start: self.eval_ep_count += 1 self.eval_global_steps += 1 chosen_action = greedy_action if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action
def take_action(self, state, is_train, is_start): # Train if is_train: if is_start: self.train_ep_count += 1 self.train_global_steps += 1 greedy_action = self.network.take_action(np.expand_dims(state, 0))[0] if self.use_external_exploration: chosen_action = self.exploration_policy.generate( greedy_action, self.train_global_steps) else: # Get action from network chosen_action = greedy_action # print('train', chosen_action) if self.write_log: raise NotImplementedError if self.write_plot: q_func = self.network.getQFunction(state) utils.plot_utils.plotFunction( "SoftQlearning", [q_func], state, greedy_action, chosen_action, self.action_min, self.action_max, display_title='SoftQlearning, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) # Eval else: # greedy action (mean) chosen_action = self.network.take_action(np.expand_dims(state, 0))[0] if is_start: self.eval_ep_count += 1 self.eval_global_steps += 1 if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action
def take_action(self, state, is_train, is_start): greedy_action = self.hydra_network.predict_action( np.expand_dims(state, 0), False)[0] if is_train: if is_start: self.train_ep_count += 1 self.train_global_steps += 1 if self.use_external_exploration: chosen_action = self.exploration_policy.generate( greedy_action, self.train_global_steps) else: chosen_action = greedy_action if self.write_log: write_summary(self.writer, self.train_global_steps, chosen_action[0], tag='train/action_taken') if self.write_plot: func1 = self.hydra_network.getQFunction(state) utils.plot_utils.plotFunction( "DDPG", [func1], state, greedy_action, chosen_action, self.action_min, self.action_max, display_title='DDPG, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) else: if is_start: self.eval_ep_count += 1 self.eval_global_steps += 1 chosen_action = greedy_action if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action
def take_action(self, state, is_train, is_start): _, max_action_batch_target = self.q_network.get_max_action( np.expand_dims(state, 0), use_target=False, is_train=False) greedy_action = max_action_batch_target[0] if is_train: if is_start: self.train_ep_count += 1 self.train_global_steps += 1 if self.use_external_exploration: chosen_action = self.exploration_policy.generate( greedy_action, self.train_global_steps) else: chosen_action = greedy_action if self.write_log: write_summary(self.writer, self.train_global_steps, chosen_action[0], tag='train/action_taken') if self.write_plot: func1 = self.q_network.getQFunction(state) raise NotImplementedError # utils.plot_utils.plotFunction("OptimalQ", [func1], state, greedy_action, chosen_action, self.action_min, # self.action_max, # display_title='OptimalQ, steps: ' + str( # self.train_global_steps), # save_title='steps_' + str(self.train_global_steps), # save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, # show=False) else: if is_start: self.eval_ep_count += 1 self.eval_global_steps += 1 chosen_action = greedy_action if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action
def take_action(self, state, is_train, is_start): if is_train: sample, greedy_action, weight_mean_var = self.qt_opt_network.sample_action( np.expand_dims(state, 0)) greedy_action = greedy_action[0] means = weight_mean_var[0][1] if self.use_external_exploration: chosen_action = self.exploration_policy.generate( greedy_action, self.train_global_steps) else: chosen_action = np.clip(sample[0][0], self.action_min, self.action_max) if is_start: self.train_ep_count += 1 self.train_global_steps += 1 if self.write_log: # only good for 1 dim action write_summary(self.writer, self.train_global_steps, chosen_action[0], tag='train/action_taken') if self.write_plot: func1 = self.qt_opt_network.getQFunction(state) func2 = self.qt_opt_network.getPolicyFunction( weight_mean_var[0]) utils.plot_utils.plotFunction( "QT_OPT", [func1, func2], state, [greedy_action, means], chosen_action, self.action_min, self.action_max, display_title='QT-Opt, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) return chosen_action else: greedy_action = self.qt_opt_network.predict_action( np.expand_dims(state, 0))[0] if is_start: self.eval_ep_count += 1 self.eval_global_steps += 1 if self.write_log: write_summary(self.writer, self.eval_global_steps, greedy_action[0], tag='eval/action_taken') return greedy_action
def take_action(self, state, is_train, is_start): greedy_action = self.hydra_network.predict_action( np.expand_dims(state, 0), False) greedy_action = greedy_action[0] if is_train: if is_start: self.train_ep_count += 1 if self.use_external_exploration: chosen_action = self.exploration_policy.generate( greedy_action, self.train_global_steps) else: # single state so first idx # single sample so first idx _, chosen_action = self.hydra_network.sample_action( np.expand_dims(state, 0), False, is_single_sample=True) chosen_action = chosen_action[0][0] self.train_global_steps += 1 if self.write_log: raise NotImplementedError if self.write_plot: alpha, mean, sigma = self.hydra_network.getModalStats() if self.use_true_q: func1 = self.hydra_network.getTrueQFunction(state) else: func1 = self.hydra_network.getQFunction(state) func2 = self.hydra_network.getPolicyFunction( alpha, mean, sigma) utils.plot_utils.plotFunction( "ActorCritic_unimodal", [func1, func2], state, [greedy_action, mean], chosen_action, self.action_min, self.action_max, display_title='Actor-Critic, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) else: if is_start: self.eval_ep_count += 1 if self.sample_for_eval: # single state so first idx # single sample so first idx _, chosen_action = self.hydra_network.sample_action( np.expand_dims(state, 0), False, is_single_sample=True) chosen_action = chosen_action[0][0] else: chosen_action = greedy_action self.eval_global_steps += 1 if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') # print('chosen_action: {}'.format(chosen_action)) return chosen_action
def take_action(self, state, is_train, is_start): greedy_action = self.actor_network.predict_action(np.expand_dims(state, 0), False) greedy_action = greedy_action[0] if is_train: if is_start: self.train_ep_count += 1 if self.use_external_exploration: chosen_action = self.exploration_policy.generate(greedy_action, self.train_global_steps) else: # single state so first idx # single sample so first idx chosen_action = self.actor_network.sample_action(np.expand_dims(state, 0), False, is_single_sample=True)[0] self.train_global_steps += 1 if self.write_log: write_summary(self.writer, self.train_global_steps, chosen_action[0], tag='train/action_taken') alpha, mean, sigma = self.actor_network.getModalStats() write_summary(self.writer, self.train_global_steps, alpha[0], tag='train/alpha0') write_summary(self.writer, self.train_global_steps, alpha[1], tag='train/alpha1') write_summary(self.writer, self.train_global_steps, mean[0], tag='train/mean0') write_summary(self.writer, self.train_global_steps, mean[1], tag='train/mean1') write_summary(self.writer, self.train_global_steps, sigma[0], tag='train/sigma0') write_summary(self.writer, self.train_global_steps, sigma[1], tag='train/sigma1') if self.write_plot: alpha, mean, sigma = self.actor_network.getModalStats() func1 = self.critic_network.getQFunction(state) func2 = self.actor_network.getPolicyFunction(alpha, mean, sigma) utils.plot_utils.plotFunction("ActorCritic", [func1, func2], state, [greedy_action, mean], chosen_action, self.action_min, self.action_max, display_title='Actor-Critic, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) else: if is_start: self.eval_ep_count += 1 if self.sample_for_eval: # single state so first idx # single sample so first idx chosen_action = self.actor_network.sample_action(np.expand_dims(state, 0), False, is_single_sample=True)[0] else: chosen_action = greedy_action self.eval_global_steps += 1 if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action
def take_action(self, state, is_train, is_start): # Train if is_train: if is_start: self.train_ep_count += 1 self.train_global_steps += 1 if self.use_external_exploration: greedy_action = self.network.predict_action( np.expand_dims(state, 0)) chosen_action = self.exploration_policy.generate( greedy_action[0], self.train_global_steps) else: # Get action from network chosen_action = self.network.sample_action( np.expand_dims(state, 0))[0] # print('train', chosen_action) if self.write_log: raise NotImplementedError if self.write_plot: if self.use_true_q: # Loaded almost True Q q_func = self.network.getQFunction(state) # q_func = self.network.getTrueQFunction(state) # raise NotImplementedError else: q_func = self.network.getQFunction(state) pi_func = self.network.getPolicyFunction(state) greedy_action = self.network.predict_action( np.expand_dims(state, 0))[0] utils.plot_utils.plotFunction( "SoftCEM", [q_func, pi_func], state, greedy_action, chosen_action, self.action_min, self.action_max, display_title='SoftCEM, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) # Eval else: if self.sample_for_eval: # sample action chosen_action = self.network.sample_action( np.expand_dims(state, 0))[0] else: # greedy action (mean) chosen_action = self.network.predict_action( np.expand_dims(state, 0))[0] if is_start: self.eval_ep_count += 1 self.eval_global_steps += 1 if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action
def take_action(self, state, is_train, is_start): greedy_action = self.network.predict_action( state.reshape(-1, self.state_dim)) # train if is_train: if is_start: self.train_ep_count += 1 self.train_global_steps += 1 if self.use_external_exploration: chosen_action = self.exploration_policy.generate( greedy_action, self.train_global_steps) covmat = None else: chosen_action, covmat = self.network.sample_action( np.expand_dims(state, 0), greedy_action) if self.write_log: write_summary(self.writer, self.train_global_steps, chosen_action[0], tag='train/action_taken') # currently doesn't handle external exploration if self.write_plot: assert (covmat != None) func1 = self.network.getQFunction(state) func2 = self.network.getPolicyFunction(greedy_action, covmat) utils.plot_utils.plotFunction( "NAF", [func1, func2], state, greedy_action, chosen_action, self.action_min, self.action_max, display_title='NAF, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) return chosen_action # eval else: if is_start: self.eval_ep_count += 1 chosen_action = greedy_action.reshape(-1) self.eval_global_steps += 1 if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action
def take_action(self, state, is_train, is_start): # initialize action space if self.inference == 'bundle_entropy': action_init = np.expand_dims( (self.rng.uniform(self.action_min, self.action_max) - self.action_min) * 1.0 / (self.action_max - self.action_min), 0) action_init = np.clip(action_init, 0.0001, 0.9999) elif self.inference == 'adam': action_init = np.expand_dims( self.rng.uniform(self.action_min, self.action_max), 0) else: print('Do not know this inference method!') exit() greedy_action = self.entropy_network.alg_opt(np.expand_dims(state, 0), action_init, self.inference_max_steps, False)[0] if is_train: if is_start: self.train_ep_count += 1 self.train_global_steps += 1 if self.use_external_exploration: chosen_action = self.exploration_policy.generate( greedy_action, self.train_global_steps) else: chosen_action = greedy_action if self.write_log: write_summary(self.writer, self.train_global_steps, chosen_action[0], tag='train/action_taken') if self.write_plot: func1 = self.entropy_network.getQFunction(state) utils.plot_utils.plotFunction( "PICNN", [func1], state, greedy_action, chosen_action, self.action_min, self.action_max, display_title='PICNN, steps: ' + str(self.train_global_steps), save_title='steps_' + str(self.train_global_steps), save_dir=self.writer.get_logdir(), ep_count=self.train_ep_count, show=False) else: if is_start: self.eval_ep_count += 1 self.eval_global_steps += 1 chosen_action = greedy_action if self.write_log: write_summary(self.writer, self.eval_global_steps, chosen_action[0], tag='eval/action_taken') return chosen_action