def __init__(self, args, env, load_flag=False, explor_rate=None): super().__init__() self.n_size_twn_status = env.obs_size_list[0] self.num_ray = env.obs_size_list[1] self.n_size_eb_status = env.obs_size_list[2] self.update_interval = 10 self.target_update_interval = 200 self.replay_start_size = 1000 self.minibatch_size = 256 gamma = 0.99 alpha = 0.5 n_clasfy_ray = 32 # self.q_func = Qfunc_FC_TWN2_Vision(env.obs_size_list[0], env.obs_size_list[1], env.obs_size_list[2], env.action_space.n) self.cnn_ae = Qfunc_FC_TWN2_Vision(self.num_ray, n_clasfy_ray) self.cnn_ae_opts = self.cnn_ae.gen_setup_optimizer(chainer.optimizers.Adam) self.replay_buffer_cnn_ae = success_buffer_replay.SuccessPrioReplayBuffer(capacity=10 ** 6) self.q_func = Qfunc_FC_TWN_RL(self.n_size_twn_status + n_clasfy_ray + self.n_size_eb_status, env.action_space.n) self.q_func_opt = chainer.optimizers.Adam(eps=1e-2) self.q_func_opt.setup(self.q_func) if load_flag: if explor_rate is None: explorer = chainerrl.explorers.ConstantEpsilonGreedy(epsilon=0.05, random_action_func=env.action_space.sample) else: explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(start_epsilon=explor_rate, end_epsilon=0.05, decay_steps=50000, random_action_func=env.action_space.sample) else: explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(start_epsilon=0.5, end_epsilon=0.05, decay_steps=50000, random_action_func=env.action_space.sample) #replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6) #replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10 ** 6) #replay_buffer = chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(capacity=10 ** 6) replay_buffer_q_func = success_buffer_replay.SuccessPrioReplayBuffer(capacity=10 ** 6) phi = lambda x: x.astype(np.float32, copy=False) self.agent = chainerrl.agents.DoubleDQN( self.q_func, self.q_func_opt, replay_buffer_q_func, gamma, explorer, average_q_decay=0.01, average_loss_decay=0.01, update_interval=self.update_interval, target_update_interval=self.target_update_interval, phi=phi, replay_start_size=self.replay_start_size, minibatch_size=self.minibatch_size, #replay_start_size=5, minibatch_size=3, episodic_update=True, episodic_update_len=64 ) self.t = 0 self.last_losses = None
def func_agent_generation(args, env, load_flag=False): gamma = 0.99 alpha = 0.5 # q_func = QFunction(env.observation_space.low.size, env.action_space.n) q_func = Qfunc_FC_TWN(env.obs_size_list[0], env.obs_size_list[1], env.obs_size_list[2], env.action_space.n) optimizer = chainer.optimizers.Adam(eps=1e-2) optimizer.setup(q_func) if load_flag: explorer = chainerrl.explorers.ConstantEpsilonGreedy( epsilon=0.05, random_action_func=env.action_space.sample) else: explorer = chainerrl.explorers.LinearDecayEpsilonGreedy( start_epsilon=0.5, end_epsilon=0.05, decay_steps=50000, random_action_func=env.action_space.sample) #replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6) #replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10 ** 6) #replay_buffer = chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(capacity=10 ** 6) replay_buffer = success_buffer_replay.SuccessPrioReplayBuffer( capacity=10**6) phi = lambda x: x.astype(np.float32, copy=False) agent = chainerrl.agents.DoubleDQN( q_func, optimizer, replay_buffer, gamma, explorer, average_q_decay=0.01, average_loss_decay=0.01, update_interval=10, target_update_interval=200, phi=phi, replay_start_size=1500, minibatch_size=500, #replay_start_size=5, minibatch_size=3, episodic_update=True, episodic_update_len=64 ) #if len(args.load) > 0: if load_flag: #agent.load(args.load) agent.load('agent_ddqn') #logger.debug('load: {}'.format(args.load) ) print('load: {}'.format(args.load)) return agent
def __init__(self, args, env, load_flag=False, explor_rate=None): super().__init__() self.n_size_twn_status = env.obs_size_list[0] self.num_ray = env.obs_size_list[1] self.n_size_eb_status = env.obs_size_list[2] self.update_interval = 10 self.target_update_interval = 200 self.replay_start_size = 1000 self.minibatch_size = 512 self.history_num = 30 self.history_update_interval = 15 self.history_append_count = 0 self.history_data = [] self.success_rate = 1.0 gamma = 0.985 alpha = 0.5 n_clasfy_ray = 16 self.cnn_ae_output_elements = n_clasfy_ray self.hist_ana_ae_output_elements = ( self.n_size_twn_status + n_clasfy_ray + self.n_size_eb_status) // 3 self.rl_layer_input_elements = self.n_size_twn_status + self.n_size_eb_status + self.cnn_ae_output_elements + self.hist_ana_ae_output_elements # self.q_func = Qfunc_FC_TWN2_Vision(env.obs_size_list[0], env.obs_size_list[1], env.obs_size_list[2], env.action_space.n) self.cnn_ae = Qfunc_FC_TWN2_Vision(self.num_ray, n_clasfy_ray) self.cnn_ae_opt = chainer.optimizers.Adam() self.cnn_ae_opt.setup(self.cnn_ae) self.replay_buffer_cnn_ae = success_buffer_replay.SuccessPrioReplayBuffer( capacity=10**6) # self.replay_buffer_cnn_ae = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6) self.cnn_ae_last_loss = None self.hist_ana_ae = Qfunc_FC_TWN2_History( self.n_size_twn_status + n_clasfy_ray + self.n_size_eb_status, self.history_num, self.hist_ana_ae_output_elements) self.hist_ana_ae_opt = chainer.optimizers.Adam() self.hist_ana_ae_opt.setup(self.hist_ana_ae) self.replay_buffer_hist_ana_ae = chainerrl.replay_buffer.ReplayBuffer( capacity=5000) self.hist_ana_ae_last_out = None self.q_func = Qfunc_FC_TWN_RL(self.rl_layer_input_elements, env.action_space.n) self.q_func_opt = chainer.optimizers.Adam(eps=1e-3) self.q_func_opt.setup(self.q_func) self.explorer = None if load_flag: if explor_rate is None: self.explorer = chainerrl.explorers.ConstantEpsilonGreedy( epsilon=0.05, random_action_func=env.action_space.sample) else: self.explorer = SuccessRateEpsilonGreedy.SuccessRateEpsilonGreedy( start_epsilon=explor_rate, end_epsilon=0.0, decay_steps=50000, random_action_func=env.action_space.sample) else: self.explorer = SuccessRateEpsilonGreedy.SuccessRateEpsilonGreedy( start_epsilon=0.5, end_epsilon=0.0, decay_steps=50000, random_action_func=env.action_space.sample) #replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6) #replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10 ** 6) #replay_buffer = chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(capacity=10 ** 6) replay_buffer_q_func = success_buffer_replay.ActionFareSamplingReplayBuffer( capacity=10**6) phi = lambda x: x.astype(np.float32, copy=False) self.agent = chainerrl.agents.DoubleDQN( self.q_func, self.q_func_opt, replay_buffer_q_func, gamma, self.explorer, average_q_decay=0.01, average_loss_decay=0.01, update_interval=self.update_interval, target_update_interval=self.target_update_interval, phi=phi, replay_start_size=self.replay_start_size, minibatch_size=self.minibatch_size, #replay_start_size=5, minibatch_size=3, episodic_update=True, episodic_update_len=64 ) self.t = 0