Пример #1
0
    def __init__(self, args, env, load_flag=False, explor_rate=None):
        super().__init__()

        self.n_size_twn_status = env.obs_size_list[0]
        self.num_ray           = env.obs_size_list[1]
        self.n_size_eb_status  = env.obs_size_list[2]

        self.update_interval = 10
        self.target_update_interval = 200
        self.replay_start_size = 1000
        self.minibatch_size = 256

        gamma = 0.99
        alpha = 0.5
        
        n_clasfy_ray = 32

#        self.q_func = Qfunc_FC_TWN2_Vision(env.obs_size_list[0], env.obs_size_list[1], env.obs_size_list[2], env.action_space.n)
        self.cnn_ae = Qfunc_FC_TWN2_Vision(self.num_ray, n_clasfy_ray)
        self.cnn_ae_opts = self.cnn_ae.gen_setup_optimizer(chainer.optimizers.Adam)
        self.replay_buffer_cnn_ae = success_buffer_replay.SuccessPrioReplayBuffer(capacity=10 ** 6)

        self.q_func = Qfunc_FC_TWN_RL(self.n_size_twn_status + n_clasfy_ray + self.n_size_eb_status, env.action_space.n)
        self.q_func_opt = chainer.optimizers.Adam(eps=1e-2)
        self.q_func_opt.setup(self.q_func)
        if load_flag:
            if explor_rate is None:
                explorer = chainerrl.explorers.ConstantEpsilonGreedy(epsilon=0.05, random_action_func=env.action_space.sample)
            else:
                explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(start_epsilon=explor_rate, end_epsilon=0.05, decay_steps=50000, random_action_func=env.action_space.sample)
        else:
            explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(start_epsilon=0.5, end_epsilon=0.05, decay_steps=50000, random_action_func=env.action_space.sample)
    
        #replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)
        #replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10 ** 6)
        #replay_buffer = chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(capacity=10 ** 6)
        replay_buffer_q_func = success_buffer_replay.SuccessPrioReplayBuffer(capacity=10 ** 6)
    
        phi = lambda x: x.astype(np.float32, copy=False)
        self.agent = chainerrl.agents.DoubleDQN(
            self.q_func, self.q_func_opt, replay_buffer_q_func, gamma, explorer,
            average_q_decay=0.01, average_loss_decay=0.01,
            update_interval=self.update_interval,
            target_update_interval=self.target_update_interval,
            phi=phi,
            replay_start_size=self.replay_start_size,
            minibatch_size=self.minibatch_size,
            #replay_start_size=5, minibatch_size=3, episodic_update=True, episodic_update_len=64
            )
        
        self.t = 0
        self.last_losses = None
Пример #2
0
def func_agent_generation(args, env, load_flag=False):
    gamma = 0.99
    alpha = 0.5

    # q_func = QFunction(env.observation_space.low.size, env.action_space.n)
    q_func = Qfunc_FC_TWN(env.obs_size_list[0], env.obs_size_list[1],
                          env.obs_size_list[2], env.action_space.n)
    optimizer = chainer.optimizers.Adam(eps=1e-2)
    optimizer.setup(q_func)
    if load_flag:
        explorer = chainerrl.explorers.ConstantEpsilonGreedy(
            epsilon=0.05, random_action_func=env.action_space.sample)
    else:
        explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(
            start_epsilon=0.5,
            end_epsilon=0.05,
            decay_steps=50000,
            random_action_func=env.action_space.sample)
    #replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)
    #replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10 ** 6)
    #replay_buffer = chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(capacity=10 ** 6)
    replay_buffer = success_buffer_replay.SuccessPrioReplayBuffer(
        capacity=10**6)
    phi = lambda x: x.astype(np.float32, copy=False)
    agent = chainerrl.agents.DoubleDQN(
        q_func,
        optimizer,
        replay_buffer,
        gamma,
        explorer,
        average_q_decay=0.01,
        average_loss_decay=0.01,
        update_interval=10,
        target_update_interval=200,
        phi=phi,
        replay_start_size=1500,
        minibatch_size=500,
        #replay_start_size=5, minibatch_size=3, episodic_update=True, episodic_update_len=64
    )

    #if len(args.load) > 0:
    if load_flag:
        #agent.load(args.load)
        agent.load('agent_ddqn')
        #logger.debug('load: {}'.format(args.load) )
        print('load: {}'.format(args.load))

    return agent
Пример #3
0
    def __init__(self, args, env, load_flag=False, explor_rate=None):
        super().__init__()

        self.n_size_twn_status = env.obs_size_list[0]
        self.num_ray = env.obs_size_list[1]
        self.n_size_eb_status = env.obs_size_list[2]

        self.update_interval = 10
        self.target_update_interval = 200
        self.replay_start_size = 1000
        self.minibatch_size = 512

        self.history_num = 30
        self.history_update_interval = 15
        self.history_append_count = 0
        self.history_data = []

        self.success_rate = 1.0

        gamma = 0.985
        alpha = 0.5

        n_clasfy_ray = 16

        self.cnn_ae_output_elements = n_clasfy_ray
        self.hist_ana_ae_output_elements = (
            self.n_size_twn_status + n_clasfy_ray + self.n_size_eb_status) // 3
        self.rl_layer_input_elements = self.n_size_twn_status + self.n_size_eb_status + self.cnn_ae_output_elements + self.hist_ana_ae_output_elements

        #        self.q_func = Qfunc_FC_TWN2_Vision(env.obs_size_list[0], env.obs_size_list[1], env.obs_size_list[2], env.action_space.n)
        self.cnn_ae = Qfunc_FC_TWN2_Vision(self.num_ray, n_clasfy_ray)
        self.cnn_ae_opt = chainer.optimizers.Adam()
        self.cnn_ae_opt.setup(self.cnn_ae)
        self.replay_buffer_cnn_ae = success_buffer_replay.SuccessPrioReplayBuffer(
            capacity=10**6)
        #        self.replay_buffer_cnn_ae = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)
        self.cnn_ae_last_loss = None

        self.hist_ana_ae = Qfunc_FC_TWN2_History(
            self.n_size_twn_status + n_clasfy_ray + self.n_size_eb_status,
            self.history_num, self.hist_ana_ae_output_elements)
        self.hist_ana_ae_opt = chainer.optimizers.Adam()
        self.hist_ana_ae_opt.setup(self.hist_ana_ae)
        self.replay_buffer_hist_ana_ae = chainerrl.replay_buffer.ReplayBuffer(
            capacity=5000)
        self.hist_ana_ae_last_out = None

        self.q_func = Qfunc_FC_TWN_RL(self.rl_layer_input_elements,
                                      env.action_space.n)
        self.q_func_opt = chainer.optimizers.Adam(eps=1e-3)
        self.q_func_opt.setup(self.q_func)
        self.explorer = None
        if load_flag:
            if explor_rate is None:
                self.explorer = chainerrl.explorers.ConstantEpsilonGreedy(
                    epsilon=0.05, random_action_func=env.action_space.sample)
            else:
                self.explorer = SuccessRateEpsilonGreedy.SuccessRateEpsilonGreedy(
                    start_epsilon=explor_rate,
                    end_epsilon=0.0,
                    decay_steps=50000,
                    random_action_func=env.action_space.sample)
        else:
            self.explorer = SuccessRateEpsilonGreedy.SuccessRateEpsilonGreedy(
                start_epsilon=0.5,
                end_epsilon=0.0,
                decay_steps=50000,
                random_action_func=env.action_space.sample)

        #replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)
        #replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10 ** 6)
        #replay_buffer = chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer(capacity=10 ** 6)
        replay_buffer_q_func = success_buffer_replay.ActionFareSamplingReplayBuffer(
            capacity=10**6)

        phi = lambda x: x.astype(np.float32, copy=False)
        self.agent = chainerrl.agents.DoubleDQN(
            self.q_func,
            self.q_func_opt,
            replay_buffer_q_func,
            gamma,
            self.explorer,
            average_q_decay=0.01,
            average_loss_decay=0.01,
            update_interval=self.update_interval,
            target_update_interval=self.target_update_interval,
            phi=phi,
            replay_start_size=self.replay_start_size,
            minibatch_size=self.minibatch_size,
            #replay_start_size=5, minibatch_size=3, episodic_update=True, episodic_update_len=64
        )

        self.t = 0