예제 #1
0
    actor_main = DDPGActor(observation_dim, action_dim, actor_lr, device)
    actor_target = DDPGActor(observation_dim, action_dim, actor_lr, device)
    critic_main = DDPGCritic(observation_dim, action_dim, critic_lr, device)
    critic_target = DDPGCritic(observation_dim, action_dim, critic_lr, device)

    target_initialize(actor_main, actor_target)
    target_initialize(critic_main, critic_target)

    iter_i = 0
    epi_i = 0
    save_flag = False

    while iter_i < max_iteration:

        noise = Noise.OrnsteinUhlenbeckActionNoise(mu=np.zeros([action_dim]),
                                                   sigma=sigma)

        noise.reset()
        timestep = env.reset()
        ep_reward = 0.0
        prev_action = np.zeros([action_dim])

        # timestep, reward, discount, observation
        _, _, _, s = timestep

        s = utils.state_1d_flat(s)

        s_a = np.append(s, prev_action)
        s_a = torch.FloatTensor(s_a).to(device)

        step_i = 0
예제 #2
0
    actor_main = DDPGActor(state_action_dim, action_dim, actor_lr, device)
    actor_target = DDPGActor(state_action_dim, action_dim, actor_lr, device)
    critic_main = DDPGCritic(state_action_dim, action_dim, critic_lr, device)
    critic_target = DDPGCritic(state_action_dim, action_dim, critic_lr, device)

    target_initialize(actor_main, actor_target)
    target_initialize(critic_main, critic_target)

    # start training agent
    for epi_i in range(1, max_episode + 1):

        sigma = np.random.uniform(sigma_min, sigma_max)

        assert noise_type in ["ou", "gaussian"]
        if noise_type == "ou":
            noise = Noise.OrnsteinUhlenbeckActionNoise(
                mu=np.zeros([action_dim]), sigma=sigma * np.ones([action_dim]))
        else:
            noise = Noise.GaussianNoise(action_dim=action_dim, sigma=sigma)

        noise.reset()
        timestep = env.reset()
        ep_reward = 0.0
        prev_action = np.zeros([action_dim])

        # timestep, reward, discount, observation
        _, _, _, s = timestep
        s = utils.state_1d_flat(s)

        s_a = np.append(s, prev_action)
        s_a = torch.FloatTensor(s_a).to(device)
    critic_main = DDPGCritic(state_control_dim, control_dim, critic_lr, device)
    critic_target = DDPGCritic(state_control_dim, control_dim, critic_lr,
                               device)

    target_initialize(actor_main, actor_target)
    target_initialize(critic_main, critic_target)

    # start training agent
    for epi_i in range(1, max_episode + 1):

        sigma = np.random.uniform(sigma_min, sigma_max)

        assert noise_type in ["ou", "gaussian"]
        if noise_type == "ou":
            noise = Noise.OrnsteinUhlenbeckActionNoise(
                mu=np.zeros([action_dim]),
                sigma=sigma,
                actions_per_control=actions_per_control)
            # this noise is only for single action, for a control you need to repeat sampling
        else:
            noise = Noise.GaussianNoise(action_dim=control_dim, sigma=sigma)

        noise.reset()
        timestep = env.reset()
        ep_reward = 0.0
        prev_action = np.zeros([actions_per_control, action_dim])

        # timestep, reward, discount, observation
        _, _, _, s = timestep
        s = utils.state_1d_flat(s)

        s_a = np.append(s, prev_action.reshape([-1]))