예제 #1
0
ac_high = env.action_space.high
num_state = env.observation_space.shape[0]
num_action = env.action_space.shape[0]
#train
agent = torch.load("LunarLander_agent.pkl")
reward_history = []
noise = OUNoise(env.action_space,theta=0.15, max_sigma=0.2, min_sigma=0.2)

for e in tqdm(range(1,101)):
    state = env.reset()
    done = False
    reward_sum = 0.0
    step = 0
    while done == False :
        env.render()
        action = agent.take_action(state)
        action = noise.get_action(action.detach().numpy(),step)
        next_state , reward , done , _ = env.step(action)
        reward_sum += reward
        #reward-shaping
        reward = reward -abs(next_state[0]) + 1/(abs(next_state[1])+0.1)-abs(next_state[4])
        agent.store_transition( state , action , reward , next_state , done )
        state = next_state[:]
        step += 1
    reward_history.append(reward_sum)        

env.close()

plt.plot(reward_history)
plt.savefig("test_reward_history")
예제 #2
0
def train(cfg):
    print('Start to train ! \n')
    env = NormalizedActions(gym.make("Pendulum-v0"))

    # 增加action噪声
    ou_noise = OUNoise(env.action_space)

    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = DDPG(n_states,
                 n_actions,
                 device="cpu",
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128)
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.train_eps + 1):
        state = env.reset()
        ou_noise.reset()
        ep_reward = 0
        for i_step in range(1, cfg.train_steps + 1):
            action = agent.select_action(state)
            action = ou_noise.get_action(action,
                                         i_step)  # 即paper中的random process
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    print('Complete training!')
    ''' 保存模型 '''
    if not os.path.exists(SAVED_MODEL_PATH):  # 检测是否存在文件夹
        os.mkdir(SAVED_MODEL_PATH)
    agent.save_model(SAVED_MODEL_PATH + 'checkpoint.pth')
    '''存储reward等相关结果'''
    if not os.path.exists(RESULT_PATH):  # 检测是否存在文件夹
        os.mkdir(RESULT_PATH)
    np.save(RESULT_PATH + 'rewards_train.npy', rewards)
    np.save(RESULT_PATH + 'moving_average_rewards_train.npy',
            moving_average_rewards)
    np.save(RESULT_PATH + 'steps_train.npy', ep_steps)
예제 #3
0
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128)

    rewards = []
    moving_average_rewards = []
    for i_episode in range(1, cfg.max_episodes + 1):
        state = env.reset()
        ou_noise.reset()
        ep_reward = 0
        for i_step in range(1, cfg.max_steps + 1):
            action = agent.select_action(state)
            action = ou_noise.get_action(action,
                                         i_step)  # 即paper中的random process
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
            if done:
                break
        print(
            'Episode:',
            i_episode,
            ' Reward: %i' % int(ep_reward),
        )
        rewards.append(ep_reward)
        #
        if i_episode == 1:
예제 #4
0
    actor_lr = 1e-4
    tau = 1e-3

    agent = ddpg(state_dim, action_dim, gamma, tau, buffer_maxlen, batch_size,
                 critic_lr, actor_lr)
    noise = OUNoise(env.action_space)

    step = 0

    for episode in range(max_episodes):
        state = env.reset()
        total = 0
        done = False
        while True:
            action = agent.act(state)
            action = noise.get_action(action, step)

            next_state, reward, done, _ = env.step(action)
            total += reward

            if next_state[0] > 0.0:
                reward += 10
            reward += 50 * abs(next_state[1])

            transition = state, action, reward, next_state, done

            agent.update(transition)

            if done:
                print(f'episode {episode} done. reward: {total}')
                agent.save()
예제 #5
0
class DDPGAgent:
    def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate,
                 actor_learning_rate, train, decay):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau
        self.train = train  # set to true if we want to train the agent, set to false to simulate agent
        self.decay = decay

        # initialize actor and critic networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim,
                                    self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy critic target parameters
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        # use exploration noise only during training
        if self.train == True:
            self.noise = OUNoise(self.env.action_space,
                                 decay_period=self.decay)

    def get_action(self, obs, t=0):
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)
        action = action.squeeze(0).cpu().detach().numpy()
        # add exploration noise only during training
        if self.train == True:
            action = self.noise.get_action(action, t=t)

        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.replay_buffer.sample(
            batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(
            batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch,
                                            next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q

        # update critic
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward()
        self.critic_optimizer.step()

        # update actor
        policy_loss = -self.critic.forward(
            state_batch, self.actor.forward(state_batch)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))