Пример #1
0
def train_agent(path,
                env,
                agent,
                seed=0,
                num_episodes=100,
                num_steps=100,
                batch_size=128,
                replay_buffer_size=1000000):

    if not os.path.isdir(path):
        os.makedirs(path)
    os.chdir(path)

    env.seed(seed)
    random.seed(seed)

    pickle.dump(agent.policy_net, open('first_policy.pickle', 'wb'))

    replay_buffer = ReplayBuffer(replay_buffer_size)

    rewards = []
    max_angle = []
    ave_angle = []
    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        max_th = 0
        ave_th = 0
        for step in range(num_steps):
            action = agent.policy_net.get_action(state) + np.array([0.0])
            next_state, reward, done, _ = env.step(action)
            replay_buffer.push(state, action, reward, next_state, done)

            if len(replay_buffer) > batch_size:
                agent.train_step(replay_buffer=replay_buffer,
                                 batch_size=batch_size)

            state = next_state
            episode_reward += reward
            th = np.arccos(state[0]) * np.sign(state[1])
            max_th = max(max_th, abs(th))
            ave_th += abs(th)

        rewards.append(episode_reward)
        max_angle.append(max_th)
        ave_angle.append(ave_th / num_steps)

    pickle.dump(agent.policy_net, open('last_policy.pickle', 'wb'))
    pickle.dump(rewards, open('rewards.pickle', 'wb'))
    pickle.dump(max_angle, open('max_angle.pickle', 'wb'))
    pickle.dump(ave_angle, open('ave_angle.pickle', 'wb'))

    plt.figure(figsize=(10, 6))
    plt.plot(rewards)
    plt.title('Reward vs Episode')
    plt.savefig('rewards.png', dpi=100)
    plt.close()
Пример #2
0
class DQNAgent(nn.Module):
    def __init__(self,
                 state_dim: int,
                 action_dim: int,
                 hidden_sizes: list = [128, 128],
                 activation=nn.ReLU,
                 buffer_size: int = 1000000,
                 batch_size: int = 32,
                 lr: float = 1e-4,
                 gamma: float = 0.95,
                 theta: float = 0.05):
        super(DQNAgent, self).__init__()
        self.q_net = mlp([state_dim] + hidden_sizes + [action_dim],
                         activation=activation)
        self.target_net = mlp([state_dim] + hidden_sizes + [action_dim],
                              activation=activation)
        self.target_net.load_state_dict(self.q_net.state_dict())
        self.buffer = ReplayBuffer(buffer_size)
        self.batch_size = batch_size
        self.optimizer = Adam(self.q_net.parameters(), lr=lr)
        self.gamma = gamma
        self.theta = theta

    def forward(self, x):
        return self.q_net(x)

    def save_memory(self, ex):
        self.buffer.push(ex)

    def train(self, k=4, max_norm=5.):
        losses = []
        for _ in range(k):
            experiences = self.buffer.sample(self.batch_size)
            s, a, r, t, mask = get_batch(experiences)
            next_q = self.target_net(t).max(-1, keepdim=True)[0]
            target = r + self.gamma * mask * next_q.detach()
            pred = self.q_net(s).gather(-1, a)
            loss = F.mse_loss(pred, target)
            self.optimizer.zero_grad()
            loss.backward()
            clip_grad_norm_(self.q_net.parameters(), max_norm)
            self.optimizer.step()
            losses.append(loss.item())
        self.target_update()
        return np.mean(losses)

    def train_start(self):
        return (len(self.buffer) >= self.batch_size)

    def target_update(self):
        for target, param in zip(self.target_net.parameters(),
                                 self.q_net.parameters()):
            target.data = (1 -
                           self.theta) * target.data + self.theta * param.data


#%%
Пример #3
0
num_frames = 100000
batch_size = 32
gamma = 0.99

losses = []
all_rewards = []
episode_reward = 0

# current_model = torch.load("data/rainbow.pt")

state = env.reset()
for frame_idx in range(1, num_frames + 1):
    action = current_model.act(state)

    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0

    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        losses.append(loss.data[0])

    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
Пример #4
0
class DDPGAgent(nn.Module):
    def __init__(self,
                 state_dim: int,
                 action_dim: int,
                 action_min: float,
                 action_max: float,
                 q_hidden_sizes: list = [128, 128],
                 p_hidden_sizes: list = [128, 128],
                 activation=nn.ReLU,
                 buffer_size: int = 1000000,
                 batch_size: int = 32,
                 q_lr: float = 1e-4,
                 p_lr: float = 1e-3,
                 gamma: float = 0.95,
                 theta: float = 0.01,
                 eps: float = 0.3):
        super(DDPGAgent, self).__init__()

        # Actor, frozen actor
        # Bound the action to minimum, maximum value using tanh
        loc = (action_min + action_max) / 2
        scale = (action_min - action_max) / 2
        self.policy = Actor(loc,
                            scale,
                            state_dim,
                            action_dim,
                            p_hidden_sizes,
                            activation=activation,
                            bn=True)
        self.policy_target = Actor(loc,
                                   scale,
                                   state_dim,
                                   action_dim,
                                   p_hidden_sizes,
                                   activation=activation,
                                   bn=True)
        self.policy_target.load_state_dict(self.policy.state_dict())

        # Critic, frozen critic
        self.q_net = Critic(state_dim, action_dim, q_hidden_sizes, activation)
        self.q_target = Critic(state_dim, action_dim, q_hidden_sizes,
                               activation)
        self.q_target.load_state_dict(self.q_net.state_dict())

        # Replay buffer
        self.buffer = ReplayBuffer(buffer_size)
        self.batch_size = batch_size

        # Learner
        self.q_optimizer = Adam(self.q_net.parameters(), lr=q_lr)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=p_lr)
        self.gamma = gamma

        # Polyak averaging parameter
        self.theta = theta

        # Exploration coefficient
        self.eps = eps

        # To use batchnorm
        self.policy.eval()
        self.policy_target.eval()

    # Get action
    def forward(self, x, step):
        x = self.policy(x)
        x_exp = x.clone() + self.eps * torch.randn(x.shape)
        return x, x_exp

    def save_memory(self, ex):
        self.buffer.push(ex)

    def train(self, k=1, q_max_norm=5., policy_max_norm=5.):
        q_losses = []

        # q update
        # To stabilize the learning,
        # critic is updated several times per each training step
        # and gradient clipping is also used
        for _ in range(k):
            experiences = self.buffer.sample(self.batch_size)
            s, a, r, t, mask = get_batch(experiences)
            mu_t = self.policy_target(t)
            next_q = self.q_target(t, mu_t)
            target = r + self.gamma * mask * next_q.detach()
            pred = self.q_net(s, a)
            q_loss = F.mse_loss(pred, target)
            self.q_optimizer.zero_grad()
            q_loss.backward()
            clip_grad_norm_(self.q_net.parameters(), q_max_norm)
            self.q_optimizer.step()
            q_losses.append(q_loss.item())

        # policy update
        # To stablize the learning,
        # batchnorm and gradient clipping is used
        self.policy.train()
        mu = self.policy(s)
        policy_loss = torch.mean(-self.q_net(s, mu))
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        clip_grad_norm_(self.policy.parameters(), policy_max_norm)
        self.policy_optimizer.step()

        # Polyak averaging
        self.target_update()

        # batchnorm
        self.policy.eval()
        return np.mean(q_losses), policy_loss.item()

    def train_start(self):
        return (len(self.buffer) >= self.batch_size)

    def target_update(self):
        for target, param in zip(self.q_target.parameters(),
                                 self.q_net.parameters()):
            target.data = (1 -
                           self.theta) * target.data + self.theta * param.data
        for target, param in zip(self.policy_target.parameters(),
                                 self.policy.parameters()):
            target.data = (1 -
                           self.theta) * target.data + self.theta * param.data