Exemplo n.º 1
0
 def test_random_action():
     env = gym.make('gym_kinova_gripper:kinovagripper-v0')
     obs, done = env.reset(), False
     noise = OUNoise(3)
     max_action = float(env.action_space.high[0])
     correct = 0
     noise.reset()
     cum_reward = 0.0
     for i in range(100):
         finger_actions = noise.noise().clip(-max_action, max_action)
         # actions = np.array([0.0, finger_actions[0], finger_actions[1], finger_actions[2]])
         actions = np.array([0.4, 0.5, 0.5, 0.5])
         obs, reward, done, _ = env.step(actions)
         inputs = torch.FloatTensor(np.array(obs)).to(device)
Exemplo n.º 2
0
class DDPGAgent:
    def __init__(self, config, state_size, action_size):
        super(DDPGAgent, self).__init__()
        l1 = config['network']['hidden']
        l2 = int(config['network']['hidden'] / 2)
        self.actor = Actor(state_size, action_size, config['seed']['agent'],
                           l1, l2).to(device)
        self.critic = Critic(state_size, action_size, config['seed']['agent'],
                             l1, l2).to(device)
        self.target_actor = Actor(state_size, action_size,
                                  config['seed']['agent'], l1, l2).to(device)
        self.target_critic = Critic(state_size, action_size,
                                    config['seed']['agent'], l1, l2).to(device)

        self.noise = OUNoise(action_size,
                             mu=config['noise']['mu'],
                             sigma=config['noise']['sigma'],
                             theta=config['noise']['theta'])

        # initialize targets same as original networks
        self.hard_update(self.target_actor, self.actor)
        self.hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=config['LR_ACTOR'])
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=config['LR_CRITIC'])

    def resetNoise(self):
        self.noise.reset()

    def act(self, obs, noise=0.0):
        action = self.actor(obs) + noise * self.noise.noise()
        action = np.clip(action.detach().numpy(), -1, 1)
        return action

    def target_act(self, obs, noise=0.0):
        action = self.target_actor(obs) + noise * self.noise.noise()
        return action

    def learn(self, experiences, gamma, tau):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.target_actor(next_states)
        Q_targets_next = self.target_critic(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)
        cl = critic_loss.cpu().detach().item()
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #from https://github.com/hortovanyi/DRLND-Continuous-Control/blob/master/ddpg_agent.py
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor(states)
        actor_loss = -self.critic(states, actions_pred).mean()
        al = actor_loss.cpu().detach().item()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic, self.target_critic, tau)
        self.soft_update(self.actor, self.target_actor, tau)

        return [al, cl]

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    # https://github.com/ikostrikov/pytorch-ddpg-naf/blob/master/ddpg.py#L15
    def hard_update(self, target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Exemplo n.º 3
0
class ddpg_agent:
    def __init__(self, args, env):
        self.args = args
        self.env = env
        # get the number of inputs...
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]
        self.action_scale = self.env.action_space.high[0]
        # build up the network
        self.actor_net = Actor(num_inputs, num_actions)
        self.critic_net = Critic(num_inputs, num_actions)
        # get the target network...
        self.actor_target_net = Actor(num_inputs, num_actions)
        self.critic_target_net = Critic(num_inputs, num_actions)
        if self.args.cuda:
            self.actor_net.cuda()
            self.critic_net.cuda()
            self.actor_target_net.cuda()
            self.critic_target_net.cuda()
        # copy the parameters..
        self.actor_target_net.load_state_dict(self.actor_net.state_dict())
        self.critic_target_net.load_state_dict(self.critic_net.state_dict())
        # setup the optimizer...
        self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                                lr=self.args.actor_lr)
        self.optimizer_critic = torch.optim.Adam(
            self.critic_net.parameters(),
            lr=self.args.critic_lr,
            weight_decay=self.args.critic_l2_reg)
        # setting up the noise
        self.ou_noise = OUNoise(num_actions)
        # check some dir
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
        self.model_path = self.args.save_dir + self.args.env_name + '/'
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)

    # start to train the network..
    def learn(self):
        # init the brain memory
        replay_buffer = []
        total_timesteps = 0
        running_reward = None
        for episode_idx in range(self.args.max_episode):
            state = self.env.reset()
            # get the scale of the ou noise...
            self.ou_noise.scale = (self.args.noise_scale - self.args.final_noise_scale) * max(0, self.args.exploration_length - episode_idx) / \
                                self.args.exploration_length + self.args.final_noise_scale
            self.ou_noise.reset()
            # start the training
            reward_total = 0
            while True:
                state_tensor = torch.tensor(state,
                                            dtype=torch.float32).unsqueeze(0)
                if self.args.cuda:
                    state_tensor = state_tensor.cuda()
                with torch.no_grad():
                    policy = self.actor_net(state_tensor)
                # start to select the actions...
                actions = self._select_actions(policy)
                # step
                state_, reward, done, _ = self.env.step(actions *
                                                        self.action_scale)
                total_timesteps += 1
                reward_total += reward
                # start to store the samples...
                replay_buffer.append((state, reward, actions, done, state_))
                # check if the buffer size is outof range
                if len(replay_buffer) > self.args.replay_size:
                    replay_buffer.pop(0)
                if len(replay_buffer) > self.args.batch_size:
                    mini_batch = random.sample(replay_buffer,
                                               self.args.batch_size)
                    # start to update the network
                    _, _ = self._update_network(mini_batch)
                if done:
                    break
                state = state_
            running_reward = reward_total if running_reward is None else running_reward * 0.99 + reward_total * 0.01
            if episode_idx % self.args.display_interval == 0:
                torch.save(self.actor_net.state_dict(),
                           self.model_path + 'model.pt')
                print('[{}] Episode: {}, Frames: {}, Rewards: {}'.format(
                    datetime.now(), episode_idx, total_timesteps,
                    running_reward))

        self.env.close()

    # select actions
    def _select_actions(self, policy):
        actions = policy.detach().cpu().numpy()[0]
        actions = actions + self.ou_noise.noise()
        actions = np.clip(actions, -1, 1)
        return actions

    # update the network
    def _update_network(self, mini_batch):
        state_batch = np.array([element[0] for element in mini_batch])
        state_batch = torch.tensor(state_batch, dtype=torch.float32)
        # reward batch
        reward_batch = np.array([element[1] for element in mini_batch])
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float32).unsqueeze(1)
        # done batch
        done_batch = np.array([int(element[3]) for element in mini_batch])
        done_batch = 1 - done_batch
        done_batch = torch.tensor(done_batch, dtype=torch.float32).unsqueeze(1)
        # action batch
        actions_batch = np.array([element[2] for element in mini_batch])
        actions_batch = torch.tensor(actions_batch, dtype=torch.float32)
        # next stsate
        state_next_batch = np.array([element[4] for element in mini_batch])
        state_next_batch = torch.tensor(state_next_batch, dtype=torch.float32)
        # check if use the cuda
        if self.args.cuda:
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            done_batch = done_batch.cuda()
            actions_batch = actions_batch.cuda()
            state_next_batch = state_next_batch.cuda()

        # update the critic network...
        with torch.no_grad():
            actions_out = self.actor_target_net(state_next_batch)
            expected_q_value = self.critic_target_net(state_next_batch,
                                                      actions_out)
        # get the target value
        target_value = reward_batch + self.args.gamma * expected_q_value * done_batch
        target_value = target_value.detach()
        values = self.critic_net(state_batch, actions_batch)
        critic_loss = (target_value - values).pow(2).mean()
        self.optimizer_critic.zero_grad()
        critic_loss.backward()
        self.optimizer_critic.step()
        # start to update the actor network
        actor_loss = -self.critic_net(state_batch,
                                      self.actor_net(state_batch)).mean()
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()
        # then, start to softupdate the network...
        self._soft_update_target_network(self.critic_target_net,
                                         self.critic_net)
        self._soft_update_target_network(self.actor_target_net, self.actor_net)

        return actor_loss.item(), critic_loss.item()

    # soft update the network
    def _soft_update_target_network(self, target, source):
        # update the critic network firstly...
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(self.args.tau * param.data +
                                    (1 - self.args.tau) * target_param.data)

    # functions to test the network
    def test_network(self):
        model_path = self.args.save_dir + self.args.env_name + '/model.pt'
        self.actor_net.load_state_dict(
            torch.load(model_path, map_location=lambda storage, loc: storage))
        self.actor_net.eval()
        # start to test
        for _ in range(5):
            state = self.env.reset()
            reward_sum = 0
            while True:
                self.env.render()
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
                with torch.no_grad():
                    actions = self.actor_net(state)
                actions = actions.detach().numpy()[0]
                state_, reward, done, _ = self.env.step(self.action_scale *
                                                        actions)
                reward_sum += reward
                if done:
                    break
                state = state_
            print('The reward of this episode is {}.'.format(reward_sum))
        self.env.close()
Exemplo n.º 4
0
        # Select action randomly or according to policy
        if t < args.start_timesteps:
            # action = env.action_space.sample()
            # action = noise.noise().clip(-max_action, max_action)
            obs = torch.FloatTensor(np.array(state).reshape(1, -1)).to(device)
            action = pretrained_network(obs).cpu().data.numpy().flatten()

        else:
            # action = (
            # 	policy.select_action(np.array(state))
            # 	+ np.random.normal(0, max_action * args.expl_noise, size=action_dim)
            # ).clip(-max_action, max_action)
            # print("action", action)
            action = (policy.select_action(np.array(state)) +
                      expl_noise.noise()).clip(-max_action, max_action)
            # print("training action", action)
        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(
            done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        # print(action)
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # writer.add_scalar("Episode_reward", episode_reward, t)
        # print("run...")
Exemplo n.º 5
0
for i_episode in range(args.num_episodes):
    state = torch.Tensor([env.reset()]).to(device)

    if args.ou_noise:
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
            0, args.exploration_end -
            i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()

    if args.param_noise and args.algo == "DDPG":
        agent.perturb_actor_parameters(param_noise)

    episode_reward = 0
    while True:
        action_noise = torch.Tensor(ounoise.noise())[0]
        action = agent.select_action(state, action_noise, param_noise)
        next_state, reward, done, _ = env.step(
            np.append(args.heading_speed,
                      action.numpy()[0]))
        # To do heading could use some noise
        total_numsteps += 1
        episode_reward += reward

        action = torch.Tensor(action).to(device)
        mask = torch.Tensor([not done]).to(device)
        next_state = torch.Tensor([next_state]).to(device)
        reward = torch.Tensor([reward]).to(device)

        memory.push(state, action, mask, next_state, reward)