class Actor:

    def __init__(self,
        device,
        key,
        state_size, action_size, random_seed,
        memory, noise,
        lr, weight_decay,
        checkpoint_folder = './Saved_Model/'):

        self.DEVICE = device

        self.KEY = key

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.LR = lr
        self.WEIGHT_DECAY = weight_decay

        self.CHECKPOINT_FOLDER = checkpoint_folder

        # Actor Network (w/ Target Network)
        self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR)

        self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth'
        if os.path.isfile(self.checkpoint_full_name):
            self.local.load_state_dict(torch.load(self.checkpoint_full_name))
            self.target.load_state_dict(torch.load(self.checkpoint_full_name))

        # Replay memory
        self.memory = memory

        # Noise process
        self.noise = noise

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.DEVICE)

        self.local.eval()
        with torch.no_grad():
            action = self.local(state).cpu().data.numpy()
        self.local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def reset(self):
        self.noise.reset()

    def checkpoint(self):
        torch.save(self.local.state_dict(), self.checkpoint_full_name)
Exemplo n.º 2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, args):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = args['seed']
        self.device = args['device']
        self.args = args

        # Q-Network
        self.actor_network = ActorNetwork(state_size, action_size,
                                          args).to(self.device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         args).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(),
                                          lr=args['LR_ACTOR'])

        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        if not agent_id:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p0_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p0_path']),
                                              strict=False)
        else:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p1_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p1_path']),
                                              strict=False)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'],
                                   args['BATCH_SIZE'], self.seed)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences)

    def act(self, current_state):

        with torch.no_grad():

            self.actor_network.eval()

            input_state = torch.from_numpy(current_state).float().to(
                self.device)

            with torch.no_grad():
                action = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()

            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, experiences):

        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = mCritic.network(states, actions)
        mCritic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        mCritic.optimizer.zero_grad()
        mCritic_loss.backward()
        mCritic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_network(states)
        actor_loss = -mCritic.network(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(mCritic.network, mCritic.target, TAU)
        self.soft_update(self.actor_network, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 3
0
class Actor(object):
    def __init__(self, opt, actor_id, q_trace, learner):
        self.opt = opt
        self.q_trace = q_trace
        self.learner = learner
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed + actor_id)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.n_episodes = 0
        self.n_steps = 0
        self.gamma = opt.gamma

        # epsilon
        self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \
            if opt.n_actors > 1 else 0.4

        # モデル
        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)

    def performing(self):
        torch.manual_seed(self.opt.seed)

        while True:
            self.load_model()
            self.train_episode()
            if self.n_episodes % 100 == 0:
                rewards = self.evaluation(self.env)
                rewards_mu = np.array(
                    [np.sum(np.array(l_i), 0) for l_i in rewards]).mean()
                print("Episode %d, Average Reward %.2f" %
                      (self.n_episodes, rewards_mu))

    def _softmax_action(self, state):
        state = torch.FloatTensor([state]).to(self.device)
        softmax_action = torch.exp(self.actor(state))  # expをかけて,行動確率とする
        softmax_action = softmax_action.cpu().detach().numpy()
        return softmax_action

    def exploration_action(self, state):
        softmax_action = self._softmax_action(state)

        if np.random.rand() > self.eps_greedy:
            return np.argmax(softmax_action)
        else:
            return np.random.choice(self.n_act)

    def train_episode(self):
        done = False
        state = self.env.reset()
        self.env_state = state
        self.next_done = done

        while not done:
            self.n_steps += 1
            states = np.zeros((self.opt.n_step, self.n_state))
            actions = np.zeros(self.opt.n_step)
            rewards = np.zeros(self.opt.n_step)
            log_probs = np.zeros((self.opt.n_step, self.n_act))
            dones = np.ones(self.opt.n_step)
            for i in range(self.opt.n_step):
                states[i] = self.env_state
                dones[i] = self.next_done
                log_prob = self.actor(
                    torch.FloatTensor([state]).to(
                        self.device)).detach().cpu().numpy()[0]
                action = self.exploration_action(state)
                next_state, reward, done, info = self.env.step(action)

                reward = 0
                if done:
                    if self.n_steps > 190:
                        reward = 1
                    else:
                        reward = -1

                log_probs[i] = log_prob
                actions[i] = action
                rewards[i] = reward
                self.env_state = next_state
                self.next_done = done
                if done:
                    self.env_state = self.env.reset()
                    break

            # n_step回終了
            if done:
                self.n_steps = 0
                self.n_episodes += 1
                self.episode_done = True
            else:
                self.episode_done = False

            self.q_trace.put((states, actions, rewards, dones, log_probs),
                             block=True)

    # choose an action based on state for execution
    def action(self, state):
        softmax_action = self._softmax_action(state)
        action = np.argmax(softmax_action)
        return action

    def value(self, state):  # Qを出力
        state_var = torch.FloatTensor([state]).to(self.device)
        q_var = self.critic(state_var)  # 行動価値を出value
        q = q_var.cpu().detach().numpy()
        return q

    def _discount_reward(self, rewards, final_value):
        discounted_r = np.zeros_like(rewards)
        R = final_value  # Q(s_t, a_t)
        for t in reversed(range(0, len(rewards))):
            R = rewards[t] + self.gamma * R
            discounted_r[t] = R
        return discounted_r

    def evaluation(self, env_eval):
        rewards = []
        for i in range(10):
            rewards_i = []

            state = env_eval.reset()
            action = self.action(state)
            state, reward, done, _ = env_eval.step(action)
            rewards_i.append(reward)

            while not done:
                action = self.action(state)
                state, reward, done, _ = env_eval.step(action)
                rewards_i.append(reward)
            rewards.append(rewards_i)

        return rewards

    def load_model(self):
        try:
            self.actor.load_state_dict(self.learner.actor.state_dict())
            self.critic.load_state_dict(self.learner.critic.state_dict())
        except:
            print('load error')