示例#1
0
class DDPGHedgingAgent:
    """DDPGAgent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment
        actor (nn.Module): target actor model to select actions
        actor_target (nn.Module): actor model to predict next actions
        actor_optimizer (Optimizer): optimizer for training actor
        critic (nn.Module): critic model to predict state values
        critic_target (nn.Module): target critic model to predict state values
        critic_optimizer (Optimizer): optimizer for training critic
        memory (ReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        gamma (float): discount factor
        tau (float): parameter for soft target update
        initial_random_episode (int): initial random action steps
        noise (OUNoise): noise generator for exploration
        device (torch.device): cpu / gpu
        transition (list): temporory storage for the recent transition
        total_step (int): total step numbers
        is_test (bool): flag to show the current mode (train / test)
    """
    def __init__(self,
                 env: gym.Env,
                 memory_size: int,
                 batch_size: int,
                 ou_noise_theta: float,
                 ou_noise_sigma: float,
                 gamma: float = 0.99,
                 tau: float = 5e-3,
                 initial_random_episode: int = 1e4,
                 name_cases='myproject'):
        """ Initialize. """

        # Logger
        self.wandb = wandb.init(project=name_cases)

        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]

        self.env = env
        self.memory = ReplayBuffer(memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.initial_random_episode = initial_random_episode

        # noise
        self.noise = OUNoise(
            action_dim,
            theta=ou_noise_theta,
            sigma=ou_noise_sigma,
        )

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        print(self.device)

        # networks
        self.actor = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())

        self.critic = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizer
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        # transition to store in memory
        self.transition = list()

        # total steps count
        self.total_step = 0
        # mode: train / test
        self.is_test = False
        self.populate(self.initial_random_episode)

    def populate(self, eps: int = 100) -> None:
        """
        Carries out several random steps through the environment to initially fill
        up the replay buffer with experiences

        Args:
            steps: number of random steps to populate the buffer with
        """

        if not self.is_test:
            print("Populate Replay Buffer... ")
            kbar = pkbar.Kbar(target=eps, width=20)
            state = self.env.reset()

            for i in range(eps):
                while True:
                    # Get action from sample space
                    selected_action = self.env.action_space.sample()
                    # selected_action = 0
                    noise = self.noise.sample()
                    selected_action = np.clip(selected_action + noise, -1.0,
                                              1.0)

                    next_state, reward, done, _ = self.env.step(
                        selected_action)
                    self.transition = [
                        state, selected_action, reward, next_state,
                        int(done)
                    ]
                    self.memory.append(Experience(*self.transition))

                    state = next_state
                    if done:
                        state = self.env.reset()
                        break

                kbar.add(1)

            # self.scaler = self.memory.standar_scaler()

    @torch.no_grad()
    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input state."""
        state_s = self.scaler.transform([state])
        selected_action = self.actor(
            torch.FloatTensor(state_s).to(self.device)).item()
        # add noise for exploration during training
        if not self.is_test:
            noise = self.noise.sample()
            selected_action = np.clip(selected_action + noise, -1.0, 1.0)

        self.transition = [state, selected_action]
        return selected_action

    def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]:
        """Take an action and return the response of the env."""
        next_state, reward, done, _ = self.env.step(action)

        if not self.is_test:
            self.transition += [reward, next_state, int(done)]
            self.memory.append(Experience(*self.transition))

        return next_state, reward, done

    def update_model(self) -> torch.Tensor:
        """ Update the model by gradient descent.
            Change the loss in to mean variance optimization
        """
        device = self.device  # for shortening the following lines

        state, action, reward, next_state, done = self.memory.sample(
            self.batch_size, self.device)

        state = torch.FloatTensor(self.scaler.transform(state)).to(device)
        next_state = torch.FloatTensor(
            self.scaler.transform(next_state)).to(device)
        # state = state.to(device)
        # next_state = next_state.to(device)
        action = action.to(device)
        reward = reward.to(device)
        done = done.to(device)

        masks = 1 - done
        next_action = self.actor_target(next_state)
        next_value = self.critic_target(next_state, next_action)
        curr_return = reward.reshape(
            -1, 1) + self.gamma * next_value * masks.reshape(-1, 1)

        # train critic
        values = self.critic(state, action)
        critic_loss = F.mse_loss(values, curr_return)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Freeze Q-network so you don't waste computational effort
        # computing gradients for it during the policy learning step.
        for p in self.critic.parameters():
            p.requires_grad = False

        # train actor
        q_values = self.critic(state, self.actor(state))
        actor_loss = -q_values.mean()
        # actor_loss = 0.5 * q_values.std() ** 2

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        for p in self.critic.parameters():
            p.requires_grad = True

        # target update
        self._target_soft_update()

        return actor_loss.data, critic_loss.data

    def train(self, num_frames: int, plotting_interval: int = 200):
        """Train the agent."""
        self.is_test = False

        state = self.env.reset()
        actor_losses = []
        critic_losses = []
        scores = []
        score = 0

        print("Training...")
        kbar = pkbar.Kbar(target=num_frames, width=20)

        for self.total_step in range(1, num_frames + 1):
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

            # if episode ends
            if done:
                state = self.env.reset()
                scores.append(score)
                score = 0

                self._plot(
                    self.total_step,
                    scores,
                    actor_losses,
                    critic_losses,
                )

            # if training is ready
            if (len(self.memory) >= self.batch_size):  # and
                actor_loss, critic_loss = self.update_model()
                actor_losses.append(actor_loss)
                critic_losses.append(critic_loss)

            kbar.add(1)

        self.env.close()

    def test(self):
        """Test the agent."""
        self.is_test = True

        state = self.env.reset()
        done = False
        score = 0

        while not done:
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

        self.env.close()

        return score

    def _target_soft_update(self):
        """Soft-update: target = tau*local + (1-tau)*target."""
        tau = self.tau

        for t_param, l_param in zip(self.actor_target.parameters(),
                                    self.actor.parameters()):
            t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data)

        for t_param, l_param in zip(self.critic_target.parameters(),
                                    self.critic.parameters()):
            t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data)

    def _plot(
        self,
        frame_idx: int,
        scores: List[float],
        actor_losses: List[float],
        critic_losses: List[float],
    ):
        """Plot the training progresses."""

        self.wandb.log({
            'frame': frame_idx,
            'score': scores[-1],
            'actor_loss': actor_losses[-1],
            'critic_loss': critic_losses[-1]
        })
示例#2
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 action_sigma=0.1,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2,
                 seed=0):
        '''
        TD3 Agent
        :param state_size: State Dimension
        :param action_size: Action dimension
        :param action_sigma: standard deviation of the noise to be added to the action
        :param memory_size:
        :param batch:
        :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper)
        :param noise_clip: How much noise to allow
        :param gamma:
        :param update_frequency:
        :param seed:
        '''

        self.state_size = state_size
        self.action_size = action_size

        self.action_sigma = action_sigma
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency
        self.seed = seed

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        #second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        # second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch, seed=seed)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor.forward(state).cpu().data.numpy()
        self.actor.train()

        if epsilon:
            #if we want to inject some noise
            noise = np.random.normal(0, self.action_sigma, action.shape[0])
            action += noise

        return action

    def update(self, step):
        '''
        #https: // arxiv.org / pdf / 1802.09477.pdf
        the function is very similar to typical DDPG algorithm, except for
        1) we have 2 critics to update
        2) we take the min of the 2 values critics output
        3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper)
        4) We delay updating the actor by certain steps

        :param step: how often to update the actor
        :return:
        '''

        state, action, reward, next_state, done = self.memory.sample()

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        next_state_action = self.target_actor(next_state)

        #sample a random noise
        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss

        #as mentioned in the paper, we delay updating the actor network.

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            # ----------------------- update target networks ------------------- #
            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
示例#3
0
class Agent():
    def __init__(self,
                 env,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2):

        self.states = env.observation_space
        self.state_size = env.observation_space.shape[0]
        self.actions = env.action_space
        self.action_size = env.action_space.shape[0]
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, step, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        action = self.actor.forward(state)
        action = action.detach().cpu().numpy()

        if epsilon:
            noise = np.random.normal(0, 0.1, action.shape[0])
            action += noise

        return action

    def update(self, step):

        state, action, reward, next_state, done = self.memory.sample()

        next_state_action = self.target_actor(next_state)

        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(
        self,
        state_size=24,
        action_size=2,
        BATCH_SIZE=128,
        BUFFER_SIZE=int(1e6),
        discount_factor=1,
        tau=1e-2,
        noise_coefficient_start=5,
        noise_coefficient_decay=0.99,
        LR_ACTOR=1e-3,
        LR_CRITIC=1e-3,
        WEIGHT_DECAY=1e-3,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")):
        """
			state_size (int): dimension of each state
			action_size (int): dimension of each action
			BATCH_SIZE (int): mini batch size
			BUFFER_SIZE (int): experience storing lenght, keep it as high as possible
			discount_factor (float): discount factor for calculating Q_target
			tau (float): interpolation parameter for updating target network
			noise_coefficient_start (float): value to be multiplied to OUNoise sample
			noise_coefficient_decay (float): exponential decay factor for value to be multiplied to OUNoise sample
			LR_ACTOR (float): learning rate for actor network
			LR_CRITIC (float): learning rate for critic network
			WEIGHT_DECAY (float): Weight decay for critic network optimizer
			device : "cuda:0" if torch.cuda.is_available() else "cpu"
		"""

        self.state_size = state_size
        print(device)
        self.action_size = action_size
        self.BATCH_SIZE = BATCH_SIZE
        self.BUFFER_SIZE = BUFFER_SIZE
        self.discount_factor = discount_factor
        self.tau = tau
        self.noise_coefficient = noise_coefficient_start
        self.noise_coefficient_decay = noise_coefficient_decay
        self.steps_completed = 0
        self.device = device
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(self.device)
        self.actor_target = Actor(state_size, action_size).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size).to(self.device)
        self.critic_target = Critic(state_size, action_size).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((1, action_size))

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done)
        self.steps_completed += 1
        # If number of memory data > Batch_Size then learn
        if len(self.memory) > self.BATCH_SIZE:
            experiences = self.memory.sample(self.device)
            self.learn(experiences, self.discount_factor, agent_number)

    def act(self, states, add_noise):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((1, self.action_size))  # shape will be (1,2)
        self.actor_local.eval()
        with torch.no_grad():
            actions[0, :] = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise_coefficient * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, discount_factor, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
		Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
		where:
			actor_target(state) -> action
			critic_target(state, action) -> Q-value
		Params
		======
			experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
			discount_factor (float): discount factor
		"""
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        # It is basically taking action of both the agents, so if agent_number=0 then we will have to concatenate agent0 action(currently actions_next) and agent1 action(currently actions[:,2:])
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (discount_factor * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

        # Update noise_coefficient value
        # self.noise_coefficient = self.noise_coefficient*self.noise_coefficient_decay

        self.noise_coefficient = max(
            self.noise_coefficient - (1 / self.noise_coefficient_decay), 0)
        # print(self.steps_completed,': ',self.noise_coefficient)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
		θ_target = τ*θ_local + (1 - τ)*θ_target
			local_model: PyTorch model (weights will be copied from)
			target_model: PyTorch model (weights will be copied to)
			tau (float): interpolation parameter 
		"""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
示例#5
0
class DDPGAGENT:
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPS

        #--- actor -----#

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=1e-3)

        #---- critic -----#

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=1e-3,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        #self.timestep = 0

    def step(self, state, action, reward, next_state, done, timestep):
        self.memory.add_experience(state, action, reward, next_state, done)

        #self.timestep = (self.timestep + 1) % UPDATE_EVERY

        if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0:
            for _ in range(LEARN_NUM):
                xp = self.memory.sample()
                self.learn(xp, GAMMA)  #GAMMA VALUE 0.99

    def act(self, state, noise_accumulate=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        #Epsilon greedy selection
        if noise_accumulate:
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset_internal_state()

    def learn(self, xp, gamma):
        states, actions, rewards, next_states, dones = xp

        #---configuring critic and computation of loss with help of MSE

        actions_nxt = self.actor_target(next_states)

        q_target_next = self.critic_target(next_states, actions_nxt)

        q_target = rewards + (gamma * q_target_next * (1 - dones))

        q_expected = self.critic_local(states, actions)

        #MSE LOSS
        critic_loss = F.mse_loss(q_expected, q_target)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clips gradient norm of an iterable of parameters
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        #---configuring actor and computation of loss with help of MSE
        actor_predicted = self.actor_local(states)
        actor_loss = -self.critic_local(states, actor_predicted).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        self.epsilon -= 1e-6
        self.noise.reset_internal_state()

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#6
0
class Agent():
    """ Interacts with and learns from the environment """
    def __init__(self, state_size, action_size, num_agents, seed):
        """
        Initialize an Agent object
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.eps = eps_start
        self.t_step = 0

        # Actor Network (with Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (with Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def step(self, state, action, reward, next_state, done, agent_number):
        """ Save experience in replay memory, and use random sample from buffer to learn """
        self.t_step += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory and at interval settings
        if len(self.memory) > BATCH_SIZE:
            if self.t_step % UPDATE_EVERY == 0:
                for _ in range(N_UPDATES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """ Returns actions for given state as per current policy """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """
        Update policy and value parameters using given batch of experience tuples

        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # Update epsilon noise value
        self.eps = self.eps - (1 / eps_decay)
        if self.eps < eps_end:
            self.eps = eps_end

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters

        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#7
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, n, state_size, action_size, random_seed, params):
        """Initialize an Agent object.
        
        Params
        ======
            n (int): number of agents in env
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            params (dict): dictionary with hyperparameters name-value pairs
        """
        self.n = n
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.BUFFER_SIZE = params["BUFFER_SIZE"]
        self.BATCH_SIZE = params["BATCH_SIZE"]
        self.GAMMA = params["GAMMA"]
        self.TAU = params["TAU"]
        self.LR_ACTOR = params["LR_ACTOR"]
        self.LR_CRITIC = params["LR_CRITIC"]
        self.WEIGHT_DECAY = params["WEIGHT_DECAY"]
        self.N_UPDATES = params["N_UPDATES"]
        self.UPDATE_STEP = params["UPDATE_STEP"]

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(self.n, action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)

        #Count timesteps
        self.timestep = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(self.n):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        self.timestep += 1
        # Learn, if enough samples are available in memory
        if self.timestep % self.UPDATE_STEP == 0 and len(
                self.memory) > self.BATCH_SIZE:
            for _ in range(self.N_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, self.GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, num_agents):
        """Initialize an Agent object.
         """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, sigma=0.1)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.num_agents = num_agents

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        #self.memory.add(state, action, reward, next_state, done)
        for i in range(self.num_agents):
            self.memory.add(state[i], action[i], reward[i], next_state[i],
                            done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # update critic
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#9
0
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """
    def __init__(self,
                 env,
                 act_dim,
                 state_dim,
                 goal_dim,
                 act_range,
                 buffer_size=int(1e6),
                 gamma=0.98,
                 lr=0.001,
                 tau=0.95):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = state_dim + goal_dim
        self.gamma = gamma
        self.lr = lr
        self.tau = tau
        self.env = env

        # Create actor and critic networks
        self.actor_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        self.critic_network = Critic(self.env_dim, act_dim, act_range)
        self.critic_target_network = Critic(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        # Optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=lr)

        # Replay buffer
        # self.buffer = MemoryBuffer(buffer_size)
        self.buffer = ReplayMemory(buffer_size)

        # Normalizers
        self.goal_normalizer = Normalizer(
            goal_dim, default_clip_range=5)  # Clip between [-5, 5]
        self.state_normalizer = Normalizer(state_dim, default_clip_range=5)

    def policy_action(self, s, g):
        """ Use the actor to predict value
        """
        input = self.preprocess_inputs(s, g)
        return self.actor_network(input)

    def memorize(self, experiences):
        """ Store experience in memory buffer
        """
        for exp in experiences:
            self.buffer.push(exp)

    def sample_batch(self, batch_size):
        return deepcopy(self.buffer.sample(batch_size))

    def clip_states_goals(self, state, goal):
        state = np.clip(state, -200, 200)
        goal = np.clip(goal, -200, 200)
        return state, goal

    def preprocess_inputs(self, state, goal):
        """Normalize and concatenate state and goal"""
        #state, goal = self.clip_states_goals(state, goal)
        state_norm = self.state_normalizer.normalize(state)
        goal_norm = self.goal_normalizer.normalize(goal)
        inputs = np.concatenate([state_norm, goal_norm])
        return torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)

    def select_actions(self, pi):
        # add the gaussian
        action = pi.cpu().numpy().squeeze()
        action += 0.2 * self.act_range * np.random.randn(*action.shape)
        action = np.clip(action, -self.act_range, self.act_range)
        # random actions...
        random_actions = np.random.uniform(low=-self.act_range,
                                           high=self.act_range,
                                           size=self.act_dim)
        # choose if use the random actions
        action += np.random.binomial(1, 0.3, 1)[0] * (random_actions - action)
        action = np.clip(action, -self.act_range, self.act_range)

        return action

    def update_network(self, batch_size):
        s, actions, rewards, ns, _, g = self.sample_batch(batch_size)

        states, goals = self.clip_states_goals(s, g)
        new_states, new_goals = self.clip_states_goals(ns, g)

        norm_states = self.state_normalizer.normalize(states)
        norm_goals = self.goal_normalizer.normalize(goals)
        inputs_norm = np.concatenate([norm_states, norm_goals], axis=1)

        norm_new_states = self.state_normalizer.normalize(new_states)
        norm_new_goals = self.goal_normalizer.normalize(new_goals)
        inputs_next_norm = np.concatenate([norm_new_states, norm_new_goals],
                                          axis=1)

        # To tensor
        inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32)
        inputs_next_norm_tensor = torch.tensor(inputs_next_norm,
                                               dtype=torch.float32)
        actions_tensor = torch.tensor(actions, dtype=torch.float32)
        r_tensor = torch.tensor(rewards, dtype=torch.float32)

        with torch.no_grad():
            # do the normalization
            # concatenate the stuffs
            actions_next = self.actor_target_network(inputs_next_norm_tensor)
            q_next_value = self.critic_target_network(inputs_next_norm_tensor,
                                                      actions_next)
            q_next_value = q_next_value.detach()
            target_q_value = r_tensor + self.gamma * q_next_value
            target_q_value = target_q_value.detach()
            # clip the q value
            clip_return = 1 / (1 - self.gamma)
            target_q_value = torch.clamp(target_q_value, -clip_return, 0)
        # the q loss
        real_q_value = self.critic_network(inputs_norm_tensor, actions_tensor)
        critic_loss = (target_q_value - real_q_value).pow(2).mean()
        # the actor loss
        actions_real = self.actor_network(inputs_norm_tensor)
        actor_loss = -self.critic_network(inputs_norm_tensor,
                                          actions_real).mean()
        actor_loss += 1.0 * (actions_real / self.act_range).pow(2).mean()
        # start to update the network
        self.actor_optim.zero_grad()
        actor_loss.backward()
        sync_grads(self.actor_network)
        self.actor_optim.step()
        # update the critic_network
        self.critic_optim.zero_grad()
        critic_loss.backward()
        sync_grads(self.critic_network)
        self.critic_optim.step()

    def soft_update_target_network(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_((1 - self.tau) * param.data +
                                    self.tau * target_param.data)

    def train(self, args):
        if MPI.COMM_WORLD.Get_rank() == 0:
            self.create_save_dir(args["save_dir"], args["env_name"],
                                 args["HER_strat"])

        success_rates = []
        for ep_num in range(NUM_EPOCHS):
            start = time.time()
            for _ in range(NUM_CYCLES):
                for _ in range(ROLLOUT_PER_WORKER):
                    # Reset episode
                    observation = self.env.reset()
                    current_state = observation['observation']
                    goal = observation['desired_goal']
                    old_achieved_goal = observation['achieved_goal']
                    episode_exp = []
                    episode_exp_her = []
                    for _ in range(self.env._max_episode_steps):
                        if args['render']: self.env.render()
                        with torch.no_grad():
                            pi = self.policy_action(current_state, goal)
                            action = self.select_actions(pi)
                        obs, reward, _, _ = self.env.step(action)
                        new_state = obs['observation']
                        new_achieved_goal = obs['achieved_goal']
                        # Add outputs to memory buffer
                        episode_exp.append([
                            current_state, action, reward, new_state,
                            old_achieved_goal, goal
                        ])
                        if reward == 0: break

                        old_achieved_goal = new_achieved_goal
                        current_state = new_state

                    if args["HER_strat"] == "final":
                        experience = episode_exp[-1]
                        # set g' to achieved goal
                        experience[-1] = np.copy(experience[-2])
                        reward = self.env.compute_reward(
                            experience[-2], experience[-1],
                            None)  # set reward of success
                        experience[2] = reward
                        episode_exp_her.append(experience)

                    elif args["HER_strat"] in ["future", "episode"]:
                        # For each transition of the episode trajectory
                        for t in range(len(episode_exp)):
                            # Add K random states which come from the same episode as the transition
                            for _ in range(args["HER_k"]):
                                if args["HER_strat"] == "future":
                                    # Select a future exp from the same episod
                                    selected = np.random.randint(
                                        t, len(episode_exp))
                                elif args["HER_strat"] == "episode":
                                    # Select an exp from the same episode
                                    selected = np.random.randint(
                                        0, len(episode_exp))
                                # Take the achieved goal of the selected
                                ag_selected = np.copy(episode_exp[selected][5])
                                s, a, _, ns, ag, _ = episode_exp[t]
                                r = self.env.compute_reward(
                                    ag_selected, ag, None)
                                # New transition where the achieved goal of the selected is the new goal
                                her_transition = [s, a, r, ns, ag, ag_selected]
                                episode_exp_her.append(her_transition)

                    self.memorize(deepcopy(episode_exp))
                    self.memorize(deepcopy(episode_exp_her))

                    # Update Normalizers with the observations of this episode
                    self.update_normalizers(deepcopy(episode_exp),
                                            deepcopy(episode_exp_her))

                for _ in range(OPTIMIZATION_STEPS):
                    # Sample experience from buffer
                    self.update_network(args["batch_size"])

                # Soft update the target networks
                self.soft_update_target_network(self.actor_target_network,
                                                self.actor_network)
                self.soft_update_target_network(self.critic_target_network,
                                                self.critic_network)

            success_rate = self.eval()
            success_rates.append(success_rate)
            if MPI.COMM_WORLD.Get_rank() == 0:
                print("Epoch:", ep_num + 1, " -- success rate:",
                      success_rates[-1], " -- duration:",
                      time.time() - start)
                torch.save([
                    self.state_normalizer.mean, self.state_normalizer.std,
                    self.goal_normalizer.mean, self.goal_normalizer.std,
                    self.actor_network.state_dict()
                ], self.model_path + '/model.pt')

        return success_rates

    def create_save_dir(self, save_dir, env_name, her_strat):
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        # path to save the model
        subdir = os.path.join(save_dir, env_name)
        if not os.path.exists(subdir):
            os.mkdir(subdir)
        self.model_path = os.path.join(save_dir, env_name, her_strat)
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)

    def update_normalizers(self, episode_exp, episode_exp_her):
        # Update Normalizers
        episode_exp_states = np.vstack(np.array(episode_exp)[:, 0])
        episode_exp_goals = np.vstack(np.array(episode_exp)[:, 5])
        if len(episode_exp_her) != 0:
            episode_exp_her_states = np.vstack(np.array(episode_exp_her)[:, 0])
            episode_exp_her_goals = np.vstack(np.array(episode_exp_her)[:, 5])
            states = np.concatenate(
                [episode_exp_states, episode_exp_her_states])
            goals = np.concatenate([episode_exp_goals, episode_exp_her_goals])
        else:
            states = np.copy(episode_exp_states)
            goals = np.copy(episode_exp_goals)

        states, goals = self.clip_states_goals(states, goals)

        self.state_normalizer.update(deepcopy(states))
        self.goal_normalizer.update(deepcopy(goals))
        self.state_normalizer.recompute_stats()
        self.goal_normalizer.recompute_stats()

    def eval(self):
        total_success_rate = []
        for _ in range(NUM_TEST):
            per_success_rate = []
            observation = self.env.reset()
            state = observation['observation']
            goal = observation['desired_goal']
            for _ in range(self.env._max_episode_steps):
                # self.env.render()
                with torch.no_grad():
                    input = self.preprocess_inputs(state, goal)
                    pi = self.actor_network(input)
                    action = pi.detach().cpu().numpy().squeeze()
                new_observation, _, _, info = self.env.step(action)
                state = new_observation['observation']
                per_success_rate.append(info['is_success'])
            total_success_rate.append(per_success_rate)

        total_success_rate = np.array(total_success_rate)
        local_success_rate = np.mean(total_success_rate[:, -1])
        global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate,
                                                       op=MPI.SUM)
        return global_success_rate / MPI.COMM_WORLD.Get_size()
示例#10
0
class DDPG:
    def __init__(self,
                 n_state,
                 n_action,
                 a_limit,
                 model_folder=None,
                 memory_size=10000,
                 batch_size=32,
                 tau=0.01,
                 gamma=0.99,
                 var=3.0):
        # Record the parameters
        self.n_state = n_state
        self.n_action = n_action
        self.a_limit = a_limit
        self.memory_size = memory_size
        self.model_folder = model_folder
        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.var = var

        # Create the network and related objects
        self.memory = np.zeros(
            [self.memory_size, 2 * self.n_state + self.n_action + 1],
            dtype=np.float32)
        self.memory_counter = 0
        self.eval_actor = Actor(self.n_state, self.n_action, self.a_limit)
        self.eval_critic = Critic(self.n_state, self.n_action)
        self.target_actor = Actor(self.n_state,
                                  self.n_action,
                                  self.a_limit,
                                  trainable=False)
        self.target_critic = Critic(self.n_state,
                                    self.n_action,
                                    trainable=False)

        self.actor_optimizer = Adam(self.eval_actor.parameters(), lr=0.001)
        self.critic_optimizer = Adam(self.eval_critic.parameters(), lr=0.002)
        self.criterion = nn.MSELoss()

        # Make sure the parameter of target network is the same as evaluate network
        self.hardCopy()

    def load(self):
        if os.path.exists(self.model_folder):
            self.eval_actor.load_state_dict(
                torch.load(os.path.join(self.model_folder, 'actor.pth')))
            self.eval_critic.load_state_dict(
                torch.load(os.path.join(self.model_folder, 'critic.pth')))
        self.hardCopy()

    def save(self):
        if not os.path.exists(self.model_folder):
            os.mkdir(self.model_folder)
        torch.save(self.eval_actor.state_dict(),
                   os.path.join(self.model_folder, 'actor.pth'))
        torch.save(self.eval_critic.state_dict(),
                   os.path.join(self.model_folder, 'critic.pth'))

    def chooseAction(self, s):
        """
            給定輸入state,透過evaluate actor輸出[-1, 1]之間的實數動作值
        """
        s = to_var(s)
        a = self.eval_actor(s)
        a = a.cpu().data.numpy()
        if self.var > 0:
            a = np.clip(np.random.normal(a, self.var), -2, 2)
        return a

    def store_path(self, s, a, r, s_):
        """
            儲存state transition相關資訊
        """
        transition = np.hstack((s, a, [r], s_))
        idx = self.memory_counter % self.memory_size
        self.memory[idx, :] = transition
        self.memory_counter += 1

    def softCopy(self):
        for ta, ea in zip(self.target_actor.parameters(),
                          self.eval_actor.parameters()):
            ta.data.copy_((1.0 - self.tau) * ta.data + self.tau * ea.data)
        for tc, ec in zip(self.target_critic.parameters(),
                          self.eval_critic.parameters()):
            tc.data.copy_((1.0 - self.tau) * tc.data + self.tau * ec.data)

    def hardCopy(self):
        for ta, ea in zip(self.target_actor.parameters(),
                          self.eval_actor.parameters()):
            ta.data.copy_(ea.data)
        for tc, ec in zip(self.target_critic.parameters(),
                          self.eval_critic.parameters()):
            tc.data.copy_(ec.data)

    def update(self):
        # 如果儲存的資訊太少就不更新
        if self.memory_counter <= 5000:
            return

        # 將evaluate network的參數複製進入target network中
        self.softCopy()

        # 決定輸入的batch data
        if self.memory_counter > self.memory_size:
            sample_idx = np.random.choice(self.memory_size,
                                          size=self.batch_size)
        else:
            sample_idx = np.random.choice(self.memory_counter,
                                          size=self.batch_size)

        # 從記憶庫中擷取要訓練的資料
        batch_data = self.memory[sample_idx, :]
        batch_s = batch_data[:, :self.n_state]
        batch_a = batch_data[:, self.n_state:self.n_state + self.n_action]
        batch_r = batch_data[:, -self.n_state - 1:-self.n_state]
        batch_s_ = batch_data[:, -self.n_state:]

        # 送入Pytorch中
        batch_s = to_var(batch_s)
        batch_a = to_var(batch_a)
        batch_r = to_var(batch_r)
        batch_s_ = to_var(batch_s_)

        # 用target network計算target Q值
        next_q_target = self.target_critic(batch_s_,
                                           self.target_actor(batch_s_))
        q_target = batch_r + self.gamma * next_q_target

        # 更新critic
        self.critic_optimizer.zero_grad()
        q_batch = self.eval_critic(batch_s, batch_a)
        value_loss = F.mse_loss(input=q_batch, target=q_target)
        value_loss.backward()
        self.critic_optimizer.step()

        # 更新actor
        self.actor_optimizer.zero_grad()
        policy_loss = -self.eval_critic(batch_s,
                                        self.eval_actor(batch_s)).mean()
        policy_loss.backward()
        self.actor_optimizer.step()

        # 降低action隨機搜索廣度
        self.var *= .9995