Exemplo n.º 1
0
    def __init__(self, state_size, action_size, num_agents, random_seed = 1, \
                       learn_interval = 4, learn_num = 1, lr_actor = 1e-4, lr_critic = 1e-3, \
                       gamma = 0.99, weight_decay = 0, tau = 0.001, batch_size = 128, buffer_size = 1e5):
        """Initialize an Agents object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            seed (int): random_seed
        """
        self.STATE_SIZE = state_size
        self.ACTION_SIZE = action_size
        self.NUM_AGENTS = num_agents
        self.seed = random.seed(random_seed)

        # hyper static parameters:
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.GAMMA = gamma
        self.WEIGHT_DECAY = weight_decay
        self.TAU = tau
        self.BATCH_SIZE = batch_size
        self.BUFFER_SIZE = buffer_size

        # Actor network with target network
        self.actor_local = Actor(self.STATE_SIZE, self.ACTION_SIZE,
                                 random_seed).to(device)
        self.actor_target = Actor(self.STATE_SIZE, self.ACTION_SIZE,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic network with target network
        self.critic_local = Critic(self.STATE_SIZE, self.ACTION_SIZE,
                                   random_seed).to(device)
        self.critic_target = Critic(self.STATE_SIZE, self.ACTION_SIZE,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((self.NUM_AGENTS, self.ACTION_SIZE), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(self.ACTION_SIZE, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)
Exemplo n.º 2
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.state_dim
        self.action_dim = env.action_dim

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            log_device_placement=True))

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.model_saver = tf.train.Saver()
Exemplo n.º 3
0
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, num_agents, random_seed = 1, \
                       learn_interval = 4, learn_num = 1, lr_actor = 1e-4, lr_critic = 1e-3, \
                       gamma = 0.99, weight_decay = 0, tau = 0.001, batch_size = 128, buffer_size = 1e5):
        """Initialize an Agents object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            seed (int): random_seed
        """
        self.STATE_SIZE = state_size
        self.ACTION_SIZE = action_size
        self.NUM_AGENTS = num_agents
        self.seed = random.seed(random_seed)

        # hyper static parameters:
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.GAMMA = gamma
        self.WEIGHT_DECAY = weight_decay
        self.TAU = tau
        self.BATCH_SIZE = batch_size
        self.BUFFER_SIZE = buffer_size

        # Actor network with target network
        self.actor_local = Actor(self.STATE_SIZE, self.ACTION_SIZE,
                                 random_seed).to(device)
        self.actor_target = Actor(self.STATE_SIZE, self.ACTION_SIZE,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic network with target network
        self.critic_local = Critic(self.STATE_SIZE, self.ACTION_SIZE,
                                   random_seed).to(device)
        self.critic_target = Critic(self.STATE_SIZE, self.ACTION_SIZE,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((self.NUM_AGENTS, self.ACTION_SIZE), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(self.ACTION_SIZE, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)

    def reset(self):
        self.noise.reset()

    def step(self, step, states, actions, rewards, next_states, dones):
        """Save experience in replay memory and use random sample from buffer to learn."""
        for n in range(self.NUM_AGENTS):
            self.memory.add(states[n, :], actions[n, :], rewards[n],
                            next_states[n, :], dones[n])

        # Learn every X frames | intervals
        if step % self.LEARN_INTERVAL == 0:

            # Learn, if we have enough samples to learn
            if len(self.memory) > self.BATCH_SIZE:

                # amount of times that we want to learn
                # is not the same as batch size
                for _ in range(self.LEARN_NUM):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.GAMMA)

    def action(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""

        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.NUM_AGENTS, self.ACTION_SIZE))

        # get out of the training environment
        self.actor_local.eval()
        with torch.no_grad():
            # for each state, predict the next action
            for n, state in enumerate(states):
                actions[n, :] = self.actor_local(state).cpu().data.numpy()

        # enter the training environment
        self.actor_local.train()

        # add some noise
        if add_noise:
            actions += self.noise.sample()

        # clip the action
        return np.clip(actions, -1, 1)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    # Load and Save data
    def save_agent(self, checkpoint_name):
        torch.save(self.actor_local.state_dict(),
                   f'./checkpoints/{checkpoint_name}_actor.pth')
        torch.save(self.critic_local.state_dict(),
                   f'./checkpoints/{checkpoint_name}_critic.pth')

    def load_agent(self, checkpoint_name):
        self.actor_local.load_state_dict(
            torch.load(f'./checkpoints/{checkpoint_name}_actor.pth'))
        self.critic_local.load_state_dict(
            torch.load(f'./checkpoints/{checkpoint_name}_critic.pth'))
Exemplo n.º 4
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Noise process
        self.mu = 0
        self.theta = 0.15
        self.sigmaStart = 0.5
        self.sigmaEnd = 0.1
        self.decayExponent = 0.01
        self.noise = OUNoise(self.action_size, self.mu, self.theta,
                             self.sigmaStart, self.sigmaEnd,
                             self.decayExponent)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.0001  # for soft update of target parameters
        self.learningRateActor = 0.00005
        self.learningRateCritic = 0.0005
        self.dropoutActor = 0.1
        self.dropoutCritic = 0.1

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learningRate=self.learningRateActor,
                                 dropoutRate=self.dropoutActor)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learningRate=self.learningRateActor,
                                  dropoutRate=self.dropoutActor)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learningRate=self.learningRateCritic,
                                   dropoutRate=self.dropoutCritic,
                                   l2Lambda=1e-2)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learningRate=self.learningRateCritic,
                                    dropoutRate=self.dropoutCritic,
                                    l2Lambda=1e-2)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.rewardSum = 0
Exemplo n.º 5
0
class AgentDDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Noise process
        self.mu = 0
        self.theta = 0.15
        self.sigmaStart = 0.5
        self.sigmaEnd = 0.1
        self.decayExponent = 0.01
        self.noise = OUNoise(self.action_size, self.mu, self.theta,
                             self.sigmaStart, self.sigmaEnd,
                             self.decayExponent)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.0001  # for soft update of target parameters
        self.learningRateActor = 0.00005
        self.learningRateCritic = 0.0005
        self.dropoutActor = 0.1
        self.dropoutCritic = 0.1

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learningRate=self.learningRateActor,
                                 dropoutRate=self.dropoutActor)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learningRate=self.learningRateActor,
                                  dropoutRate=self.dropoutActor)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learningRate=self.learningRateCritic,
                                   dropoutRate=self.dropoutCritic,
                                   l2Lambda=1e-2)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learningRate=self.learningRateCritic,
                                    dropoutRate=self.dropoutCritic,
                                    l2Lambda=1e-2)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.rewardSum = 0

    def reset_episode(self):
        self.rewardSum = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.rewardSum += reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        noise = self.noise.sample()
        return list(action + noise), noise  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Exemplo n.º 6
0
class DDPGController(object):
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.state_dim
        self.action_dim = env.action_dim

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            log_device_placement=True))

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.model_saver = tf.train.Saver()

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def initial_train(self, mini_batch):
        state_batch = np.asarray([data[0] for data in mini_batch])
        action_batch = np.asarray([data[1] for data in mini_batch])
        action_label_batch = np.asarray([data[2] for data in mini_batch])
        value_label_batch = np.asarray([data[3] for data in mini_batch])
        done_batch = np.asarray([data[4] for data in mini_batch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        action_label_batch = np.resize(action_label_batch,
                                       [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        y_batch = []
        for i in range(len(mini_batch)):
            y_batch.append(value_label_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        critic_cost = self.critic_network.train(y_batch, state_batch,
                                                action_label_batch)

        # Update the actor policy using the sampled gradient:
        # action_batch_for_gradients = self.actor_network.actions(state_batch)
        # q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        # self.actor_network.train(q_gradient_batch, state_batch)
        action_cost = self.actor_network.initial_train(
            action_label_batch=action_label_batch, state_batch=state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        return critic_cost, action_cost

    def save_model(self, path, check_point):
        self.model_saver.save(self.sess,
                              path + 'DDPGControllerModel.ckpt',
                              global_step=check_point)
        print("Model saved at " + path + 'model.ckpt')

    def load_model(self, path):
        self.model_saver.restore(self.sess, path)
        print("Model loaded at " + path)
        pass