class Christophers_Agent():
    def __init__(self, task):
        # Task (environment) information
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        self.w = np.random.normal(
            size=(
                self.state_size, self.action_size
            ),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size)
                   ))  # start producing actions in a decent range

        self.actor = Actor(self.state_size, self.action_size, self.action_low,
                           self.action_high)
        self.critic = Critic(self.state_size, self.action_size)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.gamma = 0.95
        self.tau = 0.001

        self.best_w = None
        self.best_score = -np.inf

        self.exploration_mu = 0.5
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.4
        self.noise = Noise(self.action_size, self.exploration_mu,
                           self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 32
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_score = -np.inf
        self.num_steps = 0

        # Episode variables
        self.reset_episode()

    def reset_episode(self):
        if self.get_score() > self.best_score:
            self.best_score = self.get_score()
        self.total_reward = 0.0
        self.num_steps = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.total_reward += reward
        self.num_steps += 1

        self.memory.add(self.last_state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor.model.predict(state)[0]
        action = list(action +
                      self.noise.sample())  # add some noise for exploration
        return action

    def get_score(self):
        return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        done = np.array([e.done for e in experiences
                         if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - done)

        self.critic.model.train_on_batch(x=[states, actions], y=Q_targets)

        action_gradients = np.reshape(
            self.critic.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic.model, self.critic_target.model)
        self.soft_update(self.actor.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #2
0
num_steps = env.episode_len
batch_size = 64
std = 0.1
#agent = Agent(state_dim, action_dim, hidden_dim=64, tau=0.001)
noise = Noise(action_dim, mean=0., std=std)
#replay = ReplayMemory(memory_size)

gamma = Variable(torch.Tensor([0.99]), requires_grad=True)
rewards = []
times = []
agent = load_agent(file='pretrained/model_3.0.pth.tar', gamma=gamma)
for episode in range(20):
    state = torch.Tensor([env.reset()])
    episode_reward = 0.
    #std *= 0.9985
    noise.reset(0., std)
    for t in range(num_steps):
        action = agent.select_action(state, noise)
        next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
        episode_reward += reward
        #action = torch.Tensor(action)
        mask = torch.Tensor([not done])
        next_state = torch.Tensor([next_state])
        reward = torch.Tensor([reward])
        agent.memory.push(state, action, mask, next_state, reward)
        state = next_state
        if len(agent.memory) > batch_size * 2:
            print("True")
            agent.learn(epochs=2, batch_size=batch_size)
        if done:
            #env.goal_radius -= 2e-4
Пример #3
0
class Agent():
    """ Class implementation of a so-called "intelligent" agent.
        This agent interacts with and learns from the environment.
        This agent employs the DDPG algorithm to solve this problem.
    """

    # actor_local = None
    # actor_target = None
    # actor_optimizer = None
    """ Class-level Actor properties.
    """

    # critic_local = None
    # critic_target = None
    # critic_optimizer = None
    """ Class-level Critic properties.
    """

    # memory = None
    """ Class-level memory variable.
    """
    def __init__(self, state_size, action_size, seed, add_noise=True):
        """ Initialize an Agent instance.
        
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            add_noise (bool): Toggle for using the stochastic process
        """

        # Set the parameters.
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Setting the Actor network (with the Target Network).
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)

        # Optimize the Actor using Adam.
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Setting the Critic network (with the Target Network).
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)

        # Optimize the Critic using Adam.
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Set up noise processing.
        if add_noise:
            self.noise = Noise((20, action_size), seed)

        # Use the Replay memory buffer (once per class).
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                   device)

        # Initialize the time step (until max NUM_TIME_STEPS is reached).
        # self.t_step = 0

    def step(self, time_step, states, actions, rewards, next_states, dones):
        """ Update the network on each step.
            In other words, save the experience in replay memory,
            and then use random sampling from the buffer to learn.
        """

        # Save experience in replay memory.
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every time step till NUM_TIME_STEPS is reached.
        # if time_step % NUM_TIME_STEPS != 0:
        #     return

        # Save the experience in replay memory, then use random sampling from the buffer to learn.
        self.sample_and_learn()

    def sample_and_learn(self):
        """ For a specified number of agents,
            use random sampling from the buffer to learn.
        """

        # If enough samples are available in memory, get random subset and learn.
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

            # for _ in range(NUM_LEARN_UPDATES):
            #     experiences = Agent.memory.sample()
            #     self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """ Return the actions for a given state as per current policy.
        
        Params
        ======
            state (array_like): Current state
            add_noise (bool): Toggle for using the stochastic process
        """

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        # If the stochastic process is enabled.
        if add_noise:
            action += self.noise.sample()

        # Return the action.
        return np.clip(action, -1, 1)

    def reset(self):
        """ Reset the state.
        """

        # Reset the internal state (noise) to mean (mu).
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ Update value parameters using given batch of experience tuples.
            i.e.,
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where
                actor_target(state) -> action, and
                critic_target(state, action) -> Q-value.
        
        Params
        ======
            experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done, w) tuples 
            gamma (float): Discount factor
        """

        # Set the parameters.
        states, actions, rewards, next_states, dones = experiences
        """ Update the Critic.
        """
        # Get the predicted next-state actions and Q-values from the target models.
        # Calculate the pair action/reward for each of the next_states.
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q-targets for the current states, (y_i).
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute the Critic loss.
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss.
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        """ Update the Actor.
        """
        # Compute the Actor loss.
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss.
        self.actor_optimizer.zero_grad()
        # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        actor_loss.backward()
        self.actor_optimizer.step()
        """ Update the target networks.
        """
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters.
            i.e.,
            θ_target = τ * θ_local + (1 - τ) * θ_target.

        Params
        ======
            local_model (PyTorch model): Weights will be copied from
            target_model (PyTorch model): Weights will be copied to
            tau (float): Interpolation parameter 
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1. - tau) * target_param.data)
class PolicySearch_Agent():
    def __init__(self, task):
        self.task=task
        self.state_size=task.state_size
        self.action_size=task.action_size
        self.action_low=task.action_low
        self.action_high=task.action_high

        self.actor_local=Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target=Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.critic_local=Critic(self.state_size, self.action_size)
        self.critic_target=Critic(self.state_size, self.action_size)

        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        self.mu=0
        self.theta=0.2 
        self.sigma=0.005 # random noise
        self.noise=Noise(self.action_size, self.mu, self.theta, self.sigma)
        self.gamma=0.9 
        self.tau=0.1 
        self.best_score=-np.inf
        self.score=0
        
        self.buffer_size=100000
        self.batch_size=64
        self.memory=ReplayBuffer(self.buffer_size, self.batch_size)

    def reset_episode(self):
        self.noise.reset()
        state=self.task.reset()
        self.last_state=state
        self.score=0
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        if len(self.memory) > self.batch_size:
            experiences=self.memory.sample()
            self.learn(experiences)
        self.last_state=next_state
        self.score+=reward
        if done:
            if self.score > self.best_score:
                self.best_score=self.score

    def act(self, states):
        state=np.reshape(states, [-1, self.state_size])
        action=self.actor_local.model.predict(state)[0]
        return list(action+self.noise.sample())  

    def learn(self, experiences):
        states=np.vstack([e.state for e in experiences if e is not None])
        actions=np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards=np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones=np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states=np.vstack([e.next_state for e in experiences if e is not None])
        actions_next=self.actor_target.model.predict_on_batch(next_states)
        Q_values_next=self.critic_target.model.predict_on_batch([next_states, actions_next])
        Q_values=rewards+self.gamma*Q_values_next*(1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_values)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1]) 
        self.update(self.critic_local.model, self.critic_target.model)
        self.update(self.actor_local.model, self.actor_target.model)

    def update(self, local_model, target_model):
        local_weights=np.array(local_model.get_weights())
        target_weights=np.array(target_model.get_weights())
        assert len(local_weights)==len(target_weights)
        new_weights=self.tau*local_weights+(1-self.tau)*target_weights
        target_model.set_weights(new_weights)