Пример #1
0
    def __init__(self, state_size, action_size, agent_num, random_seed):
        """
        Initialize an Agent object.
        :param state_size (int): dimension of each state
        :param action_size (int): dimension of each action
        :param random_seed (int): random seed
        """

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Networks
        self.critic_local = Critic(state_size, action_size, agent_num,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, agent_num,
                                    random_seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, scale=0.1)
Пример #2
0
    def __init__(self, config):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.config = config
        self.seed = self.config.seed

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.config.state_size,
                                 self.config.action_size, self.seed).to(device)
        self.actor_target = Actor(self.config.state_size,
                                  self.config.action_size, self.seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.config.state_size,
                                   self.config.action_size, self.seed).to(device)
        self.critic_target = Critic(self.config.state_size,
                                    self.config.action_size, self.seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.config.lr_critic)

        # ----------------------- initialize target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, 1.0)
        self.soft_update(self.actor_local, self.actor_target, 1.0)

        # Noise process
        self.noise = OUNoise(self.config.action_size, self.seed)
    def __init__(self, task):
        self.task = task

        # For mountain car task
        self.state_size = 2
        self.action_size = 1
        self.action_low = task.action_space.low
        self.action_high = task.action_space.high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.best_w_actor = None
        self.best_w_critic = None
        self.score = 0
Пример #4
0
class DDPGAgent():
    """
    Agent that interacts with and learns from the environment.
    
    """
    def __init__(self, state_size, action_size, agent_num, random_seed):
        """
        Initialize an Agent object.
        :param state_size (int): dimension of each state
        :param action_size (int): dimension of each action
        :param random_seed (int): random seed
        """

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Networks
        self.critic_local = Critic(state_size, action_size, agent_num,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, agent_num,
                                    random_seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, scale=0.1)

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor_local(obs) + noise * self.noise.sample()
        return action

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor_target(obs) + noise * self.noise.sample()
        return action
Пример #5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
Пример #6
0
""" Visualization of OU noise """
from OUnoise import OUNoise
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

x = 250
noise = OUNoise(size=(1,), seed=0, mu=0.1, theta=0.7, sigma=0.7)
noise_list_ou = []
noise_list_tf = []
for i in range(x):
    noise_list_ou.append(noise.sample().clip(-0.5, 0.5))
    noise_list_tf.append(tf.random.normal(shape=(1,), stddev=0.3, mean=0.0))

noise_list_ou = np.asarray(noise_list_ou).clip(-0.5, 0.5)
noise_list_tf = np.asarray(noise_list_tf).clip(-0.5, 0.5)

plt.style.use('seaborn')
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(noise_list_tf)
ax2.plot(noise_list_ou)
plt.show()
Пример #7
0
#                           Parameters:
# =============================================================== #

agent = Agent(
    lr_actor=0.0001,  # Learning rate of actor
    lr_critic=0.0003,  # Learning rate of critic
    num_actions=2,  # Number of actions the agent can perform
    num_states=8,  # Number of state inputs
    gamma=0.99,  # Gamma coefficient / discount factor
    tau=0.001,  # Target network update parameter
    delay_frequency=2,  # Delay rate of actor update
    batch_size=64)  # Batch size for networks / buffer

noise_ = OUNoise(
    size=(1, 2),  # Size of noise output - matches action
    seed=2,  # Seed for noise
    mu=0,  # Parameters of OU-noise
    theta=0.15,
    sigma=0.2)

buffer = Buffer(buffer_size=1000000,
                batch_size=agent.batch_size,
                num_action=agent.num_actions,
                num_states=agent.num_states)

env = gym.make('LunarLanderContinuous-v2')
env.seed(88)
num_episodes = 2500  # Number of episodes the agent does
tf.random.set_seed(88)  # Init seed for the noise
start_timestep = 100  # Number of time steps the agent behaves randomly
total_timestep = 0  # Defining total time step counter
class DDPG_Mountain_Car():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task

        # For mountain car task
        self.state_size = 2
        self.action_size = 1
        self.action_low = task.action_space.low
        self.action_high = task.action_space.high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.best_w_actor = None
        self.best_w_critic = None
        self.score = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Keep track of score, best score, and best weights
        self.score += reward
        if self.score > self.best_score:
            self.best_score = self.score
            self.best_w_actor = self.actor_local.model.save_weights(
                'actor_weights.h5')
            self.best_w_critic = self.critic_local.model.save_weights(
                'critic_weights.h5')

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #9
0
""" Visualization of OU noise """
from OUnoise import OUNoise
import numpy as np
import matplotlib.pyplot as plt

x = 250
noise = OUNoise(size=(1, 4), seed=0, mu=0, theta=0.3, sigma=0.4)
noise_list = []
for i in range(x):
    noise_list.append(noise.sample())

noise_list = np.reshape(noise_list, newshape=(x, 4))
# noise_list = noise_list.clip(-1, 1)
# print(np.shape(noise_list))
# print(noise_list[:, 1])
plt.style.use('seaborn')
plt.plot(noise_list[:, 1])
# plt.plot(noise_list[:, 2])
plt.show()