class AgentDDPG:
    def __init__(self, state_size, action_size, seed):
        """

        :state_size: size of the state vector
        :action_size: size of the action vector
        """

        self.state_size = state_size
        self.action_size = action_size
        self.t_step = 0
        self.score = 0.0
        self.best = 0.0
        self.seed = seed
        self.total_reward = 0.0
        self.count = 0
        self.learning_rate_actor = 0.0001
        self.learning_rate_critic = 0.001
        self.batch_size = 128
        self.update_every = 1

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target network definitions
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.seed).to(device)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.seed).to(device)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.seed).to(device)

        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.seed).to(device)
        # Actor Optimizer
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate_actor)

        # Critic Optimizer
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.learning_rate_critic)

        # Make sure local and target start with the same weights
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Initialize the Gaussin Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Initialize the Replay Memory
        self.buffer_size = 1000000
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.001  # Soft update for target parameters Actor Critic with Advantage

    # Actor interact with the environment through the step
    def step(self, state, action, reward, next_state, done):
        # Add to the total reward the reward of this time step
        self.total_reward += reward
        # Increase your count based on the number of rewards
        # received in the episode
        self.count += 1
        # Stored experience tuple in the replay buffer
        self.memory.add(state, action, reward, next_state, done)

        # Learn every update_times time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:

            # Check to see if you have enough to produce a batch
            # and learn from it

            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                # Train the networks using the experiences
                self.learn(experiences)

        # Roll over last state action (not needed)
        # self.last_state = next_state

    # Actor determines what to do based on the policy
    def act(self, state):
        # Given a state return the action recommended by the policy
        # Reshape the state to fit the torch tensor input
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Pass the state to the actor local model to get an action
        # recommend for the policy in a state
        # set the actor_local model to predict not to train
        self.actor_local.eval()
        # set the model so this operation is not counted in the
        # gradiant calculation.
        with torch.no_grad():
            actions = self.actor_local(state)
        # set the model back to training mode
        self.actor_local.train()

        # Because we are exploring we add some noise to the
        # action vector
        return list(actions.detach().numpy().reshape(4, ) +
                    self.noise.sample())

    # This is the Actor learning logic called when the agent
    # take a step to learn
    def learn(self, experiences):
        """
        Learning means that the networks parameters needs to be updated
        Using the experineces batch.
        Network learns from experiences not form interaction with the
        environment
        """

        # Reshape the experience tuples in separate arrays of states, actions
        # rewards, next_state, done
        # Your are converting every member of the tuple in a column or vector
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Now reshape the numpy arrays for states, actions and next_states to torch tensors
        # rewards and dones does not need to be tensors.
        states = torch.from_numpy(states).float().unsqueeze(0).to(device)
        actions = torch.from_numpy(actions).float().unsqueeze(0).to(device)
        next_states = torch.from_numpy(next_states).float().unsqueeze(0).to(
            device)

        # Firs we pass a batch of next states to the actor so it tell us what actions
        # to execute, we use the actor target network instead of the actor local network
        # because of the advantage principle

        # set the target network to predict because this is not part of the training, this model
        # weights are alter by a soft update not by an optimizer
        self.actor_target.eval()
        with torch.no_grad():
            next_state_actions = self.actor_target(next_states).detach()
        self.actor_target.train()

        # The critic evaluates the actions taking by the actor in the next state and generates the
        # Q(a,s) value of the next state taking those actions. These action, next_state tuple comes from the
        # ReplayBuffer not from interacting with the environment.
        # Remember the Critic or q_value function inputs is states, actions
        # We calculate the q_targets of the next state. We will use this to calculate the current
        # state q_value using the bellman equation.

        # set the target network to predict because this is not part of the training, this model
        # weights are alter by a soft update not by an optimizer
        self.critic_target.eval()
        with torch.no_grad():
            q_targets_next_state_action_values = self.critic_target(
                next_states, next_state_actions).detach()
        self.actor_target.train()

        # With the next state q_value that is a vector of action values Q(s,a) of a random selected
        # next_states from the replay buffer. We calculate the CURRENT state target Q(s,a).
        # using the TD one-step Sarsa equations and the q_target_next value we got from the critic_target net
        # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value
        # This is done to train the critic_local model in a supervise learning fashion, this is the target values.
        q_targets = torch.from_numpy(
            rewards + self.gamma * q_targets_next_state_action_values.numpy() *
            (1 - dones)).float()

        # --- Optimize the local Critic Model ----#

        # Here we start the supervise training process of the critic_local network
        # we pass a bunch of states actions samples it produces the expected output
        # q_value of each action we passed.
        q_expected = self.critic_local(states, actions)

        # Clear grad buffer values in preparation.
        self.critic_optimizer.zero_grad()

        # loss function for the critic_local model mean square of the difference
        # between the q_expected value and the q_target value.
        critic_loss = F.smooth_l1_loss(q_expected, q_targets)
        critic_loss.backward(retain_graph=True)

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        # optimize the critic_local model using the optimizer defined for the critic
        # In the init function of this class
        self.critic_optimizer.step()

        # --- Optimize the local Actor Model ---#

        # Get the actor actions using the experience buffer states
        actor_actions = self.actor_local(states)

        # Use as a loss the negative sum of the q_values produce by the optimized critic local model given the
        # action of the actor_local model obtain using the states of the sampled buffer.
        loss_actor = -1 * torch.sum(
            self.critic_local.forward(states, actor_actions))

        # Set the model gradients to zero in preparation
        self.actor_optimizer.zero_grad()

        # Back propagate
        loss_actor.backward()

        # optimize the actor_local model using the optimizer defined for the actor
        # In the init function of this class
        self.actor_optimizer.step()

        # Soft-update target models
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def get_episode_score(self):
        """
        Calculate the episode scores
        :return: None
        """
        # Update score and best score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best:
            self.best = self.score

    def save_model_weights(self):
        torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
예제 #2
0
class ActorCritic(Model):
    def __init__(self,
                 observation_space_size,
                 action_space_size,
                 name=None,
                 env_name=None,
                 model_config=None,
                 play_mode=False):

        if name is None:
            name = "Unnamed-ActorCritic"
        super(ActorCritic,
              self).__init__(observation_space_size, action_space_size, name,
                             env_name, model_config, play_mode)

    def build_model(self):

        self.policy_net = Actor(self.observation_space_size,
                                self.action_space_size)
        self.critic_net = Critic(self.observation_space_size)

        if self.model_config is None:
            self.gamma = 0.99

            self.actor_optimizer = optim.Adam(self.policy_net.parameters())
            self.actor_loss = nn.MSELoss()

            self.critic_optimizer = optim.Adam(self.critic_net.parameters())
            self.critic_loss = nn.MSELoss()

            self.get_epsilon = self.get_epsilon_default
        else:
            pass

    def save_checkpoint(self, n=0, filepath=None):
        """
        n - number of epoch / episode or whatever is used for enumeration
        """

        # TO DO: ADD OTHER RELEVANT PARAMETERS
        checkpoint = {
            'policy': self.policy_net.state_dict(),
            'critic': self.critic_net.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }
        super(ActorCritic, self).save_checkpoint(n, filepath, checkpoint)

    def load_checkpoint(self, filepath):
        # TO DO: ADD OTHER RELEVANT parameters
        checkpoint = torch.load(filepath)
        self.policy_net.load_state_dict(checkpoint['policy'])
        self.critic_net.load_state_dict(checkpoint['critic'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])

    def prepare_sample(self, sample):
        sample = np.array(sample)
        states = torch.tensor(sample[:, 0], dtype=torch.float32)
        actions = torch.tensor(sample[:, 1], dtype=torch.float32)
        rewards = torch.tensor(sample[:, 2], dtype=torch.float32)
        next_states = torch.tensor(sample[:, 3], dtype=torch.float32)
        dones = torch.tensor(sample[:, 4], dtype=torch.int32)

        return states, actions, rewards, next_states, dones

    def critic_update(self, V, V_target):
        self.critic_optimizer.zero_grad()
        critic_loss = self.critic_loss(V, V_target)
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    def actor_update(self, advantages, actions, mus):
        self.actor_optimizer.zero_grad()
        actor_loss = self.actor_loss(actions, mus)
        gradient_term = advantages * actor_loss
        gradient_term.backward()
        self.actor_optimizer.step()

        return actor_loss.item()

    def update(self, sample, prepare_state=None):
        actor_running_loss = []
        critic_running_loss = []

        for state, action, reward, next_state, done in sample:
            if prepare_state is not None:
                state = prepare_state(state)
                next_state = prepare_state(next_state)

            state = torch.tensor(state, dtype=torch.float32)
            next_state = torch.tensor(next_state, dtype=torch.float32)
            action = torch.tensor(action, dtype=torch.float32)

            # Update Critic
            V = self.critic_net.forward(state)
            V_target = torch.tensor([reward], dtype=torch.float32)
            if done is False:
                V_target += self.gamma * self.critic_net.forward(next_state)

            critic_loss = self.critic_update(V, V_target)
            critic_running_loss.append(critic_loss)

            # Update Actor
            advantage = (V_target - V).detach()
            mu = self.policy_net(state)

            actor_loss = self.actor_update(advantage, action, mu)
            actor_running_loss.append(actor_loss)

        return actor_running_loss, critic_running_loss

    def batch_update(self, sample, prepare_state=None):
        actor_running_loss = []
        critic_running_loss = []

        states, actions, rewards, next_states, dones = self.prepare_sample(
            sample)

        # Update Critic
        V = self.critic_net.forward(states)
        V_target = rewards + self.gamma * self.critic_net.forward(
            next_states) * (1 - dones)

        critic_loss = self.critic_update(V, V_target)
        critic_running_loss.append(critic_loss)

        # Update Actor
        advantage = (V_target - V).detach()
        mu = self.policy_net(states)

        actor_loss = self.actor_update(advantage, actions, mu)
        actor_running_loss.append(actor_loss)

        return actor_running_loss, critic_running_loss
예제 #3
0
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, ), (0.5, )),
])

dataloader = DataLoader(MNIST('.', download=False, transform=transform),
                        batch_size=batch_size,
                        shuffle=True)

generator_input_dim, discriminator_im_chan = get_input_dimensions(
    z_dim, mnist_shape, n_classes)

gen = Generator(z_dim=generator_input_dim).to(device)
gen_opt = torch.optim.Adam(gen.parameters(), lr=lr)
disc = Critic(im_chan=discriminator_im_chan).to(device)
disc_opt = torch.optim.Adam(disc.parameters(), lr=lr)

cur_step = 0
generator_losses = []
discriminator_losses = []

noise_and_labels = False
fake = False

fake_image_and_labels = False
real_image_and_labels = False
disc_fake_pred = False
disc_real_pred = False

for epoch in range(n_epochs):
    # Dataloader returns the batches and the labels
예제 #4
0
class DDPG():
    """DDPG agent"""
    def __init__(self, state_size, action_size, params, seed):
        """Initialize a DDPG agent
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            params (Params): hyperparameters 
            seed (int): random seed
        """

        self.gamma = params.gamma
        self.tau = params.tau
        self.seed = np.random.seed(seed)

        # actor networks
        self.actor_local = Actor(state_size, action_size, params.units_actor,
                                 seed).to(device)
        self.actor_target = Actor(state_size, action_size, params.units_actor,
                                  seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          params.lr_actor)

        # critic newtworks
        self.critic_local = Critic(state_size, action_size,
                                   params.units_critic, seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    params.units_critic, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           params.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, seed, params.mu, params.theta,
                             params.sigma)

    def noise_reset(self):
        self.noise.reset()

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).data.cpu().numpy()
        self.actor_local.train()
        action += self.noise.sample()

        return np.clip(action, -1, 1)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples
        
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
            
         Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s') tuples 
        """

        states, actions, rewards, next_states, dones = experiences

        #### Update critic
        # Get predicted next-state actions from actor_target model
        next_actions = self.actor_target(next_states)

        # Get predicted next-state Q-Values from critic_target model
        next_q_targets = self.critic_target(next_states, next_actions)

        # Compute Q targets for current states
        Q_targets = rewards + self.gamma * next_q_targets * (1.0 - dones)

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize critic loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        ### Update actor
        # Compute actor loss
        predicted_actions = self.actor_local(states)
        actor_loss = -self.critic_local(states, predicted_actions).mean()

        # Minimize actor loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ### Update target networks
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
class TD3(object):
    """Agent class that handles the training of the networks
    and provides outputs as actions.
    """

    def __init__(self):
        state_dim = cons.STATE_DIM.flatten().shape[0]
        action_dim = cons.ACTION_DIM
        self.actor = Actor(state_dim, action_dim, cons.MAX_ACTION).to(cons.DEVICE)
        self.actor_target = Actor(state_dim,  action_dim, cons.MAX_ACTION).to(cons.DEVICE)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)  # or 1e-3

        self.critic = Critic(state_dim,  action_dim).to(cons.DEVICE)
        self.critic_target = Critic(state_dim,  action_dim).to(cons.DEVICE)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)  # or 1e-3

        self.total_it = 0
        self.critic_loss_plot = []
        self.actor_loss_plot = []

    def select_action(self, state, noise=cons.POLICY_NOISE):
        """Select an appropriate action from the agent policy
            Args:
                state (array): current state of environment
                noise (float): how much noise to add to actions
            Returns:
                action (list): nn action results
        """
        state = torch.FloatTensor(state).to(cons.DEVICE)
        action = self.actor(state)
        # action space noise introduces noise to change the likelihoods of each action the agent might take
        if noise != 0:
            # creates tensor of gaussian noise
            noise = torch.clamp(torch.randn(14, dtype=torch.float32, device='cuda') * noise,
                                min=-cons.NOISE_CLIP, max=cons.NOISE_CLIP)
        action = action + noise
        torch.clamp(action, min=cons.MIN_ACTION, max=cons.MAX_ACTION)
        return action

    def train(self, replay_buffer, iterations):
        """Train and update actor and critic networks
            Args:
                replay_buffer (ReplayBuffer): buffer for experience replay
                iterations (int): how many times to run training
            Return:
                actor_loss (float): loss from actor network
                critic_loss (float): loss from critic network
        """
        for it in range(iterations):
            self.total_it += 1  # keep track of the total training iterations
            # Sample replay buffer (priority replay)
            # choose type of replay
            if cons.PRIORITY:
                state, action, reward, next_state, done, weights, indexes = replay_buffer.sample(cons.BATCH_SIZE,
                                                                                     beta=cons.BETA_SCHED.value(it))
            else:
                state, action, reward, next_state, done = replay_buffer.sample(cons.BATCH_SIZE)

            state = torch.from_numpy(state).float().to(cons.DEVICE)                 # torch.Size([100, 14])
            next_state = torch.from_numpy(next_state).float().to(cons.DEVICE)       # torch.Size([100, 14])
            action = torch.from_numpy(action).float().to(cons.DEVICE)               # torch.Size([100, 14])
            reward = torch.as_tensor(reward, dtype=torch.float32).to(cons.DEVICE)   # torch.Size([100])
            done = torch.as_tensor(done, dtype=torch.float32).to(cons.DEVICE)       # torch.Size([100])

            with torch.no_grad():
                # select an action according to the policy and add clipped noise
                next_action = self.actor_target(next_state)
                noise = torch.clamp(torch.randn((100, 14), dtype=torch.float32, device='cuda') * cons.POLICY_NOISE,
                                    min=-cons.NOISE_CLIP, max=cons.NOISE_CLIP)
                next_action = torch.clamp((next_action + noise), min=cons.MIN_ACTION, max=cons.MAX_ACTION)

                # Compute the target Q value
                target_q1, target_q2 = self.critic(state.float(), next_action.float())
                target_q = torch.min(target_q1, target_q2)
                gamma = torch.ones((100, 1), dtype=torch.float32, device='cuda')
                gamma = gamma.new_full((100, 1), cons.GAMMA)
                target_q = reward.unsqueeze(1) + (done.unsqueeze(1) * gamma * target_q).detach()

            # get current Q estimates
            current_q1, current_q2 = self.critic(state.float(), action.float())

            # compute critic loss
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            cons.TD3_REPORT.write_critic_loss(self.total_it, it, critic_loss)

            # optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # using the minimum of the q values as the weight, use min to prevent overestimation
            if cons.PRIORITY:
                new_priorities = torch.flatten(torch.min(current_q1, current_q2))
                # convert any negative priorities to a minimum value, can't have a negative priority
                new_priorities = torch.clamp(new_priorities, min=0.0000001).tolist()  # convert to a list for storage
                replay_buffer.update_priorities(indexes, new_priorities)

            # delayed policy updates
            if it % cons.POLICY_FREQ == 0:  # update the actor policy less frequently

                # compute the actor loss
                q_action = self.actor(state).float().detach()
                actor_loss = -self.critic.get_q(state, q_action).mean()
                cons.TD3_REPORT.write_actor_loss(self.total_it, it, actor_loss, 1)

                # optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                self.actor_loss_plot.append(actor_loss.item())

                # Update the frozen right_target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(cons.TAU * param.data + (1 - cons.TAU) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(cons.TAU * param.data + (1 - cons.TAU) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))

    def load(self, filename="best_avg", directory="td3/saves/shared_agent"):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
예제 #6
0
# training config
MAX_EPISODE = 450
Actor_lr = 1e-3
Critic_lr = 1e-3

# problem setting
grid = Grid()
grid.draw_board()
state_dim = 2
action_dim = 4

# init models
actor = Actor(input_dim=state_dim, output_dim=action_dim)
critic = Critic(input_dim=state_dim)
actor_opt = AdamW(actor.parameters(), lr=Actor_lr)
critic_opt = AdamW(critic.parameters(), lr=Critic_lr)

# init loss
a_loss = Actor_Loss()
c_loss = Critic_Loss()

for i_episode in range(MAX_EPISODE):
    s = grid.reset()
    t = 0
    total_action = []
    done = False
    while(not done and t < 200):
        # step 1
        s = torch.Tensor(s)
        pai = actor(s[None, :])
        # step 2
예제 #7
0
class AgentDDPG:
    def __init__(self, state_size, action_size, seed):
        """

        :state_size: size of the state vector
        :action_size: size of the action vector
        """

        self.state_size = state_size
        self.action_size = action_size
        self.t_step = 0
        self.score = 0.0
        self.best = 0.0
        self.seed = seed
        self.learning_rate_actor = 0.0001
        self.learning_rate_critic = 0.001

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target network definitions
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.seed).to(device)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.seed).to(device)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.seed).to(device)

        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.seed).to(device)
        # Actor Optimizer
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate_actor)

        # Critic Optimizer
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.learning_rate_critic)

        # Make sure local and target start with the same weights
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.001  # Soft update for target parameters Actor Critic with Advantage

    # Actor determines what to do based on the policy
    def act_local(self, state):
        # Given a state return the action recommended by the policy actor_local
        # Reshape the state to fit the torch tensor input
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Pass the state to the actor local model to get an action
        # recommend for the policy in a state
        # set the actor_local model to predict not to train
        self.actor_local.eval()
        # set the model so this operation is not counted in the
        # gradiant calculation.
        with torch.no_grad():
            actions = self.actor_local(state)
        # set the model back to training mode
        self.actor_local.train()

        # Return actions tensor
        return actions.detach()

    def act_target(self, states):
        # Pass the state to the actor target model to get an action
        # recommend for the policy in a state
        # set the actor_target model to predict not to train
        self.actor_target.eval()
        # set the model so this operation is not counted in the
        # gradiant calculation.
        with torch.no_grad():
            actions = self.actor_target(states)
        # set the model back to training mode
        self.actor_target.train()

        # Return actions tensor
        return actions.detach()

    def get_episode_score(self):
        """
        Calculate the episode scores
        :return: None
        """
        # Update score and best score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best:
            self.best = self.score

    def save_model_weights(self):
        torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
예제 #8
0
파일: trainer.py 프로젝트: HSShin0/rl
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    elif args.cuda == 'cpu':
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:{}'.format(args.cuda))
    print('Use {}'.format(device))
    time.sleep(0.5)

    actor = Actor(env, actor_params)
    critic = Critic(critic_params, q=True)  # Q-function
    actor.to(device)
    critic.to(device)

    actor_optim = torch.optim.Adam(actor.parameters(),
                                   lr=actor.params['learning_rate'])
    critic_optim = torch.optim.Adam(critic.parameters(),
                                    lr=critic.params['learning_rate'])

    # create the actor-critic agent
    agent_params.update(params)
    memory = ReplayBuffer(CAPACITY)
    agent = Agent(env, actor, critic, memory, actor_optim, critic_optim,
                  agent_params, device)

    # Initialize or load the model
    if not LOAD:
        agent.actor._initialize()
        agent.critic._initialize()
    else:
        agent.load(LOAD)
class MAMLFewShotClassifier(nn.Module):
    def __init__(self, im_shape, device, args):
        """
        Initializes a MAML few shot learning system
        :param im_shape: The images input size, in batch, c, h, w shape
        :param device: The device to use to use the model on.
        :param args: A namedtuple of arguments specifying various hyperparameters.
        """
        super(MAMLFewShotClassifier, self).__init__()
        self.args = args
        self.device = device
        self.batch_size = args.batch_size
        self.use_cuda = args.use_cuda
        self.im_shape = im_shape
        self.current_epoch = 0

        self.rng = set_torch_seed(seed=args.seed)
        if args.high_end:
            self.embedding = HighEndEmbedding(device, args, 3).to(device)
            self.classifier = HighEndClassifier(
                device, args, self.embedding.n_out_channels).to(device)
        else:
            self.classifier = VGGReLUNormNetwork(
                im_shape=self.im_shape,
                num_output_classes=self.args.num_classes_per_set,
                args=args,
                device=device,
                meta_classifier=True).to(device=self.device)
        self.task_learning_rate = args.task_learning_rate

        self.inner_loop_optimizer = LSLRGradientDescentLearningRule(
            device=device,
            init_learning_rate=self.task_learning_rate,
            total_num_inner_loop_steps=self.args.
            number_of_training_steps_per_iter + self.args.num_critic_updates,
            use_learnable_learning_rates=self.args.
            learnable_per_layer_per_step_inner_loop_learning_rate)
        self.inner_loop_optimizer.initialise(
            names_weights_dict=self.get_inner_loop_parameter_dict(
                params=self.classifier.named_parameters()))

        print("Inner Loop parameters")
        for key, value in self.inner_loop_optimizer.named_parameters():
            print(key, value.shape)

        if args.use_critic:
            print(
                sum([
                    reduce(mul, p.size(), 1) for p in list(
                        self.get_inner_loop_parameter_dict(
                            self.classifier.named_parameters()).values())
                ]))
            self.critic = Critic(n_theta=sum([
                reduce(mul, p.size(), 1) for p in list(
                    self.get_inner_loop_parameter_dict(
                        self.classifier.named_parameters()).values())
            ]))

        self.use_cuda = args.use_cuda
        self.device = device
        self.args = args
        self.to(device)
        print("Outer Loop parameters")
        for name, param in self.named_parameters():
            if param.requires_grad:
                print(name, param.shape, param.device, param.requires_grad)

        if args.high_end:
            self.optimizer = optim.SGD(self.trainable_parameters(), lr=1e-4)
        else:
            self.optimizer = optim.Adam(self.trainable_parameters(),
                                        lr=args.meta_learning_rate,
                                        amsgrad=False)
        if args.use_critic:
            self.critic_optimizer = optim.SGD(self.critic.parameters(),
                                              lr=1e-6)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer=self.optimizer,
            T_max=self.args.total_epochs,
            eta_min=self.args.min_learning_rate)

    def get_per_step_loss_importance_vector(self):
        """
        Generates a tensor of dimensionality (num_inner_loop_steps) indicating the importance of each step's target
        loss towards the optimization loss.
        :return: A tensor to be used to compute the weighted average of the loss, useful for
        the MSL (Multi Step Loss) mechanism.
        """
        loss_weights = np.ones(
            shape=(self.args.number_of_training_steps_per_iter)) * (
                1.0 / self.args.number_of_training_steps_per_iter)
        decay_rate = 1.0 / self.args.number_of_training_steps_per_iter / self.args.multi_step_loss_num_epochs
        min_value_for_non_final_losses = 0.03 / self.args.number_of_training_steps_per_iter
        for i in range(len(loss_weights) - 1):
            curr_value = np.maximum(
                loss_weights[i] - (self.current_epoch * decay_rate),
                min_value_for_non_final_losses)
            loss_weights[i] = curr_value

        curr_value = np.minimum(
            loss_weights[-1] +
            (self.current_epoch *
             (self.args.number_of_training_steps_per_iter - 1) * decay_rate),
            1.0 - ((self.args.number_of_training_steps_per_iter - 1) *
                   min_value_for_non_final_losses))
        loss_weights[-1] = curr_value
        loss_weights = torch.Tensor(loss_weights).to(device=self.device)
        return loss_weights

    def get_inner_loop_parameter_dict(self, params):
        """
        Returns a dictionary with the parameters to use for inner loop updates.
        :param params: A dictionary of the network's parameters.
        :return: A dictionary of the parameters to use for the inner loop optimization process.
        """
        param_dict = dict()
        for name, param in params:
            if param.requires_grad:
                if self.args.enable_inner_loop_optimizable_bn_params:
                    param_dict[name] = param.to(device=self.device)
                else:
                    if "norm_layer" not in name:
                        param_dict[name] = param.to(device=self.device)

        return param_dict

    def apply_inner_loop_update(self, loss, names_weights_copy,
                                use_second_order, current_step_idx):
        """
        Applies an inner loop update given current step's loss, the weights to update, a flag indicating whether to use
        second order derivatives and the current step's index.
        :param loss: Current step's loss with respect to the support set.
        :param names_weights_copy: A dictionary with names to parameters to update.
        :param use_second_order: A boolean flag of whether to use second order derivatives.
        :param current_step_idx: Current step's index.
        :return: A dictionary with the updated weights (name, param)
        """
        self.classifier.zero_grad(names_weights_copy)

        grads = torch.autograd.grad(loss,
                                    names_weights_copy.values(),
                                    create_graph=use_second_order)
        names_grads_wrt_params = dict(zip(names_weights_copy.keys(), grads))

        names_weights_copy = self.inner_loop_optimizer.update_params(
            names_weights_dict=names_weights_copy,
            names_grads_wrt_params_dict=names_grads_wrt_params,
            num_step=current_step_idx)

        return names_weights_copy

    def get_across_task_loss_metrics(self, total_losses, total_accuracies):
        losses = dict()

        losses['loss'] = torch.mean(torch.stack(total_losses))
        losses['accuracy'] = np.mean(total_accuracies)

        return losses

    def forward(self, data_batch, epoch, use_second_order,
                use_multi_step_loss_optimization, num_steps, training_phase):
        """
        Runs a forward outer loop pass on the batch of tasks using the MAML/++ framework.
        :param data_batch: A data batch containing the support and target sets.
        :param epoch: Current epoch's index
        :param use_second_order: A boolean saying whether to use second order derivatives.
        :param use_multi_step_loss_optimization: Whether to optimize on the outer loop using just the last step's
        target loss (True) or whether to use multi step loss which improves the stability of the system (False)
        :param num_steps: Number of inner loop steps.
        :param training_phase: Whether this is a training phase (True) or an evaluation phase (False)
        :return: A dictionary with the collected losses of the current outer forward propagation.
        """
        x_support_set, x_target_set, y_support_set, y_target_set = data_batch

        [b, ncs, spc] = y_support_set.shape

        self.num_classes_per_set = ncs

        total_losses = []
        total_accuracies = []
        per_task_target_preds = [[] for i in range(len(x_target_set))]
        if self.args.high_end:
            self.embedding.zero_grad()
        self.classifier.zero_grad()
        for task_id, (x_support_set_task, y_support_set_task, x_target_set_task, y_target_set_task) in \
                enumerate(zip(x_support_set,
                              y_support_set,
                              x_target_set,
                              y_target_set)):
            task_losses = []
            task_accuracies = []
            per_step_loss_importance_vectors = self.get_per_step_loss_importance_vector(
            )

            # this is theta_0
            names_weights_copy = self.get_inner_loop_parameter_dict(
                self.classifier.named_parameters())

            n, s, c, h, w = x_target_set_task.shape

            x_support_set_task = x_support_set_task.view(-1, c, h, w)
            y_support_set_task = y_support_set_task.view(-1)
            x_target_set_task = x_target_set_task.view(-1, c, h, w)
            y_target_set_task = y_target_set_task.view(-1)

            # Inner loop starts
            if self.args.high_end:
                x_support_set_task = self.embedding(x_support_set_task,
                                                    0,
                                                    training=training_phase)
                x_target_set_task = self.embedding(x_target_set_task,
                                                   0,
                                                   training=training_phase)
            for num_step in range(num_steps):

                # operates on the support set
                support_loss, support_preds = self.net_forward(
                    x=x_support_set_task,
                    y=y_support_set_task,
                    weights=names_weights_copy,
                    backup_running_statistics=True if
                    (num_step == 0) else False,
                    training=True,
                    num_step=num_step)

                # this is update of theta from the copy of current theta_0 and onward
                # i.e. inner loop optimization wrt support set
                names_weights_copy = self.apply_inner_loop_update(
                    loss=support_loss,
                    names_weights_copy=names_weights_copy,
                    use_second_order=use_second_order,
                    current_step_idx=num_step)

                # TODO: inner loop OPTIMIZATION wrt target set???
                if use_multi_step_loss_optimization and training_phase and epoch < self.args.multi_step_loss_num_epochs:
                    # this is MAML++ way
                    target_loss, target_preds = self.net_forward(
                        x=x_target_set_task,
                        y=y_target_set_task,
                        weights=names_weights_copy,
                        backup_running_statistics=False,
                        training=True,
                        num_step=num_step)

                    task_losses.append(
                        per_step_loss_importance_vectors[num_step] *
                        target_loss)
                else:
                    if num_step == (
                            self.args.number_of_training_steps_per_iter - 1):
                        target_loss, target_preds = self.net_forward(
                            x=x_target_set_task,
                            y=y_target_set_task,
                            weights=names_weights_copy,
                            backup_running_statistics=False,
                            training=True,
                            num_step=num_step)
                        task_losses.append(target_loss)

            if self.args.use_critic:
                for i in range(self.args.num_critic_updates):
                    # TODO: here must be an update using the Critic (start without g)
                    # F = {f(x^b_T, θ_{N+j}), θ_{N+j}, g(xS, xn)}
                    # θ_{N+j+1} = θ_{N+j} − \gamma * \nabla_{θ_{N+j}} C(F,W)
                    critic_loss, target_preds = self.net_forward_critic(
                        x=x_target_set_task,
                        y=y_target_set_task,
                        weights=names_weights_copy,
                        backup_running_statistics=False,
                        training=True,
                        num_step=num_step + i)

                    names_weights_copy = self.apply_inner_loop_update(
                        loss=critic_loss,
                        names_weights_copy=names_weights_copy,
                        use_second_order=use_second_order,
                        current_step_idx=num_step + i)

                target_loss, target_preds = self.net_forward(
                    x=x_target_set_task,
                    y=y_target_set_task,
                    weights=names_weights_copy,
                    backup_running_statistics=False,
                    training=True,
                    num_step=num_step)
                task_losses.append(target_loss)

            per_task_target_preds[task_id] = target_preds.detach().cpu().numpy(
            )
            _, predicted = torch.max(target_preds.data, 1)

            accuracy = predicted.float().eq(
                y_target_set_task.data.float()).cpu().float()
            task_losses = torch.sum(torch.stack(task_losses))
            total_losses.append(task_losses)
            total_accuracies.extend(accuracy)

            if not training_phase:
                self.classifier.restore_backup_stats()

        losses = self.get_across_task_loss_metrics(
            total_losses=total_losses, total_accuracies=total_accuracies)

        for idx, item in enumerate(per_step_loss_importance_vectors):
            losses['loss_importance_vector_{}'.format(
                idx)] = item.detach().cpu().numpy()

        return losses, per_task_target_preds

    def net_forward(self, x, y, weights, backup_running_statistics, training,
                    num_step):
        """
        A base model forward pass on some data points x. Using the parameters in the weights dictionary. Also requires
        boolean flags indicating whether to reset the running statistics at the end of the run (if at evaluation phase).
        A flag indicating whether this is the training session and an int indicating the current step's number in the
        inner loop.
        :param x: A data batch of shape b, c, h, w
        :param y: A data targets batch of shape b, n_classes
        :param weights: A dictionary containing the weights to pass to the network.
        :param backup_running_statistics: A flag indicating whether to reset the batch norm running statistics to their
         previous values after the run (only for evaluation)
        :param training: A flag indicating whether the current process phase is a training or evaluation.
        :param num_step: An integer indicating the number of the step in the inner loop.
        :return: the crossentropy losses with respect to the given y, the predictions of the base model.
        """
        preds = self.classifier.forward(
            x=x,
            params=weights,
            training=training,
            backup_running_statistics=backup_running_statistics,
            num_step=num_step)

        loss = F.cross_entropy(input=preds, target=y)

        return loss, preds

    def net_forward_critic(self, x, y, weights, backup_running_statistics,
                           training, num_step):
        """
        A base model forward pass on some data points x. Using the parameters in the weights dictionary. Also requires
        boolean flags indicating whether to reset the running statistics at the end of the run (if at evaluation phase).
        A flag indicating whether this is the training session and an int indicating the current step's number in the
        inner loop.
        :param x: A data batch of shape b, c, h, w
        :param y: A data targets batch of shape b, n_classes
        :param weights: A dictionary containing the weights to pass to the network.
        :param backup_running_statistics: A flag indicating whether to reset the batch norm running statistics to their
         previous values after the run (only for evaluation)
        :param training: A flag indicating whether the current process phase is a training or evaluation.
        :param num_step: An integer indicating the number of the step in the inner loop.
        :return: the crossentropy losses with respect to the given y, the predictions of the base model.
        """
        preds = self.classifier.forward(
            x=x,
            params=weights,
            training=training,
            backup_running_statistics=backup_running_statistics,
            num_step=num_step)
        print(weights.keys())
        params1d = torch.cat(
            [torch.reshape(p, (1, -1)) for p in list(weights.values())], dim=1)
        print(params1d.shape)

        loss = self.critic(preds, params1d)

        return loss, preds

    def trainable_parameters(self):
        """
        Returns an iterator over the trainable parameters of the model.
        """
        for param in self.parameters():
            if param.requires_grad:
                yield param

    def train_forward_prop(self, data_batch, epoch):
        """
        Runs an outer loop forward prop using the meta-model and base-model.
        :param data_batch: A data batch containing the support set and the target set input, output pairs.
        :param epoch: The index of the currrent epoch.
        :return: A dictionary of losses for the current step.
        """
        losses, per_task_target_preds = self.forward(
            data_batch=data_batch,
            epoch=epoch,
            use_second_order=self.args.second_order
            and epoch > self.args.first_order_to_second_order_epoch,
            use_multi_step_loss_optimization=self.args.
            use_multi_step_loss_optimization,
            num_steps=self.args.number_of_training_steps_per_iter,
            training_phase=True)
        return losses, per_task_target_preds

    def evaluation_forward_prop(self, data_batch, epoch):
        """
        Runs an outer loop evaluation forward prop using the meta-model and base-model.
        :param data_batch: A data batch containing the support set and the target set input, output pairs.
        :param epoch: The index of the currrent epoch.
        :return: A dictionary of losses for the current step.
        """
        losses, per_task_target_preds = self.forward(
            data_batch=data_batch,
            epoch=epoch,
            use_second_order=False,
            use_multi_step_loss_optimization=True,
            num_steps=self.args.number_of_evaluation_steps_per_iter,
            training_phase=False)

        return losses, per_task_target_preds

    def meta_update(self, loss):
        """
        Applies an outer loop update on the meta-parameters of the model.
        :param loss: The current crossentropy loss.
        """
        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        if 'imagenet' in self.args.dataset_name:
            for name, param in self.classifier.named_parameters():
                if param.requires_grad:
                    param.grad.data.clamp_(
                        -10, 10
                    )  # not sure if this is necessary, more experiments are needed
            if self.args.high_end:
                for name, param in self.embedding.named_parameters():
                    if param.requires_grad:
                        param.grad.data.clamp_(
                            -10, 10
                        )  # not sure if this is necessary, more experiments are needed
        self.optimizer.step()

    def critic_meta_update(self, loss):
        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()

    def run_train_iter(self, data_batch, epoch):
        """
        Runs an outer loop update step on the meta-model's parameters.
        :param data_batch: input data batch containing the support set and target set input, output pairs
        :param epoch: the index of the current epoch
        :return: The losses of the ran iteration.
        """
        epoch = int(epoch)
        self.scheduler.step(epoch=epoch)
        if self.current_epoch != epoch:
            self.current_epoch = epoch

        if not self.training:
            self.train()

        x_support_set, x_target_set, y_support_set, y_target_set = data_batch

        x_support_set = torch.Tensor(x_support_set).float().to(
            device=self.device)
        x_target_set = torch.Tensor(x_target_set).float().to(
            device=self.device)
        y_support_set = torch.Tensor(y_support_set).long().to(
            device=self.device)
        y_target_set = torch.Tensor(y_target_set).long().to(device=self.device)

        data_batch = (x_support_set, x_target_set, y_support_set, y_target_set)

        losses, per_task_target_preds = self.train_forward_prop(
            data_batch=data_batch, epoch=epoch)

        self.meta_update(loss=losses['loss'])
        if self.args.use_critic:
            self.critic_meta_update(loss=losses['loss'])
        losses['learning_rate'] = self.scheduler.get_lr()[0]
        self.optimizer.zero_grad()
        self.zero_grad()

        return losses, per_task_target_preds

    def run_validation_iter(self, data_batch):
        """
        Runs an outer loop evaluation step on the meta-model's parameters.
        :param data_batch: input data batch containing the support set and target set input, output pairs
        :param epoch: the index of the current epoch
        :return: The losses of the ran iteration.
        """

        if self.training:
            self.eval()

        x_support_set, x_target_set, y_support_set, y_target_set = data_batch

        x_support_set = torch.Tensor(x_support_set).float().to(
            device=self.device)
        x_target_set = torch.Tensor(x_target_set).float().to(
            device=self.device)
        y_support_set = torch.Tensor(y_support_set).long().to(
            device=self.device)
        y_target_set = torch.Tensor(y_target_set).long().to(device=self.device)

        data_batch = (x_support_set, x_target_set, y_support_set, y_target_set)

        losses, per_task_target_preds = self.evaluation_forward_prop(
            data_batch=data_batch, epoch=self.current_epoch)

        # losses['loss'].backward() # uncomment if you get the weird memory error
        # self.zero_grad()
        # self.optimizer.zero_grad()

        return losses, per_task_target_preds

    def save_model(self, model_save_dir, state):
        """
        Save the network parameter state and experiment state dictionary.
        :param model_save_dir: The directory to store the state at.
        :param state: The state containing the experiment state and the network. It's in the form of a dictionary
        object.
        """
        state['network'] = self.state_dict()
        torch.save(state, f=model_save_dir)

    def load_model(self, model_save_dir, model_name, model_idx):
        """
        Load checkpoint and return the state dictionary containing the network state params and experiment state.
        :param model_save_dir: The directory from which to load the files.
        :param model_name: The model_name to be loaded from the direcotry.
        :param model_idx: The index of the model (i.e. epoch number or 'latest' for the latest saved model of the current
        experiment)
        :return: A dictionary containing the experiment state and the saved model parameters.
        """
        filepath = os.path.join(model_save_dir,
                                "{}_{}".format(model_name, model_idx))
        state = torch.load(filepath)
        state_dict_loaded = state['network']
        self.load_state_dict(state_dict=state_dict_loaded)
        return state
예제 #10
0
class TD3(object):
    """Agent class that handles the training of the networks
    and provides outputs as actions.

    Args:
        state_dim (array): state size
        action_dim (array): action size
        policy_noise (float): how much noise to add to actions
        device (device): cuda or cpu to process the tensors
        discount (float): discount factor
        tau (float): soft update for main networks to target networks
        policy_noise (float): noise factor
        noise_clip (float): clip factor
        policy_freq (int): frequency of policy updates

    """
    def __init__(self, state_dim, action_dim, max_action, discount, tau,
                 policy_noise, noise_clip, policy_freq, device):

        self.state_dim = len(state_dim[0])
        self.action_dim = len(action_dim)
        self.max_action = max_action[2]
        self.actor = Actor(self.state_dim, self.action_dim,
                           self.max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor).float()
        # self.actor_target = Actor(state_dim, action_dim, self.max_action).to(device)
        # self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=3e-4)  # or 1e-3

        self.critic = Critic(self.state_dim, self.action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic).float()
        # self.critic_target = Critic(state_dim, action_dim).to(device)
        # self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=3e-4)  # or 1e-2

        self.device = device
        self.max_action = max_action
        self.discount = discount
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.total_it = 0

    def select_action(self, state):
        """Select an appropriate action from the agent policy
            Args:
                state (array): current state of environment

            Returns:
                action (float): action clipped within action range
        """

        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)

        #  if noise != 0:
        #      action_dim = len(self.env.action_space())
        #      action = (action + np.random.normal(0, noise, size=action_dim))
        #  action_space_low, _, action_space_high = self.env.action_domain()
        #  return action.clip(action_space_low, action_space_high)

        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=100):
        """Train and update actor and critic networks
            Args:
                replay_buffer (ReplayBuffer): buffer for experience replay
                batch_size(int): batch size to sample from replay buffer
            Return:
                actor_loss (float): loss from actor network
                critic_loss (float): loss from critic network
        """
        self.total_it += 1
        # Sample replay buffer
        state, next_state, action, reward, done = replay_buffer.sample(
            batch_size)

        state = torch.from_numpy(
            np.asarray([np.array(i.item().values()) for i in state]))

        next_state = np.asarray(
            [np.array(i.item().values()) for i in next_state])
        reward = torch.as_tensor(reward, dtype=torch.float32)
        done = torch.as_tensor(done, dtype=torch.float32)

        with torch.no_grad():
            # select an action according to the policy an add clipped noise
            # need to select set of actions
            noise = (torch.rand_like(torch.from_numpy(action)) *
                     self.policy_noise).clamp(-self.noise_clip,
                                              self.noise_clip)

            next_action = (self.actor_target(
                torch.tensor(next_state, dtype=torch.float32)) +
                           torch.tensor(noise, dtype=torch.float32)).clamp(
                               self.max_action[0], self.max_action[2])
            # next_action_d =torch.as_tensor(next_action, dtype=torch.double)
            # Compute the target Q value
            target_Q1, target_Q2 = self.critic(state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + done * self.discount * target_Q

        # update action datatype, can't do earlier, use np.array earlier
        action = torch.as_tensor(action, dtype=torch.float32)

        # get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        # compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q[:1, :].transpose(
            0, 1)) + F.mse_loss(current_Q2, target_Q[:1, :].transpose(0, 1))

        # optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # delayed policy updates
        if self.total_it % self.policy_freq == 0:
            # compute the actor loss
            actor_loss = -self.critic.get_q(state, self.actor(state)).mean()

            # optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(),
                   '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(),
                   '%s/%s_critic.pth' % (directory, filename))

    def load(self, filename="best_avg", directory="./saves"):
        self.actor.load_state_dict(
            torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(
            torch.load('%s/%s_critic.pth' % (directory, filename)))
예제 #11
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM
                              )  # set decay rate based on epsilon end target
        self.timestep = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.timestep += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory and at learning interval settings
        if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            # get action for each agent and concatenate them
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.eps * self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Construct next actions vector relative to the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        # Construct action prediction vector relative to each agent
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, EPS_FINAL)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #12
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #13
0
파일: td3.py 프로젝트: TomoyaAkiyama/TD3
class TD3:
    def __init__(self,
                 device,
                 state_dim,
                 action_dim,
                 action_max,
                 gamma=0.99,
                 tau=0.005,
                 lr=3e-4,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 exploration_noise=0.1,
                 policy_freq=2):

        self.actor = Actor(state_dim, 256, action_dim, action_max).to(device)
        self.target_actor = copy.deepcopy(self.actor)
        self.actor_optimizer = optim.Adam(params=self.actor.parameters(),
                                          lr=lr)
        self.critic = Critic(state_dim, 256, action_dim).to(device)
        self.target_critic = copy.deepcopy(self.critic)
        self.critic_optimizer = optim.Adam(params=self.critic.parameters(),
                                           lr=lr)

        self.device = device
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.rollout_actor = TD3RolloutActor(state_dim, action_dim, action_max,
                                             exploration_noise)
        self.sync_rollout_actor()

        self.iteration_num = 0

    def train(self, replay_buffer, batch_size=256):
        self.iteration_num += 1

        st, nx_st, ac, rw, mask = replay_buffer.sample(batch_size)
        with torch.no_grad():
            noise = (torch.randn_like(ac) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)
            nx_ac = self.target_actor.forward(nx_st, noise)

            target_q1, target_q2 = self.target_critic.forward(nx_st, nx_ac)
            min_q = torch.min(target_q1, target_q2)
            target_q = rw + mask * self.gamma * min_q

        q1, q2 = self.critic.forward(st, ac)
        critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q)
        self.critic.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        if self.iteration_num % self.policy_freq == 0:
            actor_loss = -self.critic.q1(st, self.actor.forward(st)).mean()
            self.actor.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param, target_param in zip(self.critic.parameters(),
                                           self.target_critic.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.target_actor.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

        self.sync_rollout_actor()

    def sync_rollout_actor(self):
        for param, target_param in zip(self.actor.parameters(),
                                       self.rollout_actor.parameters()):
            target_param.data.copy_(param.data.cpu())

    def save(self, path):
        torch.save(self.critic.state_dict(), os.path.join(path, 'critic.pth'))
        torch.save(self.target_critic.state_dict(),
                   os.path.join(path, 'target_critic.pth'))
        torch.save(self.critic_optimizer.state_dict(),
                   os.path.join(path, 'critic_optimizer.pth'))

        torch.save(self.actor.state_dict(), os.path.join(path, 'actor.pth'))
        torch.save(self.target_actor.state_dict(),
                   os.path.join(path, 'target_actor.pth'))
        torch.save(self.actor_optimizer.state_dict(),
                   os.path.join(path, 'actor_optimizer.pth'))

    def load(self, path):
        self.critic.load_state_dict(
            torch.load(os.path.join(path, 'critic.pth')))
        self.target_critic.load_state_dict(
            torch.load(os.path.join(path, 'target_critic.pth')))
        self.critic_optimizer.load_state_dict(
            torch.load(os.path.join(path, 'critic_optimizer.pth')))

        self.actor.load_state_dict(torch.load(os.path.join(path, 'actor.pth')))
        self.target_actor.load_state_dict(
            torch.load(os.path.join(path, 'target_actor.pth')))
        self.actor_optimizer.load_state_dict(
            torch.load(os.path.join(path, 'actor_optimizer.pth')))
        self.sync_rollout_actor()
예제 #14
0
trainloader = data.DataLoader(train_data, shuffle=True)

latent_distr = torch.distributions.normal.Normal(0, 1)
uniform_distr = torch.distributions.uniform.Uniform(0, 1)

# Networks
crit = Critic()
gen = Generator()
rep = Representator()
rep_gen = Representator()
crit.cuda()
gen.cuda()
rep.cuda()
rep_gen.cuda()

optimizer = torch.optim.Adam(crit.parameters(), lr=0.0004,
                             betas=(0.0, 0.9))  # TTUR
optimizer_gen = torch.optim.Adam(gen.parameters(), lr=0.001, betas=(0.0, 0.9))
optimizer_rep = torch.optim.Adam(rep.parameters(), lr=0.0001, betas=(0.0, 0.9))
optimizer_rep_gen = torch.optim.Adam(rep_gen.parameters(),
                                     lr=0.001,
                                     betas=(0.0, 0.9))

# exponentially decaying learning rate
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.999)
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optimizer_gen,
                                                     gamma=0.999)
scheduler_r = torch.optim.lr_scheduler.ExponentialLR(optimizer_rep,
                                                     gamma=0.999)
scheduler_r_gen = torch.optim.lr_scheduler.ExponentialLR(optimizer_rep_gen,
                                                         gamma=0.999)
예제 #15
0
class TD3(object):
    # Twin Delay DDGP
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())

        self.max_action = max_action

    def select_action(self, state):
        state = torch.Tensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self,
              replay_buffer,
              iterations,
              batch_size=100,
              discount=0.99,
              tau=0.005,
              policy_noise=0.2,
              noise_clip=0.5,
              policy_freq=2):
        for it in range(iterations):
            # step 4: we sample batch of transitions (s, s', a, r) from the memory
            batch_state, batch_next_state, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(
                batch_size)
            state = torch.Tensor(batch_state).to(device)
            next_state = torch.Tensor(batch_next_state).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)

            # step 5: from the next state s', the actor traget plays next action a'
            next_action = self.actor_target(next_state)

            # step 6: we add Gaussian noise to the next action a' and we clamp it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(
                0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action,
                                                      self.max_action)

            # step 7: the two critic targets take each the couple (s', a') as input and return  2 Q-values  Qt1(s', a') and Qt2(s', a') as output
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)

            # step 8: we keep the minimum of these Q-values min(Qt1, Qt2)
            target_Q = torch.min(target_Q1, target_Q2)

            # step 9: we get the final target of the 2 critic models, which is : Qt = r + gamma * target_Q
            target_Q = reward + (1 - done) * discount * target_Q

            # step 10: the 2 critic models take each the couple (s, a) as input and return 2 Q-values Qt1(s, a) and Qt2(s, a) as outputs
            current_Q1, current_Q2 = self.critic(state, action)

            # step 11: we compute the less coming from the 2 critic models : critic loss = mse_loss(Q1(s,a), Qt) + mse_loss(Q2(s,a), Qt)
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
                current_Q2, target_Q)

            # step 12:  we backpropagate the critic loss and update the parameters of the 2 critic models with SGD optimizer
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # step 13: one every 2 iterations, we update our Actor model by performing gradient ascent on the output of the first critic model
            if it % policy_freq == 0:
                # deterministic policy gradient DPG
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                self.actor.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Delay
                # step 14: still once every 2 iterations, we update the weights of the actor target by polyak averaging
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data +
                                            (1 - tau) * target_param.data)

                # step 15: still ones every 2 iterations, we uodate the weights of the critic target by polyak averaging
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data +
                                            (1 - tau) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(),
                   '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(),
                   '%s/%s_critic.pth' % (directory, filename))

    def load(self, filename, directory):
        self.actor.load_state_dict(
            torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(
            torch.load('%s/%s_critic.pth' % (directory, filename)))
# Networks
crit = Critic()
gen = Generator(latent_size)
classifier = models.vgg13(pretrained=False)
# adjust final layer to handle 10 classes
classifier.classifier._modules['6'] = torch.nn.Linear(4096, 10)
classifier.train()
crit.cuda()
gen.cuda()
classifier.cuda()

adversarial_loss = torch.nn.BCELoss()
neg_logl = torch.nn.NLLLoss()

optimizer = torch.optim.Adam(crit.parameters(), lr=0.0001, betas=(0.5, 0.999))
optimizer_gen = torch.optim.Adam(gen.parameters(),
                                 lr=0.0001,
                                 betas=(0.5, 0.999))
optimizer_classifier = torch.optim.Adam(classifier.parameters(),
                                        lr=0.0001,
                                        betas=(0.5, 0.999))

scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.999)
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optimizer_gen,
                                                     gamma=0.999)
scheduler_c = torch.optim.lr_scheduler.ExponentialLR(optimizer_classifier,
                                                     gamma=0.999)

t = 0
예제 #17
0
device = 'cuda'

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

dataloader = DataLoader(
    MNIST('.', download=True, transform=transform),
    batch_size=batch_size,
    shuffle=True)

gen = Generator(z_dim).to(device)
gen_opt = torch.optim.Adam(gen.parameters(), lr=lr, betas=(beta_1, beta_2))
crit = Critic().to(device) 
crit_opt = torch.optim.Adam(crit.parameters(), lr=lr, betas=(beta_1, beta_2))

cur_step = 0
generator_losses = []
critic_losses = []
for epoch in range(n_epochs):
    # Dataloader returns the batches
    for real, _ in tqdm(dataloader):
        cur_batch_size = len(real)
        real = real.to(device)

        mean_iteration_critic_loss = 0
        for _ in range(crit_repeats):
            ### Update critic ###
            crit_opt.zero_grad()
            fake_noise = get_noise(cur_batch_size, z_dim, device=device)
예제 #18
0
class Agent:
    def __init__(self,
                 env,
                 alpha: float = 1e-3,
                 gamma: float = 0.99,
                 hidden_size: int = 32,
                 tau: float = 1e-3):
        self.env = env
        self.gamma = gamma
        self.alpha = alpha
        self.tau = tau
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.actor = Actor(2, hidden_size, 1)
        self.actor_target = deepcopy(self.actor)

        self.critic = Critic(3, hidden_size, 1)
        self.critic_target = deepcopy(self.critic)

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=alpha)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=alpha)

        self.critic.to(self.device)
        self.critic_target.to(self.device)
        self.actor.to(self.device)
        self.actor_target.to(self.device)

    def update_critic(self, batch):
        state, action, next_state, _, done = batch

        state, next_state, action = map(
            lambda item: torch.tensor(item).to(self.device).float(),
            (state, next_state, action))
        done = torch.tensor(done).to(self.device)

        with torch.no_grad():
            q_target = self.critic_target(
                next_state,
                self.actor_target(next_state))  # pred Q value for each action
            q_target[done] = 0

        loss = F.mse_loss(self.critic(state, action), q_target)

        self.critic_optimizer.zero_grad()
        loss.backward()
        grad_clamp(self.critic)

        self.critic_optimizer.step()

        self.soft_update(self.critic, self.critic_target)

    def update_actor(self, batch):
        state, *_ = batch
        state = torch.tensor(state).to(self.device).float()

        loss = -torch.mean(self.critic(state, self.actor(state)))

        self.actor_optimizer.zero_grad()
        loss.backward()
        grad_clamp(self.actor)

        self.actor_optimizer.step()

        self.soft_update(self.actor, self.actor_target)

    def act(self, state):
        with torch.no_grad():
            state_ = torch.tensor(state).to(self.device).float()
            return self.actor(state_).cpu().numpy()

    def reset(self):
        return self.env.reset()

    def train(self,
              transitions: int,
              sigma_max: float = 1.,
              sigma_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              progress_upd_step: int = None,
              start_training: int = 1000,
              shaping_coef: float = 300.):
        history = ReplayBuffer(buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "sigma_max": sigma_max,
            "sigma_min": sigma_min,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()
        t = tqdm(range(transitions))
        for i in t:
            sigma = sigma_max - (sigma_max - sigma_min) * i / transitions
            action = self.act(state)
            noise = np.random.normal(scale=sigma, size=action.shape)
            action = np.clip(action + noise, -1, 1)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                batch = history.sample(batch_size)
                self.update_critic(batch)
                self.update_actor(batch)

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log

    def soft_update(self, model, target):
        with torch.no_grad():
            for param, param_target in zip(model.parameters(),
                                           target.parameters()):
                param_target.data.mul_(1 - self.tau)
                param_target.data.add_(self.tau * param.data)

    def rollout(self, to_render: bool = False):
        done = False
        state = self.reset()
        total_reward = 0

        while not done:
            state, reward, done, _ = self.env.step(self.act(state))
            total_reward += reward
            if to_render:
                self.env.render()

        self.env.close()
        return total_reward

    def evaluate_policy(self, episodes: int = 5, to_render: bool = False):
        rewards = []
        for _ in range(episodes):
            rewards.append(self.rollout(to_render=to_render))
        return np.mean(rewards), np.std(rewards)
class DDPGAgent():
    def __init__(self,
                 seed,
                 n_state,
                 n_action,
                 batch_size=64,
                 buffer=1e5,
                 gamma=0.99,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0,
                 tau=1e-3):
        self.batch_size = batch_size

        #init actor
        self.local_actor = Actor(n_state, n_action, seed).to(device)
        self.target_actor = Actor(n_state, n_action, seed).to(device)
        self.optim_actor = torch.optim.Adam(self.local_actor.parameters(),
                                            lr=lr_actor)
        #init critic
        self.local_critic = Critic(n_state, n_action, seed).to(device)
        self.target_critic = Critic(n_state, n_action, seed).to(device)
        self.optim_critic = torch.optim.Adam(self.local_critic.parameters(),
                                             lr=lr_critic,
                                             weight_decay=weight_decay)

        #init memory
        self.memory = memory(int(buffer), device, seed)
        self.tau = tau
        self.gamma = gamma
        self.noise = noise(n_action, seed=seed)

    def step(self, state, action, reward, next_state, done):
        event = Event(state, action, reward, next_state, done)
        self.memory.add(event)
        self.learn()

    def act(self, state):
        state = torch.from_numpy(state).float().to(device)
        self.local_actor.eval()
        with torch.no_grad():
            action = self.local_actor(state).cpu().data.numpy()
        self.local_actor.train()

        action += self.noise.make()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self):
        """
        Update both actor and critic networks
        """
        event_batch = self.memory.sample(self.batch_size)

        if event_batch is None:
            return

        event_batch = self.memory.deserialize(event_batch)
        self.update_critic(event_batch)
        self.update_actor(event_batch)
        self.update_target(self.local_actor, self.target_actor)
        self.update_target(self.local_critic, self.target_critic)

    def update_critic(self, batch):
        ## TD step
        # t
        expected_Q = self.local_critic(batch.states, batch.actions)

        # t+1
        actions_pred = self.target_actor(batch.states_next)
        target_Q_next = self.target_critic(batch.states_next, actions_pred)
        #only learning from positives? negatives are good source of learning too
        target_Q = batch.rewards + (self.gamma * target_Q_next *
                                    (1 - batch.dones))
        loss = nn.functional.mse_loss(expected_Q, target_Q)

        self.optim_critic.zero_grad()
        loss.backward()
        self.optim_critic.step()

    def update_actor(self, batch):
        actions_predicted = self.local_actor(batch.states)  #fixthis
        loss = -self.local_critic(batch.states, actions_predicted).mean()  #rms

        self.optim_actor.zero_grad()
        loss.backward()
        self.optim_actor.step()

    def update_target(self, local, target):
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
예제 #20
0
class Agent:
    def __init__(self, device, state_size, action_size, buffer_size=10,
                 batch_size=10,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 discount_rate=0.99,
                 tau=0.1,
                 steps_per_update=4,
                 action_range=None,
                 dropout_p=0.0,
                 weight_decay=0.0001,
                 noise_max=0.2,
                 noise_decay=1.0,
                 n_agents=1
                 ):
        self.device: torch.device = device
        self.state_size = state_size
        self.action_size = action_size

        self.critic_control = Critic(state_size, action_size).to(device)
        self.critic_control.dropout.p = dropout_p
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_target.eval()
        self.critic_optimizer = torch.optim.Adam(
            self.critic_control.parameters(),
            weight_decay=weight_decay,
            lr=critic_learning_rate)

        self.actor_control = Actor(state_size, action_size, action_range).to(
            device)
        self.actor_control.dropout.p = dropout_p
        self.actor_target = Actor(state_size, action_size, action_range).to(
            device)
        self.actor_target.eval()
        self.actor_optimizer = torch.optim.Adam(
            self.actor_control.parameters(),
            weight_decay=weight_decay,
            lr=actor_learning_rate)

        self.batch_size = batch_size
        self.min_buffer_size = batch_size
        self.replay_buffer = ReplayBuffer(device, state_size, action_size,
                                          buffer_size)

        self.discount_rate = discount_rate

        self.tau = tau

        self.step_count = 0
        self.steps_per_update = steps_per_update

        self.noise_max = noise_max
        self.noise = OUNoise([n_agents, action_size], 15071988, sigma=self.noise_max)
        self.noise_decay = noise_decay
        self.last_score = float('-inf')

    def policy(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_control.eval()
        with torch.no_grad():
            action = self.actor_control(state).cpu().numpy()
        self.actor_control.train()
        if add_noise:
            noise = self.noise.sample()
            action += noise
        return action

    def step(self, state, action, reward, next_state, done):
        p = self.calculate_p(state, action, reward, next_state, done)

        for i in range(state.shape[0]):
            self.replay_buffer.add(state[i, :], action[i, :], reward[i],
                                   next_state[i, :], done[i], p[i])
        if self.step_count % self.steps_per_update == 0:
            self.learn()
        self.step_count += 1

    def learn(self):
        if len(self.replay_buffer) < self.min_buffer_size:
            return
        indicies, (states, actions, rewards, next_states, dones, p) = \
            self.replay_buffer.sample(self.batch_size)

        self.actor_control.eval()
        error = self.bellman_eqn_error(
            states, actions, rewards, next_states, dones)
        self.actor_control.train()

        importance_scaling = (self.replay_buffer.buffer_size * p) ** -1
        importance_scaling /= importance_scaling.max()
        self.critic_optimizer.zero_grad()
        loss = (importance_scaling * (error ** 2)).sum() / self.batch_size
        loss.backward()
        self.critic_optimizer.step()

        self.actor_optimizer.zero_grad()
        expected_actions = self.actor_control(states)
        critic_score = self.critic_control(states, expected_actions)
        loss = -1 * (importance_scaling * critic_score).sum() / self.batch_size
        loss.backward()
        self.actor_optimizer.step()

        self.update_target(self.critic_control, self.critic_target)
        self.update_target(self.actor_control, self.actor_target)

        self.replay_buffer.update(indicies, error.detach().abs().cpu() + 1e-3)

    def bellman_eqn_error(self, states, actions, rewards, next_states, dones):
        """Double DQN error - use the control network to get the best action
        and apply the target network to it to get the target reward which is
        used for the bellman eqn error.
        """
        next_actions = self.actor_control(next_states)

        target_action_values = self.critic_target(next_states, next_actions)

        target_rewards = (
                rewards
                + self.discount_rate * (1 - dones) * target_action_values
        )

        current_rewards = self.critic_control(states, actions)
        error = current_rewards - target_rewards
        return error

    def calculate_p(self, state, action, reward, next_state, done):
        next_state = torch.from_numpy(next_state).float().to(
            self.device)
        state = torch.from_numpy(state).float().to(self.device)
        action = torch.from_numpy(action).float().to(self.device)
        reward = torch.from_numpy(reward).float().to(self.device)
        done = torch.from_numpy(done).float().to(
            self.device)

        done = done.unsqueeze(1)
        reward = reward.unsqueeze(1)

        self.actor_control.eval()
        self.critic_control.eval()

        with torch.no_grad():
            retval = abs(
                self.bellman_eqn_error(state, action, reward, next_state,
                                       done)) + 1e-3
        self.critic_control.train()
        self.actor_control.train()
        return retval

    def update_target(self, control, target):
        for target_param, control_param in zip(
                target.parameters(),
                control.parameters()):
            target_param.data.copy_(
                self.tau * control_param.data + (1.0 - self.tau) *
                target_param.data)

    def end_of_episode(self, final_score):
        self.step_count = 0

        self.noise.sigma *= self.noise_decay
        self.last_score = final_score
        self.noise.reset()

    def save(self, path):
        torch.save(self.critic_control.state_dict(), path + '-critic.p')
        torch.save(self.actor_control.state_dict(), path + '-actor.p')

    def restore(self, path):
        self.critic_control.load_state_dict(
            torch.load(path + '-critic.p', map_location='cpu'))
        self.actor_control.load_state_dict(
            torch.load(path + '-actor.p', map_location='cpu'))
예제 #21
0
def train():
    experiences_buffer = deque(maxlen=config.MAX_EXPERIENCES_SIZE)
    word2vec = LightWord2Vec()
    lang = Lang(word2vec.get_vocab())
    actor = ActorCopy(config.EMBEDDING_SIZE, config.STATE_SIZE, lang, word2vec)
    critic = Critic(config.STATE_SIZE, config.EMBEDDING_SIZE,
                    config.CRITIC_HIDDEN_SIZE)
    reader = DataSetReader('train')
    critic_optimizer = torch.optim.Adam(critic.parameters())
    critic_criterion = torch.nn.MSELoss()
    actor_optimizer = torch.optim.Adam(actor.parameters())

    if LOAD_INDEX > -1:
        actor, critic, critic_optimizer, critic_criterion, actor_optimizer, lang = load_model(
            LOAD_INDEX)

    if torch.cuda.is_available():
        actor.cuda()
        critic.cuda()

    for epoch in range(LOAD_INDEX + 1, config.EPOCHS):
        # training actor
        for x, y in reader.read(config.TRAIN_BATCH_SIZE):
            for sentence, target_sentence in zip(x, y):
                states, actions, probs = actor(
                    sentence, get_possible_actions(lang, sentence))
                predicted_sentence = actions[:-1]  # Skip None

                rewards = [
                    sari_reward(sentence[:i + 1], predicted_sentence[:i + 1],
                                target_sentence[:i + 1])
                    for i in range(
                        max(len(target_sentence), len(predicted_sentence)))
                ] + [0]

                for i in range(len(states) - 1):
                    experiences_buffer.insert(
                        0,
                        Experience(states[i], actions[i], states[i + 1],
                                   rewards[i], probs[i], sentence))

        q_estimated = []
        q_s = torch.zeros(config.Q_BATCH_SIZE, 1)

        # training q function
        exp_length = min(len(experiences_buffer), config.Q_BATCH_SIZE)

        for idx in range(exp_length):
            exp = experiences_buffer[random.randint(0, exp_length - 1)]
            action_emb = word2vec[exp.action]
            q_estimated.append(critic(exp.state, action_emb)[0, 0])
            q_s[idx] = exp.reward
            if exp.next_state is not None:
                with torch.no_grad():
                    q_s[idx] += (config.GAMMA * max([
                        critic(exp.next_state, word2vec[action])
                        for action in get_possible_actions(lang, exp.sentence)
                    ]))[0][0]

        q_estimated = torch.cat(q_estimated).view(-1, 1)
        q_estimated = q_estimated[:config.Q_BATCH_SIZE]

        critic_optimizer.zero_grad()
        loss = critic_criterion(q_s, q_estimated)

        loss.backward(retain_graph=True)
        critic_optimizer.step()

        # updating seq2seq model
        actor_optimizer.zero_grad()
        loss = shared_loss(experiences_buffer, q_estimated[:exp_length])
        loss.backward()
        actor_optimizer.step()

        experiences_buffer.clear()
        with torch.no_grad():
            actor.zero_grad()
            critic.zero_grad()

        if epoch % 100 == 0:
            save_model(epoch, actor, critic, critic_optimizer,
                       critic_criterion, actor_optimizer, lang)

        print("Finished epoch:", epoch, " loss is ", torch.sum(loss))
예제 #22
0
class Agent():
    """Main DDPG agent that extracts experiences and learns from them"""
    def __init__(self, state_size, action_size):
        """
        Initializes Agent object.
        @Param:
        1. state_size: dimension of each state.
        2. action_size: number of actions.
        """
        self.state_size = state_size
        self.action_size = action_size
        
        #Actor network
        self.actor_local = Actor(self.state_size, self.action_size).to(device) #local model
        self.actor_target = Actor(self.state_size, self.action_size).to(device) #target model, TD-target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #initialize optimizer using Adam as regularizer for Actor network.

        #Critic network
        self.critic_local = Critic(self.state_size, self.action_size).to(device) #local model
        self.critic_target = Critic(self.state_size, self.action_size).to(device) #target model, TD-target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #initialize optimizer using Adam as regularizer for Critic network.

        #Noise proccess
        self.noise = OUNoise(action_size) #define Ornstein-Uhlenbeck process

        #Replay memory
        self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, MINI_BATCH) #define experience replay buffer object

    def step(self, state, action, reward, next_state, done):
        """
        Saves an experience in the replay memory to learn from using random sampling.
        @Param:
        1. state: current state, S.
        2. action: action taken based on current state.
        3. reward: immediate reward from state, action.
        4. next_state: next state, S', from action, a.
        5. done: (bool) has the episode terminated?
        Exracted version for trajectory used in calculating the value for an action, a."""

        self.memory.add(state, action, reward, next_state, done) #append to memory buffer

        #check if enough samples in buffer. if so, learn from experiences, otherwise, keep collecting samples.
        if(len(self.memory) > MINI_BATCH):
            experience = self.memory.sample()
            self.learn(experience)

    def reset(self):
        """Resets the noise process to mean"""
        self.noise.reset()

    def act(self, state, add_noise=True):
        """
        Returns a deterministic action given current state.
        @Param:
        1. state: current state, S.
        2. add_noise: (bool) add bias to agent, default = True (training mode)
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device) #typecast to torch.Tensor
        self.actor_local.eval() #set in evaluation mode
        with torch.no_grad(): #reset gradients
            action = self.actor_local(state).cpu().data.numpy() #deterministic action based on Actor's forward pass.
        self.actor_local.train() #set training mode

        #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state.
        if(add_noise):
            action += self.noise.sample()
        return action
    
    def learn(self, experiences, gamma=GAMMA):
        """
        Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized)
        of experiences when buffer_size = MINI_BATCH.
        Updates policy and value parameters accordingly
        @Param:
        1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done)
        2. gamma: immediate reward hyper-parameter, 0.99 by default.
        """
        #Source from: Udacity/DRL
        
        #Extrapolate experience into (state, action, reward, next_state, done) tuples
        states, actions, rewards, next_states, dones = experiences

        #Update Critic network
        actions_next = self.actor_target(next_states) # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #  r + γ * Q-values(a,s)

        # Compute critic loss using MSE
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        #Update Actor Network

        # Compute actor loss
        actions_pred = self.actor_local(states) #gets mu(s)
        actor_loss = -self.critic_local(states, actions_pred).mean() #gets V(s,a)
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters. Copies model τ every experience.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
예제 #23
0
class DDPG:
    def __init__(self, state_size, action_size, random_seed, hyperparams):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.hyperparams = hyperparams

        self.actor = Actor(state_size, action_size, random_seed).to(device)
        self.actor_noise = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=hyperparams.alpha_actor)

        self.critic = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optim = optim.Adam(
            self.critic.parameters(),
            lr=hyperparams.alpha_critic,
            weight_decay=hyperparams.weight_decay,
        )

        self.replay_buffer = ReplayBuffer(hyperparams.buffer_size,
                                          hyperparams.batch_size, random_seed)

        self.noise = OUNoise(
            action_size,
            random_seed,
            self.hyperparams.mu,
            self.hyperparams.theta,
            self.hyperparams.sigma,
        )

    def step(self, state, action, reward, next_state, done):

        self.replay_buffer.add(state, action, reward, next_state, done)
        if len(self.replay_buffer) > self.hyperparams.batch_size:
            observations = self.replay_buffer.sample()
            self.update_params(observations)

    def select_action(self, state, train=True, nn_noise=False):
        state = torch.from_numpy(state).to(dtype=torch.float32, device=device)
        self.actor.eval()
        if nn_noise:
            action = self.actor_noise(state).cpu().data.numpy()
        else:
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()
        if train:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset_state()

    def update_params(self, observations):

        states, actions, rewards, next_states, dones = observations
        next_actions = self.actor_target(next_states)
        next_Q_values = self.critic_target(next_states, next_actions)
        Q_values = rewards + (self.hyperparams.gamma * next_Q_values *
                              (1 - dones))

        expected_Q = self.critic(states, actions)
        Q_values_loss = F.l1_loss(expected_Q, Q_values)
        self.critic_optim.zero_grad()
        Q_values_loss.backward()
        self.critic_optim.step()

        policy_loss = -self.critic(states, self.actor(states))
        policy_loss = policy_loss.mean()
        self.actor_optim.zero_grad()
        policy_loss.backward()
        self.actor_optim.step()

        for qtarget_param, qlocal_param in zip(self.critic_target.parameters(),
                                               self.critic.parameters()):
            qtarget_param.data.copy_(self.hyperparams.tau * qlocal_param.data +
                                     (1.0 - self.hyperparams.tau) *
                                     qtarget_param.data)

        for target_param, local_param in zip(self.actor_target.parameters(),
                                             self.actor.parameters()):
            target_param.data.copy_(self.hyperparams.tau * local_param.data +
                                    (1.0 - self.hyperparams.tau) *
                                    target_param.data)
예제 #24
0
class Agent:
    def __init__(self, state_size, action_size):
        self._state_size = state_size
        self._action_size = action_size

        # Actor network
        self._actor_local = Actor(state_size, action_size).to(device)
        self._actor_target = Actor(state_size, action_size).to(device)
        self._actor_optimizer = optim.Adam(self._actor_local.parameters())

        # Critic network
        self._critic_local = Critic(state_size, action_size).to(device)
        self._critic_target = Critic(state_size, action_size).to(device)
        self._critic_optimizer = optim.Adam(self._critic_local.parameters())

        # Memory
        self._memory = Memory(BUFFER_SIZE)

        # Do equal weights
        self.hard_update(self._actor_local, self._actor_target)
        self.hard_update(self._critic_local, self._critic_target)

    def step(self, state, action, reward, next_state, done):
        self._memory.push((state, action, reward, next_state, done))

        if len(self._memory) > BATCH_SIZE:
            for _ in range(UPDATES_PER_STEP):
                samples = self._memory.sample(BATCH_SIZE)
                self.learn(samples)

    def act(self, state):
        state = torch.from_numpy(state).float().to(device)

        if binom.rvs(1, PROBABILITY_RAND_STEP):
            action = np.ndarray((1, ), buffer=np.array(uniform(-1, 1).rvs()))
        else:
            self._actor_local.eval()
            with torch.no_grad():
                action = self._actor_local(state).cpu().data.numpy()
            self._actor_local.train()

        return np.clip(action, -1, 1)

    def hard_update(self, local, target):
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local, target, tau):
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

    def learn(self, samples):

        states, actions, rewards, next_states, dones = samples

        actions_next = self._actor_target(next_states)
        Q_targets_next = self._critic_target(next_states, actions_next)
        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        Q_expected = self._critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self._critic_optimizer.zero_grad()
        critic_loss.backward()
        self._critic_optimizer.step()

        actions_pred = self._actor_local(states)
        actor_loss = -self._critic_local(states, actions_pred).mean()

        self._actor_optimizer.zero_grad()
        actor_loss.backward()
        self._actor_optimizer.step()

        self.soft_update(self._critic_local, self._critic_target, TAU)
        self.soft_update(self._actor_local, self._actor_target, TAU)

    def save(self):
        torch.save(self._actor_local.state_dict(), ACTOR_PATH)
        torch.save(self._critic_local.state_dict(), CRITIC_PATH)

    def load(self):
        self._actor_local.load_state_dict(torch.load(ACTOR_PATH))
        self._actor_local.eval()
        self._critic_local.load_state_dict(torch.load(CRITIC_PATH))
        self._critic_local.eval()