Python Critic示例，agents.ddpg_critic.Critic Python示例

示例#1

0

显示文件

文件： agent.py 项目： cweidinger/RL-Quadcopter-2

    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

示例#2

0

显示文件

    def __init__(self, task, params={}):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 params=params)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  params=params)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, params)
        self.critic_target = Critic(self.state_size, self.action_size, params)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  # same direction
        self.exploration_sigma = 0.001  # random noise
        if (params.get("sigma")):
            self.exploration_sigma = params.get("sigma")

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        if (params.get("batch_size")):
            self.batch_size = params.get("batch_size")
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.1  # for soft update of target parameters
        #self.gamma = 0.9
        #self.tau = 0.05

        # Statistics
        self.best_score = -np.inf
        self.score = 0

示例#3

0

显示文件

    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # AE: Although OUNoise gives me a convenient set of randomness for each of the rotors, I still need
        # AE: to make a decision myself on how to apply the randomness and how to manage its magnitude
        # AE: (i.e. my eplore vs exploit strategy). These variables will do that.
        self.explore_start = 1.0  # AE: exploration probability at start
        self.explore_stop = 0.001  # AE: minimum exploration probability
        self.decay_rate = 0.003  # AE: exponential decay rate for exploration prob
        self.magnitude_coeff = 0.1  # AE: a coefficient to limit randomness

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  # AE: additive to the noise. mu * theta will be directly added

        self.exploration_theta = 0.15  # AE: old noise will be multiplied by this
        self.exploration_sigma = 0.2  # AE: new noise will be multiplied by this
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        # AE: The learning rate. How much we trust the new values compared to the old ones.
        self.tau = 0.0001  # for soft update of target parameters

        # AE: current reward in learning procedure (for statistics)
        self.score = -np.inf

        # Episode variables
        self.reset_episode()

示例#4

0

显示文件

    def __init__(self, task):
        """Initialize DDPG Agent instance."""
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # Initializing local and target Actor Models
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # Initializing local and target Critic Models
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Additional Parameters
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0
        self.score = 0

示例#5

0

显示文件

文件： ddpg_agent.py 项目： xadahiya/RL-Quadcopter-2

    def __init__(self, task):
        """Initialize DDPG Agent instance."""
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # Initializing local and target Actor Models
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # Initializing local and target Critic Models
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Additional Parameters
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0
        self.score = 0

示例#6

0

显示文件

class DDPG_Agent:
    """Reinforcement learning agent that learns through DDPG."""
    def __init__(self, task):
        """Initialize DDPG Agent instance."""
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # Initializing local and target Actor Models
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # Initializing local and target Critic Models
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Additional Parameters
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0
        self.score = 0

    def reset_episode(self):
        """Reset episode to initial state."""
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        """Take a step."""
        self.total_reward += reward
        self.count += 1
        # Save experience/reward
        self.memory.memorize(self.last_state, action, reward, next_state, done)

        # Learn if enough samples are available in memory.
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state):
        """Returns actions for state(s) according to given policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]

        # Add some noise to action for exploration and return
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience
        tuples."""

        self.score = self.total_reward / \
            float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.vstack([e.action for e in experiences
                             if e is not None]).astype(np.float32).reshape(
                                 -1, self.action_size)
        rewards = np.vstack([e.reward for e in experiences if e is not None
                             ]).astype(np.float32).reshape(-1, 1)
        dones = np.vstack([e.done for e in experiences
                           if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        # [states, actions, 0] 0 is for No learning Phase
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights), "Local and target model parameters must \
            have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights

        target_model.set_weights(new_weights)

示例#7

0

显示文件

文件： agent.py 项目： situo-liu/RL-Quadcopter-2

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            #             self.experiences.append(experiences)
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
#         return experiences

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        #         print("Learning")
        #         print(len(experiences))
        states = np.vstack([e.state for e in experiences if e is not None])
        #         print(len(states))
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        #         print(len(actions))
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        #         print("States/Actions: {} {}".format(len(states),len(actions)))
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)
#         return len(states), len(actions)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

示例#8

0

显示文件

class AE_DDPG_Agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # AE: Although OUNoise gives me a convenient set of randomness for each of the rotors, I still need
        # AE: to make a decision myself on how to apply the randomness and how to manage its magnitude
        # AE: (i.e. my eplore vs exploit strategy). These variables will do that.
        self.explore_start = 1.0  # AE: exploration probability at start
        self.explore_stop = 0.001  # AE: minimum exploration probability
        self.decay_rate = 0.003  # AE: exponential decay rate for exploration prob
        self.magnitude_coeff = 0.1  # AE: a coefficient to limit randomness

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  # AE: additive to the noise. mu * theta will be directly added

        self.exploration_theta = 0.15  # AE: old noise will be multiplied by this
        self.exploration_sigma = 0.2  # AE: new noise will be multiplied by this
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        # AE: The learning rate. How much we trust the new values compared to the old ones.
        self.tau = 0.0001  # for soft update of target parameters

        # AE: current reward in learning procedure (for statistics)
        self.score = -np.inf

        # Episode variables
        self.reset_episode()

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        self.best_score = -np.inf
        self.score = -np.inf
        self.total_reward = 0.0
        self.count = 0

        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        self.total_reward += reward
        self.count += 1

        # AE: Score (average reward in this episode so far) and best score for statistics
        self.score = self.total_reward / float(self.count)
        if self.score > self.best_score:
            self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])

        # AE: directly sampling approximated value from learned action-value function.
        action = self.actor_local.model.predict(state)[0]

        # AE: and adding some noise to that for unpredictability.
        # AE: The magnitude of noise has to drop over time.
        explore_p = self.explore_stop + (self.explore_start -
                                         self.explore_stop) * np.exp(
                                             -self.decay_rate * self.count)
        #self.noise.update_mu(explore_p)
        noise_sample = self.magnitude_coeff * explore_p * self.noise.sample()
        #noise_sample = explore_p * np.random.randn(self.action_size)
        #print("Noi=", s)

        return list(
            action +
            noise_sample * self.action_size)  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        # AE: Updating NN weights directly in the passed model (actor or critic).
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

示例#9

0

显示文件

文件： ddpg_agent.py 项目： xadahiya/RL-Quadcopter-2

class DDPG_Agent:
    """Reinforcement learning agent that learns through DDPG."""

    def __init__(self, task):
        """Initialize DDPG Agent instance."""
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # Initializing local and target Actor Models
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # Initializing local and target Critic Models
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Additional Parameters
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0
        self.score = 0

    def reset_episode(self):
        """Reset episode to initial state."""
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        """Take a step."""
        self.total_reward += reward
        self.count += 1
        # Save experience/reward
        self.memory.memorize(self.last_state, action, reward, next_state, done)

        # Learn if enough samples are available in memory.
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state):
        """Returns actions for state(s) according to given policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]

        # Add some noise to action for exploration and return
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience
        tuples."""

        self.score = self.total_reward / \
            float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.vstack(
            [e.action for e in experiences if e is not None]).astype(
            np.float32).reshape(-1, self.action_size)
        rewards = np.vstack(
            [e.reward for e in experiences if e is not None]).astype(
            np.float32).reshape(-1, 1)
        dones = np.vstack(
            [e.done for e in experiences if e is not None]).astype(
            np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local)
        # [states, actions, 0] 0 is for No learning Phase
        action_gradients = np.reshape(self.critic_local.get_action_gradients(
            [states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights), "Local and target model parameters must \
            have the same size"

        new_weights = self.tau*local_weights + (1-self.tau)*target_weights

        target_model.set_weights(new_weights)