Exemplo n.º 1
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemplo n.º 2
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.OU = OU()

        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(save_location)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.getBatch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def saveNetwork(self):
        self.saver.save(self.sess,
                        save_location + self.env_name + 'network' + '-ddpg',
                        global_step=self.time_step)

    def action(self, state):
        action = self.actor_network.action(state)
        action[0] = np.clip(action[0], -1, 1)
        action[1] = np.clip(action[1], 0, 1)
        action[2] = np.clip(action[2], 0, 1)
        #print "Action:", action
        return action

    def noise_action(self, state, epsilon):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        #print action.shape
        #print "Action_No_Noise:", action
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.OU.function(action[0], 0.0, 0.60, 0.80)
        noise_t[1] = epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
        noise_t[2] = epsilon * self.OU.function(action[2], -0.1, 1.00, 0.05)

        if random.random() <= 0.01:  # 0.1
            print("********Stochastic brake***********")
            noise_t[2] = epsilon * self.OU.function(action[2], 0.2, 1.00, 0.10)

        action = action + noise_t
        action[0] = np.clip(action[0], -1, 1)
        action[1] = np.clip(action[1], 0, 1)
        action[2] = np.clip(action[2], 0, 1)

        #print "Action_Noise:", action
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer

        if (not (math.isnan(reward))):
            self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
Exemplo n.º 3
0
class RDPG:
    """docstring for RDPG"""
    def __init__(self, env):
        self.name = 'RDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.saver = tf.train.Saver()

    def train(self):
        # Sample a random minibatch of N sequences from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # Construct histories
        observations = []
        next_observations = []
        actions = []
        rewards = []
        dones = []
        for each in minibatch:
            for i in range(1, len(each.observations)):
                observations.append(self.pad(each.observations[0:i]))
                next_observations.append(self.pad(each.observations[1, i + 1]))
                actions.append(each.actions[0:i - 1])
                rewards.append(each.rewards[0:i])
                if i == len(each.observations) - 1:
                    dones.append(True)
                else:
                    dones.append(False)
        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(observations)
        q_value_batch = self.critic_network.target_q(
            next_observations,
            [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)])
        y_batch = []
        for i in range(len(observations)):
            if dones[i]:
                y_batch.append(rewards[i][-1])
            else:
                y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [len(observations), 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, observations,
                                  [self.pad(i) for i in actions])

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(observations)
        q_gradient_batch = self.critic_network.gradients(
            observations, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, observations)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def save_model(self, path, episode):
        self.saver.save(self.sess, path + "modle.ckpt", episode)

    def noise_action(self, history):
        # Select action a_t according to a sequence of observation and action
        action = self.actor_network.action(history)
        return action + self.exploration_noise.noise()

    def action(self, history):
        action = self.actor_network.action(history)
        return action

    def perceive(self, history):
        # Store the history sequence in the replay buffer
        self.replay_buffer.add(history)

        # Store history to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def pad(self, input):
        dim = len(input[0])
        return input + [[0] * dim] * (1000 - len(input))
Exemplo n.º 4
0
class Worker:
    """docstring for DDPG"""
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')

    def start(self, setting=0):
        self.env = RunEnv(visualize=True)
        self.setting = setting

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(
            self.sess, next_state_batch)
        q_value_batch = self.critic_network.target_q(self.sess,
                                                     next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            self.sess, selfstate_batch)
        q_gradient_batch = self.critic_network.gradients(
            self.sess, state_batch, action_batch_for_gradients)

        self.actor_network.train(self.sess, q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target(self.sess)
        self.critic_network.update_target(self.sess)

    def save_model(self, saver, episode):
        #if self.episode % 10 == 1:
        if self.name == 'worker_0':
            saver.save(self.sess,
                       self.model_path + "/model-" + str(episode) + ".ckpt")

    def noise_action(self, state, decay):
        # Select action a_t according to the current policy and exploration noise which gradually vanishes
        action = self.actor_network.action(self.sess, state)
        return action + self.exploration_noise.noise() * decay

    def action(self, state):
        action = self.actor_network.action(self.sess, state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE and self.training:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def work(self, coord, saver):
        if self.training:
            episode_count = self.sess.run(self.global_episodes)
        else:
            episode_count = 0
        wining_episode_count = 0
        total_steps = 0
        print("Starting worker_" + str(self.number))

        with self.sess.as_default(), self.sess.graph.as_default():
            #not_start_training_yet = True
            while not coord.should_stop():
                returns = []
                rewards = []
                episode_reward = 0

                if np.random.rand(
                ) < 0.9:  # change Aug20 stochastic apply noise
                    noisy = True
                    self.decay -= 1. / self.explore
                else:
                    noisy = False

                self.sess.run(self.update_local_ops_actor)
                self.sess.run(self.update_local_ops_critic)

                state = self.env.reset(difficulty=self.setting)
                #print(observation)
                s = process_frame(state)

                print "episode:", episode_count
                # Train

                for step in xrange(self.env.spec.timestep_limit):
                    state = process_frame(state)
                    if noisy:
                        action = np.clip(
                            self.noise_action(state, np.maximum(self.decay,
                                                                0)), 0.0, 1.0
                        )  # change Aug20, decay noise (no noise after ep>=self.explore)
                    else:
                        action = self.action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done))
                    next_state = process_frame(next_state)
                    self.perceive(state, action, reward * 100, next_state,
                                  done)
                    state = next_state
                    episode_reward += reward
                    if done:
                        break

                if episode % 5 == 0:
                    print "episode reward:", reward_episode

                # Testing:
                #if episode % 1 == 0:
                if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1:  # change Aug19
                    self.save_model(saver, episode_count)
                    total_return = 0
                    ave_reward = 0
                    for i in xrange(TEST):
                        state = self.env.reset()
                        reward_per_step = 0
                        for j in xrange(self.env.spec.timestep_limit):
                            action = self.action(
                                process_frame(state))  # direct action for test
                            state, reward, done, _ = self.env.step(action)
                            total_return += reward
                        if done:
                            break
                            reward_per_step += (reward -
                                                reward_per_step) / (j + 1)
                        ave_reward += reward_per_step

                    ave_return = total_return / TEST
                    ave_reward = ave_reward / TEST
                    returns.append(ave_return)
                    rewards.append(ave_reward)

                    print 'episode: ', episode, 'Evaluation Average Return:', ave_return, '  Evaluation Average Reward: ', ave_reward

                if self.name == 'worker_0' and self.training:
                    sess.run(self.increment)
                episode_count += 1

# All done Stop trail
# Confirm exit
            print('Done ' + self.name)
Exemplo n.º 5
0
state_batch  = np.random.rand(batch_size, state_dim)
# with tf.Session() as sess:
#     actor = ActorNetwork(sess,state_dim,action_dim,agent_name,1)
#     print(actor.actions(state_batch))
#     actor.update_target()
#     print('\n')
#     print(actor.target_actions(state_batch))
#
#     actor.train(y_grad,state_batch)
#     actor.update_target()
#     print(actor.target_actions(state_batch))

# test create multiple agents
# agents = []
# with tf.Session() as sess:
#     for ii in range(10):
#         agent_name = 'agent'+str(ii)
#         print(agent_name)
#         agents.append(ActorNetwork(sess, state_dim, action_dim, agent_name))
#
#     print(agents)

# test the copy works
with tf.Session() as sess:
    agent1 = ActorNetwork(sess,state_dim,action_dim,'agent1')
    agent1.train(y_grad,state_batch)

    agent2 = ActorNetwork(sess, state_dim, action_dim, 'agent2', agent1.nets)
    print('agent 1', agent1.actions(state_batch))
    print('agent 2', agent2.actions(state_batch))
Exemplo n.º 6
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 12
        self.action_dim = 10
        self.has_kicked = False
        self.laststep_haskicked = False
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        self.saver = tf.train.Saver(max_to_keep=1)
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # print(minibatch)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        # print(q_value_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("action_batch[0]", file=f)
            print(action_batch[0], file=f)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("q_gradient_batch[0]", file=f)
            print(q_gradient_batch[0], file=f)
        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action2(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def noise_action(self, state):
        action = self.actor_network.action(state)
        random_action = np.zeros(10, float)
        random_action[random.randint(0, 3)] = 1
        random_action[4] = random.uniform(-100, 100)  #DASH POWER
        random_action[5] = random.uniform(-180, 180)  #DASH DEGREES
        random_action[6] = random.uniform(-180, 180)  #TURN DEGREES
        random_action[7] = random.uniform(-180, 180)  #TACKLE DEGREES
        random_action[8] = random.uniform(0, 100)  #KICK POWER
        random_action[9] = random.uniform(-180, 180)  #KICK DEGREES
        if np.random.uniform() < EPSILON:
            return action
        else:
            return random_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemplo n.º 7
0
class DDPG(object):
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0

    def train(self):
        self.time_step = self.time_step + 1
        self.epsilon_expert -= (self.epsilon_expert_range[0] -
                                self.epsilon_expert_range[1]) / EXPLORE_COUNT
        self.epsilon_expert = max(self.epsilon_expert,
                                  self.epsilon_expert_range[1])
        self.epsilon_random -= (self.epsilon_random_range[0] -
                                self.epsilon_random_range[1]) / EXPLORE_COUNT
        self.epsilon_random = max(self.epsilon_random,
                                  self.epsilon_random_range[1])
        logger.debug(
            "step: %d, epsilon_expert: %s, epsilon_random: %s" %
            (self.time_step, self.epsilon_expert, self.epsilon_random))
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
            # if done_batch[i]:
            #     y_batch.append(reward_batch[i])
            # else :
            #     y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_cost = self.critic_network.train(y_batch, state_batch,
                                                     action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = self.exploration_noise.noise(action)
    #     noise_action = action + noise
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = np.zeros(self.action_dim)
    #     noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10)
    #     noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
    #     noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10)
    #     noise_action = action + noise
    #     logger.debug("action: %s, noise: %s" % (action, noise))
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        return action

    def opposite_action(self, state):
        logger.debug("state: %s" % (state))
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        action[0] = 1 - action[0]
        logger.debug("opposite action: %s" % (action))
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >= REPLAY_START_SIZE:
            # logger.debug("train...")
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'DDPG')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
Exemplo n.º 8
0
Arquivo: ddpg.py Projeto: ivychill/ltr
class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")

    def train(self):
        # my_config.logger.debug("......enter tain......")
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise(action)
        # if random.random() <= 0.5:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5])
        # else:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75])
        noise_action = action + noise
        clipped_noise_action = np.clip(noise_action, 0, 1)
        # if (self.time_step < 5):
        #     my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action))
        return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        # my_config.logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'ltr')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
Exemplo n.º 9
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6)
        self.angular_noise = OUNoise(1, 0, 0.6, 0.8)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state, epsilon):
        action = self.actor_network.action(state)
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.linear_noise.noise()
        noise_t[1] = epsilon * self.angular_noise.noise()
        action = action + noise_t
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)
        #print(a_linear, a_angular)

        return [a_linear, a_angular]

    def action(self, state):
        action = self.actor_network.action(state)
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)

        return [a_linear, a_angular]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        if done:
            self.linear_noise.reset()
            self.angular_noise.reset()

        return self.time_step
Exemplo n.º 10
0
class ddpg:
    def __init__(self, env_name, sess, state_dim, action_dim, models_dir,
                 img_dim):
        self.name = 'DDPG'
        self.env_name = env_name
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.img_dim = img_dim
        self.models_dir = models_dir

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = sess

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.img_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.img_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.saver = tf.train.Saver()

    def train(self):
        minibatch = self.replay_buffer.getBatch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        img_batch = np.asarray([data[5] for data in minibatch])
        next_img_batch = np.asarray([data[6] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        next_action_batch = self.actor_network.target_actions(
            next_state_batch, next_img_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch,
                                                     next_img_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])

        critic_cost = self.critic_network.train(y_batch, state_batch,
                                                action_batch, img_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            state_batch, img_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients, img_batch)

        self.actor_network.train(q_gradient_batch, state_batch, img_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        return critic_cost

    def save_network(self, step):
        self.saver.save(self.sess,
                        self.models_dir + self.env_name + '-network-ddpg.ckpt',
                        global_step=step)

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.models_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")

    '''
    def action(self,state):

        action = self.actor_network.action(state)

        action[0][0] = np.clip( action[0][0], -1 , 1 )
        action[0][1] = np.clip( action[0][1], 0 , 1 )
        action[0][2] = np.clip( action[0][2], 0 , 1 )
        #print "Action:", action
        return action[0]

    def noise_action(self,state,epsilon):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        print action.shape
        print "Action_No_Noise:", action
        noise_t = np.zeros([1,self.action_dim])
        noise_t[0][0] = epsilon * self.OU.function(action[0][0],  0.0 , 0.60, 0.80)
        noise_t[0][1] = epsilon * self.OU.function(action[0][1],  0.5 , 1.00, 0.10)
        noise_t[0][2] = epsilon * self.OU.function(action[0][2], -0.1 , 1.00, 0.05)
        
        action = action+noise_t
        action[0][0] = np.clip( action[0][0], -1 , 1 )
        action[0][1] = np.clip( action[0][1], 0 , 1 )
        action[0][2] = np.clip( action[0][2], 0 , 1 )
        
        print "Action_Noise:", action
        return action[0]
    '''

    def action(self, state, img):
        action = self.actor_network.action(state, img)

        action[0] = np.clip(action[0], -1, 1)
        # action[1] = np.clip( action[1], 0 , 1 )
        # action[2] = np.clip( action[2], 0 , 1 )

        return action

    def noise_action(self, state, epsilon, img):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state, img)
        noise_t = np.zeros(self.action_dim)

        if self.time_step < 100000:
            noise_t[0] = epsilon * ornstein_uhlenbeck_process(
                action[0], 0.0, 0.60, 0.80)
            # noise_t[1] = epsilon * ornstein_uhlenbeck_process(action[1],  0.5 , 1.00, 0.10)
            # noise_t[2] = epsilon * ornstein_uhlenbeck_process(action[2], -0.1 , 1.00, 0.05)
        elif self.time_step < 200000:
            if np.random.random() < 0.1:
                noise_t[0] = 0.1 * ornstein_uhlenbeck_process(
                    action[0], 0.0, 0.60, 0.80)

        action = action + noise_t
        action[0] = np.clip(action[0], -1, 1)
        # action[1] = np.clip( action[1], 0 , 1)
        # action[2] = np.clip( action[2], 0 , 1)

        return action

    def perceive(self, state, action, reward, next_state, done, img, next_img):
        if not (math.isnan(reward)):
            self.replay_buffer.add(state, action, reward, next_state, done,
                                   img, next_img)
        self.time_step = self.time_step + 1

        # Return critic cost
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            return self.train()
        else:
            return 0
Exemplo n.º 11
0
class DDPG_TF:
    """docstring for DDPG"""
    def __init__(self, env,loadfilename=None,printVars=False):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)

        #print 'init complete'
        self.all_vars = tf.global_variables()
        if printVars:
            for v in self.all_vars:
                print v.name.ljust(30), v.shape
        
        self.saver = tf.train.Saver(self.all_vars)
        if loadfilename is not None:
            self.saver.restore(self.sess, loadfilename)
            #print 'restore complete'
        
    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def actions(self, states):
        actions = self.actor_network.actions_no_training(states)
        return actions
    def target_actions(self, states):
        actions = self.actor_network.target_actions(states)
        return actions

    def value(self, states):
        actions = self.actor_network.actions_no_training(states)
        values = self.critic_network.q_value(states,actions)
        return values

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemplo n.º 12
0
class DDPG:
    def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma,
                 tau, batch_size, n_train, n_episode):
        # Gym environment
        self.env = env

        env_flattened = gym.wrappers.FlattenDictWrapper(
            env, dict_keys=['observation', 'achieved_goal', 'desired_goal'])

        # Get space sizes
        self.state_dim = env_flattened.observation_space.shape[0]
        #self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Get replay buffer and function get a batch from it
        self.replay_buffer = replay_buffer
        self.sample_batch = sample_batch

        self.sess = tf.InteractiveSession()

        # Hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_train = n_train
        self.n_episode = n_episode

        # Initialize networks
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim)
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim)

        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        batch = self.sample_batch(self.batch_size)

        state_batch = np.asarray([data[0] for data in batch])
        action_batch = np.asarray([data[1] for data in batch])
        reward_batch = np.asarray([data[2] for data in batch])
        next_state_batch = np.asarray([data[3] for data in batch])
        done_batch = np.asarray([data[4] for data in batch])

        next_action_batch = self.actor.target_actions(next_state_batch)
        q_value_batch = self.critic.target_q(next_state_batch,
                                             next_action_batch)
        y_batch = []
        for i in range(len(batch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + self.gamma * q_value_batch[i])
        y_batch = np.resize(y_batch, [self.batch_size, 1])
        # Update critic by minimizing the loss L
        self.critic.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor.actions(state_batch)
        q_gradient_batch = self.critic.gradients(state_batch,
                                                 action_batch_for_gradients)

        self.actor.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor.update_target()
        self.critic.update_target()

    def noise_action(self, state):
        action = self.actor.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        return self.actor.action(state)

    def reset_noise(self):
        self.exploration_noise.reset()

    def save_policy(self, save_path):
        self.actor.save_network(save_path)