Python ActorNetwork.train示例

class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return

示例#2

显示文件

class DDPG:
    """docstring for DDPG"""
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.OU = OU()

        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(save_location)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.getBatch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def saveNetwork(self):
        self.saver.save(self.sess,
                        save_location + self.env_name + 'network' + '-ddpg',
                        global_step=self.time_step)

    def action(self, state):
        action = self.actor_network.action(state)
        action[0] = np.clip(action[0], -1, 1)
        action[1] = np.clip(action[1], 0, 1)
        action[2] = np.clip(action[2], 0, 1)
        #print "Action:", action
        return action

    def noise_action(self, state, epsilon):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        #print action.shape
        #print "Action_No_Noise:", action
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.OU.function(action[0], 0.0, 0.60, 0.80)
        noise_t[1] = epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
        noise_t[2] = epsilon * self.OU.function(action[2], -0.1, 1.00, 0.05)

        if random.random() <= 0.01:  # 0.1
            print("********Stochastic brake***********")
            noise_t[2] = epsilon * self.OU.function(action[2], 0.2, 1.00, 0.10)

        action = action + noise_t
        action[0] = np.clip(action[0], -1, 1)
        action[1] = np.clip(action[1], 0, 1)
        action[2] = np.clip(action[2], 0, 1)

        #print "Action_Noise:", action
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer

        if (not (math.isnan(reward))):
            self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

示例#3

显示文件

文件： ddpg.py 项目： Ivehui/DDPG

class DDPG:
    """docstring for DDPG"""
    def __init__(self, environment):
        self.name = 'DDPG' # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0

    def set_init_observation(self,observation):
        # receive initial observation state
        self.state = observation

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]

        action_batch = np.resize(action_batch,[BATCH_SIZE,1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.target_evaluate(next_state_batch)
        q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch)
        for i in range(0,BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.evaluate(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high)

    def set_feedback(self,observation,action,reward,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        next_state = observation
        self.replay_buffer.append((self.state,action,reward,next_state,done))
        # Update current state
        self.state = next_state
        # Update time step
        self.time_step += 1

        # Limit the replay buffer size
        if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
            self.replay_buffer.popleft()

        # Store transitions to replay start size then start training
        if self.time_step >  REPLAY_START_SIZE:
            self.train()

        if self.time_step % 10000 == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

示例#4

显示文件

文件： ddpg.py 项目： titi2338432/RDPG-Biped

class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, DIRECTORY):
        self.batch_size = BATCH_SIZE
        self.replay_start_size = REPLAY_START_SIZE  # self.sub_batch_size = BATCH_SIZE / n_gpu

        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.trace_length = TRACE_LENGTH
        self.temp_abstract = TEMP_ABSTRACT
        self.actor_network = ActorNetwork(self.sess, BATCH_SIZE,
                                          self.state_dim, self.action_dim,
                                          self.temp_abstract, DIRECTORY)
        self.critic_network = CriticNetwork(self.sess, BATCH_SIZE,
                                            self.state_dim, self.action_dim,
                                            self.temp_abstract, DIRECTORY)

        # initialize replay buffer
        max_len_trajectory = self.environment.spec.timestep_limit + 1  # trace_length
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY,
                                          max_len_trajectory,
                                          self.actor_network.last_epi)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        ###
        self.diff = 0.
        self.discounting_mat_dict = {}
        ###

    def state_initialiser(self, shape, mode='g'):
        if mode == 'z':  #Zero
            initial = np.zeros(shape=shape)
        elif mode == 'g':  #Gaussian
            # initial = stats.truncnorm.rvs(a=-0.02/0.01,b=0.02/0.01,loc=0.,scale=0.01,size=shape)
            initial = np.random.normal(loc=0.,
                                       scale=1. / float(shape[1]),
                                       size=shape)
        else:  # May do some adaptive initialiser can be built in later
            raise NotImplementedError
        return initial

    def train(self, time_step):  #,time_step):
        ###1) Get-batch data for opt
        minibatch, trace_length = self.replay_buffer.get_batch(
            self.batch_size, self.trace_length,
            time_step)  #, self.trace_length)
        try:
            state_trace_batch = np.stack(minibatch[:, :, 2].ravel()).reshape(
                self.batch_size, trace_length, self.state_dim)
            action_trace_batch = np.stack(minibatch[:, :, 3].ravel()).reshape(
                self.batch_size, trace_length, self.action_dim)

            next_state_batch = np.stack(minibatch[:, -1, 6].ravel()).reshape(
                self.batch_size, 1, self.state_dim)
            next_state_trace_batch = np.concatenate(
                [state_trace_batch, next_state_batch], axis=1)

            reward_trace_batch = np.stack(minibatch[:, :, 4].ravel()).reshape(
                self.batch_size, trace_length, 1)
            done_trace_batch = np.stack(minibatch[:, :, 7].ravel()).reshape(
                self.batch_size, trace_length, 1)

        except Exception as e:
            print(str(e))
            raise

        ###2) Painfully initialise initial memories of LSTMs: not super-efficient, but no error guaranteed from tf's None-type zero-state problem
        init_actor_hidden1_cORm_batch = self.state_initialiser(
            shape=(self.batch_size, self.actor_network.rnn_size), mode='z')
        actor_init_h_batch = (
            init_actor_hidden1_cORm_batch, init_actor_hidden1_cORm_batch
        )  #((init_hidden1_cORm_batch,init_hidden1_cORm_batch),(init_actor_hidden2_cORm_batch,init_actor_hidden2_cORm_batch))

        init_critic_hidden1_cORm_batch = self.state_initialiser(
            shape=(self.batch_size, self.critic_network.rnn_size), mode='z')
        critic_init_h_batch = (
            init_critic_hidden1_cORm_batch, init_critic_hidden1_cORm_batch
        )  #,(init_critic_hidden3_cORm_batch,init_critic_hidden3_cORm_batch))
        ###

        self.dt_list = np.zeros(shape=(15, ))
        self.dt_list[-1] = time.time()
        if trace_length <= OPT_LENGTH:
            target_actor_init_h_batch = actor_init_h_batch
            target_critic_init_h_batch = critic_init_h_batch
            pass
        else:
            ### memory stuff
            actor_init_h_batch = self.actor_network.action(
                state_trace_batch[:, :-OPT_LENGTH, :],
                actor_init_h_batch,
                mode=1)
            target_actor_init_h_batch = actor_init_h_batch
            critic_init_h_batch = self.critic_network.evaluation(
                state_trace_batch[:, :-OPT_LENGTH, :],
                action_trace_batch[:, :-OPT_LENGTH, :],
                critic_init_h_batch,
                mode=1)
            target_critic_init_h_batch = critic_init_h_batch

            state_trace_batch = state_trace_batch[:, -OPT_LENGTH:, :]
            next_state_trace_batch = next_state_trace_batch[:, -(OPT_LENGTH +
                                                                 1):, :]
            action_trace_batch = action_trace_batch[:, -OPT_LENGTH:, :]
            reward_trace_batch = reward_trace_batch[:, -OPT_LENGTH:, :]
            done_trace_batch = done_trace_batch[:, -OPT_LENGTH:, :]
        self.dt_list[0] = time.time() - np.sum(self.dt_list)

        ###3) Obtain target output
        next_action_batch = self.actor_network.target_action(
            next_state_trace_batch,
            init_temporal_hidden_cm_batch=target_actor_init_h_batch)
        self.dt_list[1] = time.time() - np.sum(self.dt_list)
        next_action_trace_batch = np.concatenate(
            [action_trace_batch,
             np.expand_dims(next_action_batch, axis=1)],
            axis=1)
        self.dt_list[2] = time.time() - np.sum(self.dt_list)
        target_lastQ_batch = self.critic_network.target_q_trace(
            next_state_trace_batch,
            next_action_trace_batch,
            init_temporal_hidden_cm_batch=target_critic_init_h_batch)
        self.dt_list[3] = time.time() - np.sum(self.dt_list)

        # Control the length of time-step for gradient
        if trace_length <= OPT_LENGTH:
            update_length = np.minimum(
                trace_length,
                OPT_LENGTH // 1)  #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2)
        else:
            update_length = OPT_LENGTH // 1  #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2)

        target_lastQ_batch_masked = target_lastQ_batch * (
            1. - done_trace_batch[:, -1])
        rQ = np.concatenate([
            np.squeeze(reward_trace_batch[:, -update_length:], axis=-1),
            target_lastQ_batch_masked
        ],
                            axis=1)
        self.dt_list[4] = time.time() - np.sum(self.dt_list)

        try:
            discounting_mat = self.discounting_mat_dict[update_length]
        except KeyError:
            discounting_mat = np.zeros(shape=(update_length,
                                              update_length + 1),
                                       dtype=np.float)
            for i in range(update_length):
                discounting_mat[i, :i] = 0.
                discounting_mat[i,
                                i:] = GAMMA**np.arange(0.,
                                                       -i + update_length + 1)
            discounting_mat = np.transpose(discounting_mat)
            self.discounting_mat_dict[update_length] = discounting_mat
        try:
            y_trace_batch = np.expand_dims(np.matmul(rQ, discounting_mat),
                                           axis=-1)
        except Exception as e:
            print('?')
            raise
        self.dt_list[5] = time.time() - np.sum(self.dt_list)

        ###4)Train Critic: get next_action, target_q, then optimise
        critic_grad = self.critic_network.train(
            y_trace_batch,
            update_length,
            state_trace_batch,
            action_trace_batch,
            init_temporal_hidden_cm_batch=critic_init_h_batch)
        self.dt_list[6] = time.time() - np.sum(self.dt_list)

        ###5) Train Actor: while updated critic, we declared the dQda. Hence sess,run(dQda*dadParam_actor), then optimise actor
        for i in range(update_length):
            actor_init_h_batch_trace = (np.expand_dims(actor_init_h_batch[0],
                                                       axis=1),
                                        np.expand_dims(actor_init_h_batch[1],
                                                       axis=1))
            critic_init_h_batch_trace = (np.expand_dims(critic_init_h_batch[0],
                                                        axis=1),
                                         np.expand_dims(critic_init_h_batch[1],
                                                        axis=1))
            if i == 0:
                actor_init_h_batch_stack = actor_init_h_batch_trace
                critic_init_h_batch_stack = critic_init_h_batch_trace
            else:
                actor_init_h_batch_stack = (np.concatenate(
                    (actor_init_h_batch_stack[0], actor_init_h_batch_trace[0]),
                    axis=1),
                                            np.concatenate(
                                                (actor_init_h_batch_stack[1],
                                                 actor_init_h_batch_trace[1]),
                                                axis=1))
                critic_init_h_batch_stack = (
                    np.concatenate((critic_init_h_batch_stack[0],
                                    critic_init_h_batch_trace[0]),
                                   axis=1),
                    np.concatenate((critic_init_h_batch_stack[1],
                                    critic_init_h_batch_trace[1]),
                                   axis=1))
            action_trace_batch_for_gradients, actor_init_h_batch = self.actor_network.action_trace(
                np.expand_dims(state_trace_batch[:, i], 1),
                init_temporal_hidden_cm_batch=actor_init_h_batch)
            critic_init_h_batch = self.critic_network.evaluation_trace(
                np.expand_dims(state_trace_batch[:, i], 1),
                np.expand_dims(action_trace_batch[:, i], 1),
                init_temporal_hidden_cm_batch=critic_init_h_batch)
            if i == 0:
                action_trace_batch_for_gradients_stack = action_trace_batch_for_gradients
            else:
                action_trace_batch_for_gradients_stack = np.concatenate(
                    (action_trace_batch_for_gradients_stack,
                     action_trace_batch_for_gradients),
                    axis=1)

        self.dt_list[7] = time.time() - np.sum(self.dt_list)
        state_trace_batch_stack = np.reshape(
            state_trace_batch,
            (self.batch_size * update_length, 1, self.state_dim))
        action_trace_batch_stack = np.reshape(
            action_trace_batch,
            (self.batch_size * update_length, 1, self.action_dim))
        action_trace_batch_for_gradients_stack = np.reshape(
            action_trace_batch_for_gradients_stack,
            (self.batch_size * update_length, 1, self.action_dim))
        actor_init_h_batch_stack = (np.reshape(
            actor_init_h_batch_stack[0],
            (self.batch_size * update_length, self.actor_network.rnn_size)),
                                    np.reshape(
                                        actor_init_h_batch_stack[1],
                                        (self.batch_size * update_length,
                                         self.actor_network.rnn_size)))
        critic_init_h_batch_stack = (np.reshape(
            critic_init_h_batch_stack[0],
            (self.batch_size * update_length, self.critic_network.rnn_size)),
                                     np.reshape(
                                         critic_init_h_batch_stack[1],
                                         (self.batch_size * update_length,
                                          self.critic_network.rnn_size)))

        q_gradient_trace_batch = self.critic_network.gradients(
            1,
            state_trace_batch_stack,
            action_trace_batch_for_gradients_stack,
            init_temporal_hidden_cm_batch=critic_init_h_batch_stack)
        self.dt_list[8] = time.time() - np.sum(self.dt_list)

        # Update the actor policy using the sampled gradient:
        actor_grad = self.actor_network.train(
            q_gradient_trace_batch,
            1,
            state_trace_batch_stack,
            action_trace_batch_stack,
            init_temporal_hidden_cm_batch=actor_init_h_batch_stack)
        self.dt_list[9] = time.time() - np.sum(self.dt_list)

        # Update the target networks via EMA & Indicators
        # self.critic_network.update_target()
        self.dt_list[10] = time.time() - np.sum(self.dt_list)
        # self.actor_network.update_target()
        self.dt_list[11] = time.time() - np.sum(self.dt_list)

        # actor_diff = self.actor_network.get_diff()
        self.dt_list[12] = time.time() - np.sum(self.dt_list)
        # critic_diff = self.critic_network.get_diff()
        self.dt_list[13] = time.time() - np.sum(self.dt_list)

        self.dt_list = np.delete(self.dt_list, -1)
        return actor_grad, critic_grad,  # actor_diff, actor_grad, critic_diff, critic_grad

    def action(self, state_trace, init_hidden_cm, epi, noisy=True):
        # Select action a_t according to the current policy and exploration noise
        action, last_hidden_cm = self.actor_network.action([state_trace],
                                                           init_hidden_cm,
                                                           mode=2)
        if noisy:
            noise = self.exploration_noise.noise()  #epi)
            return action + noise, last_hidden_cm  #, dt#, np.linalg.norm(noise)
        else:
            return action, last_hidden_cm

    def evaluation(self, state_trace, action_trace, action_last,
                   init_hidden_cm):
        return self.critic_network.evaluation([state_trace], [action_trace],
                                              action_last,
                                              init_hidden_cm,
                                              mode=2)  #q_value, last_hidden_cm

    # def perceive(self,actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,time_step,epi):
    def perceive(self, state, action, reward, next_state, done, time_step,
                 epi):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        # self.replay_buffer.add(actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,epi)
        done = float(done)
        self.replay_buffer.add(state, action, reward, next_state, done, epi,
                               time_step)

        # Store transitions to replay start size then start training
        if (self.replay_buffer.num_experiences > REPLAY_START_SIZE):
            # Non-zero diff should be found
            self.actor_grad, self.critic_grad = self.train(time_step)
            # self.actor_diff, self.actor_grad, self.critic_diff, self.critic_grad = self.train(time_step)
        else:
            # Zero diff as is not trained
            # self.actor_diff = 0.
            self.actor_grad = 0.
            # self.critic_diff = 0.
            self.critic_grad = 0.

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

示例#5

显示文件

文件： ddpg.py 项目： ZhichenML/IPPS

def run_ddpg(amodel,
             cmodel,
             train_indicator=0,
             seeded=1337,
             track_name='practgt2.xml'):
    OU = FunctionOU()
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic
    ALPHA = 0.9

    action_dim = 3  # Steering/Acceleration/Brake
    state_dim = 29  # of sensors input

    np.random.seed(seeded)

    vision = False

    EXPLORE = 100000.
    if train_indicator:
        episode_count = 600
    else:
        episode_count = 3
    max_steps = 20000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   gear_change=False,
                   track_name=track_name)

    if not train_indicator:
        # Now load the weight
        #logging.info("Now we load the weight")
        print("Now we load the weight")
        try:
            actor.model.load_weights(amodel)
            critic.model.load_weights(cmodel)
            actor.target_model.load_weights(amodel)
            critic.target_model.load_weights(cmodel)
            #logging.info(" Weight load successfully")
            print("Weight load successfully")
        except:
            #ogging.info("Cannot find the weight")
            print("Cannot find the weight")
            exit()

    #logging.info("TORCS Experiment Start.")
    print("TORCS Experiment Start.")
    best_lap = 500

    for i_episode in range(episode_count):
        print("Episode : " + str(i_episode) + " Replay Buffer " +
              str(buff.count()))
        #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count()))
        if np.mod(i_episode, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        for j_iter in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  # Add replay buffer

            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i_episode, "Step", step, "Action", a_t, "Reward",
                  r_t, "Loss", loss)

            if np.mod(step, 1000) == 0:
                logging.info("Episode {}, Distance {}, Last Lap {}".format(
                    i_episode, ob.distRaced, ob.lastLapTime))
                if ob.lastLapTime > 0:
                    if best_lap < ob.lastLapTime:
                        best_lap = ob.lastLapTime

            step += 1
            if done:
                break

        if train_indicator and i_episode > 20:
            if np.mod(i_episode, 3) == 0:
                logging.info("Now we save model")
                actor.model.save_weights("ddpg_actor_weights_periodic.h5",
                                         overwrite=True)
                critic.model.save_weights("ddpg_critic_weights_periodic.h5",
                                          overwrite=True)

        print("TOTAL REWARD @ " + str(i_episode) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("Best Lap {}".format(best_lap))
        print("")
        logging.info("TOTAL REWARD @ " + str(i_episode) +
                     "-th Episode  : Reward " + str(total_reward))
        logging.info("Best Lap {}".format(best_lap))
    env.end()  # This is for shutting down TORCS
    logging.info("Finish.")

示例#6

显示文件

class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

示例#7

显示文件

class NeuralAgent():
    def __init__(self, track_name='practgt2.xml'):
        BUFFER_SIZE = 100000
        TAU = 0.001  # Target Network HyperParameters
        LRA = 0.0001  # Learning rate for Actor
        LRC = 0.001  # Lerning rate for Critic
        state_dim = 29  # of sensors input
        self.batch_size = 32
        self.lambda_mix = 10.0
        self.action_dim = 3  # Steering/Acceleration/Brake

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        from keras import backend as K
        K.set_session(sess)

        self.actor = ActorNetwork(sess, state_dim, self.action_dim,
                                  self.batch_size, TAU, LRA)
        self.critic = CriticNetwork(sess, state_dim, self.action_dim,
                                    self.batch_size, TAU, LRC)
        self.buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer
        self.track_name = track_name

        self.save = dict(total_reward=[],
                         total_step=[],
                         ave_reward=[],
                         distRaced=[],
                         distFromStart=[],
                         lastLapTime=[],
                         curLapTime=[],
                         lapTimes=[],
                         avelapTime=[],
                         ave_sp=[],
                         max_sp=[],
                         min_sp=[],
                         test_total_reward=[],
                         test_total_step=[],
                         test_ave_reward=[],
                         test_distRaced=[],
                         test_distFromStart=[],
                         test_lastLapTime=[],
                         test_curLapTime=[],
                         test_lapTimes=[],
                         test_avelapTime=[],
                         test_ave_sp=[],
                         test_max_sp=[],
                         test_min_sp=[])

    def rollout(self, env):
        max_steps = 10000

        vision = False

        # zhichen: it is not stable to have two torcs env and UDP connections
        # env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name)

        ob = env.reset(relaunch=True)
        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        sp = []

        lastLapTime = []

        for j_iter in range(max_steps):

            a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            a_t = a_t[0]
            # print('test a_t:', a_t)
            a_t[0] = clip(a_t[0], -1, 1)
            a_t[1] = clip(a_t[1], 0, 1)
            a_t[2] = clip(a_t[2], 0, 1)

            ob, r_t, done, info = env.step(a_t)

            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            if np.mod(j_iter + 1, 20) == 0:
                logging.info('step: ' + str(j_iter + 1))
                print('\n ob: ', ob)

            s_t = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward += r_t

            if done: break

        logging.info("Test Episode Reward: " + str(total_reward) +
                     " Episode Length: " + str(j_iter + 1) + " Ave Reward: " +
                     str(total_reward / (j_iter + 1)) + "\n Distance: " +
                     str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))
        #logging.info(" Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) +
        #            " Episode Length: " + str(j_iter+1) + "  Distance" + str(ob.distRaced) + " Lap Times: " + str(ob.lastLapTime))

        #env.end()  # This is for shutting down TORCS

        ave_sp = np.mean(sp)
        max_sp = np.max(sp)
        min_sp = np.min(sp)

        return total_reward, j_iter + 1, info, ave_sp, max_sp, min_sp, lastLapTime

    def update_neural(self,
                      controllers,
                      episode_count=200,
                      tree=False,
                      seed=1337):
        OU = FunctionOU()
        vision = False
        GAMMA = 0.99
        EXPLORE = 100000.
        max_steps = 10000
        reward = 0
        done = False
        step = 0
        epsilon = 1

        if not tree:
            steer_prog, accel_prog, brake_prog = controllers

        # Generate a Torcs environment
        env = TorcsEnv(vision=vision,
                       throttle=True,
                       gear_change=False,
                       track_name=self.track_name)

        window = 5
        lambda_store = np.zeros((max_steps, 1))
        lambda_max = 40.
        factor = 0.8

        logging.info("TORCS Experiment Start with Lambda = " +
                     str(self.lambda_mix))

        for i_episode in range(episode_count):
            logging.info("Episode : " + str(i_episode) + " Replay Buffer " +
                         str(self.buff.count()))
            if np.mod(i_episode, 3) == 0:
                logging.info('relaunch TORCS')
                ob = env.reset(
                    relaunch=True
                )  # relaunch TORCS every 3 episode because of the memory leak error
            else:
                logging.info('reset TORCS')
                ob = env.reset()

            #[ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, list(ob.wheelSpinVel / 100.0), list(ob.track)]
            s_t = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward = 0.
            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), [0, 0, 0]]
            window_list = [tempObs[:] for _ in range(window)]

            sp = []

            lastLapTime = []

            for j_iter in range(max_steps):
                if tree:
                    tree_obs = [
                        sensor for obs in tempObs[:-1] for sensor in obs
                    ]
                    act_tree = controllers.predict([tree_obs])
                    steer_action = clip_to_range(act_tree[0][0], -1, 1)
                    accel_action = clip_to_range(act_tree[0][1], 0, 1)
                    brake_action = clip_to_range(act_tree[0][2], 0, 1)
                else:
                    steer_action = clip_to_range(
                        steer_prog.pid_execute(window_list), -1, 1)
                    accel_action = clip_to_range(
                        accel_prog.pid_execute(window_list), 0, 1)
                    brake_action = clip_to_range(
                        brake_prog.pid_execute(window_list), 0, 1)
                action_prior = [steer_action, accel_action, brake_action]

                tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                           [ob.speedZ], [ob.rpm],
                           list(ob.wheelSpinVel / 100.0),
                           list(ob.track), action_prior]
                window_list.pop(0)
                window_list.append(tempObs[:])

                loss = 0
                epsilon -= 1.0 / EXPLORE
                a_t = np.zeros([1, self.action_dim])
                noise_t = np.zeros([1, self.action_dim])

                a_t_original = self.actor.model.predict(
                    s_t.reshape(1, s_t.shape[0]))
                noise_t[0][0] = max(epsilon, 0) * OU.function(
                    a_t_original[0][0], 0.0, 0.60, 0.30)
                noise_t[0][1] = max(epsilon, 0) * OU.function(
                    a_t_original[0][1], 0.5, 1.00, 0.10)
                noise_t[0][2] = max(epsilon, 0) * OU.function(
                    a_t_original[0][2], 0, 1.00, 0.05)

                a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
                a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
                a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

                mixed_act = [
                    a_t[0][k_iter] / (1 + self.lambda_mix) +
                    (self.lambda_mix /
                     (1 + self.lambda_mix)) * action_prior[k_iter]
                    for k_iter in range(3)
                ]

                ob, r_t, done, info = env.step(mixed_act)

                sp.append(info['speed'])

                if lastLapTime == []:
                    if info['lastLapTime'] > 0:
                        lastLapTime.append(info['lastLapTime'])
                elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                        'lastLapTime']:
                    lastLapTime.append(info['lastLapTime'])

                s_t1 = np.hstack(
                    (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                     ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

                self.buff.add(s_t, a_t[0], r_t, s_t1,
                              done)  # Add replay buffer

                # Do the batch update
                batch = self.buff.getBatch(self.batch_size)
                states = np.asarray([e[0] for e in batch])
                actions = np.asarray([e[1] for e in batch])
                rewards = np.asarray([e[2] for e in batch])
                new_states = np.asarray([e[3] for e in batch])
                dones = np.asarray([e[4] for e in batch])
                y_t = np.zeros((states.shape[0], 1))

                target_q_values = self.critic.target_model.predict(
                    [new_states,
                     self.actor.target_model.predict(new_states)])

                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA * target_q_values[k]

                loss += self.critic.model.train_on_batch([states, actions],
                                                         y_t)
                a_for_grad = self.actor.model.predict(states)
                grads = self.critic.gradients(states, a_for_grad)
                self.actor.train(states, grads)
                self.actor.target_train()
                self.critic.target_train()

                total_reward += r_t
                s_t = s_t1

                # Control prior mixing term
                if j_iter > 0 and i_episode > 50:
                    lambda_track = lambda_max * (1 - np.exp(-factor * np.abs(
                        r_t +
                        GAMMA * np.mean(target_q_values[-1] - base_q[-1]))))
                    lambda_track = np.squeeze(lambda_track)
                else:
                    lambda_track = 10.
                lambda_store[j_iter] = lambda_track
                base_q = copy.deepcopy(target_q_values)

                if np.mod(step, 2000) == 0:
                    logging.info("Episode " + str(i_episode) + " Distance " +
                                 str(ob.distRaced) + " Lap Times " +
                                 str(ob.lastLapTime))

                step += 1
                if done:
                    break

            #else:
            #    env.end()

            self.lambda_mix = np.mean(lambda_store)

            logging.info('Episode ends! \n' + "Total Steps: " + str(step) +
                         " " + str(i_episode) + "-th Episode Reward: " +
                         str(total_reward) + " Episode Length: " +
                         str(j_iter + 1) + " Ave Reward: " +
                         str(total_reward / (j_iter + 1)) + "\n Distance: " +
                         str(info['distRaced']) + ' ' +
                         str(info['distFromStart']) + "\n Last Lap Times: " +
                         str(info['lastLapTime']) + " Cur Lap Times: " +
                         str(info['curLapTime']) + " lastLaptime: " +
                         str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                         " max sp: " + str(np.max(sp)))

            #logging.info(" Lambda Mix: " + str(self.lambda_mix))

            self.save['total_reward'].append(total_reward)
            self.save['total_step'].append(j_iter + 1)
            self.save['ave_reward'].append(total_reward / (j_iter + 1))

            self.save['distRaced'].append(info['distRaced'])
            self.save['distFromStart'].append(info['distFromStart'])

            self.save['lastLapTime'].append(info['lastLapTime'])
            self.save['curLapTime'].append(info['curLapTime'])
            self.save['lapTimes'].append(lastLapTime)
            if lastLapTime == []:
                self.save['avelapTime'].append(0)
            else:
                self.save['avelapTime'].append(np.mean(lastLapTime))

            self.save['ave_sp'].append(np.mean(sp))
            self.save['max_sp'].append(np.max(sp))
            self.save['min_sp'].append(np.min(sp))

            # test
            if np.mod(i_episode + 1, 10) == 0:
                logging.info("Start Testing!")
                test_total_reward, test_step, test_info, test_ave_sp, test_max_sp, test_min_sp, test_lastLapTime = self.rollout(
                    env)
                self.save['test_total_reward'].append(test_total_reward)
                self.save['test_total_step'].append(test_step)
                self.save['test_ave_reward'].append(test_total_reward /
                                                    test_step)

                self.save['test_distRaced'].append(test_info['distRaced'])
                self.save['test_distFromStart'].append(
                    test_info['distFromStart'])

                self.save['test_lastLapTime'].append(test_info['lastLapTime'])
                self.save['test_curLapTime'].append(test_info['curLapTime'])
                self.save['test_lapTimes'].append(test_lastLapTime)

                if test_lastLapTime == []:
                    self.save['test_avelapTime'].append(0)
                else:
                    self.save['test_avelapTime'].append(
                        np.mean(test_lastLapTime))

                self.save['test_ave_sp'].append(test_ave_sp)
                self.save['test_max_sp'].append(test_max_sp)
                self.save['test_min_sp'].append(test_min_sp)

            if np.mod(i_episode + 1, 5) == 0:
                print("Now we save model")
                #os.remove("actormodel.h5")
                self.actor.model.save_weights("actormodel_" + str(seed) +
                                              ".h5",
                                              overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(self.actor.model.to_json(), outfile)

                #os.remove("criticmodel.h5")
                self.critic.model.save_weights("criticmodel_" + str(seed) +
                                               ".h5",
                                               overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(self.critic.model.to_json(), outfile)

                filename = "./model/actormodel_" + str(seed) + '_' + str(
                    i_episode + 1) + ".h5"
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                self.actor.model.save_weights(filename, overwrite=True)
                filename = "./model/criticmodel_" + str(seed) + '_' + str(
                    i_episode + 1) + ".h5"
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                self.critic.model.save_weights(filename, overwrite=True)

            if np.mod(i_episode + 1, 10) == 0:
                filename = "./Fig/iprl_save_" + str(seed)
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                with open(filename, 'wb') as f:
                    pickle.dump(self.save, f)

            if i_episode > 1000 and all(
                    np.array(self.save['total_reward'][-20:]) < 20):
                print('model degenerated. Stop at Epsisode ' + str(i_episode))
                break

        env.end()  # This is for shutting down TORCS
        logging.info("Neural Policy Update Finish.")
        return None

    def collect_data(self, controllers, tree=False):

        vision = False

        max_steps = 10000

        step = 0

        if not tree:
            steer_prog, accel_prog, brake_prog = controllers

        # Generate a Torcs environment
        env = TorcsEnv(vision=vision,
                       throttle=True,
                       gear_change=False,
                       track_name=self.track_name)
        ob = env.reset(relaunch=True)
        print("S0=", ob)

        window = 5
        lambda_store = np.zeros((max_steps, 1))
        lambda_max = 40.
        factor = 0.8

        logging.info("TORCS Collection started with Lambda = " +
                     str(self.lambda_mix))

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.
        tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                   [ob.speedZ], [ob.rpm],
                   list(ob.wheelSpinVel / 100.0),
                   list(ob.track), [0, 0, 0]]
        window_list = [tempObs[:] for _ in range(window)]

        observation_list = []
        actions_list = []

        lastLapTime = []
        sp = []

        for j_iter in range(max_steps):
            if tree:
                tree_obs = [sensor for obs in tempObs[:-1] for sensor in obs]
                act_tree = controllers.predict([tree_obs])
                steer_action = clip_to_range(act_tree[0][0], -1, 1)
                accel_action = clip_to_range(act_tree[0][1], 0, 1)
                brake_action = clip_to_range(act_tree[0][2], 0, 1)
            else:
                steer_action = clip_to_range(
                    steer_prog.pid_execute(window_list), -1, 1)
                accel_action = clip_to_range(
                    accel_prog.pid_execute(window_list), 0, 1)
                brake_action = clip_to_range(
                    brake_prog.pid_execute(window_list), 0, 1)

            action_prior = [steer_action, accel_action, brake_action]

            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), action_prior]
            window_list.pop(0)
            window_list.append(tempObs[:])

            a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            mixed_act = [
                a_t[0][k_iter] / (1 + self.lambda_mix) +
                (self.lambda_mix /
                 (1 + self.lambda_mix)) * action_prior[k_iter]
                for k_iter in range(3)
            ]
            if tree:
                newobs = [item for sublist in tempObs[:-1] for item in sublist]
                observation_list.append(newobs[:])
            else:
                observation_list.append(window_list[:])
            actions_list.append(mixed_act[:])
            ob, r_t, done, info = env.step(mixed_act)

            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward += r_t
            s_t = s_t1
            #if np.mod(step, 2000) == 0:
            #    logging.info(" Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime))

            step += 1
            if done:
                break

        logging.info("Data Collection Finished!")
        logging.info('Episode ends! \n' + "Episode Reward: " +
                     str(total_reward) + " Episode Length: " +
                     str(j_iter + 1) + " Ave Reward: " + str(total_reward /
                                                             (j_iter + 1)) +
                     "\n Distance: " + str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))
        env.end()

        return observation_list, actions_list

    def label_data(self, controllers, observation_list, tree=False):
        if not tree:
            steer_prog, accel_prog, brake_prog = controllers
        actions_list = []
        net_obs_list = []
        logging.info("Data labelling started with Lambda = " +
                     str(self.lambda_mix))
        for window_list in observation_list:
            if tree:
                act_tree = controllers.predict([window_list])
                steer_action = clip_to_range(act_tree[0][0], -1, 1)
                accel_action = clip_to_range(act_tree[0][1], 0, 1)
                brake_action = clip_to_range(act_tree[0][2], 0, 1)
                net_obs_list.append(window_list)
            else:
                steer_action = clip_to_range(
                    steer_prog.pid_execute(window_list), -1, 1)
                accel_action = clip_to_range(
                    accel_prog.pid_execute(window_list), 0, 1)
                brake_action = clip_to_range(
                    brake_prog.pid_execute(window_list), 0, 1)
                net_obs = [sensor for obs in window_list[-1] for sensor in obs]
                net_obs_list.append(net_obs[:29])

            action_prior = [steer_action, accel_action, brake_action]

            s_t = np.hstack([[net_obs[:29]]])
            a_t = self.actor.model.predict(s_t.reshape(1, 29))
            mixed_act = [
                a_t[0][k_iter] / (1 + self.lambda_mix) +
                (self.lambda_mix /
                 (1 + self.lambda_mix)) * action_prior[k_iter]
                for k_iter in range(3)
            ]

            actions_list.append(mixed_act[:])

        return net_obs_list, observation_list, actions_list

示例#8

显示文件

class DDPG(object):
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0

    def train(self):
        self.time_step = self.time_step + 1
        self.epsilon_expert -= (self.epsilon_expert_range[0] -
                                self.epsilon_expert_range[1]) / EXPLORE_COUNT
        self.epsilon_expert = max(self.epsilon_expert,
                                  self.epsilon_expert_range[1])
        self.epsilon_random -= (self.epsilon_random_range[0] -
                                self.epsilon_random_range[1]) / EXPLORE_COUNT
        self.epsilon_random = max(self.epsilon_random,
                                  self.epsilon_random_range[1])
        logger.debug(
            "step: %d, epsilon_expert: %s, epsilon_random: %s" %
            (self.time_step, self.epsilon_expert, self.epsilon_random))
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
            # if done_batch[i]:
            #     y_batch.append(reward_batch[i])
            # else :
            #     y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_cost = self.critic_network.train(y_batch, state_batch,
                                                     action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = self.exploration_noise.noise(action)
    #     noise_action = action + noise
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = np.zeros(self.action_dim)
    #     noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10)
    #     noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
    #     noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10)
    #     noise_action = action + noise
    #     logger.debug("action: %s, noise: %s" % (action, noise))
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        return action

    def opposite_action(self, state):
        logger.debug("state: %s" % (state))
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        action[0] = 1 - action[0]
        logger.debug("opposite action: %s" % (action))
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >= REPLAY_START_SIZE:
            # logger.debug("train...")
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'DDPG')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)

示例#9

显示文件

文件： model.py 项目： snowfeet/-NIPS-2017-Learning-to-Run

class Worker:
    """docstring for DDPG"""
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')

    def start(self, setting=0):
        self.env = RunEnv(visualize=True)
        self.setting = setting

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(
            self.sess, next_state_batch)
        q_value_batch = self.critic_network.target_q(self.sess,
                                                     next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            self.sess, selfstate_batch)
        q_gradient_batch = self.critic_network.gradients(
            self.sess, state_batch, action_batch_for_gradients)

        self.actor_network.train(self.sess, q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target(self.sess)
        self.critic_network.update_target(self.sess)

    def save_model(self, saver, episode):
        #if self.episode % 10 == 1:
        if self.name == 'worker_0':
            saver.save(self.sess,
                       self.model_path + "/model-" + str(episode) + ".ckpt")

    def noise_action(self, state, decay):
        # Select action a_t according to the current policy and exploration noise which gradually vanishes
        action = self.actor_network.action(self.sess, state)
        return action + self.exploration_noise.noise() * decay

    def action(self, state):
        action = self.actor_network.action(self.sess, state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE and self.training:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def work(self, coord, saver):
        if self.training:
            episode_count = self.sess.run(self.global_episodes)
        else:
            episode_count = 0
        wining_episode_count = 0
        total_steps = 0
        print("Starting worker_" + str(self.number))

        with self.sess.as_default(), self.sess.graph.as_default():
            #not_start_training_yet = True
            while not coord.should_stop():
                returns = []
                rewards = []
                episode_reward = 0

                if np.random.rand(
                ) < 0.9:  # change Aug20 stochastic apply noise
                    noisy = True
                    self.decay -= 1. / self.explore
                else:
                    noisy = False

                self.sess.run(self.update_local_ops_actor)
                self.sess.run(self.update_local_ops_critic)

                state = self.env.reset(difficulty=self.setting)
                #print(observation)
                s = process_frame(state)

                print "episode:", episode_count
                # Train

                for step in xrange(self.env.spec.timestep_limit):
                    state = process_frame(state)
                    if noisy:
                        action = np.clip(
                            self.noise_action(state, np.maximum(self.decay,
                                                                0)), 0.0, 1.0
                        )  # change Aug20, decay noise (no noise after ep>=self.explore)
                    else:
                        action = self.action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done))
                    next_state = process_frame(next_state)
                    self.perceive(state, action, reward * 100, next_state,
                                  done)
                    state = next_state
                    episode_reward += reward
                    if done:
                        break

                if episode % 5 == 0:
                    print "episode reward:", reward_episode

                # Testing:
                #if episode % 1 == 0:
                if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1:  # change Aug19
                    self.save_model(saver, episode_count)
                    total_return = 0
                    ave_reward = 0
                    for i in xrange(TEST):
                        state = self.env.reset()
                        reward_per_step = 0
                        for j in xrange(self.env.spec.timestep_limit):
                            action = self.action(
                                process_frame(state))  # direct action for test
                            state, reward, done, _ = self.env.step(action)
                            total_return += reward
                        if done:
                            break
                            reward_per_step += (reward -
                                                reward_per_step) / (j + 1)
                        ave_reward += reward_per_step

                    ave_return = total_return / TEST
                    ave_reward = ave_reward / TEST
                    returns.append(ave_return)
                    rewards.append(ave_reward)

                    print 'episode: ', episode, 'Evaluation Average Return:', ave_return, '  Evaluation Average Reward: ', ave_reward

                if self.name == 'worker_0' and self.training:
                    sess.run(self.increment)
                episode_count += 1

# All done Stop trail
# Confirm exit
            print('Done ' + self.name)

示例#10

显示文件

文件： ddpg.py 项目： ivychill/ltr

class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")

    def train(self):
        # my_config.logger.debug("......enter tain......")
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise(action)
        # if random.random() <= 0.5:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5])
        # else:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75])
        noise_action = action + noise
        clipped_noise_action = np.clip(noise_action, 0, 1)
        # if (self.time_step < 5):
        #     my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action))
        return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        # my_config.logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'ltr')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)

示例#11

显示文件

文件： ddpg.py 项目： ruizhao13/HFO

class DDPG:
    """docstring for DDPG"""
    def __init__(self):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 12
        self.action_dim = 10
        self.has_kicked = False
        self.laststep_haskicked = False
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        self.saver = tf.train.Saver(max_to_keep=1)
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # print(minibatch)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        # print(q_value_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("action_batch[0]", file=f)
            print(action_batch[0], file=f)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("q_gradient_batch[0]", file=f)
            print(q_gradient_batch[0], file=f)
        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action2(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def noise_action(self, state):
        action = self.actor_network.action(state)
        random_action = np.zeros(10, float)
        random_action[random.randint(0, 3)] = 1
        random_action[4] = random.uniform(-100, 100)  #DASH POWER
        random_action[5] = random.uniform(-180, 180)  #DASH DEGREES
        random_action[6] = random.uniform(-180, 180)  #TURN DEGREES
        random_action[7] = random.uniform(-180, 180)  #TACKLE DEGREES
        random_action[8] = random.uniform(0, 100)  #KICK POWER
        random_action[9] = random.uniform(-180, 180)  #KICK DEGREES
        if np.random.uniform() < EPSILON:
            return action
        else:
            return random_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

示例#12

显示文件

class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6)
        self.angular_noise = OUNoise(1, 0, 0.6, 0.8)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state, epsilon):
        action = self.actor_network.action(state)
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.linear_noise.noise()
        noise_t[1] = epsilon * self.angular_noise.noise()
        action = action + noise_t
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)
        #print(a_linear, a_angular)

        return [a_linear, a_angular]

    def action(self, state):
        action = self.actor_network.action(state)
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)

        return [a_linear, a_angular]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        if done:
            self.linear_noise.reset()
            self.angular_noise.reset()

        return self.time_step

示例#13

显示文件

class ddpg:
    def __init__(self, env_name, sess, state_dim, action_dim, models_dir,
                 img_dim):
        self.name = 'DDPG'
        self.env_name = env_name
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.img_dim = img_dim
        self.models_dir = models_dir

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = sess

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.img_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.img_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.saver = tf.train.Saver()

    def train(self):
        minibatch = self.replay_buffer.getBatch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        img_batch = np.asarray([data[5] for data in minibatch])
        next_img_batch = np.asarray([data[6] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        next_action_batch = self.actor_network.target_actions(
            next_state_batch, next_img_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch,
                                                     next_img_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])

        critic_cost = self.critic_network.train(y_batch, state_batch,
                                                action_batch, img_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            state_batch, img_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients, img_batch)

        self.actor_network.train(q_gradient_batch, state_batch, img_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        return critic_cost

    def save_network(self, step):
        self.saver.save(self.sess,
                        self.models_dir + self.env_name + '-network-ddpg.ckpt',
                        global_step=step)

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.models_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")

    '''
    def action(self,state):

        action = self.actor_network.action(state)

        action[0][0] = np.clip( action[0][0], -1 , 1 )
        action[0][1] = np.clip( action[0][1], 0 , 1 )
        action[0][2] = np.clip( action[0][2], 0 , 1 )
        #print "Action:", action
        return action[0]

    def noise_action(self,state,epsilon):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        print action.shape
        print "Action_No_Noise:", action
        noise_t = np.zeros([1,self.action_dim])
        noise_t[0][0] = epsilon * self.OU.function(action[0][0],  0.0 , 0.60, 0.80)
        noise_t[0][1] = epsilon * self.OU.function(action[0][1],  0.5 , 1.00, 0.10)
        noise_t[0][2] = epsilon * self.OU.function(action[0][2], -0.1 , 1.00, 0.05)
        
        action = action+noise_t
        action[0][0] = np.clip( action[0][0], -1 , 1 )
        action[0][1] = np.clip( action[0][1], 0 , 1 )
        action[0][2] = np.clip( action[0][2], 0 , 1 )
        
        print "Action_Noise:", action
        return action[0]
    '''

    def action(self, state, img):
        action = self.actor_network.action(state, img)

        action[0] = np.clip(action[0], -1, 1)
        # action[1] = np.clip( action[1], 0 , 1 )
        # action[2] = np.clip( action[2], 0 , 1 )

        return action

    def noise_action(self, state, epsilon, img):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state, img)
        noise_t = np.zeros(self.action_dim)

        if self.time_step < 100000:
            noise_t[0] = epsilon * ornstein_uhlenbeck_process(
                action[0], 0.0, 0.60, 0.80)
            # noise_t[1] = epsilon * ornstein_uhlenbeck_process(action[1],  0.5 , 1.00, 0.10)
            # noise_t[2] = epsilon * ornstein_uhlenbeck_process(action[2], -0.1 , 1.00, 0.05)
        elif self.time_step < 200000:
            if np.random.random() < 0.1:
                noise_t[0] = 0.1 * ornstein_uhlenbeck_process(
                    action[0], 0.0, 0.60, 0.80)

        action = action + noise_t
        action[0] = np.clip(action[0], -1, 1)
        # action[1] = np.clip( action[1], 0 , 1)
        # action[2] = np.clip( action[2], 0 , 1)

        return action

    def perceive(self, state, action, reward, next_state, done, img, next_img):
        if not (math.isnan(reward)):
            self.replay_buffer.add(state, action, reward, next_state, done,
                                   img, next_img)
        self.time_step = self.time_step + 1

        # Return critic cost
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            return self.train()
        else:
            return 0

示例#14

显示文件

文件： ddpg_tf.py 项目： felixludos/mb-rl

class DDPG_TF:
    """docstring for DDPG"""
    def __init__(self, env,loadfilename=None,printVars=False):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)

        #print 'init complete'
        self.all_vars = tf.global_variables()
        if printVars:
            for v in self.all_vars:
                print v.name.ljust(30), v.shape
        
        self.saver = tf.train.Saver(self.all_vars)
        if loadfilename is not None:
            self.saver.restore(self.sess, loadfilename)
            #print 'restore complete'
        
    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def actions(self, states):
        actions = self.actor_network.actions_no_training(states)
        return actions
    def target_actions(self, states):
        actions = self.actor_network.target_actions(states)
        return actions

    def value(self, states):
        actions = self.actor_network.actions_no_training(states)
        values = self.critic_network.q_value(states,actions)
        return values

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

示例#15

显示文件

class DDPG:
    """docstring for DDPG"""
    def __init__(self, environment):
        self.name = 'DDPG'  # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        self.critic_network = CriticNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0

    def set_init_observation(self, observation):
        # receive initial observation state
        self.state = observation

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = random.sample(self.replay_buffer, BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]

        action_batch = np.resize(action_batch, [BATCH_SIZE, 1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.target_evaluate(
            next_state_batch)
        q_value_batch = self.critic_network.target_evaluate(
            next_state_batch, next_action_batch)
        for i in range(0, BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.evaluate(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients) / BATCH_SIZE

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action + self.exploration_noise.noise(),
                       self.environment.action_space.low,
                       self.environment.action_space.high)

    def set_feedback(self, observation, action, reward, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        next_state = observation
        self.replay_buffer.append(
            (self.state, action, reward, next_state, done))
        # Update current state
        self.state = next_state
        # Update time step
        self.time_step += 1

        # Limit the replay buffer size
        if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
            self.replay_buffer.popleft()

        # Store transitions to replay start size then start training
        if self.time_step > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 10000 == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

示例#16

显示文件

文件： apl_ddpg.py 项目： mdheller/costar_plan

class APLDDPGAgent(AbstractAgent):

    name = "apl_ddpg"

    def __init__(self, env, iter=200000, *args, **kwargs):
        # create the actor model
        # create the critic model
        self.env = env
        self.action_dim = sum(
            sum(1 for i in row if i) for row in self.env.action_space.sample())
        self.observation = env.reset()
        self.state_dim = self.observation.shape
        print ">>>>>>>>>>>>>>>>>>>>>state dim " + str(self.state_dim)
        self.nn_action_dim = 6  # limit ddpg network output to 3 DOF
        self.noise = OUProcess(self.nn_action_dim,
                               mu=OU_MEAN,
                               theta=OU_THETA,
                               sigma=EPSILON_RANGE[0])

    def fit(self, *args, **kwargs):

        MEM_SZ = MEM_SIZE_FCL

        sess = K.get_session()
        K.set_learning_phase(1)

        self.actor = ActorNetwork(sess,
                                  self.state_dim,
                                  self.nn_action_dim,
                                  BATCH_SIZE,
                                  TAU,
                                  LRA,
                                  convolutional=CONVOLUTIONAL,
                                  output_activation=ACTION_ACTIVATION)
        self.critic = CriticNetwork(sess,
                                    self.state_dim,
                                    self.nn_action_dim,
                                    BATCH_SIZE,
                                    TAU,
                                    LRC,
                                    convolutional=CONVOLUTIONAL)

        self.memory = Memory(MEM_SZ)

        self.actor.target_model.summary()
        self.critic.target_model.summary()

        if LOAD_WEIGHTS:
            self.actor.model.load_weights(LOAD_WEIGHTS_PREFIX +
                                          "actor_model_" +
                                          LOAD_WEIGHTS_EPISODE + ".h5")
            self.critic.model.load_weights(LOAD_WEIGHTS_PREFIX +
                                           "critic_model_" +
                                           LOAD_WEIGHTS_EPISODE + ".h5")
            self.actor.target_model.load_weights(LOAD_WEIGHTS_PREFIX +
                                                 "actor_target_model_" +
                                                 LOAD_WEIGHTS_EPISODE + ".h5")
            self.critic.target_model.load_weights(LOAD_WEIGHTS_PREFIX +
                                                  "critic_target_model_" +
                                                  LOAD_WEIGHTS_EPISODE + ".h5")
            print("Weights Loaded!")

        #====================================================
        #Initialize noise processes
        #self.noise_procs = []
        #for i in range(NUM_NOISE_PROCS):
        #    self.noise_procs.append(OUProcess(OU_MEAN, OU_THETA, OU_STD_DEV))

        #====================================================

        PRE_LEARNING_EPISODES = STARTING_EPISODE + PRE_LEARNING_EPS
        steps = STARTING_EPISODE * EPISODE_LENGTH
        start_time = time.time()
        last_ep_time = time.time()
        if MAKE_PLOT:
            reward_graph = Grapher()

        for ep in range(STARTING_EPISODE, EPISODES):

            #reset noise processes
            #for ou in self.noise_procs:
            #    ou.reset()

            self.noise.reset()

            #start time counter
            if (ep == PRE_LEARNING_EPISODES):
                start_time = time.time()

            print("Episode: " + str(ep) + "  Frames: " +
                  str(ep * EPISODE_LENGTH) + "  Uptime: " + str(
                      (time.time() - start_time) / 3600.0) +
                  " hrs    ===========")

            state = self.env.reset()

            play_only = (ep % 10 == 0)

            total_reward = 0

            if play_only or ALREADY_TRAINED:
                for step in range(TEST_EPISODE_LENGTH):

                    #print ">>>>>>>>>>>>>", state.shape
                    #img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center
                    #img = np.multiply(img, 1.0/128.0) #scale [-1,1]
                    #img = np.transpose(state, (1,2,0))

                    #img = np.array(state)
                    #img = np.transpose(img, (1,2,0))

                    #print ">>>>>>>>>>>>>", state.shape

                    state = np.reshape(state, state.shape + (1, ))

                    action, control_action = self.selectAction(
                        state, can_be_random=False, use_target=True)

                    nstate, reward, done, info = self.env.step(control_action)
                    total_reward += reward
                    state = nstate
            else:
                for step in range(EPISODE_LENGTH):

                    # ACT ==============================
                    epsilon = (float(steps) / float(EPSILON_STEPS)) * (
                        EPSILON_RANGE[1] - EPSILON_RANGE[0]) + EPSILON_RANGE[0]

                    state = np.reshape(state, state.shape + (1, ))

                    action, control_action = self.selectAction(state,
                                                               epsilon=epsilon)
                    new_state, reward, done, info = self.env.step(
                        control_action)
                    done = done or (step >= EPISODE_LENGTH)
                    self.memory.addMemory(state, action, reward, new_state,
                                          done)
                    state = new_state

                    # LEARN ============================
                    if ep > PRE_LEARNING_EPISODES:
                        batch, idxs = self.memory.getMiniBatch(BATCH_SIZE)
                        self.learnFromBatch(batch)

                    if done:
                        break
                    # CLEANUP ==========================
                    steps += 1

            #we need to consider the episodes without noise to actually tell how the system is doing
            if play_only and MAKE_PLOT:
                reward_graph.addSample(total_reward)
                reward_graph.displayPlot()

            #calculate fph on total frames
            total_frames = (ep - PRE_LEARNING_EPISODES) * EPISODE_LENGTH
            elapsed = time.time() - start_time
            fps = total_frames / elapsed
            fph = fps * 3600.0

            #re-calculate fps on this episode, so it updates quickly
            fps = EPISODE_LENGTH / (time.time() - last_ep_time)
            last_ep_time = time.time()
            print("fps: " + str(fps) + "  fph: " + str(fph) + "\n")

            #save plot and weights
            if (ep > 0 and ep % EPISODE_SAVE_FREQUENCY
                    == 0) and not ALREADY_TRAINED:

                #plot
                if MAKE_PLOT:
                    reward_graph.savePlot(SAVE_WEIGHTS_PREFIX + "graph_" +
                                          str(ep) + ".jpg")

                #weights
                self.actor.model.save_weights(SAVE_WEIGHTS_PREFIX +
                                              "actor_model_" + str(ep) + ".h5",
                                              overwrite=True)
                self.actor.target_model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) +
                    ".h5",
                    overwrite=True)
                self.critic.model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".h5",
                    overwrite=True)
                self.critic.target_model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) +
                    ".h5",
                    overwrite=True)

                #network structures (although I don't think I ever actually use these)
                with open(
                        SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.actor.model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.actor.target_model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.critic.model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "critic_target_model_" +
                        str(ep) + ".json", "w") as outfile:
                    json.dump(self.critic.target_model.to_json(), outfile)

    def learnFromBatch(self, miniBatch):

        dones = np.asarray([sample['isFinal'] for sample in miniBatch])
        states = np.asarray([sample['state'] for sample in miniBatch])
        actions = np.asarray([sample['action'] for sample in miniBatch])
        new_states = np.asarray([sample['newState'] for sample in miniBatch])
        Y_batch = np.asarray([sample['reward'] for sample in miniBatch])

        new_states = np.reshape(new_states, new_states.shape + (1, ))

        target_q_values = self.critic.target_model.predict(
            [new_states,
             self.actor.target_model.predict(new_states)])

        for i in range(len(miniBatch)):
            if not dones[i]:
                Y_batch[i] = Y_batch[i] + GAMMA * target_q_values[i]

        self.critic.model.train_on_batch([states, actions], Y_batch)

        #additional operations to train actor
        temp_actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, temp_actions)
        self.actor.train(states, grads)

        #update target networks
        self.actor.target_train()
        self.critic.target_train()

    ''' This is wrong I think
    def OU(x, mu, theta, sigma):
        return theta * (mu - x) + sigma * np.random.randn(1)
    '''

    def clip(self, x, minx, maxx):
        return max(minx, min(maxx, x))

    def selectAction(self,
                     state,
                     can_be_random=True,
                     use_target=False,
                     epsilon=1.0,
                     permutation_num=0):
        state = np.array([state])  #add dimension to make a "batch" of 1

        if use_target:
            actions = self.actor.target_model.predict(state)
        else:
            actions = self.actor.model.predict(state)

        actions = np.squeeze(actions)

        #print control_actions

        #print("+++++++++++")
        #print(actions)

        if can_be_random:
            self.noise.sigma = epsilon
            noise = self.noise.noise()
            #print noise

            i = 0
            for idx, a in enumerate(actions):
                actions[i] = actions[i] + noise[i]
                actions[i] = self.clip(
                    actions[i], -3.14,
                    3.14)  #need to assign to actions[i], not just a.
                i += 1

            #get noise
            #noise = []
            #iterate over all noise procs for non-coop, or a single agent's procs for co-op
            #for n in range(permutation_num*ACTIONS_PER_AGENT, permutation_num*ACTIONS_PER_AGENT + self.action_dim):
            #    ou = self.noise_procs[n]
            #    noise.append(ou.step())

#            for idx, a in enumerate(actions):
#                ou = self.noise_procs[0]
#                noise = ou.step()
#                a = a + epsilon*noise
#                #print epsilon * noise
#                actions[i] = self.clip(a, -3.14, 3.14) #need to assign to actions[i], not just a.
#                i += 1
#
#print(actions)

#fill in zeros for all non-learned outputs
        control_actions = np.pad(actions, (0, self.action_dim - len(actions)),
                                 'constant')
        #print actions
        #print control_actions

        return actions, control_actions

    #Constructs an image from state vector
    def constructImageRepresentation(self, state):
        img = np.empty([IMAGE_SIDE_LENGTH, IMAGE_SIDE_LENGTH], dtype=np.uint8)
        img.fill(128)

        color = 255
        delta_color = int(math.floor(128 / NUM_TARGETS))
        for j in range(NUM_TARGETS):
            tar = [state[2 * j], state[2 * j + 1]]
            cv2.circle(img, (int(
                tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)),
                       5, 0, -1)
            cv2.circle(img, (int(
                tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)),
                       4, color, -1)
            color -= delta_color

        color = 0
        for j in range(NUM_AGENTS):
            offset = 2 * NUM_TARGETS
            agent = [state[offset + 2 * j], state[offset + 2 * j + 1]]
            #draw blank agent, no thrust display
            cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 4,
                                int(agent[1] * IMAGE_SIDE_LENGTH) - 1),
                          (int(agent[0] * IMAGE_SIDE_LENGTH) + 4,
                           int(agent[1] * IMAGE_SIDE_LENGTH) + 1), color, -1)
            cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 1,
                                int(agent[1] * IMAGE_SIDE_LENGTH) - 4),
                          (int(agent[0] * IMAGE_SIDE_LENGTH) + 1,
                           int(agent[1] * IMAGE_SIDE_LENGTH) + 4), color, -1)
            #first agent ia 0 since we control it, others are same color
            color = 64
        '''
        cv2.namedWindow('perm_image',cv2.WINDOW_NORMAL)
        cv2.resizeWindow('perm_image', 600,600)
        cv2.imshow('perm_image', img)
        cv2.waitKey(1)
        '''

        img = np.array([np.subtract(img, 128)], dtype=np.float32)  #zero center
        img = np.multiply(img, 1.0 / 128.0)  #scale [-1,1]
        img = np.transpose(img, (1, 2, 0))

        return img

    #for co-op case, get an arrangement of the state vector for each agent.
    def getStatePermutations(self, state):
        perms = []
        for i in range(NUM_AGENTS):

            if CONVOLUTIONAL and not DRAW_STATE:
                perms.append(state)
            else:
                pstate = []

                #copy over target data
                for j in range(NUM_TARGETS * 2):
                    pstate.append(state[j])

                #copy agent data, rotated
                for j in range(NUM_AGENTS * 2):
                    rot_j = (j +
                             (i * 2)) % (NUM_AGENTS * 2) + (NUM_TARGETS * 2)
                    pstate.append(state[rot_j])

                if DRAW_STATE:
                    perms.append(constructImageRepresentation(pstate))
                else:
                    perms.append(np.asarray(pstate, dtype=np.float32))

        return perms

示例#17

显示文件

class RDPG:
    """docstring for RDPG"""
    def __init__(self, env):
        self.name = 'RDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.saver = tf.train.Saver()

    def train(self):
        # Sample a random minibatch of N sequences from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # Construct histories
        observations = []
        next_observations = []
        actions = []
        rewards = []
        dones = []
        for each in minibatch:
            for i in range(1, len(each.observations)):
                observations.append(self.pad(each.observations[0:i]))
                next_observations.append(self.pad(each.observations[1, i + 1]))
                actions.append(each.actions[0:i - 1])
                rewards.append(each.rewards[0:i])
                if i == len(each.observations) - 1:
                    dones.append(True)
                else:
                    dones.append(False)
        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(observations)
        q_value_batch = self.critic_network.target_q(
            next_observations,
            [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)])
        y_batch = []
        for i in range(len(observations)):
            if dones[i]:
                y_batch.append(rewards[i][-1])
            else:
                y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [len(observations), 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, observations,
                                  [self.pad(i) for i in actions])

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(observations)
        q_gradient_batch = self.critic_network.gradients(
            observations, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, observations)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def save_model(self, path, episode):
        self.saver.save(self.sess, path + "modle.ckpt", episode)

    def noise_action(self, history):
        # Select action a_t according to a sequence of observation and action
        action = self.actor_network.action(history)
        return action + self.exploration_noise.noise()

    def action(self, history):
        action = self.actor_network.action(history)
        return action

    def perceive(self, history):
        # Store the history sequence in the replay buffer
        self.replay_buffer.add(history)

        # Store history to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def pad(self, input):
        dim = len(input[0])
        return input + [[0] * dim] * (1000 - len(input))

示例#18

显示文件

文件： test_actor.py 项目： zachkeer/scalable_maddpg

state_batch  = np.random.rand(batch_size, state_dim)
# with tf.Session() as sess:
#     actor = ActorNetwork(sess,state_dim,action_dim,agent_name,1)
#     print(actor.actions(state_batch))
#     actor.update_target()
#     print('\n')
#     print(actor.target_actions(state_batch))
#
#     actor.train(y_grad,state_batch)
#     actor.update_target()
#     print(actor.target_actions(state_batch))

# test create multiple agents
# agents = []
# with tf.Session() as sess:
#     for ii in range(10):
#         agent_name = 'agent'+str(ii)
#         print(agent_name)
#         agents.append(ActorNetwork(sess, state_dim, action_dim, agent_name))
#
#     print(agents)

# test the copy works
with tf.Session() as sess:
    agent1 = ActorNetwork(sess,state_dim,action_dim,'agent1')
    agent1.train(y_grad,state_batch)

    agent2 = ActorNetwork(sess, state_dim, action_dim, 'agent2', agent1.nets)
    print('agent 1', agent1.actions(state_batch))
    print('agent 2', agent2.actions(state_batch))

示例#19

显示文件

class DDPGAgent():
    """
    Deep deterministic policy gradient agent as described in
    https://arxiv.org/abs/1509.02971.

    This agent is meant to operate on low dimensional inputs, not raw pixels.

    To use the agent, you can get action predictions using act(), and to teach
    the agent, feed the results to learn.
    """
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.local_actor_network = ActorNetwork(state_size, action_size)
        self.target_actor_network = ActorNetwork(state_size, action_size)
        self.actor_optimizer = optim.Adam(
            self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE)

        # Critic
        self.local_critic_network = CriticNetwork(state_size, action_size)
        self.target_critic_network = CriticNetwork(state_size, action_size)
        self.critic_optimizer = optim.Adam(
            self.local_critic_network.parameters(),
            lr=CRITIC_LEARNING_RATE,
            weight_decay=CRITIC_WEIGHT_DECAY)

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess(
            (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)

    def act(self, states, noise=True):
        """
        Returns an action vector based on the current game state.

        Params
        ======
        states (array_like): A matrix of game states (each row represents the
            state of an agent)
        noise (boolean): Add random noise to the predicted action.  Aids
            exploration of the environment during training.
        """

        self.local_actor_network.eval()
        with torch.no_grad():
            actions = self.local_actor_network(
                torch.tensor(states, dtype=torch.float32)).detach().numpy()
        self.local_actor_network.train()
        if noise:
            actions = actions + self.random_process.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def vectorize_experiences(self, experiences):
        """Vectorizes experience objects for use by pytorch

        Params
        ======
            experiences (array_like of Experience objects): Experiences to
                vectorize
        """
        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(self.device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).float().to(self.device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(self.device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences if e is not None
                       ]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones)

    def normalize(self, to_normalize):
        """
        Normalize the each row of the input along the 0 dimension using the
        formula (value - mean)/std

        Params
        ======
        to_normalize (array_like): Values to normalize
        """

        std = to_normalize.std(0)
        mean = to_normalize.mean(0)
        return (to_normalize - mean) / (std + 1e-5)

    def soft_update(self, target_parameters, local_parameters):
        """
        Updates the given target network parameters with the local parameters
        using a soft update strategy: tau * local + (1-tau) * target
        """

        for target, local in zip(target_parameters, local_parameters):
            target.data.copy_(TAU * local.data + (1.0 - TAU) * target.data)

    def train(self, experiences):
        """
        Trains the actor and critic networks using a minibatch of experiences

        Params
        ======
        experiences (array_like of Experience): Minibatch of experiences
        """
        states, actions, rewards, next_states, dones = self.vectorize_experiences(
            experiences)
        #states = self.normalize(states)
        #next_states = self.normalize(next_states)
        rewards = self.normalize(rewards)

        # Use the target critic network to calculate a target q value
        next_actions = self.target_actor_network(next_states)
        q_target = rewards + GAMMA * self.target_critic_network(
            next_states, next_actions) * (1 - dones)

        # Calculate the predicted q value
        q_predicted = self.local_critic_network(states, actions)

        # Update critic network
        critic_loss = F.mse_loss(q_predicted, q_target)
        #print(critic_loss)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.local_critic_network.parameters(),
                                       1)
        self.critic_optimizer.step()

        # Update predicted action using policy gradient
        actions_predicted = self.local_actor_network(states)
        #print(self.local_critic_network(states, actions_predicted).mean())
        policy_loss = -self.local_critic_network(states,
                                                 actions_predicted).mean()
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        #print(policy_loss)
        self.actor_optimizer.step()

        self.soft_update(self.target_actor_network.parameters(),
                         self.local_actor_network.parameters())
        self.soft_update(self.target_critic_network.parameters(),
                         self.local_critic_network.parameters())

    def learn(self, experience):
        """
        Tells the agent to learn from an experience.  This may not immediately
        result in training since this agent uses a replay buffer.

        Params
        ======
        experience (Experience): An experience used to teach the agent.
        """
        self.replay_buffer.add(experience)
        self.steps += 1
        if self.steps % STEPS_BETWEEN_TRAINING == 0 and len(
                self.replay_buffer) >= BATCH_SIZE:
            for i in range(ITERATIONS_PER_TRAINING):
                self.train(self.replay_buffer.sample(BATCH_SIZE))

    def save(self, filename):
        """Saves learned params of underlying networks to a checkpoint file.

        Params
        ======
            filename (string): Target file.  agent- and critic- are prepended
                for the agent and critic network, respectively
        """
        torch.save(self.local_actor_network.state_dict(), "actor-" + filename)
        torch.save(self.local_critic_network.state_dict(),
                   "critic-" + filename)

    def load(self, filename):
        """Loads learned params generated by save() into underlying networks.

            filename (string): Path to file.  There should be an agent- and
            critic- version of this file.
        """
        self.local_actor_network.load_state_dict(
            torch.load("actor-" + filename))
        self.target_actor_network.load_state_dict(
            torch.load("actor-" + filename))

        self.local_critic_network.load_state_dict(
            torch.load("critic-" + filename))
        self.target_critic_network.load_state_dict(
            torch.load("critic-" + filename))

    def end_episode(self):
        """
        Tell the agent that an episode is complete.
        """
        self.random_process.reset()
        self.steps = 0

示例#20

显示文件

class DDPG:
    def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma,
                 tau, batch_size, n_train, n_episode):
        # Gym environment
        self.env = env

        env_flattened = gym.wrappers.FlattenDictWrapper(
            env, dict_keys=['observation', 'achieved_goal', 'desired_goal'])

        # Get space sizes
        self.state_dim = env_flattened.observation_space.shape[0]
        #self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Get replay buffer and function get a batch from it
        self.replay_buffer = replay_buffer
        self.sample_batch = sample_batch

        self.sess = tf.InteractiveSession()

        # Hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_train = n_train
        self.n_episode = n_episode

        # Initialize networks
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim)
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim)

        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        batch = self.sample_batch(self.batch_size)

        state_batch = np.asarray([data[0] for data in batch])
        action_batch = np.asarray([data[1] for data in batch])
        reward_batch = np.asarray([data[2] for data in batch])
        next_state_batch = np.asarray([data[3] for data in batch])
        done_batch = np.asarray([data[4] for data in batch])

        next_action_batch = self.actor.target_actions(next_state_batch)
        q_value_batch = self.critic.target_q(next_state_batch,
                                             next_action_batch)
        y_batch = []
        for i in range(len(batch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + self.gamma * q_value_batch[i])
        y_batch = np.resize(y_batch, [self.batch_size, 1])
        # Update critic by minimizing the loss L
        self.critic.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor.actions(state_batch)
        q_gradient_batch = self.critic.gradients(state_batch,
                                                 action_batch_for_gradients)

        self.actor.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor.update_target()
        self.critic.update_target()

    def noise_action(self, state):
        action = self.actor.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        return self.actor.action(state)

    def reset_noise(self):
        self.exploration_noise.reset()

    def save_policy(self, save_path):
        self.actor.save_network(save_path)