예제 #1
0
    def __init__(self, env):

        # Hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 64
        self.BUFFER_SIZE = 20000
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.TAU = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        ## create actor and critic networks
        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           self.TAU, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim, self.TAU,
                             self.CRITIC_LEARNING_RATE)

        ## initialize replay buffer
        self.buffer = ReplayBuffer(self.BUFFER_SIZE)

        # save the results
        self.save_epi_reward = []
예제 #2
0
파일: ddpg.py 프로젝트: xie9187/lan_nav
    def __init__(self, flags, sess):
        self.dim_laser = [flags.dim_laser_b, flags.dim_laser_c]
        self.dim_goal = flags.dim_goal
        self.dim_action = flags.dim_action
        self.dim_emb = flags.dim_emb
        self.dim_cmd = flags.dim_cmd
        self.n_hidden = flags.n_hidden
        self.n_cmd_type = flags.n_cmd_type
        self.n_layers = flags.n_layers
        self.a_learning_rate = flags.a_learning_rate
        self.c_learning_rate = flags.c_learning_rate
        self.batch_size = flags.batch_size
        self.max_step = flags.max_step
        self.tau = flags.tau
        self.action_range = [flags.a_linear_range, flags.a_angular_range]
        self.buffer_size = flags.buffer_size
        self.gamma = flags.gamma
        self.demo_flag = flags.demo_flag

        self.actor = Actor(sess=sess,
                           dim_laser=self.dim_laser,
                           dim_cmd=self.dim_cmd,
                           dim_action=self.dim_action,
                           dim_goal=self.dim_goal,
                           dim_emb=self.dim_emb,
                           n_cmd_type=self.n_cmd_type,
                           n_hidden=self.n_hidden,
                           n_layers=self.n_layers,
                           max_step=self.max_step,
                           batch_size=self.batch_size,
                           action_range=self.action_range,
                           tau=self.tau,
                           gpu_num=1,
                           demo_flag=self.demo_flag)

        self.critic = Critic(sess=sess,
                             dim_laser=self.dim_laser,
                             dim_cmd=self.dim_cmd,
                             dim_action=self.dim_action,
                             dim_goal=self.dim_goal,
                             dim_emb=self.dim_emb,
                             n_cmd_type=self.n_cmd_type,
                             n_hidden=self.n_hidden,
                             n_layers=self.n_layers,
                             max_step=self.max_step,
                             batch_size=self.batch_size,
                             num_actor_vars=len(self.actor.network_params) +
                             len(self.actor.target_network_params),
                             tau=self.tau,
                             gpu_num=1)
        self.memory = []
예제 #3
0
    def __init__(self, env, track, episodes=650):

        self.env = env
        self.track = track

        self.max_episodes = episodes
        self.max_steps = 3000

        self.save_model = True
        self.load_model = False

        self.restart_memory_leak = 25

        ### size of action- and state space
        self.state_size = 70
        self.action_size = 3

        ### DDPG Hyperparameters
        self.epsilon = 1.0
        self.epsilon_decay = 1 / 96000
        self.epsilon_min = 0.07
        self.batch_size = 64
        self.gamma = 0.99
        self.tau = 0.001
        self.lr_actor = 0.00011
        self.lr_critic = 0.0011

        ### set OU Process
        self.ou = OU()

        ### tf gpu and session set
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        K.set_session(self.sess)

        ### actor, critic and replay memory
        self.actor = Actor(self.sess, self.state_size, self.action_size,
                           self.tau, self.lr_actor)
        self.critic = Critic(self.sess, self.state_size, self.action_size,
                             self.tau, self.lr_critic)
        self.memory = ExperienceReplayBuffer(50000)

        ### helper class to build state representation
        self.dataset_builder = DatasetBuilder()
예제 #4
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score
        self.score = 0
        self.count = 0
        self.best_score = -np.inf
예제 #5
0
    def __init__(self, env):

        self.sess = tf.Session()
        K.set_session(self.sess)

        ## hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 128
        self.BUFFER_SIZE = 20000
        self.MIN_SAMPLES_TO_BEGIN_LEARNING = 1000
        self.ACTOR_LEARNING_RATE = 0.001
        self.CRITIC_LEARNING_RATE = 0.001
        self.TAU = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        ## create actor and critic networks
        self.actor = Actor(self.sess, self.state_dim, self.action_dim,
                           self.action_bound, self.TAU,
                           self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.sess, self.state_dim, self.action_dim,
                             self.TAU, self.CRITIC_LEARNING_RATE)

        ## initialize for later gradient calculation
        self.sess.run(
            tf.global_variables_initializer())  #<-- no problem without it

        ## initialize replay buffer
        self.buffer = ReplayBuffer(self.BUFFER_SIZE)

        # save the results
        self.save_epi_reward = []
예제 #6
0
class DDPGAgent:
    def __init__(self, env, track, episodes=650):

        self.env = env
        self.track = track

        self.max_episodes = episodes
        self.max_steps = 3000

        self.save_model = True
        self.load_model = False

        self.restart_memory_leak = 25

        ### size of action- and state space
        self.state_size = 70
        self.action_size = 3

        ### DDPG Hyperparameters
        self.epsilon = 1.0
        self.epsilon_decay = 1 / 96000
        self.epsilon_min = 0.07
        self.batch_size = 64
        self.gamma = 0.99
        self.tau = 0.001
        self.lr_actor = 0.00011
        self.lr_critic = 0.0011

        ### set OU Process
        self.ou = OU()

        ### tf gpu and session set
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        K.set_session(self.sess)

        ### actor, critic and replay memory
        self.actor = Actor(self.sess, self.state_size, self.action_size,
                           self.tau, self.lr_actor)
        self.critic = Critic(self.sess, self.state_size, self.action_size,
                             self.tau, self.lr_critic)
        self.memory = ExperienceReplayBuffer(50000)

        ### helper class to build state representation
        self.dataset_builder = DatasetBuilder()

    def saveModel(self):
        self.actor.model.save("./ddpg_weights/ddpg_actor_model.h5")
        self.critic.model.save("./ddpg_weights/ddpg_critic_model.h5")

    def lowerExploration(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

    def trainAgent(self):

        all_total_rewards = []
        all_dist_raced = []
        all_dist_percentage = []
        all_avg_speed = []
        all_car_hits = []
        all_race_pos = []

        for e in range(self.max_episodes):

            ### save weights every 10th episode
            if self.save_model:
                if (e % 10) == 0:
                    self.saveModel()

            ### relaunch torcs every 10th episode because
            ### leaky memory would otherwise slow thread down
            if (e % self.restart_memory_leak) == 0:
                state = self.env.reset(relaunch=True)
            else:
                state = self.env.reset()

            ### build state representation
            state, _ = self.dataset_builder.buildStateDataSet(s=state)

            total_reward = 0
            avg_speed = 0
            avg_racepos = 0

            damage = 0
            damage_hit_counter = 0

            for j in range(self.max_steps):
                ### initialize numpy matrices to hold action values with OU noise
                action_with_noise = np.zeros([1, self.action_size])
                noise = np.zeros([1, self.action_size])

                ### get action values from actor
                action = self.actor.model.predict(
                    state.reshape(1, state.shape[0]))

                ###################################################################
                ###     Deriving OU-Parameters from                             ###
                ###     https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html ###
                ###     and own experiment                                      ###
                ###################################################################
                noise[0][0] = self.epsilon * self.ou.calc_noise(
                    action[0][0], 0.0, 0.55, 0.15)
                noise[0][1] = self.epsilon * self.ou.calc_noise(
                    action[0][1], 0.55, 1.00, 0.10)
                noise[0][2] = self.epsilon * self.ou.calc_noise(
                    action[0][2], -0.1, 1.00, 0.05)

                ###################################################################
                ### Concept of a "stochastic" break adapted and improved from   ###
                ### https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html     ###
                ### The issue is that slamming the break all the                ###
                ### time isn't adequatly represented in the                     ###
                ### reward function. Therefore we "hack" the OU-Process         ###
                ### by triggering the brake with a chance of                    ###
                ### min(0.18, self.epsilon)                                     ###
                ###################################################################
                if random.random() <= min(0.18, self.epsilon):
                    noise[0][2] = self.epsilon * self.ou.calc_noise(
                        action[0][2], 0.25, 1.00, 0.10)

                ### Add OU noise to actions
                action_with_noise[0][0] = action[0][0] + noise[0][0]
                action_with_noise[0][1] = action[0][1] + noise[0][1]
                action_with_noise[0][2] = action[0][2] + noise[0][2]

                next_state, reward, done, info = self.env.step(
                    action_with_noise[0])

                ### build state representation
                dist_raced = next_state.distRaced
                speedX = next_state.speedX
                pre_damage = damage
                damage = next_state.damage
                racePos = next_state.racePos
                next_state = np.hstack(
                    (next_state.angle, next_state.track, next_state.focus,
                     next_state.opponents, next_state.trackPos,
                     next_state.speedX, next_state.speedY, next_state.speedZ,
                     next_state.wheelSpinVel / 100.0, next_state.rpm))

                ### save to experience replay memory for batch selection
                self.memory.memorize(state, action_with_noise[0], reward,
                                     next_state, done)

                ### lower epsilon for less exploration
                self.lowerExploration()

                ### train the models!
                self.trainModel()

                total_reward += reward
                avg_speed += speedX
                avg_racepos += racePos

                state = next_state

                ### detect damange
                if damage - pre_damage > 0:
                    damage_hit_counter += 1

                print("Episode: " + str(e) + " Step: " + str(j) + " Action: " +
                      str(action_with_noise) + " Reward: " + str(reward) +
                      " Epsilon: " + str(self.epsilon))

                if done:
                    all_total_rewards.append(total_reward)
                    all_dist_raced.append(dist_raced)

                    ### use track length according to chosen track
                    if self.track == "eroad":
                        track_length = 3260
                    elif self.track == "cgspeedway":
                        track_length = 2057
                    elif self.track == "forza":
                        track_length = 5784

                    percentage_of_track = round(
                        ((dist_raced / track_length) * 100), 0)
                    ### in case agent completed multiple laps which is likely for a well trained agent
                    if percentage_of_track > 100: percentage_of_track = 100
                    all_dist_percentage.append(percentage_of_track)

                    all_avg_speed.append((avg_speed / j))

                    all_car_hits.append(damage_hit_counter)
                    all_race_pos.append(int(avg_racepos / j))

                    break

        self.env.end()

        ### All the plotting stuff
        print("Plotting rewards!")
        plt.plot(all_total_rewards)
        plt.xlabel("Episode")
        plt.ylabel("Ertrag")
        plt.show()
        print("Plotting distances!")
        plt.plot(all_dist_raced)
        plt.xlabel("Episode")
        plt.ylabel("Distanz von Startlinie [m]")
        plt.show()

        print("Plotting completeness!")
        plt.plot(all_dist_percentage)
        plt.xlabel("Episode")
        plt.ylabel("Vollstaendigkeit Strecke [%]")
        plt.axis([0, 350, 0, 100])
        plt.show()

        print("Plotting avg speed!")
        plt.plot(all_avg_speed)
        plt.xlabel("Episode")
        plt.ylabel("Durschn. Geschwindigkeit [km/h]")
        plt.axis([0, 350, 0, 1])
        plt.show()

        print("Plotting car hits!")
        plt.plot(all_car_hits)
        plt.xlabel("Episode")
        plt.ylabel("Unfaelle des Fahrzeuges")
        plt.show()
        print("Mean car hits:")
        print(sum(all_car_hits) / len(all_car_hits))
        print("Std dev car hits:")
        print(np.std(all_car_hits))

        print("Plotting car hits per distance!")
        div = np.divide(all_car_hits, all_dist_raced)
        plt.plot(div)
        plt.xlabel("Episode")
        plt.ylabel("Unfaelle des Fahrzeuges pro Distanzeinheit")
        plt.show()

        print("Plotting avg race pos!")
        plt.plot(all_race_pos)
        plt.xlabel("Episode")
        plt.ylabel("Durschn. Position")
        plt.show()

    def trainModel(self):

        ### get random mini batch from experience replay memory
        mini_batch = self.memory.sampleRandomBatch(self.batch_size)

        ### build arrays for models from mini batch
        states = np.asarray([b[0] for b in mini_batch])
        actions = np.asarray([b[1] for b in mini_batch])
        target = np.asarray([b[1] for b in mini_batch])
        rewards = np.asarray([b[2] for b in mini_batch])
        new_states = np.asarray([b[3] for b in mini_batch])
        dones = np.asarray([b[4] for b in mini_batch])

        ### get q values from target critic model
        ### q(s, t(s), w') in thesis
        target_q_values = self.critic.target_model.predict(
            [new_states,
             self.actor.target_model.predict(new_states)])

        ### iterate through minibatch, update target according to bellman eq.
        for k in range(0, len(mini_batch)):
            if dones[k]:
                target[k] = rewards[k]
            else:
                target[k] = rewards[k] + self.gamma * target_q_values[k]

        ### train networks
        self.critic.model.train_on_batch([states, actions], target)
        actions = self.actor.model.predict(states)
        ### nabla q(s, t(s))
        gradients = self.critic.gradients(states, actions)
        ### train actor
        self.actor.train(states, gradients)

        ### soft update
        self.actor.target_train()
        self.critic.target_train()

    def testAgent(self):
        ### set epsilon (exploration) low
        self.epsilon = self.epsilon_min

        ### Do not save weights when testing
        ### CHANGE if you want to continuously train agent
        self.save_model = False

        try:
            self.actor.model = load_model("./ddpg_weights/ddpg_actor_model.h5")
            self.critic.model = load_model(
                "./ddpg_weights/ddpg_critic_model.h5")
            print("Model loaded!")
        except:
            print("Model could not be loaded! Check path or train first")
            sys.exit()

        self.trainAgent()
예제 #7
0
파일: ddpg.py 프로젝트: xie9187/lan_nav
class DDPG(object):
    """docstring for DDPG"""
    def __init__(self, flags, sess):
        self.dim_laser = [flags.dim_laser_b, flags.dim_laser_c]
        self.dim_goal = flags.dim_goal
        self.dim_action = flags.dim_action
        self.dim_emb = flags.dim_emb
        self.dim_cmd = flags.dim_cmd
        self.n_hidden = flags.n_hidden
        self.n_cmd_type = flags.n_cmd_type
        self.n_layers = flags.n_layers
        self.a_learning_rate = flags.a_learning_rate
        self.c_learning_rate = flags.c_learning_rate
        self.batch_size = flags.batch_size
        self.max_step = flags.max_step
        self.tau = flags.tau
        self.action_range = [flags.a_linear_range, flags.a_angular_range]
        self.buffer_size = flags.buffer_size
        self.gamma = flags.gamma
        self.demo_flag = flags.demo_flag

        self.actor = Actor(sess=sess,
                           dim_laser=self.dim_laser,
                           dim_cmd=self.dim_cmd,
                           dim_action=self.dim_action,
                           dim_goal=self.dim_goal,
                           dim_emb=self.dim_emb,
                           n_cmd_type=self.n_cmd_type,
                           n_hidden=self.n_hidden,
                           n_layers=self.n_layers,
                           max_step=self.max_step,
                           batch_size=self.batch_size,
                           action_range=self.action_range,
                           tau=self.tau,
                           gpu_num=1,
                           demo_flag=self.demo_flag)

        self.critic = Critic(sess=sess,
                             dim_laser=self.dim_laser,
                             dim_cmd=self.dim_cmd,
                             dim_action=self.dim_action,
                             dim_goal=self.dim_goal,
                             dim_emb=self.dim_emb,
                             n_cmd_type=self.n_cmd_type,
                             n_hidden=self.n_hidden,
                             n_layers=self.n_layers,
                             max_step=self.max_step,
                             batch_size=self.batch_size,
                             num_actor_vars=len(self.actor.network_params) +
                             len(self.actor.target_network_params),
                             tau=self.tau,
                             gpu_num=1)
        self.memory = []

    def ActorPredict(self, input_laser, input_cmd, input_cmd_next,
                     input_cmd_skip, prev_action, input_goal, prev_state_2):
        a, state_2 = self.actor.PredictOnline(input_laser, input_cmd,
                                              input_cmd_next, input_cmd_skip,
                                              prev_action, input_goal,
                                              prev_state_2)
        return a[0], state_2

    def Add2Mem(self, sample):
        if len(sample) <= self.max_step:
            self.memory.append(
                sample)  # seqs of (laser, cmd, cmd_next, cmd_skip,
            #          prev_action, obj_goal, action,
            #          r, terminate, status, action_label)
        if len(self.memory) > self.buffer_size:
            self.memory.pop(0)

    def SampleBatch(self):
        if len(self.memory) >= self.batch_size:

            indices = np.random.randint(0,
                                        len(self.memory) - 1,
                                        size=self.batch_size)

            laser_t_batch = np.empty(
                (self.batch_size, self.dim_laser[0], self.dim_laser[1]),
                dtype=np.float32)
            cmd_t_batch = np.empty((self.batch_size, self.dim_cmd),
                                   dtype=np.int64)
            cmd_next_t_batch = np.empty((self.batch_size, self.dim_cmd),
                                        dtype=np.int64)
            cmd_skip_t_batch = np.empty((self.batch_size, self.dim_cmd),
                                        dtype=np.int64)
            prev_action_t_batch = np.empty((self.batch_size, self.dim_action),
                                           dtype=np.float32)
            goal_t_batch = np.empty((self.batch_size, self.dim_goal),
                                    dtype=np.float32)
            goal_a_t_batch = np.empty((self.batch_size, self.dim_goal),
                                      dtype=np.float32)
            prev_state_2_t_batch = [
                np.empty((self.batch_size, self.n_hidden), dtype=np.float32),
                np.empty((self.batch_size, self.n_hidden), dtype=np.float32)
            ]
            action_t_batch = np.empty((self.batch_size, self.dim_action),
                                      dtype=np.float32)

            reward_batch = np.empty((self.batch_size), dtype=np.float32)
            terminate_batch = np.empty((self.batch_size), dtype=bool)

            status_batch = np.empty((self.batch_size, 1), dtype=np.int64)
            action_batch = np.empty((self.batch_size, self.dim_action),
                                    dtype=np.float32)

            laser_t1_batch = np.empty(
                (self.batch_size, self.dim_laser[0], self.dim_laser[1]),
                dtype=np.float32)
            cmd_t1_batch = np.empty((self.batch_size, self.dim_cmd),
                                    dtype=np.int64)
            cmd_next_t1_batch = np.empty((self.batch_size, self.dim_cmd),
                                         dtype=np.int64)
            cmd_skip_t1_batch = np.empty((self.batch_size, self.dim_cmd),
                                         dtype=np.int64)
            prev_action_t1_batch = np.empty((self.batch_size, self.dim_action),
                                            dtype=np.float32)
            goal_t1_batch = np.empty((self.batch_size, self.dim_goal),
                                     dtype=np.float32)
            goal_a_t1_batch = np.empty((self.batch_size, self.dim_goal),
                                       dtype=np.float32)
            prev_state_2_t1_batch = [
                np.empty((self.batch_size, self.n_hidden), dtype=np.float32),
                np.empty((self.batch_size, self.n_hidden), dtype=np.float32)
            ]
            action_t1_batch = np.empty((self.batch_size, self.dim_action),
                                       dtype=np.float32)

            for i, idx in enumerate(indices):
                laser_t_batch[i] = self.memory[idx][0]
                cmd_t_batch[i] = self.memory[idx][1]
                cmd_next_t_batch[i] = self.memory[idx][2]
                cmd_skip_t_batch[i] = self.memory[idx][3]
                prev_action_t_batch[i] = self.memory[idx][4]
                goal_t_batch[i] = self.memory[idx][5]
                prev_state_2_t_batch[0][i] = self.memory[idx][6][0][0]
                prev_state_2_t_batch[1][i] = self.memory[idx][6][1][0]
                action_t_batch[i] = self.memory[idx][7]

                reward_batch[i] = self.memory[idx][8]
                terminate_batch[i] = self.memory[idx][9]

                status_batch[i] = self.memory[idx][10]
                action_batch[i] = self.memory[idx][11]

                laser_t1_batch[i] = self.memory[idx + 1][0]
                cmd_t1_batch[i] = self.memory[idx + 1][1]
                cmd_next_t1_batch[i] = self.memory[idx + 1][2]
                # prev_action_t1_batch[i] = self.memory[idx+1][4]
                goal_t1_batch[i] = self.memory[idx + 1][5]
                prev_state_2_t1_batch[0][i] = self.memory[idx + 1][6][0][0]
                prev_state_2_t1_batch[1][i] = self.memory[idx + 1][6][1][0]
                action_t1_batch[i] = self.memory[idx + 1][8]

                if cmd_t_batch[i] == 5:
                    goal_a_t_batch[i] = self.memory[idx][5]
                else:
                    goal_a_t_batch[i] = [0., 0.]
                if cmd_t1_batch[i] == 5:
                    goal_a_t1_batch[i] = self.memory[idx + 1][5]
                else:
                    goal_a_t1_batch[i] = [0., 0.]

            return [
                laser_t_batch, cmd_t_batch, cmd_next_t_batch, cmd_skip_t_batch,
                prev_action_t_batch, goal_a_t_batch, goal_a_t_batch,
                prev_state_2_t_batch, action_t_batch, reward_batch,
                terminate_batch, status_batch, action_batch, laser_t1_batch,
                cmd_t1_batch, cmd_next_t1_batch, action_t_batch,
                goal_a_t1_batch, goal_a_t1_batch, action_t1_batch
            ], indices
        else:
            print 'samples are not enough'
            return None, None

    def Train(self):
        start_time = time.time()

        batch, indices = self.SampleBatch()

        sample_time = time.time() - start_time

        if not batch:
            return 0.
        else:
            [
                laser_t_batch, cmd_t_batch, cmd_next_t_batch, cmd_skip_t_batch,
                prev_action_t_batch, goal_t_batch, goal_a_t_batch,
                prev_state_2_t_batch, action_t_batch, reward_batch,
                terminate_batch, status_batch, action_batch, laser_t1_batch,
                cmd_t1_batch, cmd_next_t1_batch, prev_action_t1_batch,
                goal_t1_batch, goal_a_t1_batch, action_t1_batch
            ] = batch

            #compute target y
            target_a_pred = self.actor.PredictTarget(
                laser=laser_t1_batch,
                cmd=cmd_t1_batch,
                cmd_next=cmd_next_t1_batch,
                prev_action=prev_action_t1_batch,
                obj_goal=goal_a_t1_batch)

            target_q_pred = self.critic.PredictTarget(
                laser=laser_t1_batch,
                cmd=cmd_t1_batch,
                cmd_next=cmd_next_t1_batch,
                prev_action=prev_action_t1_batch,
                obj_goal=goal_t1_batch,
                action=action_t1_batch)
            y = []
            for i in xrange(self.batch_size):
                if terminate_batch[i]:
                    y.append(reward_batch[i])
                else:
                    y.append(reward_batch[i] +
                             self.gamma * target_q_pred[i, 0])

            y = np.expand_dims(np.stack(y), axis=1)

            y_time = time.time() - start_time - sample_time

            # critic update
            q, _ = self.critic.Train(laser=laser_t_batch,
                                     cmd=cmd_t_batch,
                                     cmd_next=cmd_next_t_batch,
                                     prev_action=prev_action_t_batch,
                                     obj_goal=goal_t_batch,
                                     action=action_t_batch,
                                     y=y)

            # actions for a_gradients from critic
            actions, states_2 = self.actor.PredictOnline(
                laser=laser_t_batch,
                cmd=cmd_t_batch,
                cmd_next=cmd_next_t_batch,
                cmd_skip=cmd_skip_t_batch,
                prev_action=prev_action_t_batch,
                obj_goal=goal_a_t_batch,
                prev_state_2=prev_state_2_t_batch)

            # a_gradients
            a_gradients = self.critic.ActionGradients(
                laser=laser_t_batch,
                cmd=cmd_t_batch,
                cmd_next=cmd_next_t_batch,
                prev_action=prev_action_t_batch,
                obj_goal=goal_t_batch,
                action=actions)

            # actor update
            self.actor.Train(laser=laser_t_batch,
                             cmd=cmd_t_batch,
                             cmd_next=cmd_next_t_batch,
                             cmd_skip=cmd_skip_t_batch,
                             prev_action=prev_action_t_batch,
                             obj_goal=goal_a_t_batch,
                             prev_state_2=prev_state_2_t_batch,
                             a_gradient=a_gradients[0],
                             status_label=status_batch,
                             action_label=action_batch)

            train_time = time.time() - start_time - sample_time - y_time

            # target networks update
            self.critic.UpdateTarget()
            self.actor.UpdateTarget()

            target_time = time.time(
            ) - start_time - sample_time - y_time - train_time

            # print 'sample_time:{:.3f}, y_time:{:.3f}, train_time:{:.3f}, target_time:{:.3f}'.format(sample_time,
            #                                                                                         y_time,
            #                                                                                         train_time,
            #                                                                                         target_time)

            return q
예제 #8
0
class DDPGagent(object):
    def __init__(self, env):

        self.sess = tf.Session()
        K.set_session(self.sess)

        ## hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 64
        self.BUFFER_SIZE = 20000
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.TAU = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        ## create actor and critic networks
        self.actor = Actor(self.sess, self.state_dim, self.action_dim,
                           self.action_bound, self.TAU,
                           self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.sess, self.state_dim, self.action_dim,
                             self.TAU, self.CRITIC_LEARNING_RATE)

        ## initialize for later gradient calculation
        self.sess.run(
            tf.global_variables_initializer())  #<-- no problem without it

        ## initialize replay buffer
        self.buffer = ReplayBuffer(self.BUFFER_SIZE)

        # save the results
        self.save_epi_reward = []

    ## Ornstein Uhlenbeck Noise
    def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
        return x + rho * (
            mu - x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim)

    ## computing TD target: y_k = r_k + gamma*Q(s_k+1, a_k+1)
    def td_target(self, rewards, q_values, dones):
        y_k = np.asarray(q_values)
        for i in range(q_values.shape[0]):  # number of batch
            if dones[i]:
                y_k[i] = rewards[i]
            else:
                y_k[i] = rewards[i] + self.GAMMA * q_values[i]
        return y_k

    ## train the agent
    def train(self, max_episode_num):

        # initial transfer model weights to target model network
        self.actor.update_target_network()
        self.critic.update_target_network()

        for ep in range(int(max_episode_num)):
            # reset OU noise
            pre_noise = np.zeros(self.action_dim)
            # reset episode
            time, episode_reward, done = 0, 0, False
            # reset the environment and observe the first state
            state = self.env.reset()
            while not done:
                # visualize the environment
                #self.env.render()
                # pick an action: shape = (1,)
                action = self.actor.predict(state)
                noise = self.ou_noise(pre_noise, dim=self.action_dim)
                # clip continuous action to be within action_bound
                action = np.clip(action + noise, -self.action_bound,
                                 self.action_bound)
                # observe reward, new_state
                next_state, reward, done, _ = self.env.step(action)
                # add transition to replay buffer
                train_reward = (reward + 8) / 8
                self.buffer.add_buffer(state, action, train_reward, next_state,
                                       done)

                if self.buffer.buffer_size > 1000:  # start train after buffer has some amounts

                    # sample transitions from replay buffer
                    states, actions, rewards, next_states, dones = self.buffer.sample_batch(
                        self.BATCH_SIZE)
                    # predict target Q-values
                    target_qs = self.critic.target_predict(
                        [next_states,
                         self.actor.target_predict(next_states)])
                    # compute TD targets
                    y_i = self.td_target(rewards, target_qs, dones)
                    # train critic using sampled batch
                    self.critic.train_on_batch(states, actions, y_i)
                    # Q gradient wrt current policy
                    s_actions = self.actor.model.predict(
                        states)  # shape=(batch, 1),
                    # caution: NOT self.actor.predict !
                    # self.actor.model.predict(state) -> shape=(1,1)
                    # self.actor.predict(state) -> shape=(1,) -> type of gym action
                    s_grads = self.critic.dq_da(states, s_actions)
                    dq_das = np.array(s_grads).reshape((-1, self.action_dim))
                    # train actor
                    self.actor.train(states, dq_das)
                    # update both target network
                    self.actor.update_target_network()
                    self.critic.update_target_network()

                # update current state
                pre_noise = noise
                state = next_state
                episode_reward += reward
                time += 1

            ## display rewards every episode
            print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ',
                  episode_reward)

            self.save_epi_reward.append(episode_reward)

            ## save weights every episode
            #print('Now save')
            self.actor.save_weights("./save_weights/pendulum_actor.h5")
            self.critic.save_weights("./save_weights/pendulum_critic.h5")

        np.savetxt('./save_weights/pendulum_epi_reward.txt',
                   self.save_epi_reward)
        print(self.save_epi_reward)

    ## save them to file if done
    def plot_result(self):
        plt.plot(self.save_epi_reward)
        plt.show()
예제 #9
0
class DDPG_Agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score
        self.score = 0
        self.count = 0
        self.best_score = -np.inf

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.total_reward = 0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        self.total_reward += reward
        self.count += 1

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        if self.count > 0:
            self.score = self.total_reward / float(self.count)
            if self.score > self.best_score:
                self.best_score = self.score
        else:
            self.score = 0

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)