Пример #1
0
class Execute:
    def __init__(self, path):
        self.config = Configuration.construct(path)
        self.env = Environment(self.config)
        self.memory = ReplayMemory(self.config)
        self.model = Model(self.config)
        self.ep = None

    def get_epsilon(self, is_play):
        if is_play:
            return self.config.play.ep
        ep_start = self.config.train.ep.start
        ep_final = self.config.train.ep.final
        ep_num_frames = self.config.train.ep.num_frames
        decay = (ep_start - ep_final) / ep_num_frames
        if self.ep is None:
            self.ep = ep_start
        self.ep = max(self.ep - decay, ep_final)
        return self.ep

    def log(self, **kawrgs):
        log = ""
        for name, value in kawrgs.items():
            log += f"{name}: {value}, "
        print(log)

    def run_episode(self, episode=1, steps=0, is_play=True, debug=False):
        config = self.config

        self.env.reset()
        action = 1
        _, _, curr_state, is_done = self.env.step(action)
        total_reward = 0
        update_net = 0; C = config.train.network_update_freq
        t = 0; T = config.max_episode_length

        while not is_done and t < T:
            if t % config.action_repeat == 0:
                ep = self.get_epsilon(is_play)
                action = self.model.choose_action(curr_state, ep)
            prev_state, reward, curr_state, is_done = self.env.step(action)
            total_reward += reward
            t += 1

            if is_play:
                self.env.render("human")
                if debug and t % config.play.debug.time == 0:
                    self.log(ftype=self.env.get_frame_type(), action=action, reward=total_reward)
                continue

            self.memory.add((prev_state, action, reward, curr_state, is_done))
            if self.memory.get_size() > config.train.replay_start_size:
                for i in range(config.train.batch_run):
                    batch = self.memory.sample()
                    self.model.optimize(batch)
                    steps = (steps + 1) % C
                if steps % C == 0:
                    self.model.update_qhat()
                    update_net += 1

        if not is_play and debug and episode % config.train.debug.time == 0:
            self.log(ftype=self.env.get_frame_type(), total_reward=total_reward, network_update_steps=update_net, episode_time=t, ep=ep)

        return total_reward, steps

    def load_model(self):
        ftype = self.env.get_frame_type()
        in_size = self.env.get_in_size()
        num_actions = self.env.get_num_actions()
        self.model.load_model(ftype, in_size, num_actions)

    def play(self, debug=False):
        self.load_model()
        for ep in range(1):
            self.run_episode(is_play=True, debug=debug)

    def train(self, debug=False):
        self.load_model()
        optimize_steps = 0
        episodes = self.config.train.episodes
        for episode in range(1, episodes+1):
            reward, steps = self.run_episode(episode=episode, steps=optimize_steps, is_play=False, debug=debug)
            optimize_steps += steps
            if episode % self.config.train.save_model_episode == 0:
                self.model.save_model()
        self.model.update_qhat()
        self.model.save_model()

    def close(self):
        self.env.close()
        self.memory.close()
Пример #2
0
class Agent:
    
    def __init__(self, args):

        # which environment to load from the opencv database
        self.env_id = "PongNoFrameskip-v4"
        # create the environment
        self.env = Environment(self.env_id)

        # part of the q-value formula
        self.discount_factor = 0.99
        self.batch_size = 64
        # how often to update the network (backpropogation)
        self.update_frequency = 4
        # often synchronize with the target  network
        self.target_network_update_freq = 1000

        # keeps track of the frames for training, and retrieves them in batches 
        self.agent_history_length = 4
        self.memory = ReplayMemory(capacity=10000, batch_size=self.batch_size)

        # two neural networks. One for main and one for target
        self.main_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        self.target_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        
        # adam optimizer. just a standard procedure
        self.optimizer = Adam(learning_rate=1e-4, epsilon=1e-6)
        # we start with a high exploration rate then slowly decrease it
        self.init_explr = 1.0
        self.final_explr = 0.1
        self.final_explr_frame = 1000000
        self.replay_start_size = 10000

        # metrics for the loss 
        self.loss = tf.keras.losses.Huber()
        # this will be the mean of 100 last rewards
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        # comes from the q loss below
        self.q_metric = tf.keras.metrics.Mean(name="Q_value")

        # what is the max number of frames to train. probably won't reach here.
        self.training_frames = int(1e7)

        # path to save the checkpoints, logs and the weights
        self.checkpoint_path = "./checkpoints/" + args.run_name
        self.tensorboard_writer = tf.summary.create_file_writer(self.checkpoint_path + "/runs/")
        self.print_log_interval = 10
        self.save_weight_interval = 10
        self.env.reset()
           

     # calculate the network loss on the replay buffer (Q-learning)
    def update_main_q_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch):
       
        with tf.GradientTape() as tape:
            ## THIS IS WHERE THE MAGIC HAPPENS!
            ## L = Q(s, a) - (r + discount_factor* Max Q(s’, a))
            next_state_q = self.target_network(next_state_batch)
            next_state_max_q = tf.math.reduce_max(next_state_q, axis=1)
            expected_q = reward_batch + self.discount_factor * next_state_max_q * (1.0 - tf.cast(terminal_batch, tf.float32))
            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)

        return loss

    
     # calculate the network loss on the replay buffer (Double Q-learning)
    def update_main_dq_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch):
        
        with tf.GradientTape() as tape:
            # THIS IS WHERE THE MAGIC HAPPENS!
            ## here we maintain two Q values: one to maximize the reward in the next state and one to update current state
            q_online = self.main_network(next_state_batch)  # Use q values from online network
            action_q_online = tf.math.argmax(q_online, axis=1)  # optimal actions from the q_online
            q_target = self.target_network(next_state_batch)  #  q values from target netowkr
            ddqn_q = tf.reduce_sum(q_target * tf.one_hot(action_q_online, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            expected_q = reward_batch + self.discount_factor * ddqn_q * (1.0 - tf.cast(terminal_batch, tf.float32))  # Corresponds to equation (4) in ddqn paper
            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)

        return loss



    # get the next action index based on the state (84,84,4) and exploration rate
    def get_action(self, state, exploration_rate):
        recent_state = tf.expand_dims(state, axis=0)
        if tf.random.uniform((), minval=0, maxval=1, dtype=tf.float32) < exploration_rate:
            action = tf.random.uniform((), minval=0, maxval=self.env.get_action_space_size(), dtype=tf.int32)
        else:
            q_value = self.main_network(tf.cast(recent_state, tf.float32))
            action = tf.cast(tf.squeeze(tf.math.argmax(q_value, axis=1)), dtype=tf.int32)
        return action
        
    
    # get the epsilon value for the current based. Similar to https://openai.com/blog/openai-baselines-dqn/
    def get_eps(self, current_step, terminal_eps=0.01, terminal_frame_factor=25):
    
        terminal_eps_frame = self.final_explr_frame * terminal_frame_factor

        if current_step < self.replay_start_size:
            eps = self.init_explr
        elif self.replay_start_size <= current_step and current_step < self.final_explr_frame:
            eps = (self.final_explr - self.init_explr) / (self.final_explr_frame - self.replay_start_size) * (current_step - self.replay_start_size) + self.init_explr
        elif self.final_explr_frame <= current_step and current_step < terminal_eps_frame:
            eps = (terminal_eps - self.final_explr) / (terminal_eps_frame - self.final_explr_frame) * (current_step - self.final_explr_frame) + self.final_explr
        else:
            eps = terminal_eps
        return eps
    
        
    # copy over the weights between the main and target network to synchronize
    def update_target_network(self):
        main_vars = self.main_network.trainable_variables
        target_vars = self.target_network.trainable_variables
        for main_var, target_var in zip(main_vars, target_vars):
            target_var.assign(main_var)

    def train(self, algorithm='q'):
    
        total_step = 0
        episode = 0
        latest_mean_score = -99.99
        latest_100_score = deque(maxlen=100)
        # this is kinda arbitrary but looks like the best bot reach 20 when they are done training in this game
        max_reward = 20.0

        # train until the mean reward reaches 20
        while latest_mean_score < max_reward:
            
            # reset the variable for the upcoming episode
            state = self.env.reset()
            episode_step = 0
            episode_score = 0.0
            done = False


            while not done:
                # while the episode is not done, calculate the epsilon and get the next action
                eps = self.get_eps(tf.constant(total_step, tf.float32))
                action = self.get_action(tf.constant(state), tf.constant(eps, tf.float32))
            
                next_state, reward, done, info = self.env.step(action)
                episode_score += reward

                self.memory.push(state, action, reward, next_state, done)
                state = next_state

                # update the netwrok
                if (total_step % self.update_frequency == 0) and (total_step > self.replay_start_size):
                    indices = self.memory.get_minibatch_indices()
                    state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.generate_minibatch_samples(indices)
                    if algorithm == 'q':
                        self.update_main_q_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch)
                    else:
                        self.update_main_dq_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch)

                if (total_step % self.target_network_update_freq == 0) and (total_step > self.replay_start_size):
                    loss = self.update_target_network()
                
                total_step += 1
                episode_step += 1

                if done:
                    latest_100_score.append(episode_score)
                    self.write_summary(episode, latest_100_score, episode_score, total_step, eps)
                    episode += 1

                    if episode % self.print_log_interval == 0:
                        print("Episode: ", episode)
                        print("Latest 100 avg: {:.4f}".format(np.mean(latest_100_score)))
                        print("Progress: {} / {} ( {:.2f} % )".format(total_step, self.training_frames, 
                        np.round(total_step / self.training_frames, 3) * 100))
                        latest_mean_score = np.mean(latest_100_score)

                    if episode % self.save_weight_interval == 0:
                        print("Saving weights...")
                        self.main_network.save_weights(self.checkpoint_path + "/weights/episode_{}".format(episode))


    # write the summaries back to the tensorboard
    def write_summary(self, episode, latest_100_score, episode_score, total_step, eps):

        with self.tensorboard_writer.as_default():
            tf.summary.scalar("Reward", episode_score, step=episode)
            tf.summary.scalar("Latest 100 avg rewards", np.mean(latest_100_score), step=episode)
            tf.summary.scalar("Loss", self.loss_metric.result(), step=episode)
            tf.summary.scalar("Average Q", self.q_metric.result(), step=episode)
            tf.summary.scalar("Total Frames", total_step, step=episode)
            tf.summary.scalar("Epsilon", eps, step=episode)

        self.loss_metric.reset_states()
        self.q_metric.reset_states()
Пример #3
0
    encoding_specific = args.frozen_model
elif args.spatial_encoding == 'pc-gauss' or args.spatial_encoding == 'pc-gauss-softmax':
    encoding_specific = args.pc_gauss_sigma
elif args.spatial_encoding == 'pc-dog':
    encoding_specific = '{}-{}'.format(args.pc_gauss_sigma, args.pc_diff_sigma)
elif args.spatial_encoding == 'hex-trig':
    encoding_specific = args.hex_freq_coef
elif args.spatial_encoding == 'tile-coding':
    encoding_specific = '{}tiles_{}bins'.format(args.n_tiles, args.n_bins)

if not os.path.exists('data'):
    os.makedirs('data')

fname = 'data/random_walk_{}_{}_{}to{}_{}dim_{}steps.npz'.format(
    args.spatial_encoding,
    encoding_specific,
    args.limit_low,
    args.limit_high,
    args.dim,
    args.n_steps,
)

positions = np.zeros((args.n_steps, 2))
activations = np.zeros((args.n_steps, dim))

for n in range(args.n_steps):
    print('\x1b[2K\r Step {} of {}'.format(n + 1, args.n_steps), end="\r")
    activations[n, :], positions[n, :] = env.step()

np.savez(fname, positions=positions, activations=activations)
    ax = plt.axes(xlim=(args.limit_low, args.limit_high),
                  ylim=(args.limit_low, args.limit_high))
    line, = ax.plot([], [], lw=3)
    x = []
    y = []

    fig_pca = plt.figure()
    # ax_pca = plt.axes(xlim=(0, args.n_components), ylim=(0, args.n_components))
    ax_pca = plt.axes()
    image = ax_pca.imshow(
        np.random.uniform(0, 1, size=(args.n_components, args.n_components)))

    plt.show(block=False)

    for n in range(n_steps):
        activations, pos = env.step()

        if all_activations is None:
            all_activations = activations.reshape((1, dim)).copy()
        else:
            all_activations = np.append(all_activations,
                                        activations.reshape((1, dim)),
                                        axis=0)

        if all_pos is None:
            all_pos = pos.reshape((1, 2)).copy()
        else:
            all_pos = np.append(all_pos, pos.reshape((1, 2)), axis=0)

        if show_pca:
            # pca.fit(all_activations)
Пример #5
0
class Game(object):
    """A single episode of interaction with the environment."""
    def __init__(self, action_space_size: int, discount: float):
        self.environment = Environment()  # Game specific environment.
        self.history = []
        self.rewards = []
        self.child_visits = []
        self.root_values = []
        self.action_space_size = action_space_size
        self.discount = discount

    def terminal(self) -> bool:
        # Game specific termination rules.
        pass

    def legal_actions(self) -> List[Action]:
        # Game specific calculation of legal actions.
        return []

    def apply(self, action: Action):
        reward = self.environment.step(action)
        self.rewards.append(reward)
        self.history.append(action)

    def store_search_statistics(self, root: Node):
        sum_visits = sum(child.visit_count for child in root.children.values())
        action_space = (Action(index)
                        for index in range(self.action_space_size))
        self.child_visits.append([
            root.children[a].visit_count /
            sum_visits if a in root.children else 0 for a in action_space
        ])
        self.root_values.append(root.value())

    def make_image(self, state_index: int):
        # Game specific feature planes.
        return []

    def make_target(self, state_index: int, num_unroll_steps: int,
                    td_steps: int, to_play: Player):
        # The value target is the discounted root value of the search tree N steps
        # into the future, plus the discounted sum of all rewards until then.
        targets = []
        for current_index in range(state_index,
                                   state_index + num_unroll_steps + 1):
            bootstrap_index = current_index + td_steps
            if bootstrap_index < len(self.root_values):
                value = self.root_values[
                    bootstrap_index] * self.discount**td_steps
            else:
                value = 0

            if self.rewards:
                for i, reward in enumerate(
                        self.rewards[current_index:bootstrap_index]):
                    value += reward * self.discount**i  # pytype: disable=unsupported-operands

            if current_index < len(self.root_values):
                if self.rewards:
                    targets.append((value, self.rewards[current_index],
                                    self.child_visits[current_index]))
                else:
                    targets.append(
                        (value, 0, self.child_visits[current_index]))
            else:
                # States past the end of games are treated as absorbing states.
                targets.append((0, 0, []))
        return targets

    def to_play(self) -> Player:
        return Player()

    def action_history(self) -> ActionHistory:
        return ActionHistory(self.history, self.action_space_size)