Python Environment.step примеры использования

Язык программирования: Python

Пространство имен/Пакет: utils

Класс/Тип: Environment

Метод/Функция: step

Примеров на hotexamples.com: 5

Python Environment.step - 5 примеров найдено. Это лучшие примеры Python кода для utils.Environment.step, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Environment(17)

step(5)

get(2)

possibleMoves(2)

reset(2)

set(2)

close(1)

get_action_space_size(1)

get_frame_type(1)

get_in_size(1)

get_num_actions(1)

render(1)

run(1)

save_states(1)

Пример #1

Показать файл

class Execute:
    def __init__(self, path):
        self.config = Configuration.construct(path)
        self.env = Environment(self.config)
        self.memory = ReplayMemory(self.config)
        self.model = Model(self.config)
        self.ep = None

    def get_epsilon(self, is_play):
        if is_play:
            return self.config.play.ep
        ep_start = self.config.train.ep.start
        ep_final = self.config.train.ep.final
        ep_num_frames = self.config.train.ep.num_frames
        decay = (ep_start - ep_final) / ep_num_frames
        if self.ep is None:
            self.ep = ep_start
        self.ep = max(self.ep - decay, ep_final)
        return self.ep

    def log(self, **kawrgs):
        log = ""
        for name, value in kawrgs.items():
            log += f"{name}: {value}, "
        print(log)

    def run_episode(self, episode=1, steps=0, is_play=True, debug=False):
        config = self.config

        self.env.reset()
        action = 1
        _, _, curr_state, is_done = self.env.step(action)
        total_reward = 0
        update_net = 0; C = config.train.network_update_freq
        t = 0; T = config.max_episode_length

        while not is_done and t < T:
            if t % config.action_repeat == 0:
                ep = self.get_epsilon(is_play)
                action = self.model.choose_action(curr_state, ep)
            prev_state, reward, curr_state, is_done = self.env.step(action)
            total_reward += reward
            t += 1

            if is_play:
                self.env.render("human")
                if debug and t % config.play.debug.time == 0:
                    self.log(ftype=self.env.get_frame_type(), action=action, reward=total_reward)
                continue

            self.memory.add((prev_state, action, reward, curr_state, is_done))
            if self.memory.get_size() > config.train.replay_start_size:
                for i in range(config.train.batch_run):
                    batch = self.memory.sample()
                    self.model.optimize(batch)
                    steps = (steps + 1) % C
                if steps % C == 0:
                    self.model.update_qhat()
                    update_net += 1

        if not is_play and debug and episode % config.train.debug.time == 0:
            self.log(ftype=self.env.get_frame_type(), total_reward=total_reward, network_update_steps=update_net, episode_time=t, ep=ep)

        return total_reward, steps

    def load_model(self):
        ftype = self.env.get_frame_type()
        in_size = self.env.get_in_size()
        num_actions = self.env.get_num_actions()
        self.model.load_model(ftype, in_size, num_actions)

    def play(self, debug=False):
        self.load_model()
        for ep in range(1):
            self.run_episode(is_play=True, debug=debug)

    def train(self, debug=False):
        self.load_model()
        optimize_steps = 0
        episodes = self.config.train.episodes
        for episode in range(1, episodes+1):
            reward, steps = self.run_episode(episode=episode, steps=optimize_steps, is_play=False, debug=debug)
            optimize_steps += steps
            if episode % self.config.train.save_model_episode == 0:
                self.model.save_model()
        self.model.update_qhat()
        self.model.save_model()

    def close(self):
        self.env.close()
        self.memory.close()

Пример #2

Показать файл

class Agent:
    
    def __init__(self, args):

        # which environment to load from the opencv database
        self.env_id = "PongNoFrameskip-v4"
        # create the environment
        self.env = Environment(self.env_id)

        # part of the q-value formula
        self.discount_factor = 0.99
        self.batch_size = 64
        # how often to update the network (backpropogation)
        self.update_frequency = 4
        # often synchronize with the target  network
        self.target_network_update_freq = 1000

        # keeps track of the frames for training, and retrieves them in batches 
        self.agent_history_length = 4
        self.memory = ReplayMemory(capacity=10000, batch_size=self.batch_size)

        # two neural networks. One for main and one for target
        self.main_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        self.target_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        
        # adam optimizer. just a standard procedure
        self.optimizer = Adam(learning_rate=1e-4, epsilon=1e-6)
        # we start with a high exploration rate then slowly decrease it
        self.init_explr = 1.0
        self.final_explr = 0.1
        self.final_explr_frame = 1000000
        self.replay_start_size = 10000

        # metrics for the loss 
        self.loss = tf.keras.losses.Huber()
        # this will be the mean of 100 last rewards
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        # comes from the q loss below
        self.q_metric = tf.keras.metrics.Mean(name="Q_value")

        # what is the max number of frames to train. probably won't reach here.
        self.training_frames = int(1e7)

        # path to save the checkpoints, logs and the weights
        self.checkpoint_path = "./checkpoints/" + args.run_name
        self.tensorboard_writer = tf.summary.create_file_writer(self.checkpoint_path + "/runs/")
        self.print_log_interval = 10
        self.save_weight_interval = 10
        self.env.reset()
           

     # calculate the network loss on the replay buffer (Q-learning)
    def update_main_q_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch):
       
        with tf.GradientTape() as tape:
            ## THIS IS WHERE THE MAGIC HAPPENS!
            ## L = Q(s, a) - (r + discount_factor* Max Q(s’, a))
            next_state_q = self.target_network(next_state_batch)
            next_state_max_q = tf.math.reduce_max(next_state_q, axis=1)
            expected_q = reward_batch + self.discount_factor * next_state_max_q * (1.0 - tf.cast(terminal_batch, tf.float32))
            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)

        return loss

    
     # calculate the network loss on the replay buffer (Double Q-learning)
    def update_main_dq_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch):
        
        with tf.GradientTape() as tape:
            # THIS IS WHERE THE MAGIC HAPPENS!
            ## here we maintain two Q values: one to maximize the reward in the next state and one to update current state
            q_online = self.main_network(next_state_batch)  # Use q values from online network
            action_q_online = tf.math.argmax(q_online, axis=1)  # optimal actions from the q_online
            q_target = self.target_network(next_state_batch)  #  q values from target netowkr
            ddqn_q = tf.reduce_sum(q_target * tf.one_hot(action_q_online, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            expected_q = reward_batch + self.discount_factor * ddqn_q * (1.0 - tf.cast(terminal_batch, tf.float32))  # Corresponds to equation (4) in ddqn paper
            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)

        return loss



    # get the next action index based on the state (84,84,4) and exploration rate
    def get_action(self, state, exploration_rate):
        recent_state = tf.expand_dims(state, axis=0)
        if tf.random.uniform((), minval=0, maxval=1, dtype=tf.float32) < exploration_rate:
            action = tf.random.uniform((), minval=0, maxval=self.env.get_action_space_size(), dtype=tf.int32)
        else:
            q_value = self.main_network(tf.cast(recent_state, tf.float32))
            action = tf.cast(tf.squeeze(tf.math.argmax(q_value, axis=1)), dtype=tf.int32)
        return action
        
    
    # get the epsilon value for the current based. Similar to https://openai.com/blog/openai-baselines-dqn/
    def get_eps(self, current_step, terminal_eps=0.01, terminal_frame_factor=25):
    
        terminal_eps_frame = self.final_explr_frame * terminal_frame_factor

        if current_step < self.replay_start_size:
            eps = self.init_explr
        elif self.replay_start_size <= current_step and current_step < self.final_explr_frame:
            eps = (self.final_explr - self.init_explr) / (self.final_explr_frame - self.replay_start_size) * (current_step - self.replay_start_size) + self.init_explr
        elif self.final_explr_frame <= current_step and current_step < terminal_eps_frame:
            eps = (terminal_eps - self.final_explr) / (terminal_eps_frame - self.final_explr_frame) * (current_step - self.final_explr_frame) + self.final_explr
        else:
            eps = terminal_eps
        return eps
    
        
    # copy over the weights between the main and target network to synchronize
    def update_target_network(self):
        main_vars = self.main_network.trainable_variables
        target_vars = self.target_network.trainable_variables
        for main_var, target_var in zip(main_vars, target_vars):
            target_var.assign(main_var)

    def train(self, algorithm='q'):
    
        total_step = 0
        episode = 0
        latest_mean_score = -99.99
        latest_100_score = deque(maxlen=100)
        # this is kinda arbitrary but looks like the best bot reach 20 when they are done training in this game
        max_reward = 20.0

        # train until the mean reward reaches 20
        while latest_mean_score < max_reward:
            
            # reset the variable for the upcoming episode
            state = self.env.reset()
            episode_step = 0
            episode_score = 0.0
            done = False


            while not done:
                # while the episode is not done, calculate the epsilon and get the next action
                eps = self.get_eps(tf.constant(total_step, tf.float32))
                action = self.get_action(tf.constant(state), tf.constant(eps, tf.float32))
            
                next_state, reward, done, info = self.env.step(action)
                episode_score += reward

                self.memory.push(state, action, reward, next_state, done)
                state = next_state

                # update the netwrok
                if (total_step % self.update_frequency == 0) and (total_step > self.replay_start_size):
                    indices = self.memory.get_minibatch_indices()
                    state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.generate_minibatch_samples(indices)
                    if algorithm == 'q':
                        self.update_main_q_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch)
                    else:
                        self.update_main_dq_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch)

                if (total_step % self.target_network_update_freq == 0) and (total_step > self.replay_start_size):
                    loss = self.update_target_network()
                
                total_step += 1
                episode_step += 1

                if done:
                    latest_100_score.append(episode_score)
                    self.write_summary(episode, latest_100_score, episode_score, total_step, eps)
                    episode += 1

                    if episode % self.print_log_interval == 0:
                        print("Episode: ", episode)
                        print("Latest 100 avg: {:.4f}".format(np.mean(latest_100_score)))
                        print("Progress: {} / {} ( {:.2f} % )".format(total_step, self.training_frames, 
                        np.round(total_step / self.training_frames, 3) * 100))
                        latest_mean_score = np.mean(latest_100_score)

                    if episode % self.save_weight_interval == 0:
                        print("Saving weights...")
                        self.main_network.save_weights(self.checkpoint_path + "/weights/episode_{}".format(episode))


    # write the summaries back to the tensorboard
    def write_summary(self, episode, latest_100_score, episode_score, total_step, eps):

        with self.tensorboard_writer.as_default():
            tf.summary.scalar("Reward", episode_score, step=episode)
            tf.summary.scalar("Latest 100 avg rewards", np.mean(latest_100_score), step=episode)
            tf.summary.scalar("Loss", self.loss_metric.result(), step=episode)
            tf.summary.scalar("Average Q", self.q_metric.result(), step=episode)
            tf.summary.scalar("Total Frames", total_step, step=episode)
            tf.summary.scalar("Epsilon", eps, step=episode)

        self.loss_metric.reset_states()
        self.q_metric.reset_states()

Пример #3

Показать файл

    encoding_specific = args.frozen_model
elif args.spatial_encoding == 'pc-gauss' or args.spatial_encoding == 'pc-gauss-softmax':
    encoding_specific = args.pc_gauss_sigma
elif args.spatial_encoding == 'pc-dog':
    encoding_specific = '{}-{}'.format(args.pc_gauss_sigma, args.pc_diff_sigma)
elif args.spatial_encoding == 'hex-trig':
    encoding_specific = args.hex_freq_coef
elif args.spatial_encoding == 'tile-coding':
    encoding_specific = '{}tiles_{}bins'.format(args.n_tiles, args.n_bins)

if not os.path.exists('data'):
    os.makedirs('data')

fname = 'data/random_walk_{}_{}_{}to{}_{}dim_{}steps.npz'.format(
    args.spatial_encoding,
    encoding_specific,
    args.limit_low,
    args.limit_high,
    args.dim,
    args.n_steps,
)

positions = np.zeros((args.n_steps, 2))
activations = np.zeros((args.n_steps, dim))

for n in range(args.n_steps):
    print('\x1b[2K\r Step {} of {}'.format(n + 1, args.n_steps), end="\r")
    activations[n, :], positions[n, :] = env.step()

np.savez(fname, positions=positions, activations=activations)

Пример #4

Показать файл

Файл: view_trajectory_generation.py Проект: bjkomer/ssp-experiments

    ax = plt.axes(xlim=(args.limit_low, args.limit_high),
                  ylim=(args.limit_low, args.limit_high))
    line, = ax.plot([], [], lw=3)
    x = []
    y = []

    fig_pca = plt.figure()
    # ax_pca = plt.axes(xlim=(0, args.n_components), ylim=(0, args.n_components))
    ax_pca = plt.axes()
    image = ax_pca.imshow(
        np.random.uniform(0, 1, size=(args.n_components, args.n_components)))

    plt.show(block=False)

    for n in range(n_steps):
        activations, pos = env.step()

        if all_activations is None:
            all_activations = activations.reshape((1, dim)).copy()
        else:
            all_activations = np.append(all_activations,
                                        activations.reshape((1, dim)),
                                        axis=0)

        if all_pos is None:
            all_pos = pos.reshape((1, 2)).copy()
        else:
            all_pos = np.append(all_pos, pos.reshape((1, 2)), axis=0)

        if show_pca:
            # pca.fit(all_activations)

Пример #5

Показать файл

Файл: Game.py Проект: thyarles/simple-muzero

class Game(object):
    """A single episode of interaction with the environment."""
    def __init__(self, action_space_size: int, discount: float):
        self.environment = Environment()  # Game specific environment.
        self.history = []
        self.rewards = []
        self.child_visits = []
        self.root_values = []
        self.action_space_size = action_space_size
        self.discount = discount

    def terminal(self) -> bool:
        # Game specific termination rules.
        pass

    def legal_actions(self) -> List[Action]:
        # Game specific calculation of legal actions.
        return []

    def apply(self, action: Action):
        reward = self.environment.step(action)
        self.rewards.append(reward)
        self.history.append(action)

    def store_search_statistics(self, root: Node):
        sum_visits = sum(child.visit_count for child in root.children.values())
        action_space = (Action(index)
                        for index in range(self.action_space_size))
        self.child_visits.append([
            root.children[a].visit_count /
            sum_visits if a in root.children else 0 for a in action_space
        ])
        self.root_values.append(root.value())

    def make_image(self, state_index: int):
        # Game specific feature planes.
        return []

    def make_target(self, state_index: int, num_unroll_steps: int,
                    td_steps: int, to_play: Player):
        # The value target is the discounted root value of the search tree N steps
        # into the future, plus the discounted sum of all rewards until then.
        targets = []
        for current_index in range(state_index,
                                   state_index + num_unroll_steps + 1):
            bootstrap_index = current_index + td_steps
            if bootstrap_index < len(self.root_values):
                value = self.root_values[
                    bootstrap_index] * self.discount**td_steps
            else:
                value = 0

            if self.rewards:
                for i, reward in enumerate(
                        self.rewards[current_index:bootstrap_index]):
                    value += reward * self.discount**i  # pytype: disable=unsupported-operands

            if current_index < len(self.root_values):
                if self.rewards:
                    targets.append((value, self.rewards[current_index],
                                    self.child_visits[current_index]))
                else:
                    targets.append(
                        (value, 0, self.child_visits[current_index]))
            else:
                # States past the end of games are treated as absorbing states.
                targets.append((0, 0, []))
        return targets

    def to_play(self) -> Player:
        return Player()

    def action_history(self) -> ActionHistory:
        return ActionHistory(self.history, self.action_space_size)