Exemplo n.º 1
0
    def __init__(self, env_id, task_id, comm, monitor_path, config, seed=None):
        super(DPPOWorker, self).__init__()
        self.comm = comm
        self.config = config
        self.env = make(env_id)
        self.task_id = task_id
        if seed is not None:
            self.env.seed(seed)
        self.writer = tf.summary.FileWriter(
            os.path.join(monitor_path, "task{}".format(task_id)))

        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        with tf.device('/cpu:0'):
            with tf.variable_scope(
                    "new_network"):  # The workers only have 1 network
                self.global_network = self.build_networks()
                self.states = self.global_network.states
                self.action = self.global_network.action
                self.value = self.global_network.value
                self.global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    tf.get_variable_scope().name)
                self._global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

            self.env_runner = EnvRunner(self.env,
                                        self, {},
                                        summary_writer=self.writer)
Exemplo n.º 2
0
    def __init__(self, master, env, task_id, n_iter, start_at_iter=0):
        super(AKTThread, self).__init__()
        self.master = master
        self.config = self.master.config
        self.task_id = task_id
        self.nA = env.action_space.n
        self.n_iter = n_iter
        self.start_at_iter = start_at_iter
        self.add_accum_grad = None  # To be filled in later

        self.build_networks()
        self.states = self.master.states
        self.session = self.master.session
        self.task_runner = EnvRunner(env, TaskPolicy(self.action, self),
                                     self.master.config)

        # Write the summary of each task in a different directory
        self.writer = tf.summary.FileWriter(
            os.path.join(self.master.monitor_path, "task" + str(self.task_id)),
            self.master.session.graph)

        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.config["learning_rate"],
            decay=self.config["decay"],
            epsilon=self.config["epsilon"])
Exemplo n.º 3
0
    def __init__(self, envs, monitor_path, **usercfg):
        super(KnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.n_tasks = len(envs)
        self.monitor_path = monitor_path
        self.nA = envs[0].action_space.n
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=100,
                switch_at_iter=None,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.build_networks()
        self.task_runners = [
            EnvRunner(envs[i], TaskPolicy(action, self), self.config)
            for i, action in enumerate(self.action_tensors)
        ]
        if self.config["save_model"]:
            for action_tensor in self.action_tensors:
                tf.add_to_collection("action", action_tensor)
            tf.add_to_collection("states", self.states)
            self.saver = tf.train.Saver()
Exemplo n.º 4
0
    def __init__(self, env, monitor_path, video=True, **usercfg):
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))
        self.env_runner = EnvRunner(self.env, self, usercfg)

        self.config.update(
            dict(timesteps_per_batch=10000,
                 trajectories_per_batch=10,
                 batch_update="timesteps",
                 n_iter=100,
                 gamma=0.99,
                 actor_learning_rate=0.01,
                 critic_learning_rate=0.05,
                 actor_n_hidden=20,
                 critic_n_hidden=20,
                 repeat_n_actions=1,
                 save_model=False))
        self.config.update(usercfg)
        self.build_networks()
        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = tf.train.Saver()
        self.rewards = tf.placeholder("float", name="Rewards")
        self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
        summary_actor_loss = tf.summary.scalar("Actor_loss",
                                               self.summary_actor_loss)
        summary_critic_loss = tf.summary.scalar("Critic_loss",
                                                self.summary_critic_loss)
        summary_rewards = tf.summary.scalar("Rewards", self.rewards)
        summary_episode_lengths = tf.summary.scalar("Episode_lengths",
                                                    self.episode_lengths)
        self.summary_op = tf.summary.merge([
            summary_actor_loss, summary_critic_loss, summary_rewards,
            summary_episode_lengths
        ])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        return
Exemplo n.º 5
0
    def __init__(self, env, monitor_path, video=True, **usercfg):
        super(REINFORCE, self).__init__(**usercfg)
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))
        self.env_runner = EnvRunner(self.env, self, usercfg)
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.05,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.build_network()
        self.make_trainer()

        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = tf.train.Saver()
        self.rewards = tf.placeholder("float", name="Rewards")
        self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
        summary_loss = tf.summary.scalar("Loss", self.summary_loss)
        summary_rewards = tf.summary.scalar("Rewards", self.rewards)
        summary_episode_lengths = tf.summary.scalar("Episode_lengths",
                                                    self.episode_lengths)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)
Exemplo n.º 6
0
    def __init__(self,
                 env,
                 monitor_path: str,
                 video: bool = True,
                 **usercfg) -> None:
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))

        self.config.update(
            dict(n_iter=100,
                 gamma=0.99,
                 learning_rate=0.001,
                 n_hidden_units=20,
                 n_hidden_layers=1,
                 gradient_clip_value=0.5,
                 n_local_steps=20,
                 vf_coef=0.5,
                 entropy_coef=0.01,
                 loss_reducer="mean",
                 save_model=False))
        self.config.update(usercfg)
        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        self.ac_net = None  # Overwritten by build_networks
        self.build_networks()

        self.action = self.ac_net.action
        self.states = self.ac_net.states
        self.actions_taken = self.ac_net.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.actor_loss, self.critic_loss, self.loss = self.make_loss()

        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      tf.get_variable_scope().name)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.optimizer = tf.train.AdamOptimizer(self.config["learning_rate"],
                                                name="optim")
        grads = tf.gradients(self.loss, self.vars)
        grads, _ = tf.clip_by_global_norm(grads,
                                          self.config["gradient_clip_value"])

        # Apply gradients to the weights of the master network
        apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars))

        self.n_steps = tf.shape(self.states)[0]
        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        n_steps = tf.to_float(self.n_steps)
        actor_loss_summary = tf.summary.scalar(
            "model/actor_loss", tf.squeeze(self.actor_loss / n_steps))
        critic_loss_summary = tf.summary.scalar(
            "model/critic_loss", tf.squeeze(self.critic_loss / n_steps))
        loss_summary = tf.summary.scalar("model/loss",
                                         tf.squeeze(self.loss / n_steps))
        self.loss_summary_op = tf.summary.merge(
            [actor_loss_summary, critic_loss_summary, loss_summary])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(self.env,
                                    self,
                                    usercfg,
                                    summary_writer=self.writer)
        return
Exemplo n.º 7
0
    def __init__(self, env, monitor_path: str, video=False, **usercfg) -> None:
        super(PPO, self).__init__(**usercfg)
        self.monitor_path: str = monitor_path
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))

        self.config.update(
            dict(
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=0.001,
                n_epochs=10,
                n_iter=10000,
                batch_size=64,  # Timesteps per training batch
                n_local_steps=256,
                normalize_states=False,
                gradient_clip_value=None,
                adam_epsilon=1e-5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.2  # Clipped surrogate objective epsilon
            ))
        self.config.update(usercfg)

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        ratio = tf.exp(self.new_network.action_log_prob -
                       self.old_network.action_log_prob)
        ratio_clipped = tf.clip_by_value(ratio,
                                         1.0 - self.config["cso_epsilon"],
                                         1.0 + self.config["cso_epsilon"])
        cso_loss = tf.minimum(ratio * self.advantage,
                              ratio_clipped * self.advantage)
        self.actor_loss = -tf.reduce_mean(cso_loss)
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.n_steps = tf.shape(self.states)[0]
        self.session = tf.Session()
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)

        adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0])
        summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean)
        summary_adv_std = tf.summary.scalar("model/advantage/std", adv_std)

        ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0])
        summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean)
        summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std)

        summary_new_log_prob_mean = tf.summary.scalar(
            "model/new_log_prob/mean",
            tf.reduce_mean(self.new_network.action_log_prob))
        summary_old_log_prob_mean = tf.summary.scalar(
            "model/old_log_prob/mean",
            tf.reduce_mean(self.old_network.action_log_prob))

        summary_ret = tf.summary.scalar("model/return/mean",
                                        tf.reduce_mean(self.ret))
        summary_entropy = tf.summary.scalar("model/entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        summaries = []
        for v in tf.trainable_variables():
            if "new_network" in v.name:
                summaries.append(tf.summary.histogram(v.name, v))
        summaries += [
            summary_actor_loss, summary_critic_loss, summary_loss,
            summary_adv_mean, summary_adv_std, summary_ratio_mean,
            summary_ratio_std, summary_new_log_prob_mean,
            summary_old_log_prob_mean, summary_ret, summary_entropy,
            summary_grad_norm, summary_var_norm
        ]
        self.model_summary_op = tf.summary.merge(summaries)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(
            self.env,
            self,
            usercfg,
            normalize_states=self.config["normalize_states"],
            summary_writer=self.writer)

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["learning_rate"],
            epsilon=self.config["adam_epsilon"],
            name="optim")
        apply_grads = self.optimizer.apply_gradients(
            zip(grads, self.new_network_vars))

        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        init = tf.global_variables_initializer()
        self.session.run(init)
        return