Exemplo n.º 1
0
    def __init__(self, master, env, task_id, n_iter, start_at_iter=0):
        super(AKTThread, self).__init__()
        self.master = master
        self.config = self.master.config
        self.task_id = task_id
        self.nA = env.action_space.n
        self.n_iter = n_iter
        self.start_at_iter = start_at_iter
        self.add_accum_grad = None  # To be filled in later

        self.build_networks()
        self.states = self.master.states
        self.session = self.master.session
        self.task_runner = EnvRunner(env, TaskPolicy(self.action, self),
                                     self.master.config)

        # Write the summary of each task in a different directory
        self.writer = tf.summary.FileWriter(
            os.path.join(self.master.monitor_path, "task" + str(self.task_id)),
            self.master.session.graph)

        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.config["learning_rate"],
            decay=self.config["decay"],
            epsilon=self.config["epsilon"])
Exemplo n.º 2
0
    def __init__(self, envs, monitor_path, **usercfg):
        super(KnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.n_tasks = len(envs)
        self.monitor_path = monitor_path
        self.nA = envs[0].action_space.n
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=100,
                switch_at_iter=None,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.build_networks()
        self.task_runners = [
            EnvRunner(envs[i], TaskPolicy(action, self), self.config)
            for i, action in enumerate(self.action_tensors)
        ]
        if self.config["save_model"]:
            for action_tensor in self.action_tensors:
                tf.add_to_collection("action", action_tensor)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
Exemplo n.º 3
0
    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = True,
                 **usercfg) -> None:
        super(REINFORCE, self).__init__(**usercfg)
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                learning_rate=0.05,
                entropy_coef=1e-3,
                n_hidden_layers=2,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.states = tf.placeholder(tf.float32, [None] +
                                     list(self.env.observation_space.shape),
                                     name="states")  # Observation
        self.actions_taken = tf.placeholder(
            tf.float32, name="actions_taken")  # Discrete action
        self.advantage = tf.placeholder(tf.float32,
                                        name="advantage")  # Advantage
        self.build_network()
        self.make_trainer()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        summary_loss = tf.summary.scalar("model/loss", self.summary_loss)
        summaries = [summary_loss]
        if hasattr(self, "entropy"):
            summary_entropy = tf.summary.scalar("model/entropy", self.entropy)
            summaries += [summary_entropy]
        self.summary_op = tf.summary.merge(summaries)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)

        self.env_runner = EnvRunner(self.env,
                                    self,
                                    usercfg,
                                    summary_writer=self.writer)
Exemplo n.º 4
0
class REINFORCE(Agent):
    """
    REINFORCE with baselines
    """
    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = True,
                 **usercfg) -> None:
        super(REINFORCE, self).__init__(**usercfg)
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                learning_rate=0.05,
                entropy_coef=1e-3,
                n_hidden_layers=2,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.states = tf.placeholder(tf.float32, [None] +
                                     list(self.env.observation_space.shape),
                                     name="states")  # Observation
        self.actions_taken = tf.placeholder(
            tf.float32, name="actions_taken")  # Discrete action
        self.advantage = tf.placeholder(tf.float32,
                                        name="advantage")  # Advantage
        self.build_network()
        self.make_trainer()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        summary_loss = tf.summary.scalar("model/loss", self.summary_loss)
        summaries = [summary_loss]
        if hasattr(self, "entropy"):
            summary_entropy = tf.summary.scalar("model/entropy", self.entropy)
            summaries += [summary_entropy]
        self.summary_op = tf.summary.merge(summaries)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)

        self.env_runner = EnvRunner(self.env,
                                    self,
                                    usercfg,
                                    summary_writer=self.writer)

    def _initialize(self) -> None:
        self.session.run(self.init_op)

    def build_network(self):
        raise NotImplementedError()

    def make_trainer(self):
        raise NotImplementedError()

    def choose_action(self, state, features) -> Dict[str, np.ndarray]:
        """Choose an action."""
        action = self.session.run([self.action],
                                  feed_dict={self.states: [state]})[0]
        return {"action": action}

    def learn(self):
        """Run learning algorithm"""
        self._initialize()
        reporter = Reporter()
        config = self.config
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory.states for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory.rewards, config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory.actions for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                sum(trajectory.rewards) for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory.rewards) for trajectory in trajectories
            ])  # episode lengths
            # TODO: deal with RNN state
            summary, _ = self.session.run(
                [self.summary_op, self.train],
                feed_dict={
                    self.states: all_state,
                    self.actions_taken: all_action,
                    self.advantage: all_adv
                })
            self.writer.add_summary(summary, iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Exemplo n.º 5
0
    def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None:
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(
            env,
            monitor_path,
            force=True,
            video_callable=(None if video else False))

        self.config.update(dict(
            n_iter=100,
            gamma=0.99,
            learning_rate=0.001,
            n_hidden_units=20,
            n_hidden_layers=1,
            gradient_clip_value=0.5,
            n_local_steps=20,
            vf_coef=0.5,
            entropy_coef=0.01,
            loss_reducer="mean",
            save_model=False
        ))
        self.config.update(usercfg)
        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        self.ac_net = None  # Overwritten by build_networks
        self.build_networks()

        self.action = self.ac_net.action
        self.states = self.ac_net.states
        self.actions_taken = self.ac_net.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.actor_loss, self.critic_loss, self.loss = self.make_loss()

        self.vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

        self._global_step = tf.get_variable(
            "global_step",
            [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.optimizer = tf.train.AdamOptimizer(
            self.config["learning_rate"], name="optim")
        grads = tf.gradients(self.loss, self.vars)
        grads, _ = tf.clip_by_global_norm(
            grads, self.config["gradient_clip_value"])

        # Apply gradients to the weights of the master network
        apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars))

        self.n_steps = tf.shape(self.states)[0]
        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=num_cpu,
            intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        n_steps = tf.to_float(self.n_steps)
        actor_loss_summary = tf.summary.scalar("model/actor_loss", tf.squeeze(self.actor_loss / n_steps))
        critic_loss_summary = tf.summary.scalar("model/critic_loss", tf.squeeze(self.critic_loss / n_steps))
        loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps))
        self.loss_summary_op = tf.summary.merge(
            [actor_loss_summary, critic_loss_summary, loss_summary])
        self.writer = tf.summary.FileWriter(os.path.join(
            self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer)
        return
Exemplo n.º 6
0
class A2C(Agent):
    """Advantage Actor Critic"""

    def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None:
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(
            env,
            monitor_path,
            force=True,
            video_callable=(None if video else False))

        self.config.update(dict(
            n_iter=100,
            gamma=0.99,
            learning_rate=0.001,
            n_hidden_units=20,
            n_hidden_layers=1,
            gradient_clip_value=0.5,
            n_local_steps=20,
            vf_coef=0.5,
            entropy_coef=0.01,
            loss_reducer="mean",
            save_model=False
        ))
        self.config.update(usercfg)
        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        self.ac_net = None  # Overwritten by build_networks
        self.build_networks()

        self.action = self.ac_net.action
        self.states = self.ac_net.states
        self.actions_taken = self.ac_net.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.actor_loss, self.critic_loss, self.loss = self.make_loss()

        self.vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

        self._global_step = tf.get_variable(
            "global_step",
            [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.optimizer = tf.train.AdamOptimizer(
            self.config["learning_rate"], name="optim")
        grads = tf.gradients(self.loss, self.vars)
        grads, _ = tf.clip_by_global_norm(
            grads, self.config["gradient_clip_value"])

        # Apply gradients to the weights of the master network
        apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars))

        self.n_steps = tf.shape(self.states)[0]
        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=num_cpu,
            intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        n_steps = tf.to_float(self.n_steps)
        actor_loss_summary = tf.summary.scalar("model/actor_loss", tf.squeeze(self.actor_loss / n_steps))
        critic_loss_summary = tf.summary.scalar("model/critic_loss", tf.squeeze(self.critic_loss / n_steps))
        loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps))
        self.loss_summary_op = tf.summary.merge(
            [actor_loss_summary, critic_loss_summary, loss_summary])
        self.writer = tf.summary.FileWriter(os.path.join(
            self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer)
        return

    def _initialize(self):
        self.session.run(self.init_op)

    def build_networks(self):
        return NotImplementedError("Abstract method")

    def make_loss(self):
        return NotImplementedError("Abstract method")

    @property
    def global_step(self):
        return self._global_step.eval(session=self.session)

    def get_critic_value(self, state, features):
        return self.session.run([self.ac_net.value], feed_dict={self.states: state})[0].flatten()

    def choose_action(self, state, features) -> dict:
        action, value = self.session.run(
            [self.ac_net.action, self.ac_net.value], feed_dict={self.states: [state]})
        return {"action": action, "value": value[0]}

    def get_env_action(self, action) -> int:
        return np.argmax(action)

    def learn(self):
        """Run learning algorithm"""
        self._initialize()
        config = self.config
        for _ in range(int(config["n_iter"])):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectory = self.env_runner.get_steps(int(self.config["n_local_steps"]))
            v = 0 if trajectory.terminals[-1] else self.get_critic_value(
                np.asarray(trajectory.states)[None, -1], trajectory.features[-1])
            rewards_plus_v = np.asarray(trajectory.rewards + [v])
            vpred_t = np.asarray(trajectory.values + [v])
            delta_t = trajectory.rewards + \
                self.config["gamma"] * vpred_t[1:] - vpred_t[:-1]
            batch_r = discount_rewards(
                rewards_plus_v, self.config["gamma"])[:-1]
            batch_adv = discount_rewards(delta_t, self.config["gamma"])
            fetches = [self.loss_summary_op, self.train_op, self._global_step]
            states = np.asarray(trajectory.states)
            feed_dict = {
                self.states: states,
                self.actions_taken: np.asarray(trajectory.actions),
                self.advantage: batch_adv,
                self.ret: np.asarray(batch_r)
            }
            feature = trajectory.features[0]
            if feature != [] and feature is not None:
                feed_dict[self.ac_net.rnn_state_in] = feature
            summary, _, global_step = self.session.run(fetches, feed_dict)
            self.writer.add_summary(summary, global_step)
            self.writer.flush()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session, os.path.join(
                self.monitor_path, "model"))
Exemplo n.º 7
0
Arquivo: ppo.py Projeto: rkc007/yarll
    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = False,
                 **usercfg) -> None:
        super(PPO, self).__init__(**usercfg)
        self.monitor_path: str = monitor_path
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))

        self.config.update(
            dict(
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=0.001,
                n_epochs=10,
                n_iter=10000,
                batch_size=64,  # Timesteps per training batch
                n_local_steps=256,
                normalize_states=False,
                gradient_clip_value=None,
                adam_epsilon=1e-5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.2,  # Clipped surrogate objective epsilon
                save_model=False))
        self.config.update(usercfg)

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        self.actor_loss = -tf.reduce_mean(
            self.make_actor_loss(self.old_network, self.new_network,
                                 self.advantage))
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.n_steps = tf.shape(self.states)[0]
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)

        adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0])
        summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean)
        summary_adv_std = tf.summary.scalar("model/advantage/std",
                                            tf.sqrt(adv_std))

        # TODO: get from ppo_loss function
        # ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0])
        # summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean)
        # summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std)

        summary_new_log_prob_mean = tf.summary.scalar(
            "model/new_log_prob/mean",
            tf.reduce_mean(self.new_network.action_log_prob))
        summary_old_log_prob_mean = tf.summary.scalar(
            "model/old_log_prob/mean",
            tf.reduce_mean(self.old_network.action_log_prob))

        ret_mean, ret_std = tf.nn.moments(self.ret, axes=[0])
        summary_ret_mean = tf.summary.scalar("model/return/mean", ret_mean)
        summary_ret_std = tf.summary.scalar("model/return/std",
                                            tf.sqrt(ret_std))
        summary_entropy = tf.summary.scalar("model/entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        summaries: List[tf.Tensor] = []
        # Weight summaries: not turned on right now because they take too much space
        # TODO: use config to make this optional
        #for v in tf.trainable_variables():
        #    if "new_network" in v.name:
        #        summaries.append(tf.summary.histogram(v.name, v))
        summaries += self._specific_summaries()
        summaries += [
            summary_actor_loss,
            summary_critic_loss,
            summary_loss,
            summary_adv_mean,
            summary_adv_std,
            # summary_ratio_mean, summary_ratio_std,
            summary_new_log_prob_mean,
            summary_old_log_prob_mean,
            summary_ret_mean,
            summary_ret_std,
            summary_entropy,
            summary_grad_norm,
            summary_var_norm
        ]
        self.model_summary_op = tf.summary.merge(summaries)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(
            self.env,
            self,
            usercfg,
            normalize_states=self.config["normalize_states"],
            summary_writer=self.writer)

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["learning_rate"],
            epsilon=self.config["adam_epsilon"],
            name="optim")
        apply_grads = self.optimizer.apply_gradients(
            zip(grads, self.new_network_vars))

        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        return
Exemplo n.º 8
0
Arquivo: ppo.py Projeto: rkc007/yarll
class PPO(Agent):
    """Proximal Policy Optimization agent."""
    RNN = False

    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = False,
                 **usercfg) -> None:
        super(PPO, self).__init__(**usercfg)
        self.monitor_path: str = monitor_path
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))

        self.config.update(
            dict(
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=0.001,
                n_epochs=10,
                n_iter=10000,
                batch_size=64,  # Timesteps per training batch
                n_local_steps=256,
                normalize_states=False,
                gradient_clip_value=None,
                adam_epsilon=1e-5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.2,  # Clipped surrogate objective epsilon
                save_model=False))
        self.config.update(usercfg)

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        self.actor_loss = -tf.reduce_mean(
            self.make_actor_loss(self.old_network, self.new_network,
                                 self.advantage))
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.n_steps = tf.shape(self.states)[0]
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)

        adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0])
        summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean)
        summary_adv_std = tf.summary.scalar("model/advantage/std",
                                            tf.sqrt(adv_std))

        # TODO: get from ppo_loss function
        # ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0])
        # summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean)
        # summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std)

        summary_new_log_prob_mean = tf.summary.scalar(
            "model/new_log_prob/mean",
            tf.reduce_mean(self.new_network.action_log_prob))
        summary_old_log_prob_mean = tf.summary.scalar(
            "model/old_log_prob/mean",
            tf.reduce_mean(self.old_network.action_log_prob))

        ret_mean, ret_std = tf.nn.moments(self.ret, axes=[0])
        summary_ret_mean = tf.summary.scalar("model/return/mean", ret_mean)
        summary_ret_std = tf.summary.scalar("model/return/std",
                                            tf.sqrt(ret_std))
        summary_entropy = tf.summary.scalar("model/entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        summaries: List[tf.Tensor] = []
        # Weight summaries: not turned on right now because they take too much space
        # TODO: use config to make this optional
        #for v in tf.trainable_variables():
        #    if "new_network" in v.name:
        #        summaries.append(tf.summary.histogram(v.name, v))
        summaries += self._specific_summaries()
        summaries += [
            summary_actor_loss,
            summary_critic_loss,
            summary_loss,
            summary_adv_mean,
            summary_adv_std,
            # summary_ratio_mean, summary_ratio_std,
            summary_new_log_prob_mean,
            summary_old_log_prob_mean,
            summary_ret_mean,
            summary_ret_std,
            summary_entropy,
            summary_grad_norm,
            summary_var_norm
        ]
        self.model_summary_op = tf.summary.merge(summaries)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(
            self.env,
            self,
            usercfg,
            normalize_states=self.config["normalize_states"],
            summary_writer=self.writer)

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["learning_rate"],
            epsilon=self.config["adam_epsilon"],
            name="optim")
        apply_grads = self.optimizer.apply_gradients(
            zip(grads, self.new_network_vars))

        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        return

    def _initialize(self):
        self.session.run(self.init_op)

    def _specific_summaries(self) -> List[tf.Tensor]:
        """Summaries that are specific to the variant of the algorithm. None (empty list) for the base algorithm"""
        return []

    def make_actor_loss(self, old_network, new_network, advantage):
        return ppo_loss(old_network.action_log_prob,
                        new_network.action_log_prob,
                        self.config["cso_epsilon"], advantage)

    def build_networks(self):
        raise NotImplementedError

    @property
    def global_step(self):
        return self._global_step.eval(session=self.session)

    def get_critic_value(self, state, *rest):
        return self.session.run([self.value], feed_dict={self.states:
                                                         state})[0].flatten()

    def choose_action(self, state, *rest):
        action, value = self.session.run([self.action, self.value],
                                         feed_dict={self.states: [state]})
        return {"action": action, "value": value[0]}

    def get_env_action(self, action):
        return np.argmax(action)

    def get_processed_trajectories(self):
        experiences = self.env_runner.get_steps(int(
            self.config["n_local_steps"]),
                                                stop_at_trajectory_end=False)
        T = experiences.steps
        v = 0 if experiences.terminals[-1] else self.get_critic_value(
            np.asarray(experiences.states)[None, -1], experiences.features[-1])
        vpred = np.asarray(experiences.values + [v])
        gamma = self.config["gamma"]
        lambda_ = self.config["gae_lambda"]
        gaelam = advantages = np.empty(T, 'float32')
        last_gaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - experiences.terminals[t]
            delta = experiences.rewards[t] + gamma * vpred[
                t + 1] * nonterminal - vpred[t]
            gaelam[
                t] = last_gaelam = delta + gamma * lambda_ * nonterminal * last_gaelam
        rs = advantages + experiences.values
        return experiences.states, experiences.actions, advantages, rs, experiences.features

    def learn(self):
        """Run learning algorithm"""
        self._initialize()
        config = self.config
        n_updates = 0
        for _ in range(int(config["n_iter"])):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            states, actions, advs, rs, _ = self.get_processed_trajectories()
            advs = np.array(advs)
            normalized_advs = (advs - advs.mean()) / advs.std()
            self.session.run(self.set_old_to_new)

            indices = np.arange(len(states))
            for _ in range(int(self.config["n_epochs"])):
                np.random.shuffle(indices)

                batch_size = int(self.config["batch_size"])
                for j in range(0, len(states), batch_size):
                    batch_indices = indices[j:(j + batch_size)]
                    batch_states = np.array(states)[batch_indices]
                    batch_actions = np.array(actions)[batch_indices]
                    batch_advs = np.array(normalized_advs)[batch_indices]
                    batch_rs = np.array(rs)[batch_indices]
                    fetches = [self.train_op]
                    if (n_updates % 1000) == 0:
                        fetches.append(self.model_summary_op)
                    feed_dict = {
                        self.states: batch_states,
                        self.old_network.states: batch_states,
                        self.actions_taken: batch_actions,
                        self.old_network.actions_taken: batch_actions,
                        self.advantage: batch_advs,
                        self.ret: batch_rs
                    }
                    results = self.session.run(fetches, feed_dict)
                    if (n_updates % 1000) == 0:
                        self.writer.add_summary(results[-1], n_updates)
                    n_updates += 1
                self.writer.flush()

            if self.config["save_model"]:
                self.saver.save(self.session,
                                os.path.join(self.monitor_path, "model"))
Exemplo n.º 9
0
class AKTThread(Thread):
    """Asynchronous knowledge transfer learner thread. Used to learn using one specific variation of a task."""
    def __init__(self, master, env, task_id, n_iter, start_at_iter=0):
        super(AKTThread, self).__init__()
        self.master = master
        self.config = self.master.config
        self.task_id = task_id
        self.nA = env.action_space.n
        self.n_iter = n_iter
        self.start_at_iter = start_at_iter
        self.add_accum_grad = None  # To be filled in later

        self.build_networks()
        self.states = self.master.states
        self.session = self.master.session
        self.task_runner = EnvRunner(env, TaskPolicy(self.action, self),
                                     self.master.config)

        # Write the summary of each task in a different directory
        self.writer = tf.summary.FileWriter(
            os.path.join(self.master.monitor_path, "task" + str(self.task_id)),
            self.master.session.graph)

        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.config["learning_rate"],
            decay=self.config["decay"],
            epsilon=self.config["epsilon"])

    def build_networks(self):
        with tf.variable_scope("task{}".format(self.task_id)):
            self.sparse_representation = tf.Variable(
                tf.truncated_normal(
                    [self.master.config["n_sparse_units"], self.nA],
                    mean=0.0,
                    stddev=0.02))
            self.probs = tf.nn.softmax(
                tf.matmul(
                    self.master.L1,
                    tf.matmul(self.master.knowledge_base,
                              self.sparse_representation)))

            self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1),
                                     name="action")

            good_probabilities = tf.reduce_sum(tf.multiply(
                self.probs,
                tf.one_hot(tf.cast(self.master.action_taken, tf.int32),
                           self.nA)),
                                               reduction_indices=[1])
            eligibility = tf.log(good_probabilities +
                                 1e-10) * self.master.advantage
            self.loss = -tf.reduce_sum(eligibility)

    def run(self):
        """Run the appropriate learning algorithm."""
        if self.master.learning_method == "REINFORCE":
            self.learn_REINFORCE()
        else:
            self.learn_Karpathy()

    def learn_REINFORCE(self):
        """Learn using updates like in the REINFORCE algorithm."""
        reporter = Reporter()
        total_n_trajectories = 0
        iteration = self.start_at_iter
        while iteration < self.n_iter and not self.master.stop_requested:
            iteration += 1
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.task_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory["reward"], self.config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths
            results = self.master.session.run(
                [self.loss, self.apply_grad],
                feed_dict={
                    self.master.states: all_state,
                    self.master.action_taken: all_action,
                    self.master.advantage: all_adv
                })
            print("Task:", self.task_id)
            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
            summary = self.master.session.run(
                [self.master.summary_op],
                feed_dict={
                    self.master.loss: results[0],
                    self.master.reward: np.mean(episode_rewards),
                    self.master.episode_length: np.mean(episode_lengths)
                })
            self.writer.add_summary(summary[0], iteration)
            self.writer.flush()

    def learn_Karpathy(self):
        """Learn using updates like in the Karpathy algorithm."""
        iteration = self.start_at_iter
        while iteration < self.n_iter and not self.master.stop_requested:  # Keep executing episodes until the master requests a stop (e.g. using SIGINT)
            iteration += 1
            trajectory = self.task_runner.get_trajectory()
            reward = sum(trajectory["reward"])
            action_taken = trajectory["action"]

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = discounted_episode_rewards

            results = self.master.session.run(
                [self.loss, self.apply_grad],
                feed_dict={
                    self.master.states: trajectory["state"],
                    self.master.action_taken: action_taken,
                    self.master.advantage: feedback
                })
            results = self.master.session.run(
                [self.master.summary_op],
                feed_dict={
                    self.master.loss: results[0],
                    self.master.reward: reward,
                    self.master.episode_length: trajectory["steps"]
                })
            self.writer.add_summary(results[0], iteration)
            self.writer.flush()