def test_microbatches():
    def env_fn():
        env = gym.make('CartPole-v0')
        env.seed(0)
        return env

    learn_fn = partial(learn,
                       network='mlp',
                       nsteps=32,
                       total_timesteps=32,
                       seed=0)

    env_ref = DummyVecEnv([env_fn])
    sess_ref = make_session(make_default=True, graph=tf.Graph())
    learn_fn(env=env_ref)
    vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}

    env_test = DummyVecEnv([env_fn])
    sess_test = make_session(make_default=True, graph=tf.Graph())
    learn_fn(env=env_test,
             model_fn=partial(MicrobatchedModel, microbatch_size=2))
    # learn_fn(env=env_test)
    vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}

    for v in vars_ref:
        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
def test_serialization(learn_fn, network_fn):
    '''
    Test if the trained model can be serialized
    '''

    if network_fn.endswith('lstm') and learn_fn in [
            'acer', 'acktr', 'trpo_mpi', 'deepq'
    ]:
        # TODO make acktr work with recurrent policies
        # and test
        # github issue: https://github.com/openai/baselines/issues/660
        return

    def make_env():
        env = MnistEnv(episode_len=100)
        env.seed(10)
        return env

    env = DummyVecEnv([make_env])
    ob = env.reset().copy()
    learn = get_learn_function(learn_fn)

    kwargs = {}
    kwargs.update(network_kwargs[network_fn])
    kwargs.update(learn_kwargs[learn_fn])

    learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)

    with tempfile.TemporaryDirectory() as td:
        model_path = os.path.join(td, 'serialization_test_model')

        with tf.Graph().as_default(), make_session().as_default():
            model = learn(total_timesteps=100)
            model.save(model_path)
            mean1, std1 = _get_action_stats(model, ob)
            variables_dict1 = _serialize_variables()

        with tf.Graph().as_default(), make_session().as_default():
            model = learn(total_timesteps=0, load_path=model_path)
            mean2, std2 = _get_action_stats(model, ob)
            variables_dict2 = _serialize_variables()

        for k, v in variables_dict1.items():
            np.testing.assert_allclose(
                v,
                variables_dict2[k],
                atol=0.01,
                err_msg='saved and loaded variable {} value mismatch'.format(
                    k))

        np.testing.assert_allclose(mean1, mean2, atol=0.5)
        np.testing.assert_allclose(std1, std2, atol=0.5)
Exemplo n.º 3
0
def test_env_after_learn(algo):
    def make_env():
        # acktr requires too much RAM, fails on travis
        env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)

    env.reset()
    env.close()
Exemplo n.º 4
0
def train(args):
    from model.encoder import bi_direction_lstm
    from model.action_decoder import MlpPolicy
    from model.mlp_state_decoder import MlpPolicy_state
    U.make_session(num_cpu=1).__enter__()
    env = humanoid_CMU.stand()
    obs_space = env.physics.data.qpos
    ac_space = env.action_spec()

    def encoder(name):
        return bi_direction_lstm(name=name,
                                 obs_space=obs_space,
                                 batch_size=args.lstm_batch,
                                 time_steps=args.time_steps,
                                 LSTM_size=args.LSTM_size,
                                 laten_size=args.laten_size)

    def action_decorder(name):
        return MlpPolicy(name=name,
                         obs_space=obs_space,
                         ac_space=ac_space,
                         embedding_shape=args.laten_size,
                         hid_size=args.pol_hid_size,
                         num_hid_layers=args.pol_layers)

    def state_decorder(name):
        return MlpPolicy_state(name=name,
                               obs_space=obs_space,
                               embedding_shape=args.laten_size,
                               hid_size=args.state_de_hid_size,
                               num_hid_layers=args.state_de_hid_num)

    state_dataset = load_state_dataset(args.state_dir_path, env,
                                       args.control_timestep)
    learn(encoder=encoder,
          action_decorder=action_decorder,
          state_decorder=state_decorder,
          embedding_shape=args.laten_size,
          dataset=state_dataset,
          logdir=args.logdir,
          batch_size=args.lstm_batch,
          time_steps=args.time_steps,
          epsilon=args.epsilon,
          lr_rate=args.lr_rate)
Exemplo n.º 5
0
def train(args):
    from model.encoder import bi_direction_lstm
    from model.action_decoder import MlpPolicy
    from model.WaveNet import WaveNetModel
    U.make_session(num_cpu=1).__enter__()
    env = humanoid_CMU.stand()
    obs_space = env.physics.data.qpos
    ac_space = env.action_spec()
    def encoder(name):
        return bi_direction_lstm(name=name, obs_space=obs_space, batch_size=args.lstm_batch, time_steps= args.time_steps, LSTM_size= args.LSTM_size, laten_size = args.laten_size)
    def action_decorder(name):
        return  MlpPolicy(name=name, obs_space = obs_space, ac_space = ac_space, embedding_shape = args.laten_size, hid_size = pol_hid_size, num_hid_layers = pol_layers)
    with open(args.wavenet_params, 'r') as f:
        wavenet_params = json.load(f)
    def state_decorder(name): ##也要加个name
        return WaveNetModel(
            name = name,
            obs_shape= obs_space,
            embedding_shape= args.laten_size,
            batch_size=args.time_steps,
            dilations=wavenet_params["dilations"],
            filter_width=wavenet_params["filter_width"],
            residual_channels=wavenet_params["residual_channels"],
            dilation_channels=wavenet_params["dilation_channels"],
            skip_channels=wavenet_params["skip_channels"],
            quantization_channels=wavenet_params["quantization_channels"],
            use_biases=wavenet_params["use_biases"],
            scalar_input=wavenet_params["scalar_input"],
            initial_filter_width=wavenet_params["initial_filter_width"],
            histograms=args.histograms,
            global_condition_channels=args.gc_channels)
    state_dataset = load_state_dataset(args.state_dir_path, env, args.control_timestep)
    ##感觉数据会有点少,可以尝试多加一点走路的数
    optimizer = optimizer_factory[args.optimizer](
        learning_rate=args.learning_rate,
        momentum=args.momentum)
    learn(env=env, encoder = encoder, action_decorder=action_decorder, state_decorder=state_decorder, embedding_shape= args.laten_size ,dataset=state_dataset, optimizer = optimizer, logdir=args.logdir,
          batch_size = args.lstm_batch, time_steps = args.time_steps)
def test_coexistence(learn_fn, network_fn):
    '''
    Test if more than one model can exist at a time
    '''

    if learn_fn == 'deepq':
        # TODO enable multiple DQN models to be useable at the same time
        # github issue https://github.com/openai/baselines/issues/656
        return

    if network_fn.endswith('lstm') and learn_fn in [
            'acktr', 'trpo_mpi', 'deepq'
    ]:
        # TODO make acktr work with recurrent policies
        # and test
        # github issue: https://github.com/openai/baselines/issues/660
        return

    env = DummyVecEnv([lambda: gym.make('CartPole-v0')])
    learn = get_learn_function(learn_fn)

    kwargs = {}
    kwargs.update(network_kwargs[network_fn])
    kwargs.update(learn_kwargs[learn_fn])

    learn = partial(learn,
                    env=env,
                    network=network_fn,
                    total_timesteps=0,
                    **kwargs)
    make_session(make_default=True, graph=tf.Graph())
    model1 = learn(seed=1)
    make_session(make_default=True, graph=tf.Graph())
    model2 = learn(seed=2)

    model1.step(env.observation_space.sample())
    model2.step(env.observation_space.sample())
    def load(path, num_cpu=16):
        with open(path, "rb") as f:
            model_data, act_params = dill.load(f)
        act = deepq.build_act(**act_params)
        sess = U.make_session(num_cpu=num_cpu)
        sess.__enter__()
        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            U.load_state(os.path.join(td, "model"))

        return ActWrapper(act, act_params)
Exemplo n.º 8
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.compat.v1.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.compat.v1.placeholder(
                        tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.compat.v1.placeholder(tf.float32,
                                                               shape=(None, 1),
                                                               name='rewards')
                    self.actions_ph = tf.compat.v1.placeholder(
                        tf.float32,
                        shape=(None, ) + self.action_space.shape,
                        name='actions')
                    self.learning_rate_ph = tf.compat.v1.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.compat.v1.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(
                        input_tensor=self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True)
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True)

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(
                            self.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef,
                                  str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.compat.v1.get_variable(
                            'log_ent_coef',
                            dtype=tf.float32,
                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.compat.v1.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Target for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean(input_tensor=(q_backup -
                                                                  qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean(input_tensor=(q_backup -
                                                                  qf2)**2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            input_tensor=self.log_ent_coef *
                            tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.compat.v1.train.AdamOptimizer(
                            learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(
                        input_tensor=self.ent_coef * logp_pi - qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean(
                        input_tensor=(value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.compat.v1.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss,
                        var_list=tf_util.get_trainable_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.compat.v1.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = tf_util.get_trainable_vars(
                        'model/values_fn')

                    source_params = tf_util.get_trainable_vars(
                        "model/values_fn")
                    target_params = tf_util.get_trainable_vars(
                        "target/values_fn")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.compat.v1.assign(target, (1 - self.tau) * target +
                                            self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.compat.v1.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            'policy_loss', 'qf1_loss', 'qf2_loss',
                            'value_loss', 'entropy'
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss, qf1_loss, qf2_loss, value_loss, qf1,
                            qf2, value_fn, logp_pi, self.entropy,
                            policy_train_op, train_values_op
                        ]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(
                                    ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += [
                                    'ent_coef_loss', 'ent_coef'
                                ]
                                self.step_ops += [
                                    ent_coef_op, ent_coef_loss, self.ent_coef
                                ]

                    # Monitor losses and entropy in tensorboard
                    tf.compat.v1.summary.scalar('policy_loss', policy_loss)
                    tf.compat.v1.summary.scalar('qf1_loss', qf1_loss)
                    tf.compat.v1.summary.scalar('qf2_loss', qf2_loss)
                    tf.compat.v1.summary.scalar('value_loss', value_loss)
                    tf.compat.v1.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.compat.v1.summary.scalar('ent_coef_loss',
                                                    ent_coef_loss)
                        tf.compat.v1.summary.scalar('ent_coef', self.ent_coef)

                    tf.compat.v1.summary.scalar(
                        'learning_rate',
                        tf.reduce_mean(input_tensor=self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars(
                    "target/values_fn")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.compat.v1.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.compat.v1.summary.merge_all()
Exemplo n.º 9
0
def main(args):
    from ppo1 import mlp_policy
    U.make_session(num_cpu=args.num_cpu).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            reuse=reuse, hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    dataset = Mujoco_Dset(expert_path=args.expert_path, ret_threshold=args.ret_threshold, traj_limitation=args.traj_limitation)
    pretrained_weight = None
    if (args.pretrained and args.task == 'train') or args.algo == 'bc':
        # Pretrain with behavior cloning
        from gailtf.algo import behavior_clone
        if args.algo == 'bc' and args.task == 'evaluate':
            behavior_clone.evaluate(env, policy_fn, args.load_model_path, stochastic_policy=args.stochastic_policy)
            sys.exit()
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
            max_iters=args.BC_max_iter, pretrained=args.pretrained,
            ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name)
        if args.algo == 'bc':
            sys.exit()

    from gailtf.network.adversary import TransitionClassifier
    # discriminator
    discriminator = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
    if args.algo == 'trpo':
        # Set up for MPI seed
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        from gailtf.algo import trpo_mpi
        if args.task == 'train':
            trpo_mpi.learn(env, policy_fn, discriminator, dataset,
                pretrained=args.pretrained, pretrained_weight=pretrained_weight,
                g_step=args.g_step, d_step=args.d_step,
                timesteps_per_batch=1024,
                max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
                max_timesteps=args.num_timesteps,
                entcoeff=args.policy_entcoeff, gamma=0.995, lam=0.97,
                vf_iters=5, vf_stepsize=1e-3,
                ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir,
                save_per_iter=args.save_per_iter, load_model_path=args.load_model_path,
                task_name=task_name)
        elif args.task == 'evaluate':
            trpo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024,
                number_trajs=10, stochastic_policy=args.stochastic_policy)
        else: raise NotImplementedError
    elif args.algo == 'ppo':
        # Set up for MPI seed
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        from gailtf.algo import ppo_mpi
        if args.task == 'train':
            ppo_mpi.learn(env, policy_fn, discriminator, dataset,
                           # pretrained=args.pretrained, pretrained_weight=pretrained_weight,
                           timesteps_per_batch=1024,
                           g_step=args.g_step, d_step=args.d_step,
                           # max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
                           clip_param= 0.2,entcoeff=args.policy_entcoeff,
                           max_timesteps=args.num_timesteps,
                            gamma=0.99, lam=0.95,
                           # vf_iters=5, vf_stepsize=1e-3,
                            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
                          d_stepsize=3e-4,
                          schedule='linear', ckpt_dir=args.checkpoint_dir,
                          save_per_iter=100, task=args.task,
                          sample_stochastic=args.stochastic_policy,
                          load_model_path=args.load_model_path,
                          task_name=task_name)
        elif args.task == 'evaluate':
            ppo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024,
                              number_trajs=10, stochastic_policy=args.stochastic_policy)
        else:
            raise NotImplementedError
    else: raise NotImplementedError

    env.close()
Exemplo n.º 10
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.n_batch = self.n_envs * self.n_steps

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\
                        "the number of environments run in parallel should be a multiple of nminibatches."
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_batch // self.nminibatches

                act_model = self.policy(self.sess,
                                        self.observation_space,
                                        self.action_space,
                                        self.n_envs,
                                        1,
                                        n_batch_step,
                                        reuse=False,
                                        **self.policy_kwargs)
                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs // self.nminibatches,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.action_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.old_neglog_pac_ph = tf.placeholder(
                        tf.float32, [None], name="old_neglog_pac_ph")
                    self.old_vpred_ph = tf.placeholder(tf.float32, [None],
                                                       name="old_vpred_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")
                    self.clip_range_ph = tf.placeholder(tf.float32, [],
                                                        name="clip_range_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.action_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())

                    vpred = train_model.value_flat

                    # Value function clipping: not present in the original PPO
                    if self.cliprange_vf is None:
                        # Default behavior (legacy from OpenAI baselines):
                        # use the same clipping as for the policy
                        self.clip_range_vf_ph = self.clip_range_ph
                        self.cliprange_vf = self.cliprange
                    elif isinstance(self.cliprange_vf,
                                    (float, int)) and self.cliprange_vf < 0:
                        # Original PPO implementation: no value function clipping
                        self.clip_range_vf_ph = None
                    else:
                        # Last possible behavior: clipping range
                        # specific to the value function
                        self.clip_range_vf_ph = tf.placeholder(
                            tf.float32, [], name="clip_range_vf_ph")

                    if self.clip_range_vf_ph is None:
                        # No clipping
                        vpred_clipped = train_model.value_flat
                    else:
                        # Clip the different between old and new value
                        # NOTE: this depends on the reward scaling
                        vpred_clipped = self.old_vpred_ph + \
                            tf.clip_by_value(train_model.value_flat - self.old_vpred_ph,
                                             - self.clip_range_vf_ph, self.clip_range_vf_ph)

                    vf_losses1 = tf.square(vpred - self.rewards_ph)
                    vf_losses2 = tf.square(vpred_clipped - self.rewards_ph)
                    self.vf_loss = .5 * tf.reduce_mean(
                        tf.maximum(vf_losses1, vf_losses2))

                    ratio = tf.exp(self.old_neglog_pac_ph - neglogpac)
                    pg_losses = -self.advs_ph * ratio
                    pg_losses2 = -self.advs_ph * tf.clip_by_value(
                        ratio, 1.0 - self.clip_range_ph,
                        1.0 + self.clip_range_ph)
                    self.pg_loss = tf.reduce_mean(
                        tf.maximum(pg_losses, pg_losses2))
                    self.approxkl = .5 * tf.reduce_mean(
                        tf.square(neglogpac - self.old_neglog_pac_ph))
                    self.clipfrac = tf.reduce_mean(
                        tf.cast(
                            tf.greater(tf.abs(ratio - 1.0),
                                       self.clip_range_ph), tf.float32))
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('approximate_kullback-leibler',
                                      self.approxkl)
                    tf.summary.scalar('clip_factor', self.clipfrac)
                    tf.summary.scalar('loss', loss)

                    with tf.variable_scope('model'):
                        self.params = tf.trainable_variables()
                        if self.full_tensorboard_log:
                            for var in self.params:
                                tf.summary.histogram(var.name, var)
                    grads = tf.gradients(loss, self.params)
                    if self.max_grad_norm is not None:
                        grads, _grad_norm = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))
                trainer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate_ph, epsilon=1e-5)
                self._train = trainer.apply_gradients(grads)

                self.loss_names = [
                    'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
                    'clipfrac'
                ]

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_range_ph))
                    if self.clip_range_vf_ph is not None:
                        tf.summary.scalar(
                            'clip_range_vf',
                            tf.reduce_mean(self.clip_range_vf_ph))

                    tf.summary.scalar('old_neglog_action_probability',
                                      tf.reduce_mean(self.old_neglog_pac_ph))
                    tf.summary.scalar('old_value_pred',
                                      tf.reduce_mean(self.old_vpred_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        tf.summary.histogram('clip_range', self.clip_range_ph)
                        tf.summary.histogram('old_neglog_action_probability',
                                             self.old_neglog_pac_ph)
                        tf.summary.histogram('old_value_pred',
                                             self.old_vpred_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                self.train_model = train_model
                self.act_model = act_model
                self.step = act_model.step
                self.proba_step = act_model.proba_step
                self.value = act_model.value
                self.initial_state = act_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)  # pylint: disable=E1101

                self.summary = tf.summary.merge_all()
Exemplo n.º 11
0
from deepq.replay_buffer import ReplayBuffer
from deepq.utils import ObservationInput
from common.schedules import LinearSchedule


def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        return out


if __name__ == '__main__':
    with U.make_session(num_cpu=8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
Exemplo n.º 12
0
def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out,
                                     num_outputs=64,
                                     activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out,
                                     num_outputs=num_actions,
                                     activation_fn=None)
        return out


if __name__ == '__main__':
    with U.make_session(8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: ObservationInput(env.observation_space,
                                                      name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000,
Exemplo n.º 13
0
def main(args):
    from ppo1 import mlp_policy ##for policy
    from model.encoder import bi_direction_lstm
    from dm_control.suite import humanoid_CMU

    U.make_session(num_cpu=args.num_cpu).__enter__()
    set_global_seeds(args.seed)
    env = humanoid_CMU.stand()
    obs_space = env.physics.data.qpos
    ac_space = env.action_spec()
    def policy_fn(name, ob_space, ac_space, reuse=False): ###mlp policy 要不要用用之前训好的policy,不是的
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            reuse=reuse, hid_size= [300, 200, 100], num_hid_layers=3)
    def encoder(name):
        return bi_direction_lstm(name=name, obs_space=obs_space, batch_size=args.lstm_batch, time_steps= args.time_steps, LSTM_size= args.LSTM_size, laten_size = args.laten_size)

    lstm_encoder = encoder("lstm_encoder")
    saver = lstm_encoder.get_trainable_variables()
    load(saver=saver, sess=tf.get_default_session(), logdir = args.encoder_load_path) ###将encoder的参数load进去
    # env = bench.Monitor(env, logger.get_dir() and
    #     osp.join(logger.get_dir(), "monitor.json"))
    # env.seed(args.seed)
    # gym.logger.setLevel(logging.WARN)
    # task_name = get_task_name(args)
    task_name = "Humanoid-CMU"
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    # dataset = Mujoco_Dset(expert_path=args.expert_path, ret_threshold=args.ret_thres hold, traj_limitation=args.traj_limitation)
    # ================ Sample trajectory τj from the demonstration ============================= # 相当于expert dataset,仅需要obs即可
    from model.VAE import load_state_dataset
    dataset = load_state_dataset(data_dir_path=args.expert_data_dir, env = env, control_timestep=args.control_timestep)

    pretrained_weight = None
    if (args.pretrained and args.task == 'train') or args.algo == 'bc':
        # Pretrain with behavior cloning
        from gail import behavior_clone
        if args.algo == 'bc' and args.task == 'evaluate':
            behavior_clone.evaluate(env, policy_fn, args.load_model_path, stochastic_policy=args.stochastic_policy)
            sys.exit()
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
            max_iters=args.BC_max_iter, pretrained=args.pretrained,
            ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name)
        if args.algo == 'bc':
            sys.exit()

    from network.adversary import TransitionClassifier
    # discriminator
    discriminator = TransitionClassifier(env, args.adversary_hidden_size, hidden_layers = args.adversary_hidden_layers, lr_rate = args.adversary_learning_rate, entcoeff=args.adversary_entcoeff, embedding_shape=args.laten_size) ###embedding_z,现在还没有处理
    observations = dataset.get_next_batch(batch_size=128)[0].transpose((1, 0))   ### !!!!这个地方还是稍微有点儿乱啊
    embedding_z = lstm_encoder.get_laten_vector(observations)
    if args.algo == 'trpo':
        # Set up for MPI seed
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        from gail import trpo_mpi
        if args.task == 'train':
            trpo_mpi.learn(env, policy_fn, discriminator, dataset, embedding_z=None, ##embedding_z这里现在我还没有想好
                pretrained=args.pretrained, pretrained_weight=pretrained_weight,
                g_step=args.g_step, d_step=args.d_step,
                timesteps_per_batch=1024,
                max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
                max_timesteps=args.num_timesteps,
                entcoeff=args.policy_entcoeff, gamma=0.995, lam=0.97,
                vf_iters=5, vf_stepsize=1e-3,
                ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir,
                save_per_iter=args.save_per_iter, load_model_path=args.load_model_path,
                task_name=task_name)
        elif args.task == 'evaluate':
            trpo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024,
                number_trajs=10, stochastic_policy=args.stochastic_policy)
        else: raise NotImplementedError
    elif args.algo == 'ppo':
        # Set up for MPI seed
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        from gail import ppo_mpi
        if args.task == 'train':
            ppo_mpi.learn(env, policy_fn, discriminator, dataset,
                           # pretrained=args.pretrained, pretrained_weight=pretrained_weight,
                           timesteps_per_batch=1024,
                           g_step=args.g_step, d_step=args.d_step,
                           # max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
                           clip_param= 0.2,entcoeff=args.policy_entcoeff,
                           max_timesteps=args.num_timesteps,
                            gamma=0.99, lam=0.95,
                           # vf_iters=5, vf_stepsize=1e-3,
                            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
                          d_stepsize=3e-4,
                          schedule='linear', ckpt_dir=args.checkpoint_dir,
                          save_per_iter=100, task=args.task,
                          sample_stochastic=args.stochastic_policy,
                          load_model_path=args.load_model_path,
                          task_name=task_name)
        elif args.task == 'evaluate':
            ppo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024,
                              number_trajs=10, stochastic_policy=args.stochastic_policy)
        else:
            raise NotImplementedError
    else: raise NotImplementedError

    env.close()
    rollouts["vpreds"] = np.array(vpreds)
    rollouts["op_vpred"] = np.array(op_vpreds)


f = open("results/MOAC/exp_5/data/rollout_data.pkl", "rb")
p = pickle.load(f)
f.close()
horizon = 150
rolloutSize = 75
modes = 3
num_options = 9
queueSize = 5000
env = gym.make('BlockSlide2D-v1')
env.seed(1)

U.make_session(num_cpu=1).__enter__()

np.random.seed(1)
tf1.set_random_seed(1)

ob_space = env.observation_space
ac_space = env.action_space

# Initialize the model
model = partialHybridModel(env, model_learning_params, svm_grid_params,
                           svm_params_interest, svm_params_guard, horizon,
                           modes, num_options, rolloutSize)
pi = policy_fn("pi", ob_space, ac_space, model,
               num_options)  # Construct network for new policy
policy_path = "results/MOAC/exp_5/model/"
Exemplo n.º 15
0
def learn_continuous_tasks(env,
                           q_func,
                           env_name,
                           dir_path,
                           time_stamp,
                           total_num_episodes,
                           num_actions_pad=33,
                           lr=1e-4,
                           grad_norm_clipping=10,
                           max_timesteps=int(1e8),
                           buffer_size=int(1e6),
                           train_freq=1,
                           batch_size=64,
                           print_freq=10,
                           learning_starts=1000,
                           gamma=0.99,
                           target_network_update_freq=500,
                           prioritized_replay=False,
                           prioritized_replay_alpha=0.6,
                           prioritized_replay_beta0=0.4,
                           prioritized_replay_beta_iters=None,
                           prioritized_replay_eps=int(1e8),
                           num_cpu=16,
                           epsilon_greedy=False,
                           timesteps_std=1e6,
                           initial_std=0.4,
                           final_std=0.05,
                           eval_freq=100,
                           n_eval_episodes=10,
                           eval_std=0.01,
                           log_index=0,
                           log_prefix='q',
                           loss_type="L2",
                           model_file='./',
                           callback=None):
    """Train a branching deepq model to solve continuous control tasks via discretization.
    Current assumptions in the implementation:
    - for solving continuous control domains via discretization (can be adjusted to be compatible with naturally disceret-action domains using 'env.action_space.n')
    - uniform number of sub-actions per action dimension (can be generalized to heterogeneous number of sub-actions across branches)

    Parameters
    -------
    env : gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions_pad: int
        number of sub-actions per action dimension (= num of discretization grains/bars + 1)
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimize for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
        0.1 for dqn-baselines
    exploration_final_eps: float
        final value of random action probability
        0.02 for dqn-baselines
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    grad_norm_clipping: int
        set None for no clipping
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the unified TD error for updating priorities.
        Erratum: The camera-ready copy of this paper incorrectly reported 1e-8.
        The value used to produece the results is 1e8.
    num_cpu: int
        number of cpus to use for training

    dir_path: str
        path for logs and results to be stored in
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    print('Observation shape:' + str(env.observation_space.shape))

    num_action_grains = num_actions_pad - 1
    num_action_dims = env.action_space.shape[0]
    num_action_streams = num_action_dims
    num_actions = num_actions_pad * num_action_streams  # total numb network outputs for action branching with one action dimension per branch

    print('Number of actions in total:' + str(num_actions))

    act, q_val, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        num_action_streams=num_action_streams,
        batch_size=batch_size,
        optimizer_name="Adam",
        learning_rate=lr,
        grad_norm_clipping=grad_norm_clipping,
        gamma=gamma,
        double_q=True,
        scope="deepq",
        reuse=None,
        loss_type="L2")

    print('TRAIN VARS:')
    print(tf.trainable_variables())

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
        'num_action_streams': num_action_streams,
    }

    print('Create the log writer for TensorBoard visualizations.')
    log_dir = "{}/tensorboard_logs/{}".format(dir_path, env_name)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    score_placeholder = tf.placeholder(tf.float32, [],
                                       name='score_placeholder')
    tf.summary.scalar('score', score_placeholder)
    lr_constant = tf.constant(lr, name='lr_constant')
    tf.summary.scalar('learning_rate', lr_constant)

    eval_placeholder = tf.placeholder(tf.float32, [], name='eval_placeholder')
    eval_summary = tf.summary.scalar('evaluation', eval_placeholder)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    if epsilon_greedy:
        approximate_num_iters = 2e6 / 4
        exploration = PiecewiseSchedule([(0, 1.0),
                                         (approximate_num_iters / 50, 0.1),
                                         (approximate_num_iters / 5, 0.01)],
                                        outside_value=0.01)
    else:
        exploration = ConstantSchedule(value=0.0)  # greedy policy
        std_schedule = LinearSchedule(schedule_timesteps=timesteps_std,
                                      initial_p=initial_std,
                                      final_p=final_std)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    # Initialize the parameters used for converting branching, discrete action indeces to continuous actions
    low = env.action_space.low
    high = env.action_space.high
    actions_range = np.subtract(high, low)
    print('###################################')
    print(low)
    print(high)
    print('###################################')

    episode_rewards = []
    reward_sum = 0.0
    time_steps = [0]
    time_spent_exploring = [0]

    prev_time = time.time()
    n_trainings = 0

    # Open a dircetory for recording results
    results_dir = "{}/results/{}".format(dir_path, env_name)
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    displayed_mean_reward = None
    score_timesteps = []

    game_scores = []

    def evaluate(step, episode_number):
        global max_eval_reward_mean, model_saved
        print('Evaluate...')
        eval_reward_sum = 0.0
        # Run evaluation episodes
        for eval_episode in range(n_eval_episodes):
            obs = env.reset()
            done = False
            while not done:
                # Choose action
                action_idxes = np.array(
                    act(np.array(obs)[None],
                        stochastic=False))  # deterministic
                actions_greedy = action_idxes / num_action_grains * actions_range + low

                if eval_std == 0.0:
                    action = actions_greedy
                else:
                    action = []
                    for index in range(len(actions_greedy)):
                        a_greedy = actions_greedy[index]
                        out_of_range_action = True
                        while out_of_range_action:
                            a_stoch = np.random.normal(loc=a_greedy,
                                                       scale=eval_std)
                            a_idx_stoch = np.rint(
                                (a_stoch + high[index]) /
                                actions_range[index] * num_action_grains)
                            if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad:
                                action.append(a_stoch)
                                out_of_range_action = False

                # Step
                obs, rew, done, _ = env.step(action)

                eval_reward_sum += rew

        # Average the rewards and log
        eval_reward_mean = eval_reward_sum / n_eval_episodes
        print(eval_reward_mean, 'over', n_eval_episodes, 'episodes')
        game_scores.append(eval_reward_mean)
        score_timesteps.append(step)

        if max_eval_reward_mean is None or eval_reward_mean > max_eval_reward_mean:
            logger.log(
                "Saving model due to mean eval increase: {} -> {}".format(
                    max_eval_reward_mean, eval_reward_mean))
            U.save_state(model_file)
            model_saved = True
            max_eval_reward_mean = eval_reward_mean
            intact = ActWrapper(act, act_params)

            intact.save(model_file + "_" + str(episode_number) + "_" +
                        str(int(np.round(max_eval_reward_mean))))
            print('Act saved to ' + model_file + "_" + str(episode_number) +
                  "_" + str(int(np.round(max_eval_reward_mean))))

    with tempfile.TemporaryDirectory() as td:
        td = './logs'
        evaluate(0, 0)
        obs = env.reset()

        t = -1
        all_means = []
        q_stats = []
        current_qs = []

        training_game_scores = []
        training_timesteps = []
        while True:
            t += 1
            # Select action and update exploration probability
            action_idxes = np.array(
                act(np.array(obs)[None], update_eps=exploration.value(t)))
            qs = np.array(q_val(np.array(obs)[None],
                                stochastic=False))  # deterministic
            tt = []
            for val in qs:
                tt.append(np.std(val))
            current_qs.append(tt)

            # Convert sub-actions indexes (discrete sub-actions) to continuous controls
            action = action_idxes / num_action_grains * actions_range + low
            if not epsilon_greedy:  # Gaussian noise
                actions_greedy = action
                action_idx_stoch = []
                action = []
                for index in range(len(actions_greedy)):
                    a_greedy = actions_greedy[index]
                    out_of_range_action = True
                    while out_of_range_action:
                        # Sample from a Gaussian with mean at the greedy action and a std following a schedule of choice
                        a_stoch = np.random.normal(loc=a_greedy,
                                                   scale=std_schedule.value(t))
                        # Convert sampled cont action to an action idx
                        a_idx_stoch = np.rint(
                            (a_stoch + high[index]) / actions_range[index] *
                            num_action_grains)
                        # Check if action is in range
                        if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad:
                            action_idx_stoch.append(a_idx_stoch)
                            action.append(a_stoch)
                            out_of_range_action = False
                action_idxes = action_idx_stoch
            new_obs, rew, done, _ = env.step(np.array(action))
            # Store transition in the replay buffer
            replay_buffer.add(obs, action_idxes, rew, new_obs, float(done))
            obs = new_obs
            reward_sum += rew
            if done:
                obs = env.reset()
                time_spent_exploring[-1] = int(100 * exploration.value(t))
                time_spent_exploring.append(0)
                episode_rewards.append(reward_sum)
                training_game_scores.append(reward_sum)
                training_timesteps.append(t)
                time_steps[-1] = t
                reward_sum = 0.0
                time_steps.append(0)
                q_stats.append(np.mean(current_qs, 0))
                current_qs = []

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(
                    obses_t, actions, rewards, obses_tp1, dones,
                    weights)  # np.ones_like(rewards)) #TEMP AT NEW
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)
                n_trainings += 1
            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically
                update_target()
            if len(episode_rewards) == 0:
                mean_100ep_reward = 0
            elif len(episode_rewards) < 100:
                mean_100ep_reward = np.mean(episode_rewards)
            else:
                mean_100ep_reward = np.mean(episode_rewards[-100:])
            all_means.append(mean_100ep_reward)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                current_time = time.time()
                logger.record_tabular("trainings per second",
                                      n_trainings / (current_time - prev_time))
                logger.dump_tabular()
                n_trainings = 0
                prev_time = current_time
            if t > learning_starts and num_episodes > 100:
                if displayed_mean_reward is None or mean_100ep_reward > displayed_mean_reward:
                    if print_freq is not None:
                        logger.log("Mean reward increase: {} -> {}".format(
                            displayed_mean_reward, mean_100ep_reward))
                    displayed_mean_reward = mean_100ep_reward
                    # Performance evaluation with a greedy policy
            if done and num_episodes % eval_freq == 0:
                evaluate(t + 1, num_episodes)
                obs = env.reset()
            # STOP training
            if num_episodes >= total_num_episodes:
                break
        pickle.dump(q_stats,
                    open(
                        str(log_index) + "q_stat_stds99_" + log_prefix +
                        ".pkl", 'wb'),
                    protocol=pickle.HIGHEST_PROTOCOL)

        pickle.dump(game_scores,
                    open(
                        str(log_index) + "q_stat_scores99_" + log_prefix +
                        ".pkl", 'wb'),
                    protocol=pickle.HIGHEST_PROTOCOL)

    return ActWrapper(act, act_params)
def learn_continuous_tasks(env,
                           q_func,
                           env_name,
                           time_stamp,
                           total_num_episodes,
                           num_actions_pad=33,
                           lr=1e-4,
                           grad_norm_clipping=10,
                           max_timesteps=int(1e8),
                           buffer_size=int(1e6),
                           train_freq=1,
                           batch_size=64,
                           print_freq=10,
                           learning_starts=1000,
                           gamma=0.99,
                           target_network_update_freq=500,
                           prioritized_replay_alpha=0.6,
                           prioritized_replay_beta0=0.4,
                           prioritized_replay_beta_iters=2e6,
                           prioritized_replay_eps=int(1e8),
                           num_cpu=16,
                           timesteps_std=1e6,
                           initial_std=0.4,
                           final_std=0.05,
                           eval_freq=100,
                           n_eval_episodes=10,
                           eval_std=0.01,
                           callback=None):
    """Train a branching deepq model to solve continuous control tasks via discretization.
    Current assumptions in the implementation: 
    - for solving continuous control domains via discretization (can be adjusted to be compatible with naturally disceret-action domains using 'env.action_space.n')
    - uniform number of sub-actions per action dimension (can be generalized to heterogeneous number of sub-actions across branches) 

    Parameters
    -------
    env : gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions_pad: int
        number of sub-actions per action dimension (= num of discretization grains/bars + 1)
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimize for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
        0.1 for dqn-baselines
    exploration_final_eps: float
        final value of random action probability
        0.02 for dqn-baselines 
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    grad_norm_clipping: int
        set None for no clipping
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the unified TD error for updating priorities.
        Erratum: The camera-ready copy of this paper incorrectly reported 1e-8. 
        The value used to produece the results is 1e8.
    num_cpu: int
        number of cpus to use for training
    losses_version: int
        optimization version number
    dir_path: str 
        path for logs and results to be stored in 
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    num_action_grains = num_actions_pad - 1
    num_action_dims = env.action_space.shape[0]
    num_action_streams = num_action_dims
    num_actions = num_actions_pad * num_action_streams  # total numb network outputs for action branching with one action dimension per branch

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        num_action_streams=num_action_streams,
        batch_size=batch_size,
        learning_rate=lr,
        grad_norm_clipping=grad_norm_clipping,
        gamma=gamma,
        scope="deepq",
        reuse=None)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
        'num_action_streams': num_action_streams,
    }

    # prioritized_replay: create the replay buffer
    replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                            alpha=prioritized_replay_alpha)
    beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                   initial_p=prioritized_replay_beta0,
                                   final_p=1.0)

    # epsilon_greedy = False: just greedy policy
    exploration = ConstantSchedule(value=0.0)  # greedy policy
    std_schedule = LinearSchedule(schedule_timesteps=timesteps_std,
                                  initial_p=initial_std,
                                  final_p=final_std)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    # Initialize the parameters used for converting branching, discrete action indeces to continuous actions
    low = env.action_space.low
    high = env.action_space.high
    actions_range = np.subtract(high, low)

    episode_rewards = []
    reward_sum = 0.0
    num_episodes = 0
    time_steps = [0]
    time_spent_exploring = [0]

    prev_time = time.time()
    n_trainings = 0

    # Set up on-demand rendering of Gym environments using keyboard controls: 'r'ender or 's'top
    import termios, fcntl, sys
    fd = sys.stdin.fileno()
    oldterm = termios.tcgetattr(fd)
    newattr = termios.tcgetattr(fd)
    newattr[3] = newattr[3] & ~termios.ICANON & ~termios.ECHO
    render = False

    displayed_mean_reward = None

    def evaluate(step, episode_number):
        global max_eval_reward_mean, model_saved
        print('Evaluate...')
        eval_reward_sum = 0.0
        # Run evaluation episodes
        for eval_episode in range(n_eval_episodes):
            obs = env.reset()
            done = False
            while not done:
                # Choose action
                action_idxes = np.array(
                    act(np.array(obs)[None],
                        stochastic=False))  # deterministic
                actions_greedy = action_idxes / num_action_grains * actions_range + low

                if eval_std == 0.0:
                    action = actions_greedy
                else:
                    action = []
                    for index in range(len(actions_greedy)):
                        a_greedy = actions_greedy[index]
                        out_of_range_action = True
                        while out_of_range_action:
                            a_stoch = np.random.normal(loc=a_greedy,
                                                       scale=eval_std)
                            a_idx_stoch = np.rint(
                                (a_stoch + high[index]) /
                                actions_range[index] * num_action_grains)
                            if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad:
                                action.append(a_stoch)
                                out_of_range_action = False

                # Step
                obs, rew, done, _ = env.step(action)
                eval_reward_sum += rew

        # Average the rewards and log
        eval_reward_mean = eval_reward_sum / n_eval_episodes
        print(eval_reward_mean, 'over', n_eval_episodes, 'episodes')

        with open("results/{}_{}_eval.csv".format(time_stamp, env_name),
                  "a") as eval_fw:
            eval_writer = csv.writer(
                eval_fw,
                delimiter="\t",
                lineterminator="\n",
            )
            eval_writer.writerow([episode_number, step, eval_reward_mean])

        if max_eval_reward_mean is None or eval_reward_mean > max_eval_reward_mean:
            logger.log(
                "Saving model due to mean eval increase: {} -> {}".format(
                    max_eval_reward_mean, eval_reward_mean))
            U.save_state(model_file)
            model_saved = True
            max_eval_reward_mean = eval_reward_mean

    with tempfile.TemporaryDirectory() as td:
        model_file = os.path.join(td, "model")

        evaluate(0, 0)
        obs = env.reset()

        with open("results/{}_{}.csv".format(time_stamp, env_name), "w") as fw:
            writer = csv.writer(
                fw,
                delimiter="\t",
                lineterminator="\n",
            )

            t = -1
            while True:
                t += 1

                # Select action and update exploration probability
                action_idxes = np.array(
                    act(np.array(obs)[None], update_eps=exploration.value(t)))

                # Convert sub-actions indexes (discrete sub-actions) to continuous controls
                action = action_idxes / num_action_grains * actions_range + low

                # epsilon_greedy = False: use Gaussian noise
                actions_greedy = action
                action_idx_stoch = []
                action = []
                for index in range(len(actions_greedy)):
                    a_greedy = actions_greedy[index]
                    out_of_range_action = True
                    while out_of_range_action:
                        # Sample from a Gaussian with mean at the greedy action and a std following a schedule of choice
                        a_stoch = np.random.normal(loc=a_greedy,
                                                   scale=std_schedule.value(t))

                        # Convert sampled cont action to an action idx
                        a_idx_stoch = np.rint(
                            (a_stoch + high[index]) / actions_range[index] *
                            num_action_grains)

                        # Check if action is in range
                        if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad:
                            action_idx_stoch.append(a_idx_stoch)
                            action.append(a_stoch)
                            out_of_range_action = False

                action_idxes = action_idx_stoch

                new_obs, rew, done, _ = env.step(action)

                # On-demand rendering
                if (t + 1) % 100 == 0:
                    # TO DO better?
                    termios.tcsetattr(fd, termios.TCSANOW, newattr)
                    oldflags = fcntl.fcntl(fd, fcntl.F_GETFL)
                    fcntl.fcntl(fd, fcntl.F_SETFL, oldflags | os.O_NONBLOCK)
                    try:
                        try:
                            c = sys.stdin.read(1)
                            if c == 'r':
                                print()
                                print('Rendering begins...')
                                render = True
                            elif c == 's':
                                print()
                                print('Stop rendering!')
                                render = False
                                env.render(close=True)
                        except IOError:
                            pass
                    finally:
                        termios.tcsetattr(fd, termios.TCSAFLUSH, oldterm)
                        fcntl.fcntl(fd, fcntl.F_SETFL, oldflags)

                # Visualize Gym environment on render
                if render: env.render()

                # Store transition in the replay buffer
                replay_buffer.add(obs, action_idxes, rew, new_obs, float(done))
                obs = new_obs

                reward_sum += rew
                if done:
                    obs = env.reset()
                    time_spent_exploring[-1] = int(100 * exploration.value(t))
                    time_spent_exploring.append(0)
                    episode_rewards.append(reward_sum)
                    time_steps[-1] = t
                    reward_sum = 0.0
                    time_steps.append(0)
                    # Frequently log to file
                    writer.writerow(
                        [len(episode_rewards), t, episode_rewards[-1]])

                if t > learning_starts and t % train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer
                    # prioritized_replay
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience

                    td_errors = train(
                        obses_t, actions, rewards, obses_tp1, dones,
                        weights)  #np.ones_like(rewards)) #TEMP AT NEW

                    # prioritized_replay
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

                    n_trainings += 1

                if t > learning_starts and t % target_network_update_freq == 0:
                    # Update target network periodically
                    update_target()

                if len(episode_rewards) == 0: mean_100ep_reward = 0
                elif len(episode_rewards) < 100:
                    mean_100ep_reward = np.mean(episode_rewards)
                else:
                    mean_100ep_reward = np.mean(episode_rewards[-100:])

                num_episodes = len(episode_rewards)
                if done and print_freq is not None and len(
                        episode_rewards) % print_freq == 0:
                    logger.record_tabular("steps", t)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular("% time spent exploring",
                                          int(100 * exploration.value(t)))
                    current_time = time.time()
                    logger.record_tabular(
                        "trainings per second",
                        n_trainings / (current_time - prev_time))
                    logger.dump_tabular()
                    n_trainings = 0
                    prev_time = current_time

                if t > learning_starts and num_episodes > 100:
                    if displayed_mean_reward is None or mean_100ep_reward > displayed_mean_reward:
                        if print_freq is not None:
                            logger.log("Mean reward increase: {} -> {}".format(
                                displayed_mean_reward, mean_100ep_reward))
                        displayed_mean_reward = mean_100ep_reward

                # Performance evaluation with a greedy policy
                if done and num_episodes % eval_freq == 0:
                    evaluate(t + 1, num_episodes)
                    obs = env.reset()

                # STOP training
                if num_episodes >= total_num_episodes:
                    break

            if model_saved:
                logger.log("Restore model with mean eval: {}".format(
                    max_eval_reward_mean))
                U.load_state(model_file)

    data_to_log = {
        'time_steps': time_steps,
        'episode_rewards': episode_rewards,
        'time_spent_exploring': time_spent_exploring
    }

    # Write to file the episodic rewards, number of steps, and the time spent exploring
    with open("results/{}_{}.txt".format(time_stamp, env_name), 'wb') as fp:
        pickle.dump(data_to_log, fp)

    return ActWrapper(act, act_params)