Python IndexedTFEnv示例，rl_bakery.applications.tfenv.indexed_tf_env.IndexedTFEnv Python示例

示例#1

0

显示文件

    def test_step(self):
        tf_env = tf_py_environment.TFPyEnvironment(
            suite_gym.load('CartPole-v0'))
        indexed_tf_env = IndexedTFEnv(tf_env, 5)
        # take first action
        a1 = PolicyStep(action=tf.convert_to_tensor([1]), state=(), info=())
        time_step_0 = indexed_tf_env.step(a1)
        self.assertEqual(time_step_0["env_id"], 5)
        self.assertEqual(time_step_0["ts_id"], 0)
        self.assertEqual(time_step_0["reward"], 0)
        self.assertEqual(time_step_0["step_type"], 0)
        self.assertEqual(time_step_0["discount"], 1.0)
        self.assertTrue("ob_0" in time_step_0)
        self.assertTrue("ob_1" in time_step_0)
        self.assertTrue("ob_2" in time_step_0)
        self.assertTrue("ob_3" in time_step_0)

        # take second action
        a2 = PolicyStep(action=tf.convert_to_tensor([0]), state=(), info=())
        time_step_1 = indexed_tf_env.step(a2)
        self.assertEqual(time_step_1["env_id"], 5)
        self.assertEqual(time_step_1["ts_id"], 1)
        self.assertEqual(time_step_1["reward"], 1)
        self.assertEqual(time_step_1["step_type"], 1)
        self.assertEqual(time_step_1["discount"], 1.0)
        self.assertTrue("ob_0" in time_step_1)
        self.assertTrue("ob_1" in time_step_1)
        self.assertTrue("ob_2" in time_step_1)
        self.assertTrue("ob_3" in time_step_1)

示例#2

0

显示文件

文件： simulation_runner.py 项目： zynga/rl-bakery

def make_batch_tfenv(make_env, config, start_dt, training_interval, spark_session):
    """
    This returns a TFEnvRLApplication, which keeps 'config.env.num_envs' envs running in parallel.

    :param make_env: A function that returns an Environment
    :param config: An ApplicationConfig
    :param start_dt: A datetime being used to simulate the first action
    :param training_interval: A datetime indicating the lag between when an observation is generated and when it can
                           be used for training. This simulates real world environments where there's a delay between
                           data collection and Agent updates.
    :param spark_session: A Spark session
    :return:
    """

    envs = [IndexedTFEnv(make_env(), i) for i in range(0, config.env.num_envs)]

    # setup app
    training_config = {
        "num_iterations": config.training.num_iterations,
        "agent_discount": config.trajectory.agent_discount,
        "mini_batch_size": config.training.batch_size,
        "eps_start": config.policy.eps_start,
        "eps_final": config.policy.eps_final,
        "eps_steps": config.policy.eps_steps,
        "initial_collect_steps": config.policy.initial_collect_steps,
        "log_interval": config.project.log_interval
    }
    return TFEnvRLApplication(envs, spark_session, training_config, config.env.num_steps_per_run, start_dt, training_interval)

示例#3

0

显示文件

文件： test_tf_env_rl_application.py 项目： zynga/rl-bakery

    def test_init_application(self, mock_dm):
        # init a rl env application
        envs = []
        for i in range(2):
            envs.append(
                IndexedTFEnv(
                    tf_py_environment.TFPyEnvironment(
                        suite_gym.load('CartPole-v0')), i))

        training_config = {
            "fc_layer_params": (100, ),
            "learning_rate": 0.001,
            "agent_discount": 0.99,
            "mini_batch_size": 64,
            "num_training_iterations": 10000,
            "epsilon_greediness": 0.1,
            "gradient_clipping": 1.0
        }

        steps_num_per_run = 3

        app = TFEnvRLApplication(envs, training_config, steps_num_per_run,
                                 datetime.now(), 2)
        self.assertListEqual(app.obs_cols, ['ob_0', 'ob_1', 'ob_2', 'ob_3'])

示例#4

0

显示文件

    def __init__(
            self,
            # Params Q network
            fc_layer_params=(100, ),
            # Params for training
            learning_rate=0.001,
            agent_discount=0.99,
            mini_batch_size=128,
            num_iterations=5000,
            gradient_clipping=None,
            trajectory_training_window=100,
            log_interval=200,
            # Param for simulated environments
            envs_num=10,
            runs_num=10,
            steps_num_per_run=100,
            # Params for evaluation
            eval_interval=1,
            num_eval_episodes=100,
            # Params for data collection
            eps_start=1.0,
            eps_final=0.1,
            eps_steps=10000,
            initial_collect_steps=3000,
            tb_path=None):

        # store configs used during the training run
        self._runs_num = runs_num
        self._eval_interval = eval_interval
        self._num_eval_episodes = num_eval_episodes

        # setup rl_app
        envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)]
        application_name = "CartPole-example"
        version = "%s" % str(time.time())

        # setup training engine
        training_interval = timedelta(days=1)
        start_dt = datetime(year=2019, month=8, day=7, hour=10)
        self._engine_config = TFEnvEngineConfig(start_dt, training_interval,
                                                trajectory_training_window,
                                                application_name, version)
        if tb_path:
            self._engine_config.tensorboard_path = tb_path
        # setup app
        training_config = {
            "num_iterations": num_iterations,
            "agent_discount": agent_discount,
            "mini_batch_size": mini_batch_size,
            "eps_start": eps_start,
            "eps_final": eps_final,
            "eps_steps": eps_steps,
            "initial_collect_steps": initial_collect_steps,
            "log_interval": log_interval
        }
        self._rl_app = TFEnvRLApplication(envs, training_config,
                                          steps_num_per_run, start_dt,
                                          training_interval)

        @staticmethod
        def init_agent():
            """ a DQN agent is set by default in the application"""
            # get the global step
            global_step = tf.compat.v1.train.get_or_create_global_step()

            # TODO: update this to get the optimizer from tensorflow 2.0 if possible
            optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=learning_rate)

            q_net = q_network.QNetwork(self._rl_app.observation_spec,
                                       self._rl_app.action_spec,
                                       fc_layer_params=fc_layer_params)
            time_step_spec = ts.time_step_spec(self._rl_app.observation_spec)
            tf_agent = dqn_agent.DqnAgent(
                time_step_spec,
                self._rl_app.action_spec,
                q_network=q_net,
                optimizer=optimizer,
                epsilon_greedy=eps_final,
                gradient_clipping=gradient_clipping,
                td_errors_loss_fn=common.element_wise_squared_loss,
                train_step_counter=global_step,
                debug_summaries=True,
                summarize_grads_and_vars=True)
            tf_agent.initialize()
            logger.info("tf_agent initialization is complete")

            # Optimize by wrapping some of the code in a graph using TF function.
            tf_agent.train = common.function(tf_agent.train)

            return tf_agent

        self._rl_app.init_agent = init_agent.__get__(object)

示例#5

0

显示文件

文件： mountaincar_ddpg.py 项目： wowgeeker/rl-bakery

    def __init__(
            self,
            # Params for training
            learning_rate=0.001,
            discount=0.99,
            mini_batch_size=64,
            num_iterations=2000,
            gradient_clipping=None,
            trajectory_training_window=1,
            log_interval=100,
            # Param for simulated environments
            envs_num=60,
            runs_num=10,
            steps_num_per_run=50,
            # Params for evaluation
            eval_interval=1,
            num_eval_episodes=100,
            # Params for data collection
            eps_start=1.0,
            eps_final=0.1,
            eps_steps=1000,
            initial_collect_steps=1000,
            tb_path=None):

        # store configs used during the training run
        self._runs_num = runs_num
        self._eval_interval = eval_interval
        self._num_eval_episodes = num_eval_episodes

        # setup rl_app
        envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)]
        application_name = "MountainCar-example"
        version = "%s" % str(time.time())

        # setup training engine
        training_interval = timedelta(days=1)
        start_dt = datetime(year=2019, month=8, day=7, hour=10)
        self._engine_config = TFEnvEngineConfig(start_dt, training_interval,
                                                trajectory_training_window,
                                                application_name, version)
        if tb_path:
            self._engine_config.tensorboard_path = tb_path
        # setup app
        training_config = {
            "num_iterations": num_iterations,
            "mini_batch_size": mini_batch_size,
            "eps_start": eps_start,
            "eps_final": eps_final,
            "eps_steps": eps_steps,
            "initial_collect_steps": initial_collect_steps,
            "log_interval": log_interval
        }
        self._rl_app = TFEnvRLApplication(envs, training_config,
                                          steps_num_per_run, start_dt,
                                          training_interval)

        @staticmethod
        def init_agent():
            """ a DDPG agent is set by default in the application"""
            # get the global step
            global_step = tf.compat.v1.train.get_or_create_global_step()

            # TODO: update this to get the optimizer from tensorflow 2.0 if possible
            optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=learning_rate)
            time_step_spec = time_step.time_step_spec(
                self._rl_app.observation_spec)
            actor_net = actor_network.ActorNetwork(
                self._rl_app.observation_spec,
                self._rl_app.action_spec,
                fc_layer_params=(400, 300))
            value_net = critic_network.CriticNetwork(
                (time_step_spec.observation, self._rl_app.action_spec),
                observation_fc_layer_params=(400, ),
                action_fc_layer_params=None,
                joint_fc_layer_params=(300, ))
            tf_agent = ddpg_agent.DdpgAgent(
                time_step_spec,
                self._rl_app.action_spec,
                actor_network=actor_net,
                critic_network=value_net,
                actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=1e-4),
                critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=1e-3),
                ou_stddev=0.2,
                ou_damping=0.15,
                target_update_tau=0.05,
                target_update_period=5,
                dqda_clipping=None,
                td_errors_loss_fn=tf.compat.v1.losses.huber_loss,
                gamma=discount,
                reward_scale_factor=1.0,
                gradient_clipping=gradient_clipping,
                debug_summaries=True,
                summarize_grads_and_vars=True,
                train_step_counter=global_step)
            tf_agent.initialize()
            logger.info("tf_agent initialization is complete")

            # Optimize by wrapping some of the code in a graph using TF function.
            tf_agent.train = common.function(tf_agent.train)

            return tf_agent

        self._rl_app.init_agent = init_agent.__get__(object)

示例#6

0

显示文件

文件： MAB.py 项目： nimwijetunga/rl-bakery

    def __init__(
            self,
            # Params Q network
            fc_layer_params=(100, ),
            # Params for training
            learning_rate=0.01,
            agent_discount=0.99,
            mini_batch_size=1,
            num_iterations=10000,
            gradient_clipping=None,
            trajectory_training_window=100,
            log_interval=200,
            # Param for simulated environments
            envs_num=10,
            runs_num=10,
            steps_num_per_run=1000,
            # Params for evaluation
            eval_interval=1,
            num_eval_episodes=100,
            # Params for data collection
            eps_start=1.0,
            eps_final=0.1,
            eps_steps=10000,
            initial_collect_steps=3000,
            tb_path=None):

        # store configs used during the training run
        self._runs_num = runs_num
        self._eval_interval = eval_interval
        self._num_eval_episodes = num_eval_episodes

        # setup rl_app
        envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)]
        application_name = "CartPole-example"
        version = "%s" % str(time.time())

        # setup training engine
        training_interval = timedelta(days=1)
        start_dt = datetime(year=2019, month=8, day=7, hour=10)
        self._engine_config = TFEnvEngineConfig(start_dt, training_interval,
                                                trajectory_training_window,
                                                application_name, version)
        if tb_path:
            self._engine_config.tensorboard_path = tb_path
        # setup app
        training_config = {
            "n_step": 1,
            "num_iterations": num_iterations,
            # "agent_discount": agent_discount,
            "mini_batch_size": mini_batch_size,
            # "eps_start": eps_start,
            # "eps_final": eps_final,
            # "eps_steps": eps_steps,
            "initial_collect_steps": initial_collect_steps,
            "log_interval": log_interval
        }
        self._rl_app = TFEnvRLApplication(envs, training_config,
                                          steps_num_per_run, start_dt,
                                          training_interval)

        @staticmethod
        def init_agent():
            """ a DQN agent is set by default in the application"""
            time_step_spec = ts.time_step_spec(self._rl_app.observation_spec)
            agent = SignAgent(self._rl_app.observation_spec,
                              self._rl_app.action_spec, time_step_spec)
            agent.initialize()
            logger.info("tf_agent initialization is complete")

            # Optimize by wrapping some of the code in a graph using TF function.
            agent.train = common.function(agent.train)

            return agent

        self._rl_app.init_agent = init_agent.__get__(object)