Exemplo n.º 1
0
def make_batch_tfenv(make_env, config, start_dt, training_interval, spark_session):
    """
    This returns a TFEnvRLApplication, which keeps 'config.env.num_envs' envs running in parallel.

    :param make_env: A function that returns an Environment
    :param config: An ApplicationConfig
    :param start_dt: A datetime being used to simulate the first action
    :param training_interval: A datetime indicating the lag between when an observation is generated and when it can
                           be used for training. This simulates real world environments where there's a delay between
                           data collection and Agent updates.
    :param spark_session: A Spark session
    :return:
    """

    envs = [IndexedTFEnv(make_env(), i) for i in range(0, config.env.num_envs)]

    # setup app
    training_config = {
        "num_iterations": config.training.num_iterations,
        "agent_discount": config.trajectory.agent_discount,
        "mini_batch_size": config.training.batch_size,
        "eps_start": config.policy.eps_start,
        "eps_final": config.policy.eps_final,
        "eps_steps": config.policy.eps_steps,
        "initial_collect_steps": config.policy.initial_collect_steps,
        "log_interval": config.project.log_interval
    }
    return TFEnvRLApplication(envs, spark_session, training_config, config.env.num_steps_per_run, start_dt, training_interval)
Exemplo n.º 2
0
    def test_init_application(self, mock_dm):
        # init a rl env application
        envs = []
        for i in range(2):
            envs.append(
                IndexedTFEnv(
                    tf_py_environment.TFPyEnvironment(
                        suite_gym.load('CartPole-v0')), i))

        training_config = {
            "fc_layer_params": (100, ),
            "learning_rate": 0.001,
            "agent_discount": 0.99,
            "mini_batch_size": 64,
            "num_training_iterations": 10000,
            "epsilon_greediness": 0.1,
            "gradient_clipping": 1.0
        }

        steps_num_per_run = 3

        app = TFEnvRLApplication(envs, training_config, steps_num_per_run,
                                 datetime.now(), 2)
        self.assertListEqual(app.obs_cols, ['ob_0', 'ob_1', 'ob_2', 'ob_3'])
Exemplo n.º 3
0
    def __init__(
            self,
            # Params Q network
            fc_layer_params=(100, ),
            # Params for training
            learning_rate=0.001,
            agent_discount=0.99,
            mini_batch_size=128,
            num_iterations=5000,
            gradient_clipping=None,
            trajectory_training_window=100,
            log_interval=200,
            # Param for simulated environments
            envs_num=10,
            runs_num=10,
            steps_num_per_run=100,
            # Params for evaluation
            eval_interval=1,
            num_eval_episodes=100,
            # Params for data collection
            eps_start=1.0,
            eps_final=0.1,
            eps_steps=10000,
            initial_collect_steps=3000,
            tb_path=None):

        # store configs used during the training run
        self._runs_num = runs_num
        self._eval_interval = eval_interval
        self._num_eval_episodes = num_eval_episodes

        # setup rl_app
        envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)]
        application_name = "CartPole-example"
        version = "%s" % str(time.time())

        # setup training engine
        training_interval = timedelta(days=1)
        start_dt = datetime(year=2019, month=8, day=7, hour=10)
        self._engine_config = TFEnvEngineConfig(start_dt, training_interval,
                                                trajectory_training_window,
                                                application_name, version)
        if tb_path:
            self._engine_config.tensorboard_path = tb_path
        # setup app
        training_config = {
            "num_iterations": num_iterations,
            "agent_discount": agent_discount,
            "mini_batch_size": mini_batch_size,
            "eps_start": eps_start,
            "eps_final": eps_final,
            "eps_steps": eps_steps,
            "initial_collect_steps": initial_collect_steps,
            "log_interval": log_interval
        }
        self._rl_app = TFEnvRLApplication(envs, training_config,
                                          steps_num_per_run, start_dt,
                                          training_interval)

        @staticmethod
        def init_agent():
            """ a DQN agent is set by default in the application"""
            # get the global step
            global_step = tf.compat.v1.train.get_or_create_global_step()

            # TODO: update this to get the optimizer from tensorflow 2.0 if possible
            optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=learning_rate)

            q_net = q_network.QNetwork(self._rl_app.observation_spec,
                                       self._rl_app.action_spec,
                                       fc_layer_params=fc_layer_params)
            time_step_spec = ts.time_step_spec(self._rl_app.observation_spec)
            tf_agent = dqn_agent.DqnAgent(
                time_step_spec,
                self._rl_app.action_spec,
                q_network=q_net,
                optimizer=optimizer,
                epsilon_greedy=eps_final,
                gradient_clipping=gradient_clipping,
                td_errors_loss_fn=common.element_wise_squared_loss,
                train_step_counter=global_step,
                debug_summaries=True,
                summarize_grads_and_vars=True)
            tf_agent.initialize()
            logger.info("tf_agent initialization is complete")

            # Optimize by wrapping some of the code in a graph using TF function.
            tf_agent.train = common.function(tf_agent.train)

            return tf_agent

        self._rl_app.init_agent = init_agent.__get__(object)
Exemplo n.º 4
0
class ExampleCartPole(object):
    def __init__(
            self,
            # Params Q network
            fc_layer_params=(100, ),
            # Params for training
            learning_rate=0.001,
            agent_discount=0.99,
            mini_batch_size=128,
            num_iterations=5000,
            gradient_clipping=None,
            trajectory_training_window=100,
            log_interval=200,
            # Param for simulated environments
            envs_num=10,
            runs_num=10,
            steps_num_per_run=100,
            # Params for evaluation
            eval_interval=1,
            num_eval_episodes=100,
            # Params for data collection
            eps_start=1.0,
            eps_final=0.1,
            eps_steps=10000,
            initial_collect_steps=3000,
            tb_path=None):

        # store configs used during the training run
        self._runs_num = runs_num
        self._eval_interval = eval_interval
        self._num_eval_episodes = num_eval_episodes

        # setup rl_app
        envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)]
        application_name = "CartPole-example"
        version = "%s" % str(time.time())

        # setup training engine
        training_interval = timedelta(days=1)
        start_dt = datetime(year=2019, month=8, day=7, hour=10)
        self._engine_config = TFEnvEngineConfig(start_dt, training_interval,
                                                trajectory_training_window,
                                                application_name, version)
        if tb_path:
            self._engine_config.tensorboard_path = tb_path
        # setup app
        training_config = {
            "num_iterations": num_iterations,
            "agent_discount": agent_discount,
            "mini_batch_size": mini_batch_size,
            "eps_start": eps_start,
            "eps_final": eps_final,
            "eps_steps": eps_steps,
            "initial_collect_steps": initial_collect_steps,
            "log_interval": log_interval
        }
        self._rl_app = TFEnvRLApplication(envs, training_config,
                                          steps_num_per_run, start_dt,
                                          training_interval)

        @staticmethod
        def init_agent():
            """ a DQN agent is set by default in the application"""
            # get the global step
            global_step = tf.compat.v1.train.get_or_create_global_step()

            # TODO: update this to get the optimizer from tensorflow 2.0 if possible
            optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=learning_rate)

            q_net = q_network.QNetwork(self._rl_app.observation_spec,
                                       self._rl_app.action_spec,
                                       fc_layer_params=fc_layer_params)
            time_step_spec = ts.time_step_spec(self._rl_app.observation_spec)
            tf_agent = dqn_agent.DqnAgent(
                time_step_spec,
                self._rl_app.action_spec,
                q_network=q_net,
                optimizer=optimizer,
                epsilon_greedy=eps_final,
                gradient_clipping=gradient_clipping,
                td_errors_loss_fn=common.element_wise_squared_loss,
                train_step_counter=global_step,
                debug_summaries=True,
                summarize_grads_and_vars=True)
            tf_agent.initialize()
            logger.info("tf_agent initialization is complete")

            # Optimize by wrapping some of the code in a graph using TF function.
            tf_agent.train = common.function(tf_agent.train)

            return tf_agent

        self._rl_app.init_agent = init_agent.__get__(object)

    def run(self):
        engine = BaseEngine(self._rl_app, self._engine_config)
        self._rl_app.set_dm(engine._dm)

        engine.init(force_run=True)

        logger.info("Training started")
        eval_avg_rwd = []
        for run_id in range(1, self._runs_num):
            engine.train(run_id)

            if run_id % self._eval_interval == 0:
                avg_rwd = self._evaluate_agent(engine._dm, run_id,
                                               self._num_eval_episodes)
                eval_avg_rwd.append(avg_rwd)

        logger.info("Training is done")
        logger.info("Eval result: %s" % str(eval_avg_rwd))
        return eval_avg_rwd

    def _evaluate_agent(self, dm, run_id, num_eval_episodes):
        rl_agent = dm.get(DATANAME.MODEL, run_id)

        trained_policy = rl_agent.policy

        eval_env = ExampleCartPole._make_env()

        average_reward = ExampleCartPole._compute_avg_return(
            eval_env, trained_policy, num_eval_episodes)
        logger.info("step = {}: eval average reward = {}".format(
            run_id, average_reward))

        tb_writer = start_tensorboard_writer(
            self._engine_config.tensorboard_path,
            int(run_id / self._eval_interval))
        tf.compat.v2.summary.scalar(name="eval_avg_rwd", data=average_reward)
        close_tensorboard_writer(tb_writer)

        return average_reward

    @staticmethod
    def _make_env():
        # function to create a tf environment
        return tf_py_environment.TFPyEnvironment(suite_gym.load('CartPole-v0'))

    @staticmethod
    def _compute_avg_return(environment, policy, num_episodes=100):
        total_return = 0.0
        for _ in range(num_episodes):

            time_step = environment.reset()
            episode_return = 0.0

            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = environment.step(action_step.action)
                episode_return += time_step.reward
            total_return += episode_return

        avg_return = total_return / num_episodes
        return avg_return.numpy()[0]
Exemplo n.º 5
0
    def __init__(
            self,
            # Params for training
            learning_rate=0.001,
            discount=0.99,
            mini_batch_size=64,
            num_iterations=2000,
            gradient_clipping=None,
            trajectory_training_window=1,
            log_interval=100,
            # Param for simulated environments
            envs_num=60,
            runs_num=10,
            steps_num_per_run=50,
            # Params for evaluation
            eval_interval=1,
            num_eval_episodes=100,
            # Params for data collection
            eps_start=1.0,
            eps_final=0.1,
            eps_steps=1000,
            initial_collect_steps=1000,
            tb_path=None):

        # store configs used during the training run
        self._runs_num = runs_num
        self._eval_interval = eval_interval
        self._num_eval_episodes = num_eval_episodes

        # setup rl_app
        envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)]
        application_name = "MountainCar-example"
        version = "%s" % str(time.time())

        # setup training engine
        training_interval = timedelta(days=1)
        start_dt = datetime(year=2019, month=8, day=7, hour=10)
        self._engine_config = TFEnvEngineConfig(start_dt, training_interval,
                                                trajectory_training_window,
                                                application_name, version)
        if tb_path:
            self._engine_config.tensorboard_path = tb_path
        # setup app
        training_config = {
            "num_iterations": num_iterations,
            "mini_batch_size": mini_batch_size,
            "eps_start": eps_start,
            "eps_final": eps_final,
            "eps_steps": eps_steps,
            "initial_collect_steps": initial_collect_steps,
            "log_interval": log_interval
        }
        self._rl_app = TFEnvRLApplication(envs, training_config,
                                          steps_num_per_run, start_dt,
                                          training_interval)

        @staticmethod
        def init_agent():
            """ a DDPG agent is set by default in the application"""
            # get the global step
            global_step = tf.compat.v1.train.get_or_create_global_step()

            # TODO: update this to get the optimizer from tensorflow 2.0 if possible
            optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=learning_rate)
            time_step_spec = time_step.time_step_spec(
                self._rl_app.observation_spec)
            actor_net = actor_network.ActorNetwork(
                self._rl_app.observation_spec,
                self._rl_app.action_spec,
                fc_layer_params=(400, 300))
            value_net = critic_network.CriticNetwork(
                (time_step_spec.observation, self._rl_app.action_spec),
                observation_fc_layer_params=(400, ),
                action_fc_layer_params=None,
                joint_fc_layer_params=(300, ))
            tf_agent = ddpg_agent.DdpgAgent(
                time_step_spec,
                self._rl_app.action_spec,
                actor_network=actor_net,
                critic_network=value_net,
                actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=1e-4),
                critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=1e-3),
                ou_stddev=0.2,
                ou_damping=0.15,
                target_update_tau=0.05,
                target_update_period=5,
                dqda_clipping=None,
                td_errors_loss_fn=tf.compat.v1.losses.huber_loss,
                gamma=discount,
                reward_scale_factor=1.0,
                gradient_clipping=gradient_clipping,
                debug_summaries=True,
                summarize_grads_and_vars=True,
                train_step_counter=global_step)
            tf_agent.initialize()
            logger.info("tf_agent initialization is complete")

            # Optimize by wrapping some of the code in a graph using TF function.
            tf_agent.train = common.function(tf_agent.train)

            return tf_agent

        self._rl_app.init_agent = init_agent.__get__(object)
Exemplo n.º 6
0
    def __init__(
            self,
            # Params Q network
            fc_layer_params=(100, ),
            # Params for training
            learning_rate=0.01,
            agent_discount=0.99,
            mini_batch_size=1,
            num_iterations=10000,
            gradient_clipping=None,
            trajectory_training_window=100,
            log_interval=200,
            # Param for simulated environments
            envs_num=10,
            runs_num=10,
            steps_num_per_run=1000,
            # Params for evaluation
            eval_interval=1,
            num_eval_episodes=100,
            # Params for data collection
            eps_start=1.0,
            eps_final=0.1,
            eps_steps=10000,
            initial_collect_steps=3000,
            tb_path=None):

        # store configs used during the training run
        self._runs_num = runs_num
        self._eval_interval = eval_interval
        self._num_eval_episodes = num_eval_episodes

        # setup rl_app
        envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)]
        application_name = "CartPole-example"
        version = "%s" % str(time.time())

        # setup training engine
        training_interval = timedelta(days=1)
        start_dt = datetime(year=2019, month=8, day=7, hour=10)
        self._engine_config = TFEnvEngineConfig(start_dt, training_interval,
                                                trajectory_training_window,
                                                application_name, version)
        if tb_path:
            self._engine_config.tensorboard_path = tb_path
        # setup app
        training_config = {
            "n_step": 1,
            "num_iterations": num_iterations,
            # "agent_discount": agent_discount,
            "mini_batch_size": mini_batch_size,
            # "eps_start": eps_start,
            # "eps_final": eps_final,
            # "eps_steps": eps_steps,
            "initial_collect_steps": initial_collect_steps,
            "log_interval": log_interval
        }
        self._rl_app = TFEnvRLApplication(envs, training_config,
                                          steps_num_per_run, start_dt,
                                          training_interval)

        @staticmethod
        def init_agent():
            """ a DQN agent is set by default in the application"""
            time_step_spec = ts.time_step_spec(self._rl_app.observation_spec)
            agent = SignAgent(self._rl_app.observation_spec,
                              self._rl_app.action_spec, time_step_spec)
            agent.initialize()
            logger.info("tf_agent initialization is complete")

            # Optimize by wrapping some of the code in a graph using TF function.
            agent.train = common.function(agent.train)

            return agent

        self._rl_app.init_agent = init_agent.__get__(object)