Python rnn_graph_lstm示例，fed_gym.agents.a3c.estimators.rnn_graph_lstm Python示例

示例#1

0

显示文件

文件： worker_tests.py 项目： prettierci-commits/golds-rl-gym

    def setUp(self):
        super(TickerTraderWorkerTests, self).setUp()

        self.discount_factor = 0.99
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.global_counter = itertools.count()

        self.batch_size = 16
        self.num_assets = 2
        self.num_actions = 3
        self.input_size = 1 + self.num_assets * 3  # cash + (quantity, price, vol) * n_assets
        self.temporal_size = self.num_assets * 2
        self.T = 10

        with tf.variable_scope("global"):
            self.global_policy_net = DiscreteAndContPolicyEstimator(
                self.num_assets,
                static_size=self.input_size,
                temporal_size=self.temporal_size,
                shared_layer=lambda x_t, x: rnn_graph_lstm(
                    x_t, x, 32, 1, True))
            self.global_value_net = ValueEstimator(
                static_size=self.input_size,
                temporal_size=self.temporal_size,
                shared_layer=lambda x_t, x: rnn_graph_lstm(
                    x_t, x, 32, 1, True),
                reuse=True,
                num_actions=self.num_actions)

        self.shared_layer = lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True)

示例#2

0

显示文件

文件： worker_tests.py 项目： prettierci-commits/golds-rl-gym

    def setUp(self):
        super(GridWorkerTests, self).setUp()

        self.discount_factor = 0.99
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.global_counter = itertools.count()

        self.batch_size = 16
        self.num_outputs = 1
        self.num_choices = 3
        self.input_size = 2
        self.temporal_size = 2
        self.T = 10

        with tf.variable_scope("global"):
            self.global_policy_net = DiscretePolicyEstimator(
                self.num_outputs,
                self.num_choices,
                static_size=self.input_size,
                temporal_size=self.temporal_size,
                shared_layer=lambda x_t, x: rnn_graph_lstm(
                    x_t, x, 32, 1, True))
            self.global_value_net = ValueEstimator(
                static_size=self.input_size,
                temporal_size=self.temporal_size,
                shared_layer=lambda x_t, x: rnn_graph_lstm(
                    x_t, x, 32, 1, True),
                reuse=True,
            )

        self.shared_layer = lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True)

示例#3

0

显示文件

    def setUp(self):
        super(PolicyMonitorTest, self).setUp()

        self.batch_size = 16
        self.num_actions = 1
        self.input_size = 2
        self.temporal_size = 2
        self.T = 10

        self.env = make_env()
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.summary_writer = tf.summary.FileWriter(tempfile.mkdtemp())

        with tf.variable_scope("global"):
            self.global_policy_net = GaussianPolicyEstimator(
                self.num_actions,
                static_size=self.input_size,
                temporal_size=self.temporal_size,
                shared_layer=lambda x_t, x: rnn_graph_lstm(
                    x_t, x, 32, 1, True))
            self.global_value_net = ValueEstimator(
                static_size=self.input_size,
                temporal_size=self.temporal_size,
                shared_layer=lambda x_t, x: rnn_graph_lstm(
                    x_t, x, 32, 1, True),
                reuse=True)

示例#4

0

显示文件

    def predict_test(self):
        global_step = tf.Variable(0, name='global_step', trainable=False)
        estimator = ValueEstimator(
            static_size=self.input_size,
            temporal_size=self.temporal_size,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True),
            learning_rate=1e-3)

        grads = [g for g, _ in estimator.grads_and_vars]

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())

            # Run feeds
            feed_dict = {
                estimator.states: self.states,
                estimator.history: self.temporal_states,
                estimator.targets: self.targets
            }
            losses = []
            for _ in range(1000):
                loss = sess.run(estimator.loss, feed_dict)
                pred = sess.run(estimator.predictions, feed_dict)
                grads_ = sess.run(grads, feed_dict)

                grad_feed_dict = {k: v for k, v in zip(grads, grads_)}
                _ = sess.run(estimator.train_op, grad_feed_dict)
                losses.append(loss)

            # Assertions
            self.assertLess(loss, 1e-1)
            self.assertGreater(loss, 0.)
            self.assertEqual(pred['logits'].shape, (self.batch_size, ))
            self.assertLess(losses[-1], losses[0])

示例#5

0

显示文件

    def learn_policy_test(self):

        global_step = tf.Variable(0, name='global_step', trainable=False)
        estimator = GaussianPolicyEstimator(
            self.num_actions,
            static_size=self.input_size,
            temporal_size=self.temporal_size,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True),
            learning_rate=1e-3,
            seed=1692)

        grads = [g for g, _ in estimator.grads_and_vars]

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())

            # Run feeds
            for _ in range(1000):
                feed_dict = {
                    estimator.states: self.states,
                    estimator.history: self.temporal_states,
                    estimator.advantages: np.ones_like(self.advantage),
                    estimator.actions: self.actions
                }
                pred = sess.run(estimator.predictions, feed_dict)

                grads_ = sess.run(grads, feed_dict)

                grad_feed_dict = {k: v for k, v in zip(grads, grads_)}
                _ = sess.run(estimator.train_op, grad_feed_dict)

        self.assertLess(np.mean(np.abs((pred['mu'] - self.actions))), 0.1)

示例#6

0

显示文件

    def gaussian_predict_test(self):
        global_step = tf.Variable(0, name='global_step',trainable=False)
        estimator = GaussianPolicyEstimator(
            self.num_actions, static_size=self.input_size, temporal_size=self.temporal_size,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True)
        )

        grads = [g for g, _ in estimator.grads_and_vars]

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())

            # Run feeds
            losses = []
            for _ in range(10):
                feed_dict = {
                    estimator.states: self.states,
                    estimator.history: self.temporal_states,
                    estimator.advantages: self.advantage,
                    estimator.actions: self.actions
                }
                loss = sess.run(estimator.loss, feed_dict)
                losses.append(loss)
                pred = sess.run(estimator.predictions, feed_dict)

                grads_ = sess.run(grads, feed_dict)

                grad_feed_dict = { k: v for k, v in zip(grads, grads_) }
                _ = sess.run(estimator.train_op, grad_feed_dict)

            # Assertions
            self.assertLess(losses[-1], losses[0])
            np.testing.assert_array_less(0., pred['sigma'])
            self.assertEqual(pred['mu'].shape[1], self.num_actions)
            self.assertEqual(pred['sigma'].shape[1], self.num_actions)

示例#7

0

显示文件

    def learn_policy_test(self):
        tf.Variable(0, name='global_step', trainable=False)
        estimator = DiscreteAndContPolicyEstimator(
            self.n_assets,
            static_size=self.input_size,
            temporal_size=self.temporal_size,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True),
            seed=1692,
            learning_rate=1e-3)

        def all_idx(idx, axis):
            grid = np.ogrid[tuple(map(slice, idx.shape))]
            grid.insert(axis, idx)
            return tuple(grid)

        grads = [g for g, _ in estimator.grads_and_vars]

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())

            # Run feeds
            for _ in range(1000):
                feed_dict = {
                    estimator.states: self.states,
                    estimator.history: self.temporal_states,
                    estimator.advantages: np.ones_like(self.advantage),
                    estimator.actions: self.actions,
                    estimator.discrete_actions: self.discrete_actions
                }
                pred = sess.run(estimator.predictions, feed_dict)

                grads_ = sess.run(grads, feed_dict)

                grad_feed_dict = {k: v for k, v in zip(grads, grads_)}
                _ = sess.run(estimator.train_op, grad_feed_dict)
                cont_action_optimal_choice = pred['mu'][all_idx(
                    self.discrete_actions, 2)]

        # index 3D probs with 2D array of choices
        prob_optimal_choice = pred['probs'][all_idx(self.discrete_actions, 2)]
        cont_action_optimal_choice = pred['mu'][all_idx(
            self.discrete_actions, 2)]

        self.assertLess(0.9, prob_optimal_choice.mean())
        self.assertLess(
            np.mean(np.abs((cont_action_optimal_choice - self.actions))), 0.2)

示例#8

0

显示文件

    def policy_monitor_worker_equal(self):

        global_counter = itertools.count()
        worker_env = make_env()
        worker_env.seed(1692)
        worker = SolowWorker(
            'test_worker',
            env=worker_env,
            policy_net=self.global_policy_net,
            value_net=None,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True),
            global_counter=global_counter,
        )

        env = make_env()
        pe = PolicyMonitor(env=env,
                           state_processor=SolowStateProcessor(),
                           global_policy_net=self.global_policy_net,
                           summary_writer=self.summary_writer,
                           num_actions=self.num_actions,
                           input_size=self.input_size,
                           temporal_size=self.temporal_size)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())

            worker.state = worker_env.reset()
            worker.history.append(worker.process_state(worker.state))

            sess.run(worker.copy_params_op)

            transitions = worker.run_n_steps(10, sess, stochastic=False)
            worker_rewards = [t.reward for t in transitions[0]]

            pe.env = make_env()
            pe.env.seed(1692)
            pe.policy_net = worker.policy_net
            total_reward, episode_length, rewards = pe.eval_once(sess)
            monitor_rewards = rewards[:10]

        np.testing.assert_almost_equal(monitor_rewards,
                                       worker_rewards,
                                       decimal=4)

示例#9

0

显示文件

文件： train_solow.py 项目： prettierci-commits/golds-rl-gym

q = 1

register_solow_env(p, q)

with tf.device("/cpu:0"):

    # Keeps track of the number of updates we've performed
    global_step = tf.Variable(0, name="global_step", trainable=False)

    # Global policy and value nets
    with tf.variable_scope("global"):
        policy_net = GaussianPolicyEstimator(
            NUM_ACTIONS,
            static_size=INPUT_SIZE,
            temporal_size=TEMPORAL_SIZE,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True),
        )
        value_net = ValueEstimator(
            static_size=INPUT_SIZE,
            temporal_size=TEMPORAL_SIZE,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True),
            reuse=True,
            scale=100.,
        )

    # Global step iterator
    global_counter = itertools.count()

    # Create worker graphs
    workers = []
    for worker_id in range(NUM_WORKERS):